1 #####################################################################
3 # The Perl::Tidy::Tokenizer package is essentially a filter which
4 # reads lines of perl source code from a source object and provides
5 # corresponding tokenized lines through its get_line() method. Lines
6 # flow from the source_object to the caller like this:
8 # source_object --> LineBuffer_object --> Tokenizer --> calling routine
9 # get_line() get_line() get_line() line_of_tokens
11 # The source object can be any object with a get_line() method which
12 # supplies one line (a character string) perl call.
13 # The LineBuffer object is created by the Tokenizer.
14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
15 # containing one tokenized line for each call to its get_line() method.
17 # WARNING: This is not a real class. Only one tokenizer my be used.
19 ########################################################################
21 package Perl::Tidy::Tokenizer;
24 use English qw( -no_match_vars );
26 our $VERSION = '20220613';
28 use constant DEVEL_MODE => 0;
29 use constant EMPTY_STRING => q{};
30 use constant SPACE => q{ };
32 use Perl::Tidy::LineBuffer;
35 # PACKAGE VARIABLES for processing an entire FILE.
36 # These must be package variables because most may get localized during
37 # processing. Most are initialized in sub prepare_for_a_new_file.
43 $last_nonblank_block_type
51 %user_function_prototype
53 %is_block_list_function
54 %saw_function_definition
65 @nesting_sequence_number
66 @current_sequence_number
68 @paren_semicolon_count
69 @paren_structural_type
71 @brace_structural_type
75 @square_bracket_structural_type
78 @nested_statement_type
79 @starting_line_of_current_depth
82 # GLOBAL CONSTANTS for routines in this package,
83 # Initialized in a BEGIN block.
85 %is_indirect_object_taker
87 %expecting_operator_token
88 %expecting_operator_types
93 %is_file_test_operator
99 %is_sort_map_grep_eval_do
105 %is_keyword_taking_list
106 %is_keyword_taking_optional_arg
107 %is_keyword_rejecting_slash_as_pattern_delimiter
108 %is_keyword_rejecting_question_as_pattern_delimiter
109 %is_q_qq_qx_qr_s_y_tr_m
110 %is_q_qq_qw_qx_qr_s_y_tr_m
113 %is_comma_question_colon
115 %is_if_elsif_unless_case_when
117 %is_END_DATA_format_sub
119 $code_skipping_pattern_begin
120 $code_skipping_pattern_end
123 # GLOBAL VARIABLES which are constant after being configured by user-supplied
124 # parameters. They remain constant as a file is being processed.
127 $rOpts_code_skipping,
128 $code_skipping_pattern_begin,
129 $code_skipping_pattern_end,
132 # possible values of operator_expected()
133 use constant TERM => -1;
134 use constant UNKNOWN => 0;
135 use constant OPERATOR => 1;
137 # possible values of context
138 use constant SCALAR_CONTEXT => -1;
139 use constant UNKNOWN_CONTEXT => 0;
140 use constant LIST_CONTEXT => 1;
142 # Maximum number of little messages; probably need not be changed.
143 use constant MAX_NAG_MESSAGES => 6;
147 # Array index names for $self.
148 # Do not combine with other BEGIN blocks (c101).
151 _rhere_target_list_ => $i++,
152 _in_here_doc_ => $i++,
153 _here_doc_target_ => $i++,
154 _here_quote_character_ => $i++,
160 _in_skipped_ => $i++,
161 _in_attribute_list_ => $i++,
163 _quote_target_ => $i++,
164 _line_start_quote_ => $i++,
165 _starting_level_ => $i++,
166 _know_starting_level_ => $i++,
168 _indent_columns_ => $i++,
169 _look_for_hash_bang_ => $i++,
171 _continuation_indentation_ => $i++,
172 _outdent_labels_ => $i++,
173 _last_line_number_ => $i++,
174 _saw_perl_dash_P_ => $i++,
175 _saw_perl_dash_w_ => $i++,
176 _saw_use_strict_ => $i++,
177 _saw_v_string_ => $i++,
179 _look_for_autoloader_ => $i++,
180 _look_for_selfloader_ => $i++,
181 _saw_autoloader_ => $i++,
182 _saw_selfloader_ => $i++,
183 _saw_hash_bang_ => $i++,
186 _saw_negative_indentation_ => $i++,
187 _started_tokenizing_ => $i++,
188 _line_buffer_object_ => $i++,
189 _debugger_object_ => $i++,
190 _diagnostics_object_ => $i++,
191 _logger_object_ => $i++,
192 _unexpected_error_count_ => $i++,
193 _started_looking_for_here_target_at_ => $i++,
194 _nearly_matched_here_target_at_ => $i++,
195 _line_of_text_ => $i++,
196 _rlower_case_labels_at_ => $i++,
197 _extended_syntax_ => $i++,
198 _maximum_level_ => $i++,
199 _true_brace_error_count_ => $i++,
200 _rOpts_maximum_level_errors_ => $i++,
201 _rOpts_maximum_unexpected_errors_ => $i++,
202 _rOpts_logfile_ => $i++,
207 { ## closure for subs to count instances
209 # methods to count instances
211 sub get_count { return $_count; }
212 sub _increment_count { return ++$_count }
213 sub _decrement_count { return --$_count }
218 $self->_decrement_count();
224 # Catch any undefined sub calls so that we are sure to get
225 # some diagnostic information. This sub should never be called
226 # except for a programming error.
228 return if ( $AUTOLOAD =~ /\bDESTROY$/ );
229 my ( $pkg, $fname, $lno ) = caller();
230 my $my_package = __PACKAGE__;
232 ======================================================================
233 Error detected in package '$my_package', version $VERSION
234 Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
235 Called from package: '$pkg'
236 Called from File '$fname' at line '$lno'
237 This error is probably due to a recent programming change
238 ======================================================================
241 } ## end sub AUTOLOAD
245 Perl::Tidy::Die($msg);
246 croak "unexpected return from Perl::Tidy::Die";
252 # This routine is called for errors that really should not occur
253 # except if there has been a bug introduced by a recent program change.
254 # Please add comments at calls to Fault to explain why the call
255 # should not occur, and where to look to fix it.
256 my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
257 my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
258 my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
259 my $input_stream_name = get_input_stream_name();
262 ==============================================================================
263 While operating on input stream with name: '$input_stream_name'
264 A fault was detected at line $line0 of sub '$subroutine1'
266 which was called from line $line1 of sub '$subroutine2'
268 This is probably an error introduced by a recent programming change.
269 Perl::Tidy::Tokenizer.pm reports VERSION='$VERSION'.
270 ==============================================================================
273 # We shouldn't get here, but this return is to keep Perl-Critic from
280 # See if a pattern will compile. We have to use a string eval here,
281 # but it should be safe because the pattern has been constructed
284 eval "'##'=~/$pattern/";
288 sub make_code_skipping_pattern {
289 my ( $rOpts, $opt_name, $default ) = @_;
290 my $param = $rOpts->{$opt_name};
291 unless ($param) { $param = $default }
292 $param =~ s/^\s*//; # allow leading spaces to be like format-skipping
293 if ( $param !~ /^#/ ) {
294 Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
296 my $pattern = '^\s*' . $param . '\b';
297 if ( bad_pattern($pattern) ) {
299 "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
303 } ## end sub make_code_skipping_pattern
307 # Check Tokenizer parameters
313 %is_END_DATA_format_sub = (
320 # Install any aliases to 'sub'
321 if ( $rOpts->{'sub-alias-list'} ) {
323 # Note that any 'sub-alias-list' has been preprocessed to
324 # be a trimmed, space-separated list which includes 'sub'
325 # for example, it might be 'sub method fun'
326 my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
327 foreach my $word (@sub_alias_list) {
329 $is_END_DATA_format_sub{$word} = 1;
334 if ( $rOpts->{'grep-alias-list'} ) {
336 # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
337 # space-separated list
338 my @q = split /\s+/, $rOpts->{'grep-alias-list'};
339 @{is_grep_alias}{@q} = (1) x scalar(@q);
342 $rOpts_code_skipping = $rOpts->{'code-skipping'};
343 $code_skipping_pattern_begin =
344 make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
345 $code_skipping_pattern_end =
346 make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
348 } ## end sub check_options
352 my ( $class, @args ) = @_;
354 # Note: 'tabs' and 'indent_columns' are temporary and should be
357 source_object => undef,
358 debugger_object => undef,
359 diagnostics_object => undef,
360 logger_object => undef,
361 starting_level => undef,
364 look_for_hash_bang => 0,
366 look_for_autoloader => 1,
367 look_for_selfloader => 1,
368 starting_line_number => 1,
369 extended_syntax => 0,
372 my %args = ( %defaults, @args );
374 # we are given an object with a get_line() method to supply source lines
375 my $source_object = $args{source_object};
376 my $rOpts = $args{rOpts};
378 # we create another object with a get_line() and peek_ahead() method
379 my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
381 # Tokenizer state data is as follows:
382 # _rhere_target_list_ reference to list of here-doc targets
383 # _here_doc_target_ the target string for a here document
384 # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
385 # to determine if interpolation is done
386 # _quote_target_ character we seek if chasing a quote
387 # _line_start_quote_ line where we started looking for a long quote
388 # _in_here_doc_ flag indicating if we are in a here-doc
389 # _in_pod_ flag set if we are in pod documentation
390 # _in_skipped_ flag set if we are in a skipped section
391 # _in_error_ flag set if we saw severe error (binary in script)
392 # _in_data_ flag set if we are in __DATA__ section
393 # _in_end_ flag set if we are in __END__ section
394 # _in_format_ flag set if we are in a format description
395 # _in_attribute_list_ flag telling if we are looking for attributes
396 # _in_quote_ flag telling if we are chasing a quote
397 # _starting_level_ indentation level of first line
398 # _line_buffer_object_ object with get_line() method to supply source code
399 # _diagnostics_object_ place to write debugging information
400 # _unexpected_error_count_ error count used to limit output
401 # _lower_case_labels_at_ line numbers where lower case labels seen
402 # _hit_bug_ program bug detected
405 $self->[_rhere_target_list_] = [];
406 $self->[_in_here_doc_] = 0;
407 $self->[_here_doc_target_] = EMPTY_STRING;
408 $self->[_here_quote_character_] = EMPTY_STRING;
409 $self->[_in_data_] = 0;
410 $self->[_in_end_] = 0;
411 $self->[_in_format_] = 0;
412 $self->[_in_error_] = 0;
413 $self->[_in_pod_] = 0;
414 $self->[_in_skipped_] = 0;
415 $self->[_in_attribute_list_] = 0;
416 $self->[_in_quote_] = 0;
417 $self->[_quote_target_] = EMPTY_STRING;
418 $self->[_line_start_quote_] = -1;
419 $self->[_starting_level_] = $args{starting_level};
420 $self->[_know_starting_level_] = defined( $args{starting_level} );
421 $self->[_tabsize_] = $args{tabsize};
422 $self->[_indent_columns_] = $args{indent_columns};
423 $self->[_look_for_hash_bang_] = $args{look_for_hash_bang};
424 $self->[_trim_qw_] = $args{trim_qw};
425 $self->[_continuation_indentation_] = $args{continuation_indentation};
426 $self->[_outdent_labels_] = $args{outdent_labels};
427 $self->[_last_line_number_] = $args{starting_line_number} - 1;
428 $self->[_saw_perl_dash_P_] = 0;
429 $self->[_saw_perl_dash_w_] = 0;
430 $self->[_saw_use_strict_] = 0;
431 $self->[_saw_v_string_] = 0;
432 $self->[_hit_bug_] = 0;
433 $self->[_look_for_autoloader_] = $args{look_for_autoloader};
434 $self->[_look_for_selfloader_] = $args{look_for_selfloader};
435 $self->[_saw_autoloader_] = 0;
436 $self->[_saw_selfloader_] = 0;
437 $self->[_saw_hash_bang_] = 0;
438 $self->[_saw_end_] = 0;
439 $self->[_saw_data_] = 0;
440 $self->[_saw_negative_indentation_] = 0;
441 $self->[_started_tokenizing_] = 0;
442 $self->[_line_buffer_object_] = $line_buffer_object;
443 $self->[_debugger_object_] = $args{debugger_object};
444 $self->[_diagnostics_object_] = $args{diagnostics_object};
445 $self->[_logger_object_] = $args{logger_object};
446 $self->[_unexpected_error_count_] = 0;
447 $self->[_started_looking_for_here_target_at_] = 0;
448 $self->[_nearly_matched_here_target_at_] = undef;
449 $self->[_line_of_text_] = EMPTY_STRING;
450 $self->[_rlower_case_labels_at_] = undef;
451 $self->[_extended_syntax_] = $args{extended_syntax};
452 $self->[_maximum_level_] = 0;
453 $self->[_true_brace_error_count_] = 0;
454 $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
455 $self->[_rOpts_maximum_unexpected_errors_] =
456 $rOpts->{'maximum-unexpected-errors'};
457 $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
458 $self->[_rOpts_] = $rOpts;
460 # These vars are used for guessing indentation and must be positive
461 $self->[_tabsize_] = 8 if ( !$self->[_tabsize_] );
462 $self->[_indent_columns_] = 4 if ( !$self->[_indent_columns_] );
466 $tokenizer_self = $self;
468 prepare_for_a_new_file();
469 find_starting_indentation_level();
471 # This is not a full class yet, so die if an attempt is made to
472 # create more than one object.
474 if ( _increment_count() > 1 ) {
476 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
483 # interface to Perl::Tidy::Logger routines
486 my $logger_object = $tokenizer_self->[_logger_object_];
487 if ($logger_object) {
488 $logger_object->warning($msg);
493 sub get_input_stream_name {
494 my $input_stream_name = EMPTY_STRING;
495 my $logger_object = $tokenizer_self->[_logger_object_];
496 if ($logger_object) {
497 $input_stream_name = $logger_object->get_input_stream_name();
499 return $input_stream_name;
504 my $logger_object = $tokenizer_self->[_logger_object_];
505 if ($logger_object) {
506 my $input_line_number = $tokenizer_self->[_last_line_number_] + 1;
507 $msg = "Line $input_line_number: $msg";
508 $logger_object->complain($msg);
511 } ## end sub complain
513 sub write_logfile_entry {
515 my $logger_object = $tokenizer_self->[_logger_object_];
516 if ($logger_object) {
517 $logger_object->write_logfile_entry($msg);
522 sub interrupt_logfile {
523 my $logger_object = $tokenizer_self->[_logger_object_];
524 if ($logger_object) {
525 $logger_object->interrupt_logfile();
531 my $logger_object = $tokenizer_self->[_logger_object_];
532 if ($logger_object) {
533 $logger_object->resume_logfile();
538 sub increment_brace_error {
539 my $logger_object = $tokenizer_self->[_logger_object_];
540 if ($logger_object) {
541 $logger_object->increment_brace_error();
546 sub report_definite_bug {
547 $tokenizer_self->[_hit_bug_] = 1;
548 my $logger_object = $tokenizer_self->[_logger_object_];
549 if ($logger_object) {
550 $logger_object->report_definite_bug();
557 my $logger_object = $tokenizer_self->[_logger_object_];
558 if ($logger_object) {
559 $logger_object->brace_warning($msg);
564 sub get_saw_brace_error {
565 my $logger_object = $tokenizer_self->[_logger_object_];
566 if ($logger_object) {
567 return $logger_object->get_saw_brace_error();
574 sub get_unexpected_error_count {
576 return $self->[_unexpected_error_count_];
579 # interface to Perl::Tidy::Diagnostics routines
580 sub write_diagnostics {
582 if ( $tokenizer_self->[_diagnostics_object_] ) {
583 $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
588 sub get_maximum_level {
589 return $tokenizer_self->[_maximum_level_];
592 sub report_tokenization_errors {
596 # Report any tokenization errors and return a flag '$severe_error'.
597 # Set $severe_error = 1 if the tokenization errors are so severe that
598 # the formatter should not attempt to format the file. Instead, it will
599 # just output the file verbatim.
601 # set severe error flag if tokenizer has encountered file reading problems
602 # (i.e. unexpected binary characters)
603 my $severe_error = $self->[_in_error_];
605 my $maxle = $self->[_rOpts_maximum_level_errors_];
606 my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
607 $maxle = 1 unless defined($maxle);
608 $maxue = 0 unless defined($maxue);
610 my $level = get_indentation_level();
611 if ( $level != $tokenizer_self->[_starting_level_] ) {
612 warning("final indentation level: $level\n");
613 my $level_diff = $tokenizer_self->[_starting_level_] - $level;
614 if ( $level_diff < 0 ) { $level_diff = -$level_diff }
616 # Set severe error flag if the level error is greater than 1.
617 # The formatter can function for any level error but it is probably
618 # best not to attempt formatting for a high level error.
619 if ( $maxle >= 0 && $level_diff > $maxle ) {
622 Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
627 check_final_nesting_depths();
629 # Likewise, large numbers of brace errors usually indicate non-perl
630 # scripts, so set the severe error flag at a low number. This is similar
631 # to the level check, but different because braces may balance but be
632 # incorrectly interlaced.
633 if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
637 if ( $tokenizer_self->[_look_for_hash_bang_]
638 && !$tokenizer_self->[_saw_hash_bang_] )
641 "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
644 if ( $tokenizer_self->[_in_format_] ) {
645 warning("hit EOF while in format description\n");
648 if ( $tokenizer_self->[_in_skipped_] ) {
650 "hit EOF while in lines skipped with --code-skipping\n");
653 if ( $tokenizer_self->[_in_pod_] ) {
655 # Just write log entry if this is after __END__ or __DATA__
656 # because this happens to often, and it is not likely to be
658 if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
660 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
666 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
672 if ( $tokenizer_self->[_in_here_doc_] ) {
674 my $here_doc_target = $tokenizer_self->[_here_doc_target_];
675 my $started_looking_for_here_target_at =
676 $tokenizer_self->[_started_looking_for_here_target_at_];
677 if ($here_doc_target) {
679 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
684 Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
685 (Perl will match to the end of file but this may not be intended).
688 my $nearly_matched_here_target_at =
689 $tokenizer_self->[_nearly_matched_here_target_at_];
690 if ($nearly_matched_here_target_at) {
692 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
697 # Something is seriously wrong if we ended inside a quote
698 if ( $tokenizer_self->[_in_quote_] ) {
700 my $line_start_quote = $tokenizer_self->[_line_start_quote_];
701 my $quote_target = $tokenizer_self->[_quote_target_];
703 ( $tokenizer_self->[_in_attribute_list_] )
707 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
711 if ( $tokenizer_self->[_hit_bug_] ) {
715 # Multiple "unexpected" type tokenization errors usually indicate parsing
716 # non-perl scripts, or that something is seriously wrong, so we should
717 # avoid formatting them. This can happen for example if we run perltidy on
718 # a shell script or an html file. But unfortunately this check can
719 # interfere with some extended syntaxes, such as RPerl, so it has to be off
721 my $ue_count = $tokenizer_self->[_unexpected_error_count_];
722 if ( $maxue > 0 && $ue_count > $maxue ) {
724 Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
729 unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
731 write_logfile_entry("Suggest including '-w parameter'\n");
734 write_logfile_entry("Suggest including 'use warnings;'\n");
738 if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
739 write_logfile_entry("Use of -P parameter for defines is discouraged\n");
742 unless ( $tokenizer_self->[_saw_use_strict_] ) {
743 write_logfile_entry("Suggest including 'use strict;'\n");
746 # it is suggested that labels have at least one upper case character
747 # for legibility and to avoid code breakage as new keywords are introduced
748 if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
749 my @lower_case_labels_at =
750 @{ $tokenizer_self->[_rlower_case_labels_at_] };
752 "Suggest using upper case characters in label(s)\n");
753 local $LIST_SEPARATOR = ')(';
754 write_logfile_entry(" defined at line(s): (@lower_case_labels_at)\n");
756 return $severe_error;
757 } ## end sub report_tokenization_errors
759 sub report_v_string {
761 # warn if this version can't handle v-strings
763 unless ( $tokenizer_self->[_saw_v_string_] ) {
764 $tokenizer_self->[_saw_v_string_] =
765 $tokenizer_self->[_last_line_number_];
769 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
773 } ## end sub report_v_string
775 sub is_valid_token_type {
777 return $is_valid_token_type{$type};
780 sub get_input_line_number {
781 return $tokenizer_self->[_last_line_number_];
784 # returns the next tokenized line
789 # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
790 # $square_bracket_depth, $paren_depth
792 my $input_line = $tokenizer_self->[_line_buffer_object_]->get_line();
793 $tokenizer_self->[_line_of_text_] = $input_line;
795 return unless ($input_line);
797 my $input_line_number = ++$tokenizer_self->[_last_line_number_];
799 my $write_logfile_entry = sub {
801 write_logfile_entry("Line $input_line_number: $msg");
805 # Find and remove what characters terminate this line, including any
807 my $input_line_separator = EMPTY_STRING;
808 if ( chomp($input_line) ) {
809 $input_line_separator = $INPUT_RECORD_SEPARATOR;
812 # The first test here very significantly speeds things up, but be sure to
813 # keep the regex and hash %other_line_endings the same.
814 if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
815 if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
816 $input_line_separator = $2 . $input_line_separator;
820 # for backwards compatibility we keep the line text terminated with
821 # a newline character
823 $tokenizer_self->[_line_of_text_] = $input_line; # update
825 # create a data structure describing this line which will be
826 # returned to the caller.
828 # _line_type codes are:
829 # SYSTEM - system-specific code before hash-bang line
830 # CODE - line of perl code (including comments)
831 # POD_START - line starting pod, such as '=head'
832 # POD - pod documentation text
833 # POD_END - last line of pod section, '=cut'
834 # HERE - text of here-document
835 # HERE_END - last line of here-doc (target word)
836 # FORMAT - format section
837 # FORMAT_END - last line of format section, '.'
838 # SKIP - code skipping section
839 # SKIP_END - last line of code skipping section, '#>>V'
840 # DATA_START - __DATA__ line
841 # DATA - unidentified text following __DATA__
842 # END_START - __END__ line
843 # END - unidentified text following __END__
844 # ERROR - we are in big trouble, probably not a perl script
847 # _curly_brace_depth - depth of curly braces at start of line
848 # _square_bracket_depth - depth of square brackets at start of line
849 # _paren_depth - depth of parens at start of line
850 # _starting_in_quote - this line continues a multi-line quote
851 # (so don't trim leading blanks!)
852 # _ending_in_quote - this line ends in a multi-line quote
853 # (so don't trim trailing blanks!)
854 my $line_of_tokens = {
856 _line_text => $input_line,
857 _line_number => $input_line_number,
858 _guessed_indentation_level => 0,
859 _curly_brace_depth => $brace_depth,
860 _square_bracket_depth => $square_bracket_depth,
861 _paren_depth => $paren_depth,
862 _quote_character => EMPTY_STRING,
863 ## _rtoken_type => undef,
864 ## _rtokens => undef,
865 ## _rlevels => undef,
866 ## _rblock_type => undef,
867 ## _rcontainer_type => undef,
868 ## _rcontainer_environment => undef,
869 ## _rtype_sequence => undef,
870 ## _rnesting_tokens => undef,
871 ## _rci_levels => undef,
872 ## _rnesting_blocks => undef,
873 ## _starting_in_quote => 0,
874 ## _ending_in_quote => 0,
877 # must print line unchanged if we are in a here document
878 if ( $tokenizer_self->[_in_here_doc_] ) {
880 $line_of_tokens->{_line_type} = 'HERE';
881 my $here_doc_target = $tokenizer_self->[_here_doc_target_];
882 my $here_quote_character = $tokenizer_self->[_here_quote_character_];
883 my $candidate_target = $input_line;
884 chomp $candidate_target;
886 # Handle <<~ targets, which are indicated here by a leading space on
887 # the here quote character
888 if ( $here_quote_character =~ /^\s/ ) {
889 $candidate_target =~ s/^\s*//;
891 if ( $candidate_target eq $here_doc_target ) {
892 $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
893 $line_of_tokens->{_line_type} = 'HERE_END';
894 $write_logfile_entry->("Exiting HERE document $here_doc_target\n");
896 my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
897 if ( @{$rhere_target_list} ) { # there can be multiple here targets
898 ( $here_doc_target, $here_quote_character ) =
899 @{ shift @{$rhere_target_list} };
900 $tokenizer_self->[_here_doc_target_] = $here_doc_target;
901 $tokenizer_self->[_here_quote_character_] =
902 $here_quote_character;
903 $write_logfile_entry->(
904 "Entering HERE document $here_doc_target\n");
905 $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
906 $tokenizer_self->[_started_looking_for_here_target_at_] =
910 $tokenizer_self->[_in_here_doc_] = 0;
911 $tokenizer_self->[_here_doc_target_] = EMPTY_STRING;
912 $tokenizer_self->[_here_quote_character_] = EMPTY_STRING;
916 # check for error of extra whitespace
917 # note for PERL6: leading whitespace is allowed
919 $candidate_target =~ s/\s*$//;
920 $candidate_target =~ s/^\s*//;
921 if ( $candidate_target eq $here_doc_target ) {
922 $tokenizer_self->[_nearly_matched_here_target_at_] =
926 return $line_of_tokens;
929 # Print line unchanged if we are in a format section
930 elsif ( $tokenizer_self->[_in_format_] ) {
932 if ( $input_line =~ /^\.[\s#]*$/ ) {
934 # Decrement format depth count at a '.' after a 'format'
935 $tokenizer_self->[_in_format_]--;
937 # This is the end when count reaches 0
938 if ( !$tokenizer_self->[_in_format_] ) {
939 $write_logfile_entry->("Exiting format section\n");
940 $line_of_tokens->{_line_type} = 'FORMAT_END';
944 $line_of_tokens->{_line_type} = 'FORMAT';
945 if ( $input_line =~ /^\s*format\s+\w+/ ) {
947 # Increment format depth count at a 'format' within a 'format'
948 # This is a simple way to handle nested formats (issue c019).
949 $tokenizer_self->[_in_format_]++;
952 return $line_of_tokens;
955 # must print line unchanged if we are in pod documentation
956 elsif ( $tokenizer_self->[_in_pod_] ) {
958 $line_of_tokens->{_line_type} = 'POD';
959 if ( $input_line =~ /^=cut/ ) {
960 $line_of_tokens->{_line_type} = 'POD_END';
961 $write_logfile_entry->("Exiting POD section\n");
962 $tokenizer_self->[_in_pod_] = 0;
964 if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) {
966 "Hash-bang in pod can cause older versions of perl to fail! \n"
970 return $line_of_tokens;
973 # print line unchanged if in skipped section
974 elsif ( $tokenizer_self->[_in_skipped_] ) {
976 $line_of_tokens->{_line_type} = 'SKIP';
977 if ( $input_line =~ /$code_skipping_pattern_end/ ) {
978 $line_of_tokens->{_line_type} = 'SKIP_END';
979 $write_logfile_entry->("Exiting code-skipping section\n");
980 $tokenizer_self->[_in_skipped_] = 0;
982 return $line_of_tokens;
985 # must print line unchanged if we have seen a severe error (i.e., we
986 # are seeing illegal tokens and cannot continue. Syntax errors do
987 # not pass this route). Calling routine can decide what to do, but
988 # the default can be to just pass all lines as if they were after __END__
989 elsif ( $tokenizer_self->[_in_error_] ) {
990 $line_of_tokens->{_line_type} = 'ERROR';
991 return $line_of_tokens;
994 # print line unchanged if we are __DATA__ section
995 elsif ( $tokenizer_self->[_in_data_] ) {
997 # ...but look for POD
998 # Note that the _in_data and _in_end flags remain set
999 # so that we return to that state after seeing the
1000 # end of a pod section
1001 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1002 $line_of_tokens->{_line_type} = 'POD_START';
1003 $write_logfile_entry->("Entering POD section\n");
1004 $tokenizer_self->[_in_pod_] = 1;
1005 return $line_of_tokens;
1008 $line_of_tokens->{_line_type} = 'DATA';
1009 return $line_of_tokens;
1013 # print line unchanged if we are in __END__ section
1014 elsif ( $tokenizer_self->[_in_end_] ) {
1016 # ...but look for POD
1017 # Note that the _in_data and _in_end flags remain set
1018 # so that we return to that state after seeing the
1019 # end of a pod section
1020 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1021 $line_of_tokens->{_line_type} = 'POD_START';
1022 $write_logfile_entry->("Entering POD section\n");
1023 $tokenizer_self->[_in_pod_] = 1;
1024 return $line_of_tokens;
1027 $line_of_tokens->{_line_type} = 'END';
1028 return $line_of_tokens;
1032 # check for a hash-bang line if we haven't seen one
1033 if ( !$tokenizer_self->[_saw_hash_bang_] ) {
1034 if ( $input_line =~ /^\#\!.*perl\b/ ) {
1035 $tokenizer_self->[_saw_hash_bang_] = $input_line_number;
1037 # check for -w and -P flags
1038 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
1039 $tokenizer_self->[_saw_perl_dash_P_] = 1;
1042 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
1043 $tokenizer_self->[_saw_perl_dash_w_] = 1;
1047 $input_line_number > 1
1049 # leave any hash bang in a BEGIN block alone
1050 # i.e. see 'debugger-duck_type.t'
1052 $last_nonblank_block_type
1053 && $last_nonblank_block_type eq 'BEGIN'
1055 && !$tokenizer_self->[_look_for_hash_bang_]
1057 # Try to avoid giving a false alarm at a simple comment.
1058 # These look like valid hash-bang lines:
1062 #!c:\perl\bin\perl.exe
1064 # These are comments:
1066 #! sunos does not yet provide a /usr/bin/perl
1068 # Comments typically have multiple spaces, which suggests
1070 && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
1074 # this is helpful for VMS systems; we may have accidentally
1075 # tokenized some DCL commands
1076 if ( $tokenizer_self->[_started_tokenizing_] ) {
1078 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
1082 complain("Useless hash-bang after line 1\n");
1086 # Report the leading hash-bang as a system line
1087 # This will prevent -dac from deleting it
1089 $line_of_tokens->{_line_type} = 'SYSTEM';
1090 return $line_of_tokens;
1095 # wait for a hash-bang before parsing if the user invoked us with -x
1096 if ( $tokenizer_self->[_look_for_hash_bang_]
1097 && !$tokenizer_self->[_saw_hash_bang_] )
1099 $line_of_tokens->{_line_type} = 'SYSTEM';
1100 return $line_of_tokens;
1103 # a first line of the form ': #' will be marked as SYSTEM
1104 # since lines of this form may be used by tcsh
1105 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1106 $line_of_tokens->{_line_type} = 'SYSTEM';
1107 return $line_of_tokens;
1110 # now we know that it is ok to tokenize the line...
1111 # the line tokenizer will modify any of these private variables:
1112 # _rhere_target_list_
1120 my $ending_in_quote_last = $tokenizer_self->[_in_quote_];
1121 tokenize_this_line($line_of_tokens);
1123 # Now finish defining the return structure and return it
1124 $line_of_tokens->{_ending_in_quote} = $tokenizer_self->[_in_quote_];
1126 # handle severe error (binary data in script)
1127 if ( $tokenizer_self->[_in_error_] ) {
1128 $tokenizer_self->[_in_quote_] = 0; # to avoid any more messages
1129 warning("Giving up after error\n");
1130 $line_of_tokens->{_line_type} = 'ERROR';
1131 reset_indentation_level(0); # avoid error messages
1132 return $line_of_tokens;
1135 # handle start of pod documentation
1136 if ( $tokenizer_self->[_in_pod_] ) {
1138 # This gets tricky..above a __DATA__ or __END__ section, perl
1139 # accepts '=cut' as the start of pod section. But afterwards,
1140 # only pod utilities see it and they may ignore an =cut without
1141 # leading =head. In any case, this isn't good.
1142 if ( $input_line =~ /^=cut\b/ ) {
1143 if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] )
1145 complain("=cut while not in pod ignored\n");
1146 $tokenizer_self->[_in_pod_] = 0;
1147 $line_of_tokens->{_line_type} = 'POD_END';
1150 $line_of_tokens->{_line_type} = 'POD_START';
1152 "=cut starts a pod section .. this can fool pod utilities.\n"
1153 ) unless (DEVEL_MODE);
1154 $write_logfile_entry->("Entering POD section\n");
1159 $line_of_tokens->{_line_type} = 'POD_START';
1160 $write_logfile_entry->("Entering POD section\n");
1163 return $line_of_tokens;
1166 # handle start of skipped section
1167 if ( $tokenizer_self->[_in_skipped_] ) {
1169 $line_of_tokens->{_line_type} = 'SKIP';
1170 $write_logfile_entry->("Entering code-skipping section\n");
1171 return $line_of_tokens;
1174 # see if this line contains here doc targets
1175 my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
1176 if ( @{$rhere_target_list} ) {
1178 my ( $here_doc_target, $here_quote_character ) =
1179 @{ shift @{$rhere_target_list} };
1180 $tokenizer_self->[_in_here_doc_] = 1;
1181 $tokenizer_self->[_here_doc_target_] = $here_doc_target;
1182 $tokenizer_self->[_here_quote_character_] = $here_quote_character;
1183 $write_logfile_entry->("Entering HERE document $here_doc_target\n");
1184 $tokenizer_self->[_started_looking_for_here_target_at_] =
1188 # NOTE: __END__ and __DATA__ statements are written unformatted
1189 # because they can theoretically contain additional characters
1190 # which are not tokenized (and cannot be read with <DATA> either!).
1191 if ( $tokenizer_self->[_in_data_] ) {
1192 $line_of_tokens->{_line_type} = 'DATA_START';
1193 $write_logfile_entry->("Starting __DATA__ section\n");
1194 $tokenizer_self->[_saw_data_] = 1;
1196 # keep parsing after __DATA__ if use SelfLoader was seen
1197 if ( $tokenizer_self->[_saw_selfloader_] ) {
1198 $tokenizer_self->[_in_data_] = 0;
1199 $write_logfile_entry->(
1200 "SelfLoader seen, continuing; -nlsl deactivates\n");
1203 return $line_of_tokens;
1206 elsif ( $tokenizer_self->[_in_end_] ) {
1207 $line_of_tokens->{_line_type} = 'END_START';
1208 $write_logfile_entry->("Starting __END__ section\n");
1209 $tokenizer_self->[_saw_end_] = 1;
1211 # keep parsing after __END__ if use AutoLoader was seen
1212 if ( $tokenizer_self->[_saw_autoloader_] ) {
1213 $tokenizer_self->[_in_end_] = 0;
1214 $write_logfile_entry->(
1215 "AutoLoader seen, continuing; -nlal deactivates\n");
1217 return $line_of_tokens;
1220 # now, finally, we know that this line is type 'CODE'
1221 $line_of_tokens->{_line_type} = 'CODE';
1223 # remember if we have seen any real code
1224 if ( !$tokenizer_self->[_started_tokenizing_]
1225 && $input_line !~ /^\s*$/
1226 && $input_line !~ /^\s*#/ )
1228 $tokenizer_self->[_started_tokenizing_] = 1;
1231 if ( $tokenizer_self->[_debugger_object_] ) {
1232 $tokenizer_self->[_debugger_object_]
1233 ->write_debug_entry($line_of_tokens);
1236 # Note: if keyword 'format' occurs in this line code, it is still CODE
1237 # (keyword 'format' need not start a line)
1238 if ( $tokenizer_self->[_in_format_] ) {
1239 $write_logfile_entry->("Entering format section\n");
1242 if ( $tokenizer_self->[_in_quote_]
1243 and ( $tokenizer_self->[_line_start_quote_] < 0 ) )
1246 #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1247 if ( ( my $quote_target = $tokenizer_self->[_quote_target_] ) !~
1250 $tokenizer_self->[_line_start_quote_] = $input_line_number;
1251 $write_logfile_entry->(
1252 "Start multi-line quote or pattern ending in $quote_target\n");
1255 elsif ( ( $tokenizer_self->[_line_start_quote_] >= 0 )
1256 && !$tokenizer_self->[_in_quote_] )
1258 $tokenizer_self->[_line_start_quote_] = -1;
1259 $write_logfile_entry->("End of multi-line quote or pattern\n");
1262 # we are returning a line of CODE
1263 return $line_of_tokens;
1264 } ## end sub get_line
1266 sub find_starting_indentation_level {
1268 # We need to find the indentation level of the first line of the
1269 # script being formatted. Often it will be zero for an entire file,
1270 # but if we are formatting a local block of code (within an editor for
1271 # example) it may not be zero. The user may specify this with the
1272 # -sil=n parameter but normally doesn't so we have to guess.
1274 # USES GLOBAL VARIABLES: $tokenizer_self
1275 my $starting_level = 0;
1277 # use value if given as parameter
1278 if ( $tokenizer_self->[_know_starting_level_] ) {
1279 $starting_level = $tokenizer_self->[_starting_level_];
1282 # if we know there is a hash_bang line, the level must be zero
1283 elsif ( $tokenizer_self->[_look_for_hash_bang_] ) {
1284 $tokenizer_self->[_know_starting_level_] = 1;
1287 # otherwise figure it out from the input file
1292 # keep looking at lines until we find a hash bang or piece of code
1293 my $msg = EMPTY_STRING;
1295 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
1298 # if first line is #! then assume starting level is zero
1299 if ( $i == 1 && $line =~ /^\#\!/ ) {
1300 $starting_level = 0;
1303 next if ( $line =~ /^\s*#/ ); # skip past comments
1304 next if ( $line =~ /^\s*$/ ); # skip past blank lines
1305 $starting_level = guess_old_indentation_level($line);
1308 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1309 write_logfile_entry("$msg");
1311 $tokenizer_self->[_starting_level_] = $starting_level;
1312 reset_indentation_level($starting_level);
1314 } ## end sub find_starting_indentation_level
1316 sub guess_old_indentation_level {
1319 # Guess the indentation level of an input line.
1321 # For the first line of code this result will define the starting
1322 # indentation level. It will mainly be non-zero when perltidy is applied
1323 # within an editor to a local block of code.
1325 # This is an impossible task in general because we can't know what tabs
1326 # meant for the old script and how many spaces were used for one
1327 # indentation level in the given input script. For example it may have
1328 # been previously formatted with -i=7 -et=3. But we can at least try to
1329 # make sure that perltidy guesses correctly if it is applied repeatedly to
1330 # a block of code within an editor, so that the block stays at the same
1331 # level when perltidy is applied repeatedly.
1333 # USES GLOBAL VARIABLES: $tokenizer_self
1336 # find leading tabs, spaces, and any statement label
1338 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1340 # If there are leading tabs, we use the tab scheme for this run, if
1341 # any, so that the code will remain stable when editing.
1342 if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }
1344 if ($2) { $spaces += length($2) }
1346 # correct for outdented labels
1347 if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
1348 $spaces += $tokenizer_self->[_continuation_indentation_];
1352 # compute indentation using the value of -i for this run.
1353 # If -i=0 is used for this run (which is possible) it doesn't matter
1354 # what we do here but we'll guess that the old run used 4 spaces per level.
1355 my $indent_columns = $tokenizer_self->[_indent_columns_];
1356 $indent_columns = 4 if ( !$indent_columns );
1357 $level = int( $spaces / $indent_columns );
1359 } ## end sub guess_old_indentation_level
1361 # This is a currently unused debug routine
1362 sub dump_functions {
1365 foreach my $pkg ( keys %is_user_function ) {
1366 $fh->print("\nnon-constant subs in package $pkg\n");
1368 foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1369 my $msg = EMPTY_STRING;
1370 if ( $is_block_list_function{$pkg}{$sub} ) {
1371 $msg = 'block_list';
1374 if ( $is_block_function{$pkg}{$sub} ) {
1377 $fh->print("$sub $msg\n");
1381 foreach my $pkg ( keys %is_constant ) {
1382 $fh->print("\nconstants and constant subs in package $pkg\n");
1384 foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1385 $fh->print("$sub\n");
1389 } ## end sub dump_functions
1391 sub prepare_for_a_new_file {
1393 # previous tokens needed to determine what to expect next
1394 $last_nonblank_token = ';'; # the only possible starting state which
1395 $last_nonblank_type = ';'; # will make a leading brace a code block
1396 $last_nonblank_block_type = EMPTY_STRING;
1398 # scalars for remembering statement types across multiple lines
1399 $statement_type = EMPTY_STRING; # '' or 'use' or 'sub..' or 'case..'
1400 $in_attribute_list = 0;
1402 # scalars for remembering where we are in the file
1403 $current_package = "main";
1404 $context = UNKNOWN_CONTEXT;
1406 # hashes used to remember function information
1407 %is_constant = (); # user-defined constants
1408 %is_user_function = (); # user-defined functions
1409 %user_function_prototype = (); # their prototypes
1410 %is_block_function = ();
1411 %is_block_list_function = ();
1412 %saw_function_definition = ();
1413 %saw_use_module = ();
1415 # variables used to track depths of various containers
1416 # and report nesting errors
1419 $square_bracket_depth = 0;
1420 @current_depth = (0) x scalar @closing_brace_names;
1423 @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
1424 @current_sequence_number = ();
1425 $next_sequence_number = 2; # The value 1 is reserved for SEQ_ROOT
1428 @paren_semicolon_count = ();
1429 @paren_structural_type = ();
1431 @brace_structural_type = ();
1432 @brace_context = ();
1433 @brace_package = ();
1434 @square_bracket_type = ();
1435 @square_bracket_structural_type = ();
1437 @nested_ternary_flag = ();
1438 @nested_statement_type = ();
1439 @starting_line_of_current_depth = ();
1441 $paren_type[$paren_depth] = EMPTY_STRING;
1442 $paren_semicolon_count[$paren_depth] = 0;
1443 $paren_structural_type[$brace_depth] = EMPTY_STRING;
1444 $brace_type[$brace_depth] = ';'; # identify opening brace as code block
1445 $brace_structural_type[$brace_depth] = EMPTY_STRING;
1446 $brace_context[$brace_depth] = UNKNOWN_CONTEXT;
1447 $brace_package[$paren_depth] = $current_package;
1448 $square_bracket_type[$square_bracket_depth] = EMPTY_STRING;
1449 $square_bracket_structural_type[$square_bracket_depth] = EMPTY_STRING;
1451 initialize_tokenizer_state();
1453 } ## end sub prepare_for_a_new_file
1455 { ## closure for sub tokenize_this_line
1457 use constant BRACE => 0;
1458 use constant SQUARE_BRACKET => 1;
1459 use constant PAREN => 2;
1460 use constant QUESTION_COLON => 3;
1462 # TV1: scalars for processing one LINE.
1463 # Re-initialized on each entry to sub tokenize_this_line.
1465 $block_type, $container_type, $expecting,
1466 $i, $i_tok, $input_line,
1467 $input_line_number, $last_nonblank_i, $max_token_index,
1468 $next_tok, $next_type, $peeked_ahead,
1469 $prototype, $rhere_target_list, $rtoken_map,
1470 $rtoken_type, $rtokens, $tok,
1471 $type, $type_sequence, $indent_flag,
1474 # TV2: refs to ARRAYS for processing one LINE
1475 # Re-initialized on each call.
1476 my $routput_token_list = []; # stack of output token indexes
1477 my $routput_token_type = []; # token types
1478 my $routput_block_type = []; # types of code block
1479 my $routput_container_type = []; # paren types, such as if, elsif, ..
1480 my $routput_type_sequence = []; # nesting sequential number
1481 my $routput_indent_flag = []; #
1483 # TV3: SCALARS for quote variables. These are initialized with a
1484 # subroutine call and continually updated as lines are processed.
1485 my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1486 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1488 # TV4: SCALARS for multi-line identifiers and
1489 # statements. These are initialized with a subroutine call
1490 # and continually updated as lines are processed.
1491 my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
1493 # TV5: SCALARS for tracking indentation level.
1494 # Initialized once and continually updated as lines are
1497 $nesting_token_string, $nesting_type_string,
1498 $nesting_block_string, $nesting_block_flag,
1499 $nesting_list_string, $nesting_list_flag,
1500 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1501 $in_statement_continuation, $level_in_tokenizer,
1502 $slevel_in_tokenizer, $rslevel_stack,
1505 # TV6: SCALARS for remembering several previous
1506 # tokens. Initialized once and continually updated as
1507 # lines are processed.
1509 $last_nonblank_container_type, $last_nonblank_type_sequence,
1510 $last_last_nonblank_token, $last_last_nonblank_type,
1511 $last_last_nonblank_block_type, $last_last_nonblank_container_type,
1512 $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1515 # ----------------------------------------------------------------
1516 # beginning of tokenizer variable access and manipulation routines
1517 # ----------------------------------------------------------------
1519 sub initialize_tokenizer_state {
1521 # TV1: initialized on each call
1522 # TV2: initialized on each call
1526 $quote_character = EMPTY_STRING;
1529 $quoted_string_1 = EMPTY_STRING;
1530 $quoted_string_2 = EMPTY_STRING;
1531 $allowed_quote_modifiers = EMPTY_STRING;
1534 $id_scan_state = EMPTY_STRING;
1535 $identifier = EMPTY_STRING;
1536 $want_paren = EMPTY_STRING;
1537 $indented_if_level = 0;
1540 $nesting_token_string = EMPTY_STRING;
1541 $nesting_type_string = EMPTY_STRING;
1542 $nesting_block_string = '1'; # initially in a block
1543 $nesting_block_flag = 1;
1544 $nesting_list_string = '0'; # initially not in a list
1545 $nesting_list_flag = 0; # initially not in a list
1546 $ci_string_in_tokenizer = EMPTY_STRING;
1547 $continuation_string_in_tokenizer = "0";
1548 $in_statement_continuation = 0;
1549 $level_in_tokenizer = 0;
1550 $slevel_in_tokenizer = 0;
1551 $rslevel_stack = [];
1554 $last_nonblank_container_type = EMPTY_STRING;
1555 $last_nonblank_type_sequence = EMPTY_STRING;
1556 $last_last_nonblank_token = ';';
1557 $last_last_nonblank_type = ';';
1558 $last_last_nonblank_block_type = EMPTY_STRING;
1559 $last_last_nonblank_container_type = EMPTY_STRING;
1560 $last_last_nonblank_type_sequence = EMPTY_STRING;
1561 $last_nonblank_prototype = EMPTY_STRING;
1563 } ## end sub initialize_tokenizer_state
1565 sub save_tokenizer_state {
1568 $block_type, $container_type, $expecting,
1569 $i, $i_tok, $input_line,
1570 $input_line_number, $last_nonblank_i, $max_token_index,
1571 $next_tok, $next_type, $peeked_ahead,
1572 $prototype, $rhere_target_list, $rtoken_map,
1573 $rtoken_type, $rtokens, $tok,
1574 $type, $type_sequence, $indent_flag,
1578 $routput_token_list, $routput_token_type,
1579 $routput_block_type, $routput_container_type,
1580 $routput_type_sequence, $routput_indent_flag,
1584 $in_quote, $quote_type,
1585 $quote_character, $quote_pos,
1586 $quote_depth, $quoted_string_1,
1587 $quoted_string_2, $allowed_quote_modifiers,
1591 [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
1594 $nesting_token_string, $nesting_type_string,
1595 $nesting_block_string, $nesting_block_flag,
1596 $nesting_list_string, $nesting_list_flag,
1597 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1598 $in_statement_continuation, $level_in_tokenizer,
1599 $slevel_in_tokenizer, $rslevel_stack,
1603 $last_nonblank_container_type,
1604 $last_nonblank_type_sequence,
1605 $last_last_nonblank_token,
1606 $last_last_nonblank_type,
1607 $last_last_nonblank_block_type,
1608 $last_last_nonblank_container_type,
1609 $last_last_nonblank_type_sequence,
1610 $last_nonblank_prototype,
1612 return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1613 } ## end sub save_tokenizer_state
1615 sub restore_tokenizer_state {
1617 my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1619 $block_type, $container_type, $expecting,
1620 $i, $i_tok, $input_line,
1621 $input_line_number, $last_nonblank_i, $max_token_index,
1622 $next_tok, $next_type, $peeked_ahead,
1623 $prototype, $rhere_target_list, $rtoken_map,
1624 $rtoken_type, $rtokens, $tok,
1625 $type, $type_sequence, $indent_flag,
1629 $routput_token_list, $routput_token_type,
1630 $routput_block_type, $routput_container_type,
1631 $routput_type_sequence, $routput_indent_flag,
1635 $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1636 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1639 ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
1643 $nesting_token_string, $nesting_type_string,
1644 $nesting_block_string, $nesting_block_flag,
1645 $nesting_list_string, $nesting_list_flag,
1646 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1647 $in_statement_continuation, $level_in_tokenizer,
1648 $slevel_in_tokenizer, $rslevel_stack,
1652 $last_nonblank_container_type,
1653 $last_nonblank_type_sequence,
1654 $last_last_nonblank_token,
1655 $last_last_nonblank_type,
1656 $last_last_nonblank_block_type,
1657 $last_last_nonblank_container_type,
1658 $last_last_nonblank_type_sequence,
1659 $last_nonblank_prototype,
1662 } ## end sub restore_tokenizer_state
1664 sub split_pretoken {
1668 # Split the leading $numc characters from the current token (at index=$i)
1669 # which is pre-type 'w' and insert the remainder back into the pretoken
1670 # stream with appropriate settings. Since we are splitting a pre-type 'w',
1671 # there are three cases, depending on if the remainder starts with a digit:
1672 # Case 1: remainder is type 'd', all digits
1673 # Case 2: remainder is type 'd' and type 'w': digits and other characters
1674 # Case 3: remainder is type 'w'
1676 # Examples, for $numc=1:
1677 # $tok => $tok_0 $tok_1 $tok_2
1678 # 'x10' => 'x' '10' # case 1
1679 # 'x10if' => 'x' '10' 'if' # case 2
1680 # '0ne => 'O' 'ne' # case 3
1683 # $tok_1 is a possible string of digits (pre-type 'd')
1684 # $tok_2 is a possible word (pre-type 'w')
1686 # return 1 if successful
1687 # return undef if error (shouldn't happen)
1689 # Calling routine should update '$type' and '$tok' if successful.
1691 my $pretoken = $rtokens->[$i];
1693 && length($pretoken) > $numc
1694 && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
1697 # Split $tok into up to 3 tokens:
1698 my $tok_0 = substr( $pretoken, 0, $numc );
1699 my $tok_1 = defined($1) ? $1 : EMPTY_STRING;
1700 my $tok_2 = defined($2) ? $2 : EMPTY_STRING;
1702 my $len_0 = length($tok_0);
1703 my $len_1 = length($tok_1);
1704 my $len_2 = length($tok_2);
1706 my $pre_type_0 = 'w';
1707 my $pre_type_1 = 'd';
1708 my $pre_type_2 = 'w';
1710 my $pos_0 = $rtoken_map->[$i];
1711 my $pos_1 = $pos_0 + $len_0;
1712 my $pos_2 = $pos_1 + $len_1;
1714 my $isplice = $i + 1;
1716 # Splice in any digits
1718 splice @{$rtoken_map}, $isplice, 0, $pos_1;
1719 splice @{$rtokens}, $isplice, 0, $tok_1;
1720 splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
1725 # Splice in any trailing word
1727 splice @{$rtoken_map}, $isplice, 0, $pos_2;
1728 splice @{$rtokens}, $isplice, 0, $tok_2;
1729 splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
1733 $rtokens->[$i] = $tok_0;
1738 # Shouldn't get here
1741 While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
1746 } ## end sub split_pretoken
1748 sub get_indentation_level {
1750 # patch to avoid reporting error if indented if is not terminated
1751 if ($indented_if_level) { return $level_in_tokenizer - 1 }
1752 return $level_in_tokenizer;
1755 sub reset_indentation_level {
1756 $level_in_tokenizer = $slevel_in_tokenizer = shift;
1757 push @{$rslevel_stack}, $slevel_in_tokenizer;
1763 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1764 return $peeked_ahead;
1767 # ------------------------------------------------------------
1768 # end of tokenizer variable access and manipulation routines
1769 # ------------------------------------------------------------
1771 #------------------------------
1772 # beginning of tokenizer hashes
1773 #------------------------------
1775 my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
1777 # These block types terminate statements and do not need a trailing
1779 # patched for SWITCH/CASE/
1780 my %is_zero_continuation_block_type;
1782 @q = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
1783 if elsif else unless while until for foreach switch case given when);
1784 @is_zero_continuation_block_type{@q} = (1) x scalar(@q);
1786 my %is_logical_container;
1787 @q = qw(if elsif unless while and or err not && ! || for foreach);
1788 @is_logical_container{@q} = (1) x scalar(@q);
1792 @is_binary_type{@q} = (1) x scalar(@q);
1794 my %is_binary_keyword;
1795 @q = qw(and or err eq ne cmp);
1796 @is_binary_keyword{@q} = (1) x scalar(@q);
1798 # 'L' is token for opening { at hash key
1799 my %is_opening_type;
1801 @is_opening_type{@q} = (1) x scalar(@q);
1803 # 'R' is token for closing } at hash key
1804 my %is_closing_type;
1806 @is_closing_type{@q} = (1) x scalar(@q);
1808 my %is_redo_last_next_goto;
1809 @q = qw(redo last next goto);
1810 @is_redo_last_next_goto{@q} = (1) x scalar(@q);
1813 @q = qw(use require);
1814 @is_use_require{@q} = (1) x scalar(@q);
1816 # This hash holds the array index in $tokenizer_self for these keywords:
1817 # Fix for issue c035: removed 'format' from this hash
1819 '__END__' => _in_end_,
1820 '__DATA__' => _in_data_,
1823 my %is_list_end_type;
1826 @is_list_end_type{@q} = (1) x scalar(@q);
1828 # original ref: camel 3 p 147,
1829 # but perl may accept undocumented flags
1830 # perl 5.10 adds 'p' (preserve)
1831 # Perl version 5.22 added 'n'
1832 # From http://perldoc.perl.org/perlop.html we have
1833 # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
1834 # s/PATTERN/REPLACEMENT/msixpodualngcer
1835 # y/SEARCHLIST/REPLACEMENTLIST/cdsr
1836 # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
1837 # qr/STRING/msixpodualn
1838 my %quote_modifiers = (
1839 's' => '[msixpodualngcer]',
1842 'm' => '[msixpodualngc]',
1843 'qr' => '[msixpodualn]',
1844 'q' => EMPTY_STRING,
1845 'qq' => EMPTY_STRING,
1846 'qw' => EMPTY_STRING,
1847 'qx' => EMPTY_STRING,
1850 # table showing how many quoted things to look for after quote operator..
1851 # s, y, tr have 2 (pattern and replacement)
1852 # others have 1 (pattern only)
1866 @_ = qw(for foreach);
1867 @is_for_foreach{@_} = (1) x scalar(@_);
1869 my %is_my_our_state;
1870 @_ = qw(my our state);
1871 @is_my_our_state{@_} = (1) x scalar(@_);
1873 # These keywords may introduce blocks after parenthesized expressions,
1875 # keyword ( .... ) { BLOCK }
1876 # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
1877 my %is_blocktype_with_paren;
1879 qw(if elsif unless while until for foreach switch case given when catch);
1880 @is_blocktype_with_paren{@_} = (1) x scalar(@_);
1882 my %is_case_default;
1883 @_ = qw(case default);
1884 @is_case_default{@_} = (1) x scalar(@_);
1886 #------------------------
1887 # end of tokenizer hashes
1888 #------------------------
1890 # ------------------------------------------------------------
1891 # beginning of various scanner interface routines
1892 # ------------------------------------------------------------
1893 sub scan_replacement_text {
1895 # check for here-docs in replacement text invoked by
1896 # a substitution operator with executable modifier 'e'.
1901 # $rht = reference to any here-doc targets
1902 my ($replacement_text) = @_;
1905 return unless ( $replacement_text =~ /<</ );
1907 write_logfile_entry("scanning replacement text for here-doc targets\n");
1909 # save the logger object for error messages
1910 my $logger_object = $tokenizer_self->[_logger_object_];
1912 # localize all package variables
1914 $tokenizer_self, $last_nonblank_token,
1915 $last_nonblank_type, $last_nonblank_block_type,
1916 $statement_type, $in_attribute_list,
1917 $current_package, $context,
1918 %is_constant, %is_user_function,
1919 %user_function_prototype, %is_block_function,
1920 %is_block_list_function, %saw_function_definition,
1921 $brace_depth, $paren_depth,
1922 $square_bracket_depth, @current_depth,
1923 @total_depth, $total_depth,
1924 @nesting_sequence_number, @current_sequence_number,
1925 @paren_type, @paren_semicolon_count,
1926 @paren_structural_type, @brace_type,
1927 @brace_structural_type, @brace_context,
1928 @brace_package, @square_bracket_type,
1929 @square_bracket_structural_type, @depth_array,
1930 @starting_line_of_current_depth, @nested_ternary_flag,
1931 @nested_statement_type, $next_sequence_number,
1934 # save all lexical variables
1935 my $rstate = save_tokenizer_state();
1936 _decrement_count(); # avoid error check for multiple tokenizers
1938 # make a new tokenizer
1940 my $rpending_logfile_message;
1941 my $source_object = Perl::Tidy::LineSource->new(
1942 input_file => \$replacement_text,
1944 rpending_logfile_message => $rpending_logfile_message,
1946 my $tokenizer = Perl::Tidy::Tokenizer->new(
1947 source_object => $source_object,
1948 logger_object => $logger_object,
1949 starting_line_number => $input_line_number,
1952 # scan the replacement text
1953 1 while ( $tokenizer->get_line() );
1955 # remove any here doc targets
1957 if ( $tokenizer_self->[_in_here_doc_] ) {
1961 $tokenizer_self->[_here_doc_target_],
1962 $tokenizer_self->[_here_quote_character_]
1964 if ( $tokenizer_self->[_rhere_target_list_] ) {
1965 push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
1966 $tokenizer_self->[_rhere_target_list_] = undef;
1968 $tokenizer_self->[_in_here_doc_] = undef;
1971 # now its safe to report errors
1972 my $severe_error = $tokenizer->report_tokenization_errors();
1974 # TODO: Could propagate a severe error up
1976 # restore all tokenizer lexical variables
1977 restore_tokenizer_state($rstate);
1979 # return the here doc targets
1981 } ## end sub scan_replacement_text
1983 sub scan_bare_identifier {
1984 ( $i, $tok, $type, $prototype ) =
1985 scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
1986 $rtoken_map, $max_token_index );
1990 sub scan_identifier {
1992 $i, $tok, $type, $id_scan_state, $identifier,
1993 my $split_pretoken_flag
1995 = scan_complex_identifier( $i, $id_scan_state, $identifier, $rtokens,
1996 $max_token_index, $expecting, $paren_type[$paren_depth] );
1998 # Check for signal to fix a special variable adjacent to a keyword,
1999 # such as '$^One$0'.
2000 if ($split_pretoken_flag) {
2002 # Try to fix it by splitting the pretoken
2004 && $rtokens->[ $i - 1 ] eq '^'
2005 && split_pretoken(1) )
2007 $identifier = substr( $identifier, 0, 3 );
2012 # This shouldn't happen ...
2013 my $var = substr( $tok, 0, 3 );
2014 my $excess = substr( $tok, 3 );
2015 interrupt_logfile();
2017 $input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
2018 A space may be needed after '$var'.
2024 } ## end sub scan_identifier
2026 use constant VERIFY_FASTSCAN => 0;
2027 my %fast_scan_context;
2030 %fast_scan_context = (
2031 '$' => SCALAR_CONTEXT,
2032 '*' => SCALAR_CONTEXT,
2033 '@' => LIST_CONTEXT,
2034 '%' => LIST_CONTEXT,
2035 '&' => UNKNOWN_CONTEXT,
2039 sub scan_simple_identifier {
2041 # This is a wrapper for sub scan_identifier. It does a fast preliminary
2042 # scan for certain common identifiers:
2043 # '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
2044 # If it does not find one of these, or this is a restart, it calls the
2045 # original scanner directly.
2047 # This gives the same results as the full scanner in about 1/4 the
2048 # total runtime for a typical input stream.
2051 my $tok_begin = $tok;
2054 ###############################
2055 # quick scan with leading sigil
2056 ###############################
2057 if ( !$id_scan_state
2058 && $i + 1 <= $max_token_index
2059 && $fast_scan_context{$tok} )
2061 $context = $fast_scan_context{$tok};
2063 # look for $var, @var, ...
2064 if ( $rtoken_type->[ $i + 1 ] eq 'w' ) {
2065 my $pretype_next = EMPTY_STRING;
2066 my $i_next = $i + 2;
2067 if ( $i_next <= $max_token_index ) {
2068 if ( $rtoken_type->[$i_next] eq 'b'
2069 && $i_next < $max_token_index )
2073 $pretype_next = $rtoken_type->[$i_next];
2075 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
2077 # Found type 'i' like '$var', '@var', or '%var'
2078 $identifier = $tok . $rtokens->[ $i + 1 ];
2082 $fast_scan_type = $type;
2086 # Look for @{ or %{ .
2087 # But we must let the full scanner handle things ${ because it may
2088 # keep going to get a complete identifier like '${#}' .
2090 $rtoken_type->[ $i + 1 ] eq '{'
2091 && ( $tok_begin eq '@'
2092 || $tok_begin eq '%' )
2098 $fast_scan_type = $type;
2102 ############################
2103 # Quick scan with leading ->
2104 # Look for ->[ and ->{
2105 ############################
2108 && $i < $max_token_index
2109 && ( $rtokens->[ $i + 1 ] eq '{'
2110 || $rtokens->[ $i + 1 ] eq '[' )
2114 $fast_scan_type = $type;
2116 $context = UNKNOWN_CONTEXT;
2119 #######################################
2120 # Verify correctness during development
2121 #######################################
2122 if ( VERIFY_FASTSCAN && $fast_scan_type ) {
2124 # We will call the full method
2125 my $identifier_simple = $identifier;
2126 my $tok_simple = $tok;
2128 my $context_simple = $context;
2134 if ( $tok ne $tok_simple
2135 || $type ne $fast_scan_type
2137 || $identifier ne $identifier_simple
2139 || $context ne $context_simple )
2142 scan_simple_identifier differs from scan_identifier:
2143 simple: i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
2144 full: i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
2149 ###################################################
2150 # call full scanner if fast method did not succeed
2151 ###################################################
2152 if ( !$fast_scan_type ) {
2156 } ## end sub scan_simple_identifier
2159 ( $i, $tok, $type, $id_scan_state ) =
2160 scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
2161 $id_scan_state, $max_token_index );
2167 ( $i, $type, $number ) =
2168 scan_number_do( $input_line, $i, $rtoken_map, $type,
2173 use constant VERIFY_FASTNUM => 0;
2175 sub scan_number_fast {
2177 # This is a wrapper for sub scan_number. It does a fast preliminary
2178 # scan for a simple integer. It calls the original scan_number if it
2179 # does not find one.
2182 my $tok_begin = $tok;
2185 ##################################
2186 # Quick check for (signed) integer
2187 ##################################
2189 # This will be the string of digits:
2192 my $typ_d = $rtoken_type->[$i_d];
2194 # check for signed integer
2195 my $sign = EMPTY_STRING;
2197 && ( $typ_d eq '+' || $typ_d eq '-' )
2198 && $i_d < $max_token_index )
2202 $tok_d = $rtokens->[$i_d];
2203 $typ_d = $rtoken_type->[$i_d];
2210 $i_d == $max_token_index
2211 || ( $i_d < $max_token_index
2212 && $rtoken_type->[ $i_d + 1 ] ne '.'
2213 && $rtoken_type->[ $i_d + 1 ] ne 'w' )
2217 # Let let full scanner handle multi-digit integers beginning with
2218 # '0' because there could be error messages. For example, '009' is
2219 # not a valid number.
2221 if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
2222 $number = $sign . $tok_d;
2228 #######################################
2229 # Verify correctness during development
2230 #######################################
2231 if ( VERIFY_FASTNUM && defined($number) ) {
2233 # We will call the full method
2234 my $type_simple = $type;
2236 my $number_simple = $number;
2240 $number = scan_number();
2242 if ( $type ne $type_simple
2243 || ( $i != $i_simple && $i <= $max_token_index )
2244 || $number ne $number_simple )
2247 scan_number_fast differs from scan_number:
2248 simple: i=$i_simple, type=$type_simple, number=$number_simple
2249 full: i=$i, type=$type, number=$number
2254 #########################################
2255 # call full scanner if may not be integer
2256 #########################################
2257 if ( !defined($number) ) {
2258 $number = scan_number();
2261 } ## end sub scan_number_fast
2263 # a sub to warn if token found where term expected
2264 sub error_if_expecting_TERM {
2265 if ( $expecting == TERM ) {
2266 if ( $really_want_term{$last_nonblank_type} ) {
2267 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
2268 $rtoken_map, $rtoken_type, $input_line );
2273 } ## end sub error_if_expecting_TERM
2275 # a sub to warn if token found where operator expected
2276 sub error_if_expecting_OPERATOR {
2278 if ( $expecting == OPERATOR ) {
2279 if ( !defined($thing) ) { $thing = $tok }
2280 report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
2281 $rtoken_map, $rtoken_type, $input_line );
2282 if ( $i_tok == 0 ) {
2283 interrupt_logfile();
2284 warning("Missing ';' or ',' above?\n");
2290 } ## end sub error_if_expecting_OPERATOR
2292 # ------------------------------------------------------------
2293 # end scanner interfaces
2294 # ------------------------------------------------------------
2299 sub do_GREATER_THAN_SIGN {
2302 error_if_expecting_TERM()
2303 if ( $expecting == TERM );
2307 sub do_VERTICAL_LINE {
2310 error_if_expecting_TERM()
2311 if ( $expecting == TERM );
2315 sub do_DOLLAR_SIGN {
2318 # start looking for a scalar
2319 error_if_expecting_OPERATOR("Scalar")
2320 if ( $expecting == OPERATOR );
2321 scan_simple_identifier();
2323 if ( $identifier eq '$^W' ) {
2324 $tokenizer_self->[_saw_perl_dash_w_] = 1;
2327 # Check for identifier in indirect object slot
2328 # (vorboard.pl, sort.t). Something like:
2329 # /^(print|printf|sort|exec|system)$/
2331 $is_indirect_object_taker{$last_nonblank_token}
2332 || ( ( $last_nonblank_token eq '(' )
2333 && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
2334 || ( $last_nonblank_type eq 'w'
2335 || $last_nonblank_type eq 'U' ) # possible object
2339 # An identifier followed by '->' is not indirect object;
2340 # fixes b1175, b1176
2341 my ( $next_nonblank_type, $i_next ) =
2342 find_next_noncomment_type( $i, $rtokens, $max_token_index );
2343 $type = 'Z' if ( $next_nonblank_type ne '->' );
2346 } ## end sub do_DOLLAR_SIGN
2348 sub do_LEFT_PARENTHESIS {
2352 $paren_semicolon_count[$paren_depth] = 0;
2354 $container_type = $want_paren;
2355 $want_paren = EMPTY_STRING;
2357 elsif ( $statement_type =~ /^sub\b/ ) {
2358 $container_type = $statement_type;
2361 $container_type = $last_nonblank_token;
2363 # We can check for a syntax error here of unexpected '(',
2364 # but this is going to get messy...
2366 $expecting == OPERATOR
2368 # Be sure this is not a method call of the form
2369 # &method(...), $method->(..), &{method}(...),
2370 # $ref[2](list) is ok & short for $ref[2]->(list)
2371 # NOTE: at present, braces in something like &{ xxx }
2372 # are not marked as a block, we might have a method call.
2373 # Added ')' to fix case c017, something like ()()()
2374 && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/
2379 # ref: camel 3 p 703.
2380 if ( $last_last_nonblank_token eq 'do' ) {
2382 "do SUBROUTINE is deprecated; consider & or -> notation\n"
2387 # if this is an empty list, (), then it is not an
2388 # error; for example, we might have a constant pi and
2389 # invoke it with pi() or just pi;
2390 my ( $next_nonblank_token, $i_next ) =
2391 find_next_nonblank_token( $i, $rtokens,
2394 # Patch for c029: give up error check if
2395 # a side comment follows
2396 if ( $next_nonblank_token ne ')'
2397 && $next_nonblank_token ne '#' )
2401 error_if_expecting_OPERATOR('(');
2403 if ( $last_nonblank_type eq 'C' ) {
2405 "$last_nonblank_token has a void prototype\n";
2407 elsif ( $last_nonblank_type eq 'i' ) {
2409 && $last_nonblank_token =~ /^\$/ )
2412 "Do you mean '$last_nonblank_token->(' ?\n";
2416 interrupt_logfile();
2420 } ## end if ( $next_nonblank_token...
2421 } ## end else [ if ( $last_last_nonblank_token...
2422 } ## end if ( $expecting == OPERATOR...
2424 $paren_type[$paren_depth] = $container_type;
2425 ( $type_sequence, $indent_flag ) =
2426 increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2428 # propagate types down through nested parens
2429 # for example: the second paren in 'if ((' would be structural
2430 # since the first is.
2432 if ( $last_nonblank_token eq '(' ) {
2433 $type = $last_nonblank_type;
2436 # We exclude parens as structural after a ',' because it
2437 # causes subtle problems with continuation indentation for
2438 # something like this, where the first 'or' will not get
2443 # ( not defined $check )
2445 # or $check eq "new"
2446 # or $check eq "old",
2449 # Likewise, we exclude parens where a statement can start
2450 # because of problems with continuation indentation, like
2453 # ($firstline =~ /^#\!.*perl/)
2454 # and (print $File::Find::name, "\n")
2457 # (ref($usage_fref) =~ /CODE/)
2459 # : (&blast_usage, &blast_params, &blast_general_params);
2465 if ( $last_nonblank_type eq ')' ) {
2467 "Syntax error? found token '$last_nonblank_type' then '('\n");
2469 $paren_structural_type[$paren_depth] = $type;
2472 } ## end sub do_LEFT_PARENTHESIS
2474 sub do_RIGHT_PARENTHESIS {
2477 ( $type_sequence, $indent_flag ) =
2478 decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2480 if ( $paren_structural_type[$paren_depth] eq '{' ) {
2484 $container_type = $paren_type[$paren_depth];
2486 # restore statement type as 'sub' at closing paren of a signature
2487 # so that a subsequent ':' is identified as an attribute
2488 if ( $container_type =~ /^sub\b/ ) {
2489 $statement_type = $container_type;
2493 if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
2494 my $num_sc = $paren_semicolon_count[$paren_depth];
2495 if ( $num_sc > 0 && $num_sc != 2 ) {
2496 warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2500 if ( $paren_depth > 0 ) { $paren_depth-- }
2502 } ## end sub do_RIGHT_PARENTHESIS
2507 if ( $last_nonblank_type eq ',' ) {
2508 complain("Repeated ','s \n");
2511 # Note that we have to check both token and type here because a
2512 # comma following a qw list can have last token='(' but type = 'q'
2513 elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' ) {
2514 warning("Unexpected leading ',' after a '('\n");
2517 # patch for operator_expected: note if we are in the list (use.t)
2518 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2521 } ## end sub do_COMMA
2526 $context = UNKNOWN_CONTEXT;
2527 $statement_type = EMPTY_STRING;
2528 $want_paren = EMPTY_STRING;
2531 if ( $is_for_foreach{ $paren_type[$paren_depth] } )
2532 { # mark ; in for loop
2534 # Be careful: we do not want a semicolon such as the
2535 # following to be included:
2537 # for (sort {strcoll($a,$b);} keys %investments) {
2539 if ( $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
2540 && $square_bracket_depth ==
2541 $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
2545 $paren_semicolon_count[$paren_depth]++;
2549 } ## end sub do_SEMICOLON
2551 sub do_QUOTATION_MARK {
2554 error_if_expecting_OPERATOR("String")
2555 if ( $expecting == OPERATOR );
2558 $allowed_quote_modifiers = EMPTY_STRING;
2560 } ## end sub do_QUOTATION_MARK
2565 error_if_expecting_OPERATOR("String")
2566 if ( $expecting == OPERATOR );
2569 $allowed_quote_modifiers = EMPTY_STRING;
2571 } ## end sub do_APOSTROPHE
2576 error_if_expecting_OPERATOR("String")
2577 if ( $expecting == OPERATOR );
2580 $allowed_quote_modifiers = EMPTY_STRING;
2582 } ## end sub do_BACKTICK
2589 # a pattern cannot follow certain keywords which take optional
2590 # arguments, like 'shift' and 'pop'. See also '?'.
2592 $last_nonblank_type eq 'k'
2593 && $is_keyword_rejecting_slash_as_pattern_delimiter{
2594 $last_nonblank_token}
2599 elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
2601 ( $is_pattern, $msg ) =
2602 guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
2606 write_diagnostics("DIVIDE:$msg\n");
2607 write_logfile_entry($msg);
2610 else { $is_pattern = ( $expecting == TERM ) }
2615 $allowed_quote_modifiers = '[msixpodualngc]';
2617 else { # not a pattern; check for a /= token
2619 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
2625 #DEBUG - collecting info on what tokens follow a divide
2626 # for development of guessing algorithm
2627 #if ( is_possible_numerator( $i, $rtokens, $max_token_index ) < 0 ) {
2628 # #write_diagnostics( "DIVIDE? $input_line\n" );
2632 } ## end sub do_SLASH
2634 sub do_LEFT_CURLY_BRACKET {
2637 # if we just saw a ')', we will label this block with
2638 # its type. We need to do this to allow sub
2639 # code_block_type to determine if this brace starts a
2640 # code block or anonymous hash. (The type of a paren
2641 # pair is the preceding token, such as 'if', 'else',
2643 $container_type = EMPTY_STRING;
2645 # ATTRS: for a '{' following an attribute list, reset
2646 # things to look like we just saw the sub name
2647 if ( $statement_type =~ /^sub\b/ ) {
2648 $last_nonblank_token = $statement_type;
2649 $last_nonblank_type = 'i';
2650 $statement_type = EMPTY_STRING;
2653 # patch for SWITCH/CASE: hide these keywords from an immediately
2654 # following opening brace
2655 elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
2656 && $statement_type eq $last_nonblank_token )
2658 $last_nonblank_token = ";";
2661 elsif ( $last_nonblank_token eq ')' ) {
2662 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
2664 # defensive move in case of a nesting error (pbug.t)
2665 # in which this ')' had no previous '('
2666 # this nesting error will have been caught
2667 if ( !defined($last_nonblank_token) ) {
2668 $last_nonblank_token = 'if';
2671 # check for syntax error here;
2672 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
2673 if ( $tokenizer_self->[_extended_syntax_] ) {
2675 # we append a trailing () to mark this as an unknown
2676 # block type. This allows perltidy to format some
2677 # common extensions of perl syntax.
2678 # This is used by sub code_block_type
2679 $last_nonblank_token .= '()';
2683 join( SPACE, sort keys %is_blocktype_with_paren );
2685 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
2691 # patch for paren-less for/foreach glitch, part 2.
2692 # see note below under 'qw'
2693 elsif ($last_nonblank_token eq 'qw'
2694 && $is_for_foreach{$want_paren} )
2696 $last_nonblank_token = $want_paren;
2697 if ( $last_last_nonblank_token eq $want_paren ) {
2699 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
2703 $want_paren = EMPTY_STRING;
2706 # now identify which of the three possible types of
2707 # curly braces we have: hash index container, anonymous
2708 # hash reference, or code block.
2710 # non-structural (hash index) curly brace pair
2711 # get marked 'L' and 'R'
2712 if ( is_non_structural_brace() ) {
2715 # patch for SWITCH/CASE:
2716 # allow paren-less identifier after 'when'
2717 # if the brace is preceded by a space
2718 if ( $statement_type eq 'when'
2719 && $last_nonblank_type eq 'i'
2720 && $last_last_nonblank_type eq 'k'
2721 && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
2724 $block_type = $statement_type;
2728 # code and anonymous hash have the same type, '{', but are
2729 # distinguished by 'block_type',
2730 # which will be blank for an anonymous hash
2733 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
2736 # patch to promote bareword type to function taking block
2738 && $last_nonblank_type eq 'w'
2739 && $last_nonblank_i >= 0 )
2741 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
2742 $routput_token_type->[$last_nonblank_i] =
2743 $is_grep_alias{$block_type} ? 'k' : 'G';
2747 # patch for SWITCH/CASE: if we find a stray opening block brace
2748 # where we might accept a 'case' or 'when' block, then take it
2749 if ( $statement_type eq 'case'
2750 || $statement_type eq 'when' )
2752 if ( !$block_type || $block_type eq '}' ) {
2753 $block_type = $statement_type;
2758 $brace_type[ ++$brace_depth ] = $block_type;
2759 $brace_package[$brace_depth] = $current_package;
2760 $brace_structural_type[$brace_depth] = $type;
2761 $brace_context[$brace_depth] = $context;
2762 ( $type_sequence, $indent_flag ) =
2763 increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2765 } ## end sub do_LEFT_CURLY_BRACKET
2767 sub do_RIGHT_CURLY_BRACKET {
2770 $block_type = $brace_type[$brace_depth];
2771 if ($block_type) { $statement_type = EMPTY_STRING }
2772 if ( defined( $brace_package[$brace_depth] ) ) {
2773 $current_package = $brace_package[$brace_depth];
2776 # can happen on brace error (caught elsewhere)
2779 ( $type_sequence, $indent_flag ) =
2780 decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2782 if ( $brace_structural_type[$brace_depth] eq 'L' ) {
2786 # propagate type information for 'do' and 'eval' blocks, and also
2787 # for smartmatch operator. This is necessary to enable us to know
2788 # if an operator or term is expected next.
2789 if ( $is_block_operator{$block_type} ) {
2793 $context = $brace_context[$brace_depth];
2794 if ( $brace_depth > 0 ) { $brace_depth--; }
2796 } ## end sub do_RIGHT_CURLY_BRACKET
2800 # '&' = maybe sub call? start looking
2801 # We have to check for sub call unless we are sure we
2802 # are expecting an operator. This example from s2p
2803 # got mistaken as a q operator in an early version:
2804 # print BODY &q(<<'EOT');
2805 if ( $expecting != OPERATOR ) {
2807 # But only look for a sub call if we are expecting a term or
2808 # if there is no existing space after the &.
2809 # For example we probably don't want & as sub call here:
2810 # Fcntl::S_IRUSR & $mode;
2811 if ( $expecting == TERM || $next_type ne 'b' ) {
2812 scan_simple_identifier();
2818 } ## end sub do_AMPERSAND
2820 sub do_LESS_THAN_SIGN {
2822 # '<' - angle operator or less than?
2823 if ( $expecting != OPERATOR ) {
2825 find_angle_operator_termination( $input_line, $i, $rtoken_map,
2826 $expecting, $max_token_index );
2828 ## This message is not very helpful and quite confusing if the above
2829 ## routine decided not to write a message with the line number.
2830 ## if ( $type eq '<' && $expecting == TERM ) {
2831 ## error_if_expecting_TERM();
2832 ## interrupt_logfile();
2833 ## warning("Unterminated <> operator?\n");
2834 ## resume_logfile();
2841 } ## end sub do_LESS_THAN_SIGN
2843 sub do_QUESTION_MARK {
2845 # '?' = conditional or starting pattern?
2848 # Patch for rt #126965
2849 # a pattern cannot follow certain keywords which take optional
2850 # arguments, like 'shift' and 'pop'. See also '/'.
2852 $last_nonblank_type eq 'k'
2853 && $is_keyword_rejecting_question_as_pattern_delimiter{
2854 $last_nonblank_token}
2860 # patch for RT#131288, user constant function without prototype
2861 # last type is 'U' followed by ?.
2862 elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
2865 elsif ( $expecting == UNKNOWN ) {
2867 # In older versions of Perl, a bare ? can be a pattern
2868 # delimiter. In perl version 5.22 this was
2869 # dropped, but we have to support it in order to format
2870 # older programs. See:
2871 ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
2872 # For example, the following line worked
2874 # ?(.*)? && (print $1,"\n");
2875 # In current versions it would have to be written with slashes:
2876 # /(.*)/ && (print $1,"\n");
2878 ( $is_pattern, $msg ) =
2879 guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
2882 if ($msg) { write_logfile_entry($msg) }
2884 else { $is_pattern = ( $expecting == TERM ) }
2889 $allowed_quote_modifiers = '[msixpodualngc]';
2892 ( $type_sequence, $indent_flag ) =
2893 increase_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
2896 } ## end sub do_QUESTION_MARK
2900 # '*' = typeglob, or multiply?
2901 if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
2902 if ( $next_type ne 'b'
2903 && $next_type ne '('
2904 && $next_type ne '#' ) # Fix c036
2909 if ( $expecting == TERM ) {
2910 scan_simple_identifier();
2914 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2919 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
2923 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2931 } ## end sub do_STAR
2935 # '.' = what kind of . ?
2936 if ( $expecting != OPERATOR ) {
2938 if ( $type eq '.' ) {
2939 error_if_expecting_TERM()
2940 if ( $expecting == TERM );
2950 # ':' = label, ternary, attribute, ?
2952 # if this is the first nonblank character, call it a label
2953 # since perl seems to just swallow it
2954 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
2958 # ATTRS: check for a ':' which introduces an attribute list
2959 # either after a 'sub' keyword or within a paren list
2960 elsif ( $statement_type =~ /^sub\b/ ) {
2962 $in_attribute_list = 1;
2965 # Within a signature, unless we are in a ternary. For example,
2966 # from 't/filter_example.t':
2967 # method foo4 ( $class: $bar ) { $class->bar($bar) }
2968 elsif ( $paren_type[$paren_depth] =~ /^sub\b/
2969 && !is_balanced_closing_container(QUESTION_COLON) )
2972 $in_attribute_list = 1;
2975 # check for scalar attribute, such as
2976 # my $foo : shared = 1;
2977 elsif ($is_my_our_state{$statement_type}
2978 && $current_depth[QUESTION_COLON] == 0 )
2981 $in_attribute_list = 1;
2984 # Look for Switch::Plain syntax if an error would otherwise occur
2985 # here. Note that we do not need to check if the extended syntax
2986 # flag is set because otherwise an error would occur, and we would
2987 # then have to output a message telling the user to set the
2988 # extended syntax flag to avoid the error.
2992 # Note that the line 'default:' will be parsed as a label elsewhere.
2993 elsif ( $is_case_default{$statement_type}
2994 && !is_balanced_closing_container(QUESTION_COLON) )
2996 # mark it as a perltidy label type
3000 # otherwise, it should be part of a ?/: operator
3002 ( $type_sequence, $indent_flag ) =
3003 decrease_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
3004 if ( $last_nonblank_token eq '?' ) {
3005 warning("Syntax error near ? :\n");
3009 } ## end sub do_COLON
3013 # '+' = what kind of plus?
3014 if ( $expecting == TERM ) {
3015 my $number = scan_number_fast();
3017 # unary plus is safest assumption if not a number
3018 if ( !defined($number) ) { $type = 'p'; }
3020 elsif ( $expecting == OPERATOR ) {
3023 if ( $next_type eq 'w' ) { $type = 'p' }
3026 } ## end sub do_PLUS_SIGN
3030 # '@' = sigil for array?
3031 error_if_expecting_OPERATOR("Array")
3032 if ( $expecting == OPERATOR );
3033 scan_simple_identifier();
3037 sub do_PERCENT_SIGN {
3039 # '%' = hash or modulo?
3040 # first guess is hash if no following blank or paren
3041 if ( $expecting == UNKNOWN ) {
3042 if ( $next_type ne 'b' && $next_type ne '(' ) {
3046 if ( $expecting == TERM ) {
3047 scan_simple_identifier();
3050 } ## end sub do_PERCENT_SIGN
3052 sub do_LEFT_SQUARE_BRACKET {
3055 $square_bracket_type[ ++$square_bracket_depth ] = $last_nonblank_token;
3056 ( $type_sequence, $indent_flag ) =
3057 increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3059 # It may seem odd, but structural square brackets have
3060 # type '{' and '}'. This simplifies the indentation logic.
3061 if ( !is_non_structural_brace() ) {
3064 $square_bracket_structural_type[$square_bracket_depth] = $type;
3066 } ## end sub do_LEFT_SQUARE_BRACKET
3068 sub do_RIGHT_SQUARE_BRACKET {
3071 ( $type_sequence, $indent_flag ) =
3072 decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3074 if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' ) {
3078 # propagate type information for smartmatch operator. This is
3079 # necessary to enable us to know if an operator or term is expected
3081 if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
3082 $tok = $square_bracket_type[$square_bracket_depth];
3085 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
3087 } ## end sub do_RIGHT_SQUARE_BRACKET
3091 # '-' = what kind of minus?
3092 if ( ( $expecting != OPERATOR )
3093 && $is_file_test_operator{$next_tok} )
3095 my ( $next_nonblank_token, $i_next ) =
3096 find_next_nonblank_token( $i + 1, $rtokens, $max_token_index );
3098 # check for a quoted word like "-w=>xx";
3099 # it is sufficient to just check for a following '='
3100 if ( $next_nonblank_token eq '=' ) {
3109 elsif ( $expecting == TERM ) {
3110 my $number = scan_number_fast();
3112 # maybe part of bareword token? unary is safest
3113 if ( !defined($number) ) { $type = 'm'; }
3116 elsif ( $expecting == OPERATOR ) {
3120 if ( $next_type eq 'w' ) {
3125 } ## end sub do_MINUS_SIGN
3130 # check for special variables like ${^WARNING_BITS}
3131 if ( $expecting == TERM ) {
3133 if ( $last_nonblank_token eq '{'
3134 && ( $next_tok !~ /^\d/ )
3135 && ( $next_tok =~ /^\w/ ) )
3138 if ( $next_tok eq 'W' ) {
3139 $tokenizer_self->[_saw_perl_dash_w_] = 1;
3141 $tok = $tok . $next_tok;
3145 # Optional coding to try to catch syntax errors. This can
3146 # be removed if it ever causes incorrect warning messages.
3147 # The '{^' should be preceded by either by a type or '$#'
3150 # *${^LAST_FH}{NAME} ok
3152 # $hash{^HOWDY} error
3154 # Note that a type sigil '$' may be tokenized as 'Z'
3155 # after something like 'print', so allow type 'Z'
3156 if ( $last_last_nonblank_type ne 't'
3157 && $last_last_nonblank_type ne 'Z'
3158 && $last_last_nonblank_token ne '$#' )
3160 warning("Possible syntax error near '{^'\n");
3165 unless ( error_if_expecting_TERM() ) {
3167 # Something like this is valid but strange:
3169 complain("The '^' seems unusual here\n");
3174 } ## end sub do_CARAT_SIGN
3176 sub do_DOUBLE_COLON {
3178 # '::' = probably a sub call
3179 scan_bare_identifier();
3185 # '<<' = maybe a here-doc?
3187 ## This check removed because it could be a deprecated here-doc with
3188 ## no specified target. See example in log 16 Sep 2020.
3190 ## unless ( $i < $max_token_index )
3191 ## ; # here-doc not possible if end of line
3193 if ( $expecting != OPERATOR ) {
3194 my ( $found_target, $here_doc_target, $here_quote_character,
3197 $found_target, $here_doc_target, $here_quote_character, $i,
3200 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3203 if ($found_target) {
3204 push @{$rhere_target_list},
3205 [ $here_doc_target, $here_quote_character ];
3207 if ( length($here_doc_target) > 80 ) {
3208 my $truncated = substr( $here_doc_target, 0, 80 );
3209 complain("Long here-target: '$truncated' ...\n");
3211 elsif ( !$here_doc_target ) {
3213 'Use of bare << to mean <<"" is deprecated' . "\n" )
3214 unless ($here_quote_character);
3216 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3218 "Unconventional here-target: '$here_doc_target'\n");
3221 elsif ( $expecting == TERM ) {
3222 unless ($saw_error) {
3224 # shouldn't happen..arriving here implies an error in
3225 # the logic in sub 'find_here_doc'
3228 Program bug; didn't find here doc target
3232 "Possible program error: didn't find here doc target\n"
3234 report_definite_bug();
3241 } ## end sub do_LEFT_SHIFT
3243 sub do_NEW_HERE_DOC {
3245 # '<<~' = a here-doc, new type added in v26
3247 unless ( $i < $max_token_index )
3248 ; # here-doc not possible if end of line
3249 if ( $expecting != OPERATOR ) {
3250 my ( $found_target, $here_doc_target, $here_quote_character,
3253 $found_target, $here_doc_target, $here_quote_character, $i,
3256 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3259 if ($found_target) {
3261 if ( length($here_doc_target) > 80 ) {
3262 my $truncated = substr( $here_doc_target, 0, 80 );
3263 complain("Long here-target: '$truncated' ...\n");
3265 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3267 "Unconventional here-target: '$here_doc_target'\n");
3270 # Note that we put a leading space on the here quote
3271 # character indicate that it may be preceded by spaces
3272 $here_quote_character = SPACE . $here_quote_character;
3273 push @{$rhere_target_list},
3274 [ $here_doc_target, $here_quote_character ];
3277 elsif ( $expecting == TERM ) {
3278 unless ($saw_error) {
3280 # shouldn't happen..arriving here implies an error in
3281 # the logic in sub 'find_here_doc'
3284 Program bug; didn't find here doc target
3288 "Possible program error: didn't find here doc target\n"
3290 report_definite_bug();
3295 error_if_expecting_OPERATOR();
3298 } ## end sub do_NEW_HERE_DOC
3303 # if -> points to a bare word, we must scan for an identifier,
3304 # otherwise something like ->y would look like the y operator
3306 # NOTE: this will currently allow things like
3307 # '->@array' '->*VAR' '->%hash'
3308 # to get parsed as identifiers, even though these are not currently
3309 # allowed syntax. To catch syntax errors like this we could first
3310 # check that the next character and skip this call if it is one of
3311 # ' @ % * '. A disadvantage with doing this is that this would
3312 # have to be fixed if the perltidy syntax is ever extended to make
3313 # any of these valid. So for now this check is not done.
3314 scan_simple_identifier();
3316 } ## end sub do_POINTER
3321 # type = 'pp' for pre-increment, '++' for post-increment
3322 if ( $expecting == TERM ) { $type = 'pp' }
3323 elsif ( $expecting == UNKNOWN ) {
3325 my ( $next_nonblank_token, $i_next ) =
3326 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3328 # Fix for c042: look past a side comment
3329 if ( $next_nonblank_token eq '#' ) {
3330 ( $next_nonblank_token, $i_next ) =
3331 find_next_nonblank_token( $max_token_index,
3332 $rtokens, $max_token_index );
3335 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
3338 } ## end sub do_PLUS_PLUS
3343 if ( $last_nonblank_type eq $tok ) {
3344 complain("Repeated '=>'s \n");
3347 # patch for operator_expected: note if we are in the list (use.t)
3348 # TODO: make version numbers a new token type
3349 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
3351 } ## end sub do_FAT_COMMA
3353 sub do_MINUS_MINUS {
3356 # type = 'mm' for pre-decrement, '--' for post-decrement
3358 if ( $expecting == TERM ) { $type = 'mm' }
3359 elsif ( $expecting == UNKNOWN ) {
3360 my ( $next_nonblank_token, $i_next ) =
3361 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3363 # Fix for c042: look past a side comment
3364 if ( $next_nonblank_token eq '#' ) {
3365 ( $next_nonblank_token, $i_next ) =
3366 find_next_nonblank_token( $max_token_index,
3367 $rtokens, $max_token_index );
3370 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
3373 } ## end sub do_MINUS_MINUS
3375 sub do_LOGICAL_AND {
3378 error_if_expecting_TERM()
3379 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3386 error_if_expecting_TERM()
3387 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3391 sub do_SLASH_SLASH {
3394 error_if_expecting_TERM()
3395 if ( $expecting == TERM );
3401 # 'd' = string of digits
3402 error_if_expecting_OPERATOR("Number")
3403 if ( $expecting == OPERATOR );
3405 my $number = scan_number_fast();
3406 if ( !defined($number) ) {
3408 # shouldn't happen - we should always get a number
3411 non-number beginning with digit--program bug
3415 "Unexpected error condition: non-number beginning with digit\n"
3417 report_definite_bug();
3420 } ## end sub do_DIGITS
3422 sub do_ATTRIBUTE_LIST {
3424 my ($next_nonblank_token) = @_;
3426 # Called at a bareword encountered while in an attribute list
3427 # returns 'is_attribute':
3428 # true if attribute found
3429 # false if an attribute (continue parsing bareword)
3431 # treat bare word followed by open paren like qw(
3432 if ( $next_nonblank_token eq '(' ) {
3434 # For something like:
3436 # we should let do_scan_sub see it so that it can see
3437 # the prototype. All other attributes get parsed as a
3439 if ( $tok eq 'prototype' ) {
3440 $id_scan_state = 'prototype';
3442 # start just after the word 'prototype'
3444 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
3446 input_line => $input_line,
3451 rtokens => $rtokens,
3452 rtoken_map => $rtoken_map,
3453 id_scan_state => $id_scan_state,
3454 max_token_index => $max_token_index
3458 # If successful, mark as type 'q' to be consistent
3459 # with other attributes. Type 'w' would also work.
3460 if ( $i > $i_beg ) {
3465 # If not successful, continue and parse as a quote.
3468 # All other attribute lists must be parsed as quotes
3469 # (see 'signatures.t' for good examples)
3470 $in_quote = $quote_items{'q'};
3471 $allowed_quote_modifiers = $quote_modifiers{'q'};
3477 # handle bareword not followed by open paren
3483 # attribute not found
3485 } ## end sub do_ATTRIBUTE_LIST
3487 sub do_QUOTED_BAREWORD {
3489 # find type of a bareword followed by a '=>'
3490 if ( $is_constant{$current_package}{$tok} ) {
3493 elsif ( $is_user_function{$current_package}{$tok} ) {
3495 $prototype = $user_function_prototype{$current_package}{$tok};
3497 elsif ( $tok =~ /^v\d+$/ ) {
3499 report_v_string($tok);
3503 # Bareword followed by a fat comma - see 'git18.in'
3504 # If tok is something like 'x17' then it could
3505 # actually be operator x followed by number 17.
3506 # For example, here:
3507 # 123x17 => [ 792, 1224 ],
3508 # (a key of 123 repeated 17 times, perhaps not
3509 # what was intended). We will mark x17 as type
3510 # 'n' and it will be split. If the previous token
3511 # was also a bareword then it is not very clear is
3512 # going on. In this case we will not be sure that
3513 # an operator is expected, so we just mark it as a
3514 # bareword. Perl is a little murky in what it does
3515 # with stuff like this, and its behavior can change
3516 # over time. Something like
3517 # a x18 => [792, 1224], will compile as
3518 # a key with 18 a's. But something like
3519 # push @array, a x18;
3520 # is a syntax error.
3522 $expecting == OPERATOR
3523 && substr( $tok, 0, 1 ) eq 'x'
3524 && ( length($tok) == 1
3525 || substr( $tok, 1, 1 ) =~ /^\d/ )
3529 if ( split_pretoken(1) ) {
3538 error_if_expecting_OPERATOR();
3542 } ## end sub do_QUOTED_BAREWORD
3546 if ( $tok eq 'x' ) {
3547 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
3558 # Split a pretoken like 'x10' into 'x' and '10'.
3559 # Note: In previous versions of perltidy it was marked
3560 # as a number, $type = 'n', and fixed downstream by the
3563 if ( split_pretoken(1) ) {
3569 } ## end sub do_X_OPERATOR
3571 sub do_USE_CONSTANT {
3572 scan_bare_identifier();
3573 my ( $next_nonblank_tok2, $i_next2 ) =
3574 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3576 if ($next_nonblank_tok2) {
3578 if ( $is_keyword{$next_nonblank_tok2} ) {
3580 # Assume qw is used as a quote and okay, as in:
3581 # use constant qw{ DEBUG 0 };
3582 # Not worth trying to parse for just a warning
3584 # NOTE: This warning is deactivated because recent
3585 # versions of perl do not complain here, but
3586 # the coding is retained for reference.
3587 if ( 0 && $next_nonblank_tok2 ne 'qw' ) {
3589 "Attempting to define constant '$next_nonblank_tok2' which is a perl keyword\n"
3595 $is_constant{$current_package}{$next_nonblank_tok2} = 1;
3599 } ## end sub do_USE_CONSTANT
3603 # found a keyword - set any associated flags
3606 # Since for and foreach may not be followed immediately
3607 # by an opening paren, we have to remember which keyword
3608 # is associated with the next '('
3609 if ( $is_for_foreach{$tok} ) {
3610 if ( new_statement_ok() ) {
3615 # recognize 'use' statements, which are special
3616 elsif ( $is_use_require{$tok} ) {
3617 $statement_type = $tok;
3618 error_if_expecting_OPERATOR()
3619 if ( $expecting == OPERATOR );
3622 # remember my and our to check for trailing ": shared"
3623 elsif ( $is_my_our_state{$tok} ) {
3624 $statement_type = $tok;
3627 # Check for misplaced 'elsif' and 'else', but allow isolated
3628 # else or elsif blocks to be formatted. This is indicated
3629 # by a last noblank token of ';'
3630 elsif ( $tok eq 'elsif' ) {
3632 $last_nonblank_token ne ';'
3634 ## !~ /^(if|elsif|unless)$/
3635 && !$is_if_elsif_unless{$last_nonblank_block_type}
3639 "expecting '$tok' to follow one of 'if|elsif|unless'\n");
3642 elsif ( $tok eq 'else' ) {
3644 # patched for SWITCH/CASE
3646 $last_nonblank_token ne ';'
3648 ## !~ /^(if|elsif|unless|case|when)$/
3649 && !$is_if_elsif_unless_case_when{$last_nonblank_block_type}
3651 # patch to avoid an unwanted error message for
3652 # the case of a parenless 'case' (RT 105484):
3653 # switch ( 1 ) { case x { 2 } else { } }
3654 ## !~ /^(if|elsif|unless|case|when)$/
3655 && !$is_if_elsif_unless_case_when{$statement_type}
3659 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
3663 elsif ( $tok eq 'continue' ) {
3664 if ( $last_nonblank_token ne ';'
3665 && $last_nonblank_block_type !~
3666 /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
3669 # note: ';' '{' and '}' in list above
3670 # because continues can follow bare blocks;
3671 # ':' is labeled block
3673 ############################################
3674 # NOTE: This check has been deactivated because
3675 # continue has an alternative usage for given/when
3676 # blocks in perl 5.10
3677 ## warning("'$tok' should follow a block\n");
3678 ############################################
3682 # patch for SWITCH/CASE if 'case' and 'when are
3683 # treated as keywords. Also 'default' for Switch::Plain
3684 elsif ($tok eq 'when'
3686 || $tok eq 'default' )
3688 $statement_type = $tok; # next '{' is block
3692 # indent trailing if/unless/while/until
3693 # outdenting will be handled by later indentation loop
3694 ## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
3706 ## if ( $tok =~ /^(if|unless|while|until)$/
3707 ## && $next_nonblank_token ne '(' )
3709 ## $indent_flag = 1;
3712 } ## end sub do_KEYWORD
3714 sub do_QUOTE_OPERATOR {
3716 if ( $expecting == OPERATOR ) {
3718 # Be careful not to call an error for a qw quote
3719 # where a parenthesized list is allowed. For example,
3720 # it could also be a for/foreach construct such as
3722 # foreach my $key qw\Uno Due Tres Quadro\ {
3723 # print "Set $key\n";
3727 # Or it could be a function call.
3728 # NOTE: Braces in something like &{ xxx } are not
3729 # marked as a block, we might have a method call.
3730 # &method(...), $method->(..), &{method}(...),
3731 # $ref[2](list) is ok & short for $ref[2]->(list)
3733 # See notes in 'sub code_block_type' and
3734 # 'sub is_non_structural_brace'
3738 && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
3739 || $is_for_foreach{$want_paren} )
3742 error_if_expecting_OPERATOR();
3745 $in_quote = $quote_items{$tok};
3746 $allowed_quote_modifiers = $quote_modifiers{$tok};
3748 # All quote types are 'Q' except possibly qw quotes.
3749 # qw quotes are special in that they may generally be trimmed
3750 # of leading and trailing whitespace. So they are given a
3751 # separate type, 'q', unless requested otherwise.
3753 ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
3756 $quote_type = $type;
3758 } ## end sub do_QUOTE_OPERATOR
3760 sub do_UNKNOWN_BAREWORD {
3762 my ($next_nonblank_token) = @_;
3764 scan_bare_identifier();
3766 if ( $statement_type eq 'use'
3767 && $last_nonblank_token eq 'use' )
3769 $saw_use_module{$current_package}->{$tok} = 1;
3772 if ( $type eq 'w' ) {
3774 if ( $expecting == OPERATOR ) {
3776 # Patch to avoid error message for RPerl overloaded
3777 # operator functions: use overload
3782 # FIXME: this should eventually be generalized
3783 if ( $saw_use_module{$current_package}->{'RPerl'}
3784 && $tok =~ /^sse_(mul|div|add|sub)$/ )
3789 # Fix part 1 for git #63 in which a comment falls
3790 # between an -> and the following word. An
3791 # alternate fix would be to change operator_expected
3792 # to return an UNKNOWN for this type.
3793 elsif ( $last_nonblank_type eq '->' ) {
3797 # don't complain about possible indirect object
3801 # sub new($) { ... }
3802 # $b = new A::; # calls A::new
3803 # $c = new A; # same thing but suspicious
3804 # This will call A::new but we have a 'new' in
3805 # main:: which looks like a constant.
3807 elsif ( $last_nonblank_type eq 'C' ) {
3808 if ( $tok !~ /::$/ ) {
3810 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
3811 Maybe indirectet object notation?
3816 error_if_expecting_OPERATOR("bareword");
3820 # mark bare words immediately followed by a paren as
3822 $next_tok = $rtokens->[ $i + 1 ];
3823 if ( $next_tok eq '(' ) {
3825 # Fix part 2 for git #63. Leave type as 'w' to keep
3826 # the type the same as if the -> were not separated
3827 $type = 'U' unless ( $last_nonblank_type eq '->' );
3830 # underscore after file test operator is file handle
3831 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
3835 # patch for SWITCH/CASE if 'case' and 'when are
3836 # not treated as keywords:
3838 ( $tok eq 'case' && $brace_type[$brace_depth] eq 'switch' )
3840 && $brace_type[$brace_depth] eq 'given' )
3843 $statement_type = $tok; # next '{' is block
3844 $type = 'k'; # for keyword syntax coloring
3847 # patch for SWITCH/CASE if switch and given not keywords
3848 # Switch is not a perl 5 keyword, but we will gamble
3849 # and mark switch followed by paren as a keyword. This
3850 # is only necessary to get html syntax coloring nice,
3851 # and does not commit this as being a switch/case.
3852 if ( $next_nonblank_token eq '('
3853 && ( $tok eq 'switch' || $tok eq 'given' ) )
3855 $type = 'k'; # for keyword syntax coloring
3859 } ## end sub do_UNKNOWN_BAREWORD
3861 sub sub_attribute_ok_here {
3863 my ( $tok_kw, $next_nonblank_token, $i_next ) = @_;
3865 # Decide if 'sub :' can be the start of a sub attribute list.
3866 # We will decide based on if the colon is followed by a
3867 # bareword which is not a keyword.
3868 # Changed inext+1 to inext to fixed case b1190.
3869 my $sub_attribute_ok_here;
3870 if ( $is_sub{$tok_kw}
3871 && $expecting != OPERATOR
3872 && $next_nonblank_token eq ':' )
3874 my ( $nn_nonblank_token, $i_nn ) =
3875 find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
3876 $sub_attribute_ok_here =
3877 $nn_nonblank_token =~ /^\w/
3878 && $nn_nonblank_token !~ /^\d/
3879 && !$is_keyword{$nn_nonblank_token};
3881 return $sub_attribute_ok_here;
3882 } ## end sub sub_attribute_ok_here
3886 my ($is_END_or_DATA) = @_;
3888 # handle a bareword token:
3890 # true if this token ends the current line
3893 # Patch for c043, part 3: A bareword after '->' expects a TERM
3894 # FIXME: It would be cleaner to give method calls a new type 'M'
3895 # and update sub operator_expected to handle this.
3896 if ( $last_nonblank_type eq '->' ) {
3900 my ( $next_nonblank_token, $i_next ) =
3901 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3903 # a bare word immediately followed by :: is not a keyword;
3904 # use $tok_kw when testing for keywords to avoid a mistake
3906 if ( $rtokens->[ $i + 1 ] eq ':'
3907 && $rtokens->[ $i + 2 ] eq ':' )
3912 if ($in_attribute_list) {
3913 my $is_attribute = do_ATTRIBUTE_LIST($next_nonblank_token);
3914 return if ($is_attribute);
3917 #----------------------------------------
3918 # Starting final if-elsif- chain of tests
3919 #----------------------------------------
3921 # This is the return flag:
3922 # true => this is the last token on the line
3923 # false => keep tokenizing the line
3926 # The following blocks of code must update these vars:
3927 # $type - the final token type, must always be set
3929 # In addition, if additional pretokens are added:
3930 # $tok - the final token
3931 # $i - the index of the last pretoken
3933 # They may also need to check and set various flags
3935 # Quote a word followed by => operator
3936 # unless the word __END__ or __DATA__ and the only word on
3938 if ( !$is_END_or_DATA
3939 && $next_nonblank_token eq '='
3940 && $rtokens->[ $i_next + 1 ] eq '>' )
3942 do_QUOTED_BAREWORD();
3945 # quote a bare word within braces..like xxx->{s}; note that we
3946 # must be sure this is not a structural brace, to avoid
3947 # mistaking {s} in the following for a quoted bare word:
3948 # for(@[){s}bla}BLA}
3949 # Also treat q in something like var{-q} as a bare word, not
3952 $next_nonblank_token eq '}'
3954 $last_nonblank_type eq 'L'
3955 || ( $last_nonblank_type eq 'm'
3956 && $last_last_nonblank_type eq 'L' )
3963 # Scan a bare word following a -> as an identifier; it could
3964 # have a long package name. Fixes c037, c041.
3965 elsif ( $last_nonblank_token eq '->' ) {
3966 scan_bare_identifier();
3968 # Patch for c043, part 4; use type 'w' after a '->'.
3969 # This is just a safety check on sub scan_bare_identifier,
3970 # which should get this case correct.
3974 # handle operator x (now we know it isn't $x=)
3976 $expecting == OPERATOR
3977 && substr( $tok, 0, 1 ) eq 'x'
3978 && ( length($tok) == 1
3979 || substr( $tok, 1, 1 ) =~ /^\d/ )
3984 elsif ( $tok_kw eq 'CORE::' ) {
3985 $type = $tok = $tok_kw;
3988 elsif ( ( $tok eq 'strict' )
3989 and ( $last_nonblank_token eq 'use' ) )
3991 $tokenizer_self->[_saw_use_strict_] = 1;
3992 scan_bare_identifier();
3995 elsif ( ( $tok eq 'warnings' )
3996 and ( $last_nonblank_token eq 'use' ) )
3998 $tokenizer_self->[_saw_perl_dash_w_] = 1;
4000 # scan as identifier, so that we pick up something like:
4001 # use warnings::register
4002 scan_bare_identifier();
4006 $tok eq 'AutoLoader'
4007 && $tokenizer_self->[_look_for_autoloader_]
4009 $last_nonblank_token eq 'use'
4011 # these regexes are from AutoSplit.pm, which we want
4013 || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
4014 || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
4018 write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
4019 $tokenizer_self->[_saw_autoloader_] = 1;
4020 $tokenizer_self->[_look_for_autoloader_] = 0;
4021 scan_bare_identifier();
4025 $tok eq 'SelfLoader'
4026 && $tokenizer_self->[_look_for_selfloader_]
4027 && ( $last_nonblank_token eq 'use'
4028 || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
4029 || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
4032 write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
4033 $tokenizer_self->[_saw_selfloader_] = 1;
4034 $tokenizer_self->[_look_for_selfloader_] = 0;
4035 scan_bare_identifier();
4038 elsif ( ( $tok eq 'constant' )
4039 and ( $last_nonblank_token eq 'use' ) )
4044 # various quote operators
4045 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
4046 do_QUOTE_OPERATOR();
4049 # check for a statement label
4051 ( $next_nonblank_token eq ':' )
4052 && ( $rtokens->[ $i_next + 1 ] ne ':' )
4053 && ( $i_next <= $max_token_index ) # colon on same line
4055 # like 'sub : lvalue' ?
4056 ##&& !$sub_attribute_ok_here # like 'sub : lvalue' ?
4057 && !sub_attribute_ok_here( $tok_kw, $next_nonblank_token, $i_next )
4061 if ( $tok !~ /[A-Z]/ ) {
4062 push @{ $tokenizer_self->[_rlower_case_labels_at_] },
4071 elsif ( $is_sub{$tok_kw} ) {
4072 error_if_expecting_OPERATOR()
4073 if ( $expecting == OPERATOR );
4074 initialize_subname();
4079 elsif ( $is_package{$tok_kw} ) {
4080 error_if_expecting_OPERATOR()
4081 if ( $expecting == OPERATOR );
4085 # Fix for c035: split 'format' from 'is_format_END_DATA' to be
4086 # more restrictive. Require a new statement to be ok here.
4087 elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
4088 $type = ';'; # make tokenizer look for TERM next
4089 $tokenizer_self->[_in_format_] = 1;
4090 $is_last = 1; ## is last token on this line
4093 # Note on token types for format, __DATA__, __END__:
4094 # It simplifies things to give these type ';', so that when we
4095 # start rescanning we will be expecting a token of type TERM.
4096 # We will switch to type 'k' before outputting the tokens.
4097 elsif ( $is_END_DATA{$tok_kw} ) {
4098 $type = ';'; # make tokenizer look for TERM next
4100 # Remember that we are in one of these three sections
4101 $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1;
4102 $is_last = 1; ## is last token on this line
4105 elsif ( $is_keyword{$tok_kw} ) {
4109 # check for inline label following
4110 # /^(redo|last|next|goto)$/
4111 elsif (( $last_nonblank_type eq 'k' )
4112 && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4119 do_UNKNOWN_BAREWORD($next_nonblank_token);
4124 } ## end sub do_BAREWORD
4126 sub do_FOLLOW_QUOTE {
4128 # Continue following a quote on a new line
4129 $type = $quote_type;
4131 unless ( @{$routput_token_list} ) { # initialize if continuation line
4132 push( @{$routput_token_list}, $i );
4133 $routput_token_type->[$i] = $type;
4137 # Removed to fix b1280. This is not needed and was causing the
4138 # starting type 'qw' to be lost, leading to mis-tokenization of
4139 # a trailing block brace in a parenless for stmt 'for .. qw.. {'
4140 ##$tok = $quote_character if ($quote_character);
4142 # scan for the end of the quote or pattern
4144 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
4145 $quoted_string_1, $quoted_string_2
4148 $i, $in_quote, $quote_character,
4149 $quote_pos, $quote_depth, $quoted_string_1,
4150 $quoted_string_2, $rtokens, $rtoken_map,
4154 # all done if we didn't find it
4155 if ($in_quote) { return }
4157 # save pattern and replacement text for rescanning
4158 my $qs1 = $quoted_string_1;
4160 # re-initialize for next search
4161 $quote_character = EMPTY_STRING;
4164 $quoted_string_1 = EMPTY_STRING;
4165 $quoted_string_2 = EMPTY_STRING;
4166 if ( ++$i > $max_token_index ) { return }
4168 # look for any modifiers
4169 if ($allowed_quote_modifiers) {
4171 # check for exact quote modifiers
4172 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
4173 my $str = $rtokens->[$i];
4175 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
4176 my $pos = pos($str);
4177 my $char = substr( $str, $pos - 1, 1 );
4178 $saw_modifier_e ||= ( $char eq 'e' );
4181 # For an 'e' quote modifier we must scan the replacement
4182 # text for here-doc targets...
4183 # but if the modifier starts a new line we can skip
4184 # this because either the here doc will be fully
4185 # contained in the replacement text (so we can
4186 # ignore it) or Perl will not find it.
4187 # See test 'here2.in'.
4188 if ( $saw_modifier_e && $i_tok >= 0 ) {
4190 my $rht = scan_replacement_text($qs1);
4192 # Change type from 'Q' to 'h' for quotes with
4193 # here-doc targets so that the formatter (see sub
4194 # process_line_of_CODE) will not make any line
4195 # breaks after this point.
4197 push @{$rhere_target_list}, @{$rht};
4200 my $ilast = $routput_token_list->[-1];
4201 $routput_token_type->[$ilast] = $type;
4206 if ( defined( pos($str) ) ) {
4209 if ( pos($str) == length($str) ) {
4210 if ( ++$i > $max_token_index ) { return }
4213 # Looks like a joined quote modifier
4214 # and keyword, maybe something like
4215 # s/xxx/yyy/gefor @k=...
4216 # Example is "galgen.pl". Would have to split
4217 # the word and insert a new token in the
4218 # pre-token list. This is so rare that I haven't
4219 # done it. Will just issue a warning citation.
4221 # This error might also be triggered if my quote
4222 # modifier characters are incomplete
4226 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
4227 Please put a space between quote modifiers and trailing keywords.
4230 # print "token $rtokens->[$i]\n";
4231 # my $num = length($str) - pos($str);
4232 # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
4233 # print "continuing with new token $rtokens->[$i]\n";
4235 # skipping past this token does least damage
4236 if ( ++$i > $max_token_index ) { return }
4241 # example file: rokicki4.pl
4242 # This error might also be triggered if my quote
4243 # modifier characters are incomplete
4244 write_logfile_entry(
4245 "Note: found word $str at quote modifier location\n");
4250 $allowed_quote_modifiers = EMPTY_STRING;
4253 } ## end sub do_FOLLOW_QUOTE
4255 # ------------------------------------------------------------
4256 # begin hash of code for handling most token types
4257 # ------------------------------------------------------------
4258 my $tokenization_code = {
4260 '>' => \&do_GREATER_THAN_SIGN,
4261 '|' => \&do_VERTICAL_LINE,
4262 '$' => \&do_DOLLAR_SIGN,
4263 '(' => \&do_LEFT_PARENTHESIS,
4264 ')' => \&do_RIGHT_PARENTHESIS,
4266 ';' => \&do_SEMICOLON,
4267 '"' => \&do_QUOTATION_MARK,
4268 "'" => \&do_APOSTROPHE,
4269 '`' => \&do_BACKTICK,
4271 '{' => \&do_LEFT_CURLY_BRACKET,
4272 '}' => \&do_RIGHT_CURLY_BRACKET,
4273 '&' => \&do_AMPERSAND,
4274 '<' => \&do_LESS_THAN_SIGN,
4275 '?' => \&do_QUESTION_MARK,
4279 '+' => \&do_PLUS_SIGN,
4280 '@' => \&do_AT_SIGN,
4281 '%' => \&do_PERCENT_SIGN,
4282 '[' => \&do_LEFT_SQUARE_BRACKET,
4283 ']' => \&do_RIGHT_SQUARE_BRACKET,
4284 '-' => \&do_MINUS_SIGN,
4285 '^' => \&do_CARAT_SIGN,
4286 '::' => \&do_DOUBLE_COLON,
4287 '<<' => \&do_LEFT_SHIFT,
4288 '<<~' => \&do_NEW_HERE_DOC,
4289 '->' => \&do_POINTER,
4290 '++' => \&do_PLUS_PLUS,
4291 '=>' => \&do_FAT_COMMA,
4292 '--' => \&do_MINUS_MINUS,
4293 '&&' => \&do_LOGICAL_AND,
4294 '||' => \&do_LOGICAL_OR,
4295 '//' => \&do_SLASH_SLASH,
4297 # No special code for these types yet, but syntax checks
4332 # ------------------------------------------------------------
4333 # end hash of code for handling individual token types
4334 # ------------------------------------------------------------
4336 use constant DEBUG_TOKENIZE => 0;
4338 sub tokenize_this_line {
4340 # This routine breaks a line of perl code into tokens which are of use in
4341 # indentation and reformatting. One of my goals has been to define tokens
4342 # such that a newline may be inserted between any pair of tokens without
4343 # changing or invalidating the program. This version comes close to this,
4344 # although there are necessarily a few exceptions which must be caught by
4345 # the formatter. Many of these involve the treatment of bare words.
4347 # The tokens and their types are returned in arrays. See previous
4348 # routine for their names.
4350 # See also the array "valid_token_types" in the BEGIN section for an
4353 # To simplify things, token types are either a single character, or they
4354 # are identical to the tokens themselves.
4356 # As a debugging aid, the -D flag creates a file containing a side-by-side
4357 # comparison of the input string and its tokenization for each line of a file.
4358 # This is an invaluable debugging aid.
4360 # In addition to tokens, and some associated quantities, the tokenizer
4361 # also returns flags indication any special line types. These include
4362 # quotes, here_docs, formats.
4364 # -----------------------------------------------------------------------
4366 # How to add NEW_TOKENS:
4368 # New token types will undoubtedly be needed in the future both to keep up
4369 # with changes in perl and to help adapt the tokenizer to other applications.
4371 # Here are some notes on the minimal steps. I wrote these notes while
4372 # adding the 'v' token type for v-strings, which are things like version
4373 # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
4374 # can use your editor to search for the string "NEW_TOKENS" to find the
4375 # appropriate sections to change):
4377 # *. Try to talk somebody else into doing it! If not, ..
4379 # *. Make a backup of your current version in case things don't work out!
4381 # *. Think of a new, unused character for the token type, and add to
4382 # the array @valid_token_types in the BEGIN section of this package.
4383 # For example, I used 'v' for v-strings.
4385 # *. Implement coding to recognize the $type of the token in this routine.
4386 # This is the hardest part, and is best done by imitating or modifying
4387 # some of the existing coding. For example, to recognize v-strings, I
4388 # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
4389 # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
4391 # *. Update sub operator_expected. This update is critically important but
4392 # the coding is trivial. Look at the comments in that routine for help.
4393 # For v-strings, which should behave like numbers, I just added 'v' to the
4394 # regex used to handle numbers and strings (types 'n' and 'Q').
4396 # *. Implement a 'bond strength' rule in sub set_bond_strengths in
4397 # Perl::Tidy::Formatter for breaking lines around this token type. You can
4398 # skip this step and take the default at first, then adjust later to get
4399 # desired results. For adding type 'v', I looked at sub bond_strength and
4400 # saw that number type 'n' was using default strengths, so I didn't do
4401 # anything. I may tune it up someday if I don't like the way line
4402 # breaks with v-strings look.
4404 # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
4405 # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
4406 # and saw that type 'n' used spaces on both sides, so I just added 'v'
4407 # to the array @spaces_both_sides.
4409 # *. Update HtmlWriter package so that users can colorize the token as
4410 # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
4411 # that package. For v-strings, I initially chose to use a default color
4412 # equal to the default for numbers, but it might be nice to change that
4415 # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
4417 # *. Run lots and lots of debug tests. Start with special files designed
4418 # to test the new token type. Run with the -D flag to create a .DEBUG
4419 # file which shows the tokenization. When these work ok, test as many old
4420 # scripts as possible. Start with all of the '.t' files in the 'test'
4421 # directory of the distribution file. Compare .tdy output with previous
4422 # version and updated version to see the differences. Then include as
4423 # many more files as possible. My own technique has been to collect a huge
4424 # number of perl scripts (thousands!) into one directory and run perltidy
4425 # *, then run diff between the output of the previous version and the
4428 # *. For another example, search for the smartmatch operator '~~'
4429 # with your editor to see where updates were made for it.
4431 # -----------------------------------------------------------------------
4433 my $line_of_tokens = shift;
4434 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
4436 # patch while coding change is underway
4437 # make callers private data to allow access
4438 # $tokenizer_self = $caller_tokenizer_self;
4440 # extract line number for use in error messages
4441 $input_line_number = $line_of_tokens->{_line_number};
4443 # reinitialize for multi-line quote
4444 $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
4446 # check for pod documentation
4447 if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
4448 && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
4451 # must not be in multi-line quote
4452 # and must not be in an equation
4454 && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
4456 $tokenizer_self->[_in_pod_] = 1;
4461 $input_line = $untrimmed_input_line;
4465 # Set a flag to indicate if we might be at an __END__ or __DATA__ line
4466 # This will be used below to avoid quoting a bare word followed by
4470 # trim start of this line unless we are continuing a quoted line
4471 # do not trim end because we might end in a quote (test: deken4.pl)
4472 # Perl::Tidy::Formatter will delete needless trailing blanks
4473 unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
4474 $input_line =~ s/^(\s+)//; # trim left end
4476 # calculate a guessed level for nonblank lines to avoid calls to
4477 # sub guess_old_indentation_level()
4478 if ( $input_line && $1 ) {
4479 my $leading_spaces = $1;
4480 my $spaces = length($leading_spaces);
4482 # handle leading tabs
4483 if ( ord( substr( $leading_spaces, 0, 1 ) ) == 9
4484 && $leading_spaces =~ /^(\t+)/ )
4486 my $tabsize = $tokenizer_self->[_tabsize_];
4487 $spaces += length($1) * ( $tabsize - 1 );
4490 my $indent_columns = $tokenizer_self->[_indent_columns_];
4491 $line_of_tokens->{_guessed_indentation_level} =
4492 int( $spaces / $indent_columns );
4495 $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
4496 && $input_line =~ /^__(END|DATA)__\s*$/;
4499 # update the copy of the line for use in error messages
4500 # This must be exactly what we give the pre_tokenizer
4501 $tokenizer_self->[_line_of_text_] = $input_line;
4503 # re-initialize for the main loop
4504 $routput_token_list = []; # stack of output token indexes
4505 $routput_token_type = []; # token types
4506 $routput_block_type = []; # types of code block
4507 $routput_container_type = []; # paren types, such as if, elsif, ..
4508 $routput_type_sequence = []; # nesting sequential number
4510 $rhere_target_list = [];
4512 $tok = $last_nonblank_token;
4513 $type = $last_nonblank_type;
4514 $prototype = $last_nonblank_prototype;
4515 $last_nonblank_i = -1;
4516 $block_type = $last_nonblank_block_type;
4517 $container_type = $last_nonblank_container_type;
4518 $type_sequence = $last_nonblank_type_sequence;
4522 # This variable signals pre_tokenize to get all tokens.
4523 # But note that it is no longer needed with fast block comment
4525 my $max_tokens_wanted = 0;
4527 # optimize for a full-line comment
4528 if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) {
4529 $max_tokens_wanted = 1; # no use tokenizing a comment
4531 # and check for skipped section
4532 if ( $rOpts_code_skipping
4533 && $input_line =~ /$code_skipping_pattern_begin/ )
4535 $tokenizer_self->[_in_skipped_] = 1;
4539 # Optional fast processing of a block comment
4541 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4542 my $ci_string_i = $ci_string_sum + $in_statement_continuation;
4543 $line_of_tokens->{_line_type} = 'CODE';
4544 $line_of_tokens->{_rtokens} = [$input_line];
4545 $line_of_tokens->{_rtoken_type} = ['#'];
4546 $line_of_tokens->{_rlevels} = [$level_in_tokenizer];
4547 $line_of_tokens->{_rci_levels} = [$ci_string_i];
4548 $line_of_tokens->{_rblock_type} = [EMPTY_STRING];
4549 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
4550 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
4554 tokenizer_main_loop( $max_tokens_wanted, $is_END_or_DATA );
4556 #-----------------------------------------------
4557 # all done tokenizing this line ...
4558 # now prepare the final list of tokens and types
4559 #-----------------------------------------------
4561 tokenizer_wrapup_line($line_of_tokens);
4564 } ## end sub tokenize_this_line
4566 sub tokenizer_main_loop {
4567 my ( $max_tokens_wanted, $is_END_or_DATA ) = @_;
4569 # tokenization is done in two stages..
4570 # stage 1 is a very simple pre-tokenization
4572 # start by breaking the line into pre-tokens
4573 ( $rtokens, $rtoken_map, $rtoken_type ) =
4574 pre_tokenize( $input_line, $max_tokens_wanted );
4576 $max_token_index = scalar( @{$rtokens} ) - 1;
4577 push( @{$rtokens}, SPACE, SPACE, SPACE )
4578 ; # extra whitespace simplifies logic
4579 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
4580 push( @{$rtoken_type}, 'b', 'b', 'b' );
4582 # initialize for main loop
4583 if (0) { #<<< this is not necessary
4584 foreach my $ii ( 0 .. $max_token_index + 3 ) {
4585 $routput_token_type->[$ii] = EMPTY_STRING;
4586 $routput_block_type->[$ii] = EMPTY_STRING;
4587 $routput_container_type->[$ii] = EMPTY_STRING;
4588 $routput_type_sequence->[$ii] = EMPTY_STRING;
4589 $routput_indent_flag->[$ii] = 0;
4596 # ------------------------------------------------------------
4597 # begin main tokenization loop
4598 # ------------------------------------------------------------
4600 # we are looking at each pre-token of one line and combining them
4602 while ( ++$i <= $max_token_index ) {
4604 # continue looking for the end of a quote
4607 last if ( $in_quote || $i > $max_token_index );
4610 if ( $type ne 'b' && $tok ne 'CORE::' ) {
4612 # try to catch some common errors
4613 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
4615 if ( $last_nonblank_token eq 'eq' ) {
4616 complain("Should 'eq' be '==' here ?\n");
4618 elsif ( $last_nonblank_token eq 'ne' ) {
4619 complain("Should 'ne' be '!=' here ?\n");
4623 # fix c090, only rotate vars if a new token will be stored
4624 if ( $i_tok >= 0 ) {
4625 $last_last_nonblank_token = $last_nonblank_token;
4626 $last_last_nonblank_type = $last_nonblank_type;
4627 $last_last_nonblank_block_type = $last_nonblank_block_type;
4628 $last_last_nonblank_container_type =
4629 $last_nonblank_container_type;
4630 $last_last_nonblank_type_sequence =
4631 $last_nonblank_type_sequence;
4633 # Fix part #3 for git82: propagate type 'Z' though L-R pair
4634 unless ( $type eq 'R' && $last_nonblank_type eq 'Z' ) {
4635 $last_nonblank_token = $tok;
4636 $last_nonblank_type = $type;
4638 $last_nonblank_prototype = $prototype;
4639 $last_nonblank_block_type = $block_type;
4640 $last_nonblank_container_type = $container_type;
4641 $last_nonblank_type_sequence = $type_sequence;
4642 $last_nonblank_i = $i_tok;
4645 # Patch for c030: Fix things in case a '->' got separated from
4646 # the subsequent identifier by a side comment. We need the
4647 # last_nonblank_token to have a leading -> to avoid triggering
4648 # an operator expected error message at the next '('. See also
4650 if ( $last_last_nonblank_token eq '->' ) {
4651 if ( $last_nonblank_type eq 'w'
4652 || $last_nonblank_type eq 'i'
4653 && substr( $last_nonblank_token, 0, 1 ) eq '$' )
4655 $last_nonblank_token = '->' . $last_nonblank_token;
4656 $last_nonblank_type = 'i';
4661 # store previous token type
4662 if ( $i_tok >= 0 ) {
4663 $routput_token_type->[$i_tok] = $type;
4664 $routput_block_type->[$i_tok] = $block_type;
4665 $routput_container_type->[$i_tok] = $container_type;
4666 $routput_type_sequence->[$i_tok] = $type_sequence;
4667 $routput_indent_flag->[$i_tok] = $indent_flag;
4670 # get the next pre-token and type
4671 # $tok and $type will be modified to make the output token
4672 my $pre_tok = $tok = $rtokens->[$i]; # get the next pre-token
4673 my $pre_type = $type = $rtoken_type->[$i]; # and type
4675 # remember the starting index of this token; we will be updating $i
4678 # re-initialize various flags for the next output token
4679 $block_type &&= EMPTY_STRING;
4680 $container_type &&= EMPTY_STRING;
4681 $type_sequence &&= EMPTY_STRING;
4683 $prototype &&= EMPTY_STRING;
4685 # this pre-token will start an output token
4686 push( @{$routput_token_list}, $i_tok );
4688 #--------------------------
4689 # handle a whitespace token
4690 #--------------------------
4691 next if ( $pre_type eq 'b' );
4696 last if ( $pre_type eq '#' );
4698 # continue gathering identifier if necessary
4699 if ($id_scan_state) {
4701 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
4708 if ($id_scan_state) {
4710 # Still scanning ...
4711 # Check for side comment between sub and prototype (c061)
4713 # done if nothing left to scan on this line
4714 last if ( $i > $max_token_index );
4716 my ( $next_nonblank_token, $i_next ) =
4717 find_next_nonblank_token_on_this_line( $i, $rtokens,
4720 # done if it was just some trailing space
4721 last if ( $i_next > $max_token_index );
4723 # something remains on the line ... must be a side comment
4727 next if ( ( $i > 0 ) || $type );
4729 # didn't find any token; start over
4734 my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : SPACE;
4735 my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
4737 #-----------------------------------------------------------
4738 # Combine pre-tokens into digraphs and trigraphs if possible
4739 #-----------------------------------------------------------
4741 # See if we can make a digraph...
4742 # The following tokens are excluded and handled specially:
4743 # '/=' is excluded because the / might start a pattern.
4744 # 'x=' is excluded since it might be $x=, with $ on previous line
4745 # '**' and *= might be typeglobs of punctuation variables
4746 # I have allowed tokens starting with <, such as <=,
4747 # because I don't think these could be valid angle operators.
4748 # test file: storrs4.pl
4749 if ( $can_start_digraph{$tok}
4750 && $i < $max_token_index
4751 && $is_digraph{ $tok . $rtokens->[ $i + 1 ] } )
4755 my $test_tok = $tok . $rtokens->[ $i + 1 ];
4757 # check for special cases which cannot be combined
4759 # '//' must be defined_or operator if an operator is expected.
4760 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
4761 # could be migrated here for clarity
4763 # Patch for RT#102371, misparsing a // in the following snippet:
4764 # state $b //= ccc();
4765 # The solution is to always accept the digraph (or trigraph)
4766 # after type 'Z' (possible file handle). The reason is that
4767 # sub operator_expected gives TERM expected here, which is
4768 # wrong in this case.
4769 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
4771 # note that here $tok = '/' and the next tok and type is '/'
4772 $expecting = operator_expected( [ $prev_type, $tok, '/' ] );
4774 # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
4775 $combine_ok = 0 if ( $expecting == TERM );
4778 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
4779 # Accept the digraphs '**' only after type 'Z'
4780 # Otherwise postpone the decision.
4781 if ( $test_tok eq '**' ) {
4782 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
4787 # still ok to combine?
4790 && ( $test_tok ne '/=' ) # might be pattern
4791 && ( $test_tok ne 'x=' ) # might be $x
4792 && ( $test_tok ne '*=' ) # typeglob?
4794 # Moved above as part of fix for
4795 # RT #114359: Missparsing of "print $x ** 0.5;
4796 # && ( $test_tok ne '**' ) # typeglob?
4802 # Now try to assemble trigraphs. Note that all possible
4803 # perl trigraphs can be constructed by appending a character
4805 $test_tok = $tok . $rtokens->[ $i + 1 ];
4807 if ( $is_trigraph{$test_tok} ) {
4812 # The only current tetragraph is the double diamond operator
4813 # and its first three characters are not a trigraph, so
4814 # we do can do a special test for it
4815 elsif ( $test_tok eq '<<>' ) {
4816 $test_tok .= $rtokens->[ $i + 2 ];
4817 if ( $is_tetragraph{$test_tok} ) {
4826 $next_tok = $rtokens->[ $i + 1 ];
4827 $next_type = $rtoken_type->[ $i + 1 ];
4829 DEBUG_TOKENIZE && do {
4830 local $LIST_SEPARATOR = ')(';
4832 $last_nonblank_token, $tok,
4833 $next_tok, $brace_depth,
4834 $brace_type[$brace_depth], $paren_depth,
4835 $paren_type[$paren_depth],
4837 print STDOUT "TOKENIZE:(@debug_list)\n";
4840 # Turn off attribute list on first non-blank, non-bareword.
4841 # Added '#' to fix c038 (later moved above).
4842 if ( $in_attribute_list && $pre_type ne 'w' ) {
4843 $in_attribute_list = 0;
4846 ###############################################################
4847 # We have the next token, $tok.
4848 # Now we have to examine this token and decide what it is
4849 # and define its $type
4851 # section 1: bare words
4852 ###############################################################
4854 if ( $pre_type eq 'w' ) {
4856 operator_expected( [ $prev_type, $tok, $next_type ] );
4857 my $is_last = do_BAREWORD($is_END_or_DATA);
4861 ###############################################################
4862 # section 2: strings of digits
4863 ###############################################################
4864 elsif ( $pre_type eq 'd' ) {
4866 operator_expected( [ $prev_type, $tok, $next_type ] );
4870 ###############################################################
4871 # section 3: all other tokens
4872 ###############################################################
4874 my $code = $tokenization_code->{$tok};
4877 operator_expected( [ $prev_type, $tok, $next_type ] );
4884 # -----------------------------
4885 # end of main tokenization loop
4886 # -----------------------------
4888 # Store the final token
4889 if ( $i_tok >= 0 ) {
4890 $routput_token_type->[$i_tok] = $type;
4891 $routput_block_type->[$i_tok] = $block_type;
4892 $routput_container_type->[$i_tok] = $container_type;
4893 $routput_type_sequence->[$i_tok] = $type_sequence;
4894 $routput_indent_flag->[$i_tok] = $indent_flag;
4897 # Remember last nonblank values
4898 unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
4899 $last_last_nonblank_token = $last_nonblank_token;
4900 $last_last_nonblank_type = $last_nonblank_type;
4901 $last_last_nonblank_block_type = $last_nonblank_block_type;
4902 $last_last_nonblank_container_type = $last_nonblank_container_type;
4903 $last_last_nonblank_type_sequence = $last_nonblank_type_sequence;
4904 $last_nonblank_token = $tok;
4905 $last_nonblank_type = $type;
4906 $last_nonblank_block_type = $block_type;
4907 $last_nonblank_container_type = $container_type;
4908 $last_nonblank_type_sequence = $type_sequence;
4909 $last_nonblank_prototype = $prototype;
4912 # reset indentation level if necessary at a sub or package
4913 # in an attempt to recover from a nesting error
4914 if ( $level_in_tokenizer < 0 ) {
4915 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
4916 reset_indentation_level(0);
4917 brace_warning("resetting level to 0 at $1 $2\n");
4921 $tokenizer_self->[_in_attribute_list_] = $in_attribute_list;
4922 $tokenizer_self->[_in_quote_] = $in_quote;
4923 $tokenizer_self->[_quote_target_] =
4924 $in_quote ? matching_end_token($quote_character) : EMPTY_STRING;
4925 $tokenizer_self->[_rhere_target_list_] = $rhere_target_list;
4928 } ## end sub tokenizer_main_loop
4930 sub tokenizer_wrapup_line {
4931 my ($line_of_tokens) = @_;
4933 # We have broken the current line into tokens. Now we have to wrap up
4934 # the result for shipping. Most of the remaining work involves
4935 # defining the various indentation parameters that the formatter needs
4936 # (indentation level and continuation indentation). This turns out to
4937 # be somewhat complicated.
4939 my @token_type = (); # stack of output token types
4940 my @block_type = (); # stack of output code block types
4941 my @type_sequence = (); # stack of output type sequence numbers
4942 my @tokens = (); # output tokens
4943 my @levels = (); # structural brace levels of output tokens
4944 my @ci_string = (); # string needed to compute continuation indentation
4945 my $container_environment = EMPTY_STRING;
4946 my $im = -1; # previous $i value
4949 # Count the number of '1's in the string (previously sub ones_count)
4950 my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4952 # Computing Token Indentation
4954 # The final section of the tokenizer forms tokens and also computes
4955 # parameters needed to find indentation. It is much easier to do it
4956 # in the tokenizer than elsewhere. Here is a brief description of how
4957 # indentation is computed. Perl::Tidy computes indentation as the sum
4960 # (1) structural indentation, such as if/else/elsif blocks
4961 # (2) continuation indentation, such as long parameter call lists.
4963 # These are occasionally called primary and secondary indentation.
4965 # Structural indentation is introduced by tokens of type '{', although
4966 # the actual tokens might be '{', '(', or '['. Structural indentation
4967 # is of two types: BLOCK and non-BLOCK. Default structural indentation
4968 # is 4 characters if the standard indentation scheme is used.
4970 # Continuation indentation is introduced whenever a line at BLOCK level
4971 # is broken before its termination. Default continuation indentation
4972 # is 2 characters in the standard indentation scheme.
4974 # Both types of indentation may be nested arbitrarily deep and
4975 # interlaced. The distinction between the two is somewhat arbitrary.
4977 # For each token, we will define two variables which would apply if
4978 # the current statement were broken just before that token, so that
4979 # that token started a new line:
4981 # $level = the structural indentation level,
4982 # $ci_level = the continuation indentation level
4984 # The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
4985 # assuming defaults. However, in some special cases it is customary
4986 # to modify $ci_level from this strict value.
4988 # The total structural indentation is easy to compute by adding and
4989 # subtracting 1 from a saved value as types '{' and '}' are seen. The
4990 # running value of this variable is $level_in_tokenizer.
4992 # The total continuation is much more difficult to compute, and requires
4993 # several variables. These variables are:
4995 # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
4996 # each indentation level, if there are intervening open secondary
4997 # structures just prior to that level.
4998 # $continuation_string_in_tokenizer = a string of 1's and 0's indicating
4999 # if the last token at that level is "continued", meaning that it
5000 # is not the first token of an expression.
5001 # $nesting_block_string = a string of 1's and 0's indicating, for each
5002 # indentation level, if the level is of type BLOCK or not.
5003 # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
5004 # $nesting_list_string = a string of 1's and 0's indicating, for each
5005 # indentation level, if it is appropriate for list formatting.
5006 # If so, continuation indentation is used to indent long list items.
5007 # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
5008 # @{$rslevel_stack} = a stack of total nesting depths at each
5009 # structural indentation level, where "total nesting depth" means
5010 # the nesting depth that would occur if every nesting token -- '{', '[',
5011 # and '(' -- , regardless of context, is used to compute a nesting
5014 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5016 my ( $ci_string_i, $level_i );
5018 # loop over the list of pre-tokens indexes
5019 foreach my $i ( @{$routput_token_list} ) {
5021 # Get $tok_i, the PRE-token. It only equals the token for symbols
5022 my $type_i = $routput_token_type->[$i];
5023 my $tok_i = $rtokens->[$i];
5025 # Quick handling of indentation levels for blanks and comments
5026 if ( $type_i eq 'b' || $type_i eq '#' ) {
5027 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5028 $level_i = $level_in_tokenizer;
5034 # Check for an invalid token type..
5035 # This can happen by running perltidy on non-scripts although
5036 # it could also be bug introduced by programming change. Perl
5037 # silently accepts a 032 (^Z) and takes it as the end
5038 if ( !$is_valid_token_type{$type_i} ) {
5039 my $val = ord($type_i);
5041 "unexpected character decimal $val ($type_i) in script\n"
5043 $tokenizer_self->[_in_error_] = 1;
5046 # See if we should undo the $forced_indentation_flag.
5047 # Forced indentation after 'if', 'unless', 'while' and 'until'
5048 # expressions without trailing parens is optional and doesn't
5049 # always look good. It is usually okay for a trailing logical
5050 # expression, but if the expression is a function call, code block,
5051 # or some kind of list it puts in an unwanted extra indentation
5052 # level which is hard to remove.
5054 # Example where extra indentation looks ok:
5056 # if $det_a < 0 and $det_b > 0
5057 # or $det_a > 0 and $det_b < 0;
5059 # Example where extra indentation is not needed because
5060 # the eval brace also provides indentation:
5061 # print "not " if defined eval {
5062 # reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
5065 # The following rule works fairly well:
5066 # Undo the flag if the end of this line, or start of the next
5067 # line, is an opening container token or a comma.
5068 # This almost always works, but if not after another pass it will
5070 my $forced_indentation_flag = $routput_indent_flag->[$i];
5071 if ( $forced_indentation_flag && $type_i eq 'k' ) {
5073 my $ilast = $routput_token_list->[$ixlast];
5074 my $toklast = $routput_token_type->[$ilast];
5075 if ( $toklast eq '#' ) {
5077 $ilast = $routput_token_list->[$ixlast];
5078 $toklast = $routput_token_type->[$ilast];
5080 if ( $toklast eq 'b' ) {
5082 $ilast = $routput_token_list->[$ixlast];
5083 $toklast = $routput_token_type->[$ilast];
5085 if ( $toklast =~ /^[\{,]$/ ) {
5086 $forced_indentation_flag = 0;
5089 ( $toklast, my $i_next ) =
5090 find_next_nonblank_token( $max_token_index, $rtokens,
5092 if ( $toklast =~ /^[\{,]$/ ) {
5093 $forced_indentation_flag = 0;
5096 } ## end if ( $forced_indentation_flag...)
5098 # if we are already in an indented if, see if we should outdent
5099 if ($indented_if_level) {
5101 # don't try to nest trailing if's - shouldn't happen
5102 if ( $type_i eq 'k' ) {
5103 $forced_indentation_flag = 0;
5106 # check for the normal case - outdenting at next ';'
5107 elsif ( $type_i eq ';' ) {
5108 if ( $level_in_tokenizer == $indented_if_level ) {
5109 $forced_indentation_flag = -1;
5110 $indented_if_level = 0;
5114 # handle case of missing semicolon
5115 elsif ( $type_i eq '}' ) {
5116 if ( $level_in_tokenizer == $indented_if_level ) {
5117 $indented_if_level = 0;
5119 $level_in_tokenizer--;
5120 if ( @{$rslevel_stack} > 1 ) {
5121 pop( @{$rslevel_stack} );
5123 if ( length($nesting_block_string) > 1 )
5124 { # true for valid script
5125 chop $nesting_block_string;
5126 chop $nesting_list_string;
5130 } ## end if ($indented_if_level)
5132 # Now we have the first approximation to the level
5133 $level_i = $level_in_tokenizer;
5135 # set primary indentation levels based on structural braces
5136 # Note: these are set so that the leading braces have a HIGHER
5137 # level than their CONTENTS, which is convenient for indentation
5138 # Also, define continuation indentation for each token.
5141 || $forced_indentation_flag > 0 )
5144 # use environment before updating
5145 $container_environment =
5146 $nesting_block_flag ? 'BLOCK'
5147 : $nesting_list_flag ? 'LIST'
5150 # if the difference between total nesting levels is not 1,
5151 # there are intervening non-structural nesting types between
5152 # this '{' and the previous unclosed '{'
5153 my $intervening_secondary_structure = 0;
5154 if ( @{$rslevel_stack} ) {
5155 $intervening_secondary_structure =
5156 $slevel_in_tokenizer - $rslevel_stack->[-1];
5159 # Continuation Indentation
5161 # Having tried setting continuation indentation both in the formatter and
5162 # in the tokenizer, I can say that setting it in the tokenizer is much,
5163 # much easier. The formatter already has too much to do, and can't
5164 # make decisions on line breaks without knowing what 'ci' will be at
5165 # arbitrary locations.
5167 # But a problem with setting the continuation indentation (ci) here
5168 # in the tokenizer is that we do not know where line breaks will actually
5169 # be. As a result, we don't know if we should propagate continuation
5170 # indentation to higher levels of structure.
5172 # For nesting of only structural indentation, we never need to do this.
5173 # For example, in a long if statement, like this
5175 # if ( !$output_block_type[$i]
5176 # && ($in_statement_continuation) )
5181 # the second line has ci but we do normally give the lines within the BLOCK
5182 # any ci. This would be true if we had blocks nested arbitrarily deeply.
5184 # But consider something like this, where we have created a break after
5185 # an opening paren on line 1, and the paren is not (currently) a
5186 # structural indentation token:
5188 # my $file = $menubar->Menubutton(
5189 # qw/-text File -underline 0 -menuitems/ => [
5191 # Cascade => '~View',
5195 # The second line has ci, so it would seem reasonable to propagate it
5196 # down, giving the third line 1 ci + 1 indentation. This suggests the
5197 # following rule, which is currently used to propagating ci down: if there
5198 # are any non-structural opening parens (or brackets, or braces), before
5199 # an opening structural brace, then ci is propagated down, and otherwise
5200 # not. The variable $intervening_secondary_structure contains this
5201 # information for the current token, and the string
5202 # "$ci_string_in_tokenizer" is a stack of previous values of this
5205 # save the current states
5206 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
5207 $level_in_tokenizer++;
5209 if ( $level_in_tokenizer >
5210 $tokenizer_self->[_maximum_level_] )
5212 $tokenizer_self->[_maximum_level_] =
5213 $level_in_tokenizer;
5216 if ($forced_indentation_flag) {
5218 # break BEFORE '?' when there is forced indentation
5219 if ( $type_i eq '?' ) {
5220 $level_i = $level_in_tokenizer;
5222 if ( $type_i eq 'k' ) {
5223 $indented_if_level = $level_in_tokenizer;
5226 # do not change container environment here if we are not
5227 # at a real list. Adding this check prevents "blinkers"
5228 # often near 'unless" clauses, such as in the following
5233 ## File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
5236 $nesting_block_string .= "$nesting_block_flag";
5237 } ## end if ($forced_indentation_flag)
5240 if ( $routput_block_type->[$i] ) {
5241 $nesting_block_flag = 1;
5242 $nesting_block_string .= '1';
5245 $nesting_block_flag = 0;
5246 $nesting_block_string .= '0';
5250 # we will use continuation indentation within containers
5251 # which are not blocks and not logical expressions
5253 if ( !$routput_block_type->[$i] ) {
5255 # propagate flag down at nested open parens
5256 if ( $routput_container_type->[$i] eq '(' ) {
5257 $bit = 1 if $nesting_list_flag;
5260 # use list continuation if not a logical grouping
5261 # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
5265 $is_logical_container{ $routput_container_type
5269 $nesting_list_string .= $bit;
5270 $nesting_list_flag = $bit;
5272 $ci_string_in_tokenizer .=
5273 ( $intervening_secondary_structure != 0 ) ? '1' : '0';
5275 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5276 $continuation_string_in_tokenizer .=
5277 ( $in_statement_continuation > 0 ) ? '1' : '0';
5279 # Sometimes we want to give an opening brace continuation indentation,
5280 # and sometimes not. For code blocks, we don't do it, so that the leading
5281 # '{' gets outdented, like this:
5283 # if ( !$output_block_type[$i]
5284 # && ($in_statement_continuation) )
5287 # For other types, we will give them continuation indentation. For example,
5288 # here is how a list looks with the opening paren indented:
5291 # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
5292 # [ "homer", "marge", "bart" ], );
5294 # This looks best when 'ci' is one-half of the indentation (i.e., 2 and 4)
5296 my $total_ci = $ci_string_sum;
5298 !$routput_block_type->[$i] # patch: skip for BLOCK
5299 && ($in_statement_continuation)
5300 && !( $forced_indentation_flag && $type_i eq ':' )
5303 $total_ci += $in_statement_continuation
5305 substr( $ci_string_in_tokenizer, -1 ) eq '1' );
5308 $ci_string_i = $total_ci;
5309 $in_statement_continuation = 0;
5310 } ## end if ( $type_i eq '{' ||...})
5312 elsif ($type_i eq '}'
5314 || $forced_indentation_flag < 0 )
5317 # only a nesting error in the script would prevent popping here
5318 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
5320 $level_i = --$level_in_tokenizer;
5322 # restore previous level values
5323 if ( length($nesting_block_string) > 1 )
5324 { # true for valid script
5325 chop $nesting_block_string;
5326 $nesting_block_flag =
5327 substr( $nesting_block_string, -1 ) eq '1';
5328 chop $nesting_list_string;
5329 $nesting_list_flag =
5330 substr( $nesting_list_string, -1 ) eq '1';
5332 chop $ci_string_in_tokenizer;
5334 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5336 $in_statement_continuation =
5337 chop $continuation_string_in_tokenizer;
5339 # zero continuation flag at terminal BLOCK '}' which
5341 my $block_type_i = $routput_block_type->[$i];
5342 if ($block_type_i) {
5344 # ...These include non-anonymous subs
5345 # note: could be sub ::abc { or sub 'abc
5346 if ( $block_type_i =~ m/^sub\s*/gc ) {
5348 # note: older versions of perl require the /gc
5349 # modifier here or else the \G does not work.
5350 if ( $block_type_i =~ /\G('|::|\w)/gc ) {
5351 $in_statement_continuation = 0;
5355 # ...and include all block types except user subs
5356 # with block prototypes and these:
5357 # (sort|grep|map|do|eval)
5359 $is_zero_continuation_block_type{$block_type_i}
5362 $in_statement_continuation = 0;
5365 # ..but these are not terminal types:
5366 # /^(sort|grep|map|do|eval)$/ )
5367 elsif ($is_sort_map_grep_eval_do{$block_type_i}
5368 || $is_grep_alias{$block_type_i} )
5372 # ..and a block introduced by a label
5374 elsif ( $block_type_i =~ /:$/ ) {
5375 $in_statement_continuation = 0;
5378 # user function with block prototype
5380 $in_statement_continuation = 0;
5382 } ## end if ($block_type_i)
5384 # If we are in a list, then
5385 # we must set continuation indentation at the closing
5386 # paren of something like this (paren after $check):
5389 # ( not defined $check )
5391 # or $check eq "new"
5392 # or $check eq "old",
5394 elsif ( $tok_i eq ')' ) {
5395 $in_statement_continuation = 1
5398 $routput_container_type->[$i]
5401 ##if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
5404 elsif ( $tok_i eq ';' ) {
5405 $in_statement_continuation = 0;
5407 } ## end if ( length($nesting_block_string...))
5409 # use environment after updating
5410 $container_environment =
5411 $nesting_block_flag ? 'BLOCK'
5412 : $nesting_list_flag ? 'LIST'
5414 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5415 } ## end elsif ( $type_i eq '}' ||...{)
5417 # not a structural indentation type..
5420 $container_environment =
5421 $nesting_block_flag ? 'BLOCK'
5422 : $nesting_list_flag ? 'LIST'
5425 # zero the continuation indentation at certain tokens so
5426 # that they will be at the same level as its container. For
5427 # commas, this simplifies the -lp indentation logic, which
5428 # counts commas. For ?: it makes them stand out.
5429 if ($nesting_list_flag) {
5430 ## $type_i =~ /^[,\?\:]$/
5431 if ( $is_comma_question_colon{$type_i} ) {
5432 $in_statement_continuation = 0;
5436 # be sure binary operators get continuation indentation
5438 $container_environment
5439 && ( $type_i eq 'k' && $is_binary_keyword{$tok_i}
5440 || $is_binary_type{$type_i} )
5443 $in_statement_continuation = 1;
5446 # continuation indentation is sum of any open ci from
5447 # previous levels plus the current level
5448 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5450 # update continuation flag ...
5452 ## if ( $type_i ne 'b' && $type_i ne '#' ) { # moved above
5454 # if we are in a BLOCK
5455 if ($nesting_block_flag) {
5457 # the next token after a ';' and label starts a new stmt
5458 if ( $type_i eq ';' || $type_i eq 'J' ) {
5459 $in_statement_continuation = 0;
5462 # otherwise, we are continuing the current statement
5464 $in_statement_continuation = 1;
5468 # if we are not in a BLOCK..
5471 # do not use continuation indentation if not list
5472 # environment (could be within if/elsif clause)
5473 if ( !$nesting_list_flag ) {
5474 $in_statement_continuation = 0;
5477 # otherwise, the token after a ',' starts a new term
5479 # Patch FOR RT#99961; no continuation after a ';'
5480 # This is needed because perltidy currently marks
5481 # a block preceded by a type character like % or @
5482 # as a non block, to simplify formatting. But these
5483 # are actually blocks and can have semicolons.
5484 # See code_block_type() and is_non_structural_brace().
5485 elsif ( $type_i eq ',' || $type_i eq ';' ) {
5486 $in_statement_continuation = 0;
5489 # otherwise, we are continuing the current term
5491 $in_statement_continuation = 1;
5493 } ## end else [ if ($nesting_block_flag)]
5495 ##} ## end if ( $type_i ne 'b' ... # (old moved above)
5497 } ## end else [ if ( $type_i eq '{' ||...})]
5499 if ( $level_in_tokenizer < 0 ) {
5500 unless ( $tokenizer_self->[_saw_negative_indentation_] ) {
5501 $tokenizer_self->[_saw_negative_indentation_] = 1;
5502 warning("Starting negative indentation\n");
5506 # set secondary nesting levels based on all containment token
5507 # types Note: these are set so that the nesting depth is the
5508 # depth of the PREVIOUS TOKEN, which is convenient for setting
5509 # the strength of token bonds
5512 if ( $is_opening_type{$type_i} ) {
5513 $slevel_in_tokenizer++;
5514 $nesting_token_string .= $tok_i;
5515 $nesting_type_string .= $type_i;
5519 elsif ( $is_closing_type{$type_i} ) {
5520 $slevel_in_tokenizer--;
5521 my $char = chop $nesting_token_string;
5523 if ( $char ne $matching_start_token{$tok_i} ) {
5524 $nesting_token_string .= $char . $tok_i;
5525 $nesting_type_string .= $type_i;
5528 chop $nesting_type_string;
5532 # apply token type patch:
5533 # - output anonymous 'sub' as keyword (type 'k')
5534 # - output __END__, __DATA__, and format as type 'k' instead
5535 # of ';' to make html colors correct, etc.
5536 # The following hash tests are equivalent to these older tests:
5537 # if ( $type_i eq 't' && $is_sub{$tok_i} ) { $fix_type = 'k' }
5538 # if ( $type_i eq ';' && $tok_i =~ /\w/ ) { $fix_type = 'k' }
5539 if ( $is_END_DATA_format_sub{$tok_i}
5540 && $is_semicolon_or_t{$type_i} )
5544 } ## end else [ if ( $type_i eq 'b' ||...)]
5546 # Store the values for this token
5547 push( @ci_string, $ci_string_i );
5548 push( @levels, $level_i );
5549 push( @block_type, $routput_block_type->[$i] );
5550 push( @type_sequence, $routput_type_sequence->[$i] );
5551 push( @token_type, $type_i );
5553 # Form and store the previous token
5556 $rtoken_map->[$i] - $rtoken_map->[$im]; # how many characters
5560 substr( $input_line, $rtoken_map->[$im], $num ) );
5564 # or grab some values for the leading token (needed for log output)
5566 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5570 } ## end foreach my $i ( @{$routput_token_list...})
5572 # Form and store the final token
5573 $num = length($input_line) - $rtoken_map->[$im]; # make the last token
5575 push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
5578 $line_of_tokens->{_rtoken_type} = \@token_type;
5579 $line_of_tokens->{_rtokens} = \@tokens;
5580 $line_of_tokens->{_rblock_type} = \@block_type;
5581 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
5582 $line_of_tokens->{_rlevels} = \@levels;
5583 $line_of_tokens->{_rci_levels} = \@ci_string;
5586 } ## end sub tokenizer_wrapup_line
5587 } ## end tokenize_this_line
5589 #########i#############################################################
5590 # Tokenizer routines which assist in identifying token types
5591 #######################################################################
5593 # hash lookup table of operator expected values
5594 my %op_expected_table;
5596 # exceptions to perl's weird parsing rules after type 'Z'
5597 my %is_weird_parsing_rule_exception;
5599 my %is_paren_dollar;
5605 # Always expecting TERM following these types:
5606 # note: this is identical to '@value_requestor_type' defined later.
5608 ; ! + x & ? F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
5609 || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
5610 &= // >> ~. &. |. ^.
5611 ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
5614 push @q, '('; # for completeness, not currently a token type
5615 @{op_expected_table}{@q} = (TERM) x scalar(@q);
5617 # Always UNKNOWN following these types:
5618 # Fix for c030: added '->' to this list
5620 @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
5622 # Always expecting OPERATOR ...
5623 # 'n' and 'v' are currently excluded because they might be VERSION numbers
5624 # 'i' is currently excluded because it might be a package
5625 # 'q' is currently excluded because it might be a prototype
5626 # Fix for c030: removed '->' from this list:
5627 @q = qw( -- C h R ++ ] Q <> ); ## n v q i );
5629 @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
5631 # Fix for git #62: added '*' and '%'
5633 @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
5636 @{is_paren_dollar}{@q} = (1) x scalar(@q);
5639 @{is_n_v}{@q} = (1) x scalar(@q);
5643 use constant DEBUG_OPERATOR_EXPECTED => 0;
5645 sub operator_expected {
5647 # Returns a parameter indicating what types of tokens can occur next
5650 # $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
5652 # $prev_type is the type of the previous token (blank or not)
5653 # $tok is the current token
5654 # $next_type is the type of the next token (blank or not)
5656 # Many perl symbols have two or more meanings. For example, '<<'
5657 # can be a shift operator or a here-doc operator. The
5658 # interpretation of these symbols depends on the current state of
5659 # the tokenizer, which may either be expecting a term or an
5660 # operator. For this example, a << would be a shift if an OPERATOR
5661 # is expected, and a here-doc if a TERM is expected. This routine
5662 # is called to make this decision for any current token. It returns
5663 # one of three possible values:
5665 # OPERATOR - operator expected (or at least, not a term)
5666 # UNKNOWN - can't tell
5667 # TERM - a term is expected (or at least, not an operator)
5669 # The decision is based on what has been seen so far. This
5670 # information is stored in the "$last_nonblank_type" and
5671 # "$last_nonblank_token" variables. For example, if the
5672 # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
5673 # if $last_nonblank_type is 'n' (numeric), we are expecting an
5676 # If a UNKNOWN is returned, the calling routine must guess. A major
5677 # goal of this tokenizer is to minimize the possibility of returning
5678 # UNKNOWN, because a wrong guess can spoil the formatting of a
5681 # Adding NEW_TOKENS: it is critically important that this routine be
5682 # updated to allow it to determine if an operator or term is to be
5683 # expected after the new token. Doing this simply involves adding
5684 # the new token character to one of the regexes in this routine or
5685 # to one of the hash lists
5686 # that it uses, which are initialized in the BEGIN section.
5687 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
5690 # When possible, token types should be selected such that we can determine
5691 # the 'operator_expected' value by a simple hash lookup. If there are
5692 # exceptions, that is an indication that a new type is needed.
5696 my $msg = EMPTY_STRING;
5702 # Many types are can be obtained by a table lookup given the previous type.
5703 # This typically handles half or more of the calls.
5704 my $op_expected = $op_expected_table{$last_nonblank_type};
5705 if ( defined($op_expected) ) {
5706 $msg = "Table lookup";
5710 ######################
5711 # Handle special cases
5712 ######################
5714 $op_expected = UNKNOWN;
5715 my ( $prev_type, $tok, $next_type ) = @{$rarg};
5717 # Types 'k', '}' and 'Z' depend on context
5718 # FIXME: Types 'i', 'n', 'v', 'q' currently also temporarily depend on
5719 # context but that dependence could eventually be eliminated with better
5720 # token type definition
5723 if ( $last_nonblank_type eq 'i' ) {
5724 $op_expected = OPERATOR;
5726 # FIXME: it would be cleaner to make this a special type
5727 # expecting VERSION or {} after package NAMESPACE
5728 # TODO: maybe mark these words as type 'Y'?
5729 if ( substr( $last_nonblank_token, 0, 7 ) eq 'package'
5730 && $statement_type =~ /^package\b/
5731 && $last_nonblank_token =~ /^package\b/ )
5733 $op_expected = TERM;
5738 elsif ( $last_nonblank_type eq 'k' ) {
5739 $op_expected = TERM;
5740 if ( $expecting_operator_token{$last_nonblank_token} ) {
5741 $op_expected = OPERATOR;
5743 elsif ( $expecting_term_token{$last_nonblank_token} ) {
5745 # Exceptions from TERM:
5747 # // may follow perl functions which may be unary operators
5748 # see test file dor.t (defined or);
5751 && $next_type eq '/'
5752 && $is_keyword_rejecting_slash_as_pattern_delimiter{
5753 $last_nonblank_token}
5756 $op_expected = OPERATOR;
5759 # Patch to allow a ? following 'split' to be a deprecated pattern
5760 # delimiter. This patch is coordinated with the omission of split
5762 # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
5763 # will force perltidy to guess.
5765 && $last_nonblank_token eq 'split' )
5767 $op_expected = UNKNOWN;
5772 # closing container token...
5774 # Note that the actual token for type '}' may also be a ')'.
5776 # Also note that $last_nonblank_token is not the token corresponding to
5777 # $last_nonblank_type when the type is a closing container. In that
5778 # case it is the token before the corresponding opening container token.
5779 # So for example, for this snippet
5780 # $a = do { BLOCK } / 2;
5781 # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
5783 elsif ( $last_nonblank_type eq '}' ) {
5784 $op_expected = UNKNOWN;
5786 # handle something after 'do' and 'eval'
5787 if ( $is_block_operator{$last_nonblank_token} ) {
5789 # something like $a = do { BLOCK } / 2;
5790 $op_expected = OPERATOR; # block mode following }
5793 ##elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
5794 elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
5795 || substr( $last_nonblank_token, 0, 2 ) eq '->' )
5797 $op_expected = OPERATOR;
5798 if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
5801 # Check for smartmatch operator before preceding brace or square
5802 # bracket. For example, at the ? after the ] in the following
5803 # expressions we are expecting an operator:
5805 # qr/3/ ~~ ['1234'] ? 1 : 0;
5806 # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
5807 elsif ( $last_nonblank_token eq '~~' ) {
5808 $op_expected = OPERATOR;
5811 # A right brace here indicates the end of a simple block. All
5812 # non-structural right braces have type 'R' all braces associated with
5813 # block operator keywords have been given those keywords as
5814 # "last_nonblank_token" and caught above. (This statement is order
5815 # dependent, and must come after checking $last_nonblank_token).
5818 # patch for dor.t (defined or).
5820 && $next_type eq '/'
5821 && $last_nonblank_token eq ']' )
5823 $op_expected = OPERATOR;
5826 # Patch for RT #116344: misparse a ternary operator after an
5827 # anonymous hash, like this:
5828 # return ref {} ? 1 : 0;
5829 # The right brace should really be marked type 'R' in this case,
5830 # and it is safest to return an UNKNOWN here. Expecting a TERM will
5831 # cause the '?' to always be interpreted as a pattern delimiter
5832 # rather than introducing a ternary operator.
5833 elsif ( $tok eq '?' ) {
5834 $op_expected = UNKNOWN;
5837 $op_expected = TERM;
5842 # number or v-string...
5843 # An exception is for VERSION numbers a 'use' statement. It has the format
5844 # use Module VERSION LIST
5845 # We could avoid this exception by writing a special sub to parse 'use'
5846 # statements and perhaps mark these numbers with a new type V (for VERSION)
5847 ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
5848 elsif ( $is_n_v{$last_nonblank_type} ) {
5849 $op_expected = OPERATOR;
5850 if ( $statement_type eq 'use' ) {
5851 $op_expected = UNKNOWN;
5856 # FIXME: labeled prototype words should probably be given type 'A' or maybe
5857 # 'J'; not 'q'; or maybe mark as type 'Y'
5858 elsif ( $last_nonblank_type eq 'q' ) {
5859 $op_expected = OPERATOR;
5860 if ( $last_nonblank_token eq 'prototype' )
5861 ##|| $last_nonblank_token eq 'switch' )
5863 $op_expected = TERM;
5867 # file handle or similar
5868 elsif ( $last_nonblank_type eq 'Z' ) {
5870 $op_expected = UNKNOWN;
5873 if ( $last_nonblank_token =~ /^\w/ ) {
5874 $op_expected = UNKNOWN;
5877 # Exception to weird parsing rules for 'x(' ... see case b1205:
5878 # In something like 'print $vv x(...' the x is an operator;
5879 # Likewise in 'print $vv x$ww' the x is an operator (case b1207)
5880 # otherwise x follows the weird parsing rules.
5881 elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
5882 $op_expected = OPERATOR;
5885 # The 'weird parsing rules' of next section do not work for '<' and '?'
5886 # It is best to mark them as unknown. Test case:
5888 elsif ( $is_weird_parsing_rule_exception{$tok} ) {
5889 $op_expected = UNKNOWN;
5892 # For possible file handle like "$a", Perl uses weird parsing rules.
5894 # print $a/2,"/hi"; - division
5895 # print $a / 2,"/hi"; - division
5896 # print $a/ 2,"/hi"; - division
5897 # print $a /2,"/hi"; - pattern (and error)!
5898 # Some examples where this logic works okay, for '&','*','+':
5899 # print $fh &xsi_protos(@mods);
5900 # my $x = new $CompressClass *FH;
5901 # print $OUT +( $count % 15 ? ", " : "\n\t" );
5902 elsif ($prev_type eq 'b'
5903 && $next_type ne 'b' )
5905 $op_expected = TERM;
5908 # Note that '?' and '<' have been moved above
5909 # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
5910 elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
5912 # Do not complain in 'use' statements, which have special syntax.
5913 # For example, from RT#130344:
5914 # use lib $FindBin::Bin . '/lib';
5915 if ( $statement_type ne 'use' ) {
5917 "operator in possible indirect object location not recommended\n"
5920 $op_expected = OPERATOR;
5926 $op_expected = UNKNOWN;
5931 DEBUG_OPERATOR_EXPECTED && do {
5933 "OPERATOR_EXPECTED: $msg: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5936 return $op_expected;
5938 } ## end sub operator_expected
5940 sub new_statement_ok {
5942 # return true if the current token can start a new statement
5943 # USES GLOBAL VARIABLES: $last_nonblank_type
5945 return label_ok() # a label would be ok here
5947 || $last_nonblank_type eq 'J'; # or we follow a label
5949 } ## end sub new_statement_ok
5953 # Decide if a bare word followed by a colon here is a label
5954 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5955 # $brace_depth, @brace_type
5957 # if it follows an opening or closing code block curly brace..
5958 if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
5959 && $last_nonblank_type eq $last_nonblank_token )
5962 # it is a label if and only if the curly encloses a code block
5963 return $brace_type[$brace_depth];
5966 # otherwise, it is a label if and only if it follows a ';' (real or fake)
5969 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
5971 } ## end sub label_ok
5973 sub code_block_type {
5975 # Decide if this is a block of code, and its type.
5976 # Must be called only when $type = $token = '{'
5977 # The problem is to distinguish between the start of a block of code
5978 # and the start of an anonymous hash reference
5979 # Returns "" if not code block, otherwise returns 'last_nonblank_token'
5980 # to indicate the type of code block. (For example, 'last_nonblank_token'
5981 # might be 'if' for an if block, 'else' for an else block, etc).
5982 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5983 # $last_nonblank_block_type, $brace_depth, @brace_type
5985 # handle case of multiple '{'s
5987 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
5989 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
5990 if ( $last_nonblank_token eq '{'
5991 && $last_nonblank_type eq $last_nonblank_token )
5994 # opening brace where a statement may appear is probably
5995 # a code block but might be and anonymous hash reference
5996 if ( $brace_type[$brace_depth] ) {
5997 return decide_if_code_block( $i, $rtokens, $rtoken_type,
6001 # cannot start a code block within an anonymous hash
6003 return EMPTY_STRING;
6007 elsif ( $last_nonblank_token eq ';' ) {
6009 # an opening brace where a statement may appear is probably
6010 # a code block but might be and anonymous hash reference
6011 return decide_if_code_block( $i, $rtokens, $rtoken_type,
6015 # handle case of '}{'
6016 elsif ($last_nonblank_token eq '}'
6017 && $last_nonblank_type eq $last_nonblank_token )
6020 # a } { situation ...
6021 # could be hash reference after code block..(blktype1.t)
6022 if ($last_nonblank_block_type) {
6023 return decide_if_code_block( $i, $rtokens, $rtoken_type,
6027 # must be a block if it follows a closing hash reference
6029 return $last_nonblank_token;
6033 ################################################################
6034 # NOTE: braces after type characters start code blocks, but for
6035 # simplicity these are not identified as such. See also
6036 # sub is_non_structural_brace.
6037 ################################################################
6039 ## elsif ( $last_nonblank_type eq 't' ) {
6040 ## return $last_nonblank_token;
6043 # brace after label:
6044 elsif ( $last_nonblank_type eq 'J' ) {
6045 return $last_nonblank_token;
6048 # otherwise, look at previous token. This must be a code block if
6049 # it follows any of these:
6050 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
6051 elsif ($is_code_block_token{$last_nonblank_token}
6052 || $is_grep_alias{$last_nonblank_token} )
6055 # Bug Patch: Note that the opening brace after the 'if' in the following
6056 # snippet is an anonymous hash ref and not a code block!
6057 # print 'hi' if { x => 1, }->{x};
6058 # We can identify this situation because the last nonblank type
6059 # will be a keyword (instead of a closing paren)
6061 $last_nonblank_type eq 'k'
6062 && ( $last_nonblank_token eq 'if'
6063 || $last_nonblank_token eq 'unless' )
6066 return EMPTY_STRING;
6069 return $last_nonblank_token;
6073 # or a sub or package BLOCK
6074 elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6075 && $last_nonblank_token =~ /^(sub|package)\b/ )
6077 return $last_nonblank_token;
6081 elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6082 && ( $is_sub{$last_nonblank_token} ) )
6087 elsif ( $statement_type =~ /^(sub|package)\b/ ) {
6088 return $statement_type;
6091 # user-defined subs with block parameters (like grep/map/eval)
6092 elsif ( $last_nonblank_type eq 'G' ) {
6093 return $last_nonblank_token;
6097 elsif ( $last_nonblank_type eq 'w' ) {
6099 # check for syntax 'use MODULE LIST'
6100 # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
6101 return EMPTY_STRING if ( $statement_type eq 'use' );
6103 return decide_if_code_block( $i, $rtokens, $rtoken_type,
6107 # Patch for bug # RT #94338 reported by Daniel Trizen
6108 # for-loop in a parenthesized block-map triggering an error message:
6109 # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
6110 # Check for a code block within a parenthesized function call
6111 elsif ( $last_nonblank_token eq '(' ) {
6112 my $paren_type = $paren_type[$paren_depth];
6114 # /^(map|grep|sort)$/
6115 if ( $paren_type && $is_sort_map_grep{$paren_type} ) {
6117 # We will mark this as a code block but use type 't' instead
6118 # of the name of the containing function. This will allow for
6119 # correct parsing but will usually produce better formatting.
6120 # Braces with block type 't' are not broken open automatically
6121 # in the formatter as are other code block types, and this usually
6123 return 't'; # (Not $paren_type)
6126 return EMPTY_STRING;
6130 # handle unknown syntax ') {'
6131 # we previously appended a '()' to mark this case
6132 elsif ( $last_nonblank_token =~ /\(\)$/ ) {
6133 return $last_nonblank_token;
6136 # anything else must be anonymous hash reference
6138 return EMPTY_STRING;
6140 } ## end sub code_block_type
6142 sub decide_if_code_block {
6144 # USES GLOBAL VARIABLES: $last_nonblank_token
6145 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6147 my ( $next_nonblank_token, $i_next ) =
6148 find_next_nonblank_token( $i, $rtokens, $max_token_index );
6150 # we are at a '{' where a statement may appear.
6151 # We must decide if this brace starts an anonymous hash or a code
6153 # return "" if anonymous hash, and $last_nonblank_token otherwise
6155 # initialize to be code BLOCK
6156 my $code_block_type = $last_nonblank_token;
6158 # Check for the common case of an empty anonymous hash reference:
6159 # Maybe something like sub { { } }
6160 if ( $next_nonblank_token eq '}' ) {
6161 $code_block_type = EMPTY_STRING;
6166 # To guess if this '{' is an anonymous hash reference, look ahead
6167 # and test as follows:
6169 # it is a hash reference if next come:
6170 # - a string or digit followed by a comma or =>
6171 # - bareword followed by =>
6172 # otherwise it is a code block
6174 # Examples of anonymous hash ref:
6178 # Examples of code blocks:
6179 # {1; print "hello\n", 1;}
6182 # We are only going to look ahead one more (nonblank/comment) line.
6183 # Strange formatting could cause a bad guess, but that's unlikely.
6187 # Ignore the rest of this line if it is a side comment
6188 if ( $next_nonblank_token ne '#' ) {
6189 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
6190 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
6192 my ( $rpre_tokens, $rpre_types ) =
6193 peek_ahead_for_n_nonblank_pre_tokens(20); # 20 is arbitrary but
6194 # generous, and prevents
6196 # time in mangled files
6197 if ( defined($rpre_types) && @{$rpre_types} ) {
6198 push @pre_types, @{$rpre_types};
6199 push @pre_tokens, @{$rpre_tokens};
6202 # put a sentinel token to simplify stopping the search
6203 push @pre_types, '}';
6204 push @pre_types, '}';
6207 $jbeg = 1 if $pre_types[0] eq 'b';
6209 # first look for one of these
6211 # - bareword with leading -
6215 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
6217 # find the closing quote; don't worry about escapes
6218 my $quote_mark = $pre_types[$j];
6219 foreach my $k ( $j + 1 .. @pre_types - 2 ) {
6220 if ( $pre_types[$k] eq $quote_mark ) {
6222 my $next = $pre_types[$j];
6227 elsif ( $pre_types[$j] eq 'd' ) {
6230 elsif ( $pre_types[$j] eq 'w' ) {
6233 elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
6238 $j++ if $pre_types[$j] eq 'b';
6240 # Patched for RT #95708
6243 # it is a comma which is not a pattern delimiter except for qw
6245 $pre_types[$j] eq ','
6246 ## !~ /^(s|m|y|tr|qr|q|qq|qx)$/
6247 && !$is_q_qq_qx_qr_s_y_tr_m{ $pre_tokens[$jbeg] }
6251 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
6254 $code_block_type = EMPTY_STRING;
6258 if ($code_block_type) {
6260 # Patch for cases b1085 b1128: It is uncertain if this is a block.
6261 # If this brace follows a bareword, then append a space as a signal
6262 # to the formatter that this may not be a block brace. To find the
6263 # corresponding code in Formatter.pm search for 'b1085'.
6264 $code_block_type .= SPACE if ( $code_block_type =~ /^\w/ );
6268 return $code_block_type;
6269 } ## end sub decide_if_code_block
6271 sub report_unexpected {
6273 # report unexpected token type and show where it is
6274 # USES GLOBAL VARIABLES: $tokenizer_self
6275 my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
6276 $rpretoken_type, $input_line )
6279 if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
6280 my $msg = "found $found where $expecting expected";
6281 my $pos = $rpretoken_map->[$i_tok];
6282 interrupt_logfile();
6283 my $input_line_number = $tokenizer_self->[_last_line_number_];
6284 my ( $offset, $numbered_line, $underline ) =
6285 make_numbered_line( $input_line_number, $input_line, $pos );
6286 $underline = write_on_underline( $underline, $pos - $offset, '^' );
6288 my $trailer = EMPTY_STRING;
6289 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
6290 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
6292 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
6293 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
6296 $num = $pos - $pos_prev;
6298 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
6301 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
6302 $trailer = " (previous token underlined)";
6304 $underline =~ s/\s+$//;
6305 warning( $numbered_line . "\n" );
6306 warning( $underline . "\n" );
6307 warning( $msg . $trailer . "\n" );
6311 } ## end sub report_unexpected
6313 my %is_sigil_or_paren;
6314 my %is_R_closing_sb;
6318 my @q = qw< $ & % * @ ) >;
6319 @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
6322 @{is_R_closing_sb}{@q} = (1) x scalar(@q);
6325 sub is_non_structural_brace {
6327 # Decide if a brace or bracket is structural or non-structural
6328 # by looking at the previous token and type
6329 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
6331 # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
6332 # Tentatively deactivated because it caused the wrong operator expectation
6334 # $user = @vars[1] / 100;
6335 # Must update sub operator_expected before re-implementing.
6336 # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
6340 ################################################################
6341 # NOTE: braces after type characters start code blocks, but for
6342 # simplicity these are not identified as such. See also
6343 # sub code_block_type
6344 ################################################################
6346 ##if ($last_nonblank_type eq 't') {return 0}
6348 # otherwise, it is non-structural if it is decorated
6349 # by type information.
6350 # For example, the '{' here is non-structural: ${xxx}
6351 # Removed '::' to fix c074
6352 ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
6354 ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
6355 $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
6356 || substr( $last_nonblank_token, 0, 2 ) eq '->'
6358 # or if we follow a hash or array closing curly brace or bracket
6359 # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
6360 # because the first '}' would have been given type 'R'
6361 ##|| $last_nonblank_type =~ /^([R\]])$/
6362 || $is_R_closing_sb{$last_nonblank_type}
6364 } ## end sub is_non_structural_brace
6366 #########i#############################################################
6367 # Tokenizer routines for tracking container nesting depths
6368 #######################################################################
6370 # The following routines keep track of nesting depths of the nesting
6371 # types, ( [ { and ?. This is necessary for determining the indentation
6372 # level, and also for debugging programs. Not only do they keep track of
6373 # nesting depths of the individual brace types, but they check that each
6374 # of the other brace types is balanced within matching pairs. For
6375 # example, if the program sees this sequence:
6379 # then it can determine that there is an extra left paren somewhere
6380 # between the { and the }. And so on with every other possible
6381 # combination of outer and inner brace types. For another
6386 # which has an extra ] within the parens.
6388 # The brace types have indexes 0 .. 3 which are indexes into
6391 # The pair ? : are treated as just another nesting type, with ? acting
6392 # as the opening brace and : acting as the closing brace.
6396 # $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
6398 # saves the nesting depth of brace type $b (where $b is either of the other
6399 # nesting types) when brace type $a enters a new depth. When this depth
6400 # decreases, a check is made that the current depth of brace types $b is
6401 # unchanged, or otherwise there must have been an error. This can
6402 # be very useful for localizing errors, particularly when perl runs to
6403 # the end of a large file (such as this one) and announces that there
6404 # is a problem somewhere.
6406 # A numerical sequence number is maintained for every nesting type,
6407 # so that each matching pair can be uniquely identified in a simple
6410 sub increase_nesting_depth {
6411 my ( $aa, $pos ) = @_;
6413 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6414 # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
6416 $current_depth[$aa]++;
6418 $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
6419 my $input_line_number = $tokenizer_self->[_last_line_number_];
6420 my $input_line = $tokenizer_self->[_line_of_text_];
6422 # Sequence numbers increment by number of items. This keeps
6423 # a unique set of numbers but still allows the relative location
6424 # of any type to be determined.
6426 ########################################################################
6427 # OLD SEQNO METHOD for incrementing sequence numbers.
6428 # Keep this coding awhile for possible testing.
6429 ## $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
6430 ## my $seqno = $nesting_sequence_number[$aa];
6432 # NEW SEQNO METHOD, continuous sequence numbers. This allows sequence
6433 # numbers to be used as array indexes, and allows them to be compared.
6434 my $seqno = $next_sequence_number++;
6435 ########################################################################
6437 $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
6439 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
6440 [ $input_line_number, $input_line, $pos ];
6442 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6443 next if ( $bb == $aa );
6444 $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
6447 # set a flag for indenting a nested ternary statement
6449 if ( $aa == QUESTION_COLON ) {
6450 $nested_ternary_flag[ $current_depth[$aa] ] = 0;
6451 if ( $current_depth[$aa] > 1 ) {
6452 if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
6453 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
6454 if ( $pdepth == $total_depth - 1 ) {
6456 $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
6462 # Fix part #1 for git82: save last token type for propagation of type 'Z'
6463 $nested_statement_type[$aa][ $current_depth[$aa] ] =
6464 [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
6465 $statement_type = EMPTY_STRING;
6466 return ( $seqno, $indent );
6467 } ## end sub increase_nesting_depth
6469 sub is_balanced_closing_container {
6471 # Return true if a closing container can go here without error
6472 # Return false if not
6475 # cannot close if there was no opening
6476 return unless ( $current_depth[$aa] > 0 );
6478 # check that any other brace types $bb contained within would be balanced
6479 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6480 next if ( $bb == $aa );
6482 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6483 $current_depth[$bb] );
6486 # OK, everything will be balanced
6488 } ## end sub is_balanced_closing_container
6490 sub decrease_nesting_depth {
6492 my ( $aa, $pos ) = @_;
6494 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6495 # @current_sequence_number, @depth_array, @starting_line_of_current_depth
6498 my $input_line_number = $tokenizer_self->[_last_line_number_];
6499 my $input_line = $tokenizer_self->[_line_of_text_];
6503 if ( $current_depth[$aa] > 0 ) {
6505 # set a flag for un-indenting after seeing a nested ternary statement
6506 $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
6507 if ( $aa == QUESTION_COLON ) {
6508 $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
6511 # Fix part #2 for git82: use saved type for propagation of type 'Z'
6512 # through type L-R braces. Perl seems to allow ${bareword}
6513 # as an indirect object, but nothing much more complex than that.
6514 ( $statement_type, my $saved_type, my $saved_token ) =
6515 @{ $nested_statement_type[$aa][ $current_depth[$aa] ] };
6517 && $saved_type eq 'Z'
6518 && $last_nonblank_type eq 'w'
6519 && $brace_structural_type[$brace_depth] eq 'L' )
6521 $last_nonblank_type = $saved_type;
6524 # check that any brace types $bb contained within are balanced
6525 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6526 next if ( $bb == $aa );
6528 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6529 $current_depth[$bb] )
6532 $current_depth[$bb] -
6533 $depth_array[$aa][$bb][ $current_depth[$aa] ];
6535 # don't whine too many times
6536 my $saw_brace_error = get_saw_brace_error();
6538 $saw_brace_error <= MAX_NAG_MESSAGES
6540 # if too many closing types have occurred, we probably
6541 # already caught this error
6542 && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
6545 interrupt_logfile();
6547 $starting_line_of_current_depth[$aa]
6548 [ $current_depth[$aa] ];
6550 my $rel = [ $input_line_number, $input_line, $pos ];
6554 if ( $diff == 1 || $diff == -1 ) {
6555 $ess = EMPTY_STRING;
6562 ? $opening_brace_names[$bb]
6563 : $closing_brace_names[$bb];
6564 write_error_indicator_pair( @{$rsl}, '^' );
6566 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
6571 $starting_line_of_current_depth[$bb]
6572 [ $current_depth[$bb] ];
6575 " The most recent un-matched $bname is on line $ml\n";
6576 write_error_indicator_pair( @{$rml}, '^' );
6578 write_error_indicator_pair( @{$rel}, '^' );
6582 increment_brace_error();
6585 $current_depth[$aa]--;
6589 my $saw_brace_error = get_saw_brace_error();
6590 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
6592 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
6594 indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
6596 increment_brace_error();
6598 # keep track of errors in braces alone (ignoring ternary nesting errors)
6599 $tokenizer_self->[_true_brace_error_count_]++
6600 if ( $closing_brace_names[$aa] ne "':'" );
6602 return ( $seqno, $outdent );
6603 } ## end sub decrease_nesting_depth
6605 sub check_final_nesting_depths {
6607 # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
6609 for my $aa ( 0 .. @closing_brace_names - 1 ) {
6611 if ( $current_depth[$aa] ) {
6613 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
6616 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
6617 The most recent un-matched $opening_brace_names[$aa] is on line $sl
6619 indicate_error( $msg, @{$rsl}, '^' );
6620 increment_brace_error();
6624 } ## end sub check_final_nesting_depths
6626 #########i#############################################################
6627 # Tokenizer routines for looking ahead in input stream
6628 #######################################################################
6630 sub peek_ahead_for_n_nonblank_pre_tokens {
6632 # returns next n pretokens if they exist
6633 # returns undef's if hits eof without seeing any pretokens
6634 # USES GLOBAL VARIABLES: $tokenizer_self
6635 my $max_pretokens = shift;
6638 my ( $rpre_tokens, $rmap, $rpre_types );
6641 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6643 $line =~ s/^\s*//; # trim leading blanks
6644 next if ( length($line) <= 0 ); # skip blank
6645 next if ( $line =~ /^#/ ); # skip comment
6646 ( $rpre_tokens, $rmap, $rpre_types ) =
6647 pre_tokenize( $line, $max_pretokens );
6650 return ( $rpre_tokens, $rpre_types );
6651 } ## end sub peek_ahead_for_n_nonblank_pre_tokens
6653 # look ahead for next non-blank, non-comment line of code
6654 sub peek_ahead_for_nonblank_token {
6656 # USES GLOBAL VARIABLES: $tokenizer_self
6657 my ( $rtokens, $max_token_index ) = @_;
6662 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6664 $line =~ s/^\s*//; # trim leading blanks
6665 next if ( length($line) <= 0 ); # skip blank
6666 next if ( $line =~ /^#/ ); # skip comment
6668 # Updated from 2 to 3 to get trigraphs, added for case b1175
6669 my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
6670 my $j = $max_token_index + 1;
6672 foreach my $tok ( @{$rtok} ) {
6673 last if ( $tok =~ "\n" );
6674 $rtokens->[ ++$j ] = $tok;
6679 } ## end sub peek_ahead_for_nonblank_token
6681 #########i#############################################################
6682 # Tokenizer guessing routines for ambiguous situations
6683 #######################################################################
6685 sub guess_if_pattern_or_conditional {
6687 # this routine is called when we have encountered a ? following an
6688 # unknown bareword, and we must decide if it starts a pattern or not
6690 # $i - token index of the ? starting possible pattern
6691 # output parameters:
6692 # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
6693 # msg = a warning or diagnostic message
6694 # USES GLOBAL VARIABLES: $last_nonblank_token
6696 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6698 my $msg = "guessing that ? after $last_nonblank_token starts a ";
6700 if ( $i >= $max_token_index ) {
6701 $msg .= "conditional (no end to pattern found on the line)\n";
6706 my $next_token = $rtokens->[$i]; # first token after ?
6708 # look for a possible ending ? on this line..
6710 my $quote_depth = 0;
6711 my $quote_character = EMPTY_STRING;
6715 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6718 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6719 $quote_pos, $quote_depth, $max_token_index );
6723 # we didn't find an ending ? on this line,
6724 # so we bias towards conditional
6726 $msg .= "conditional (no ending ? on this line)\n";
6728 # we found an ending ?, so we bias towards a pattern
6732 # Watch out for an ending ? in quotes, like this
6733 # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
6737 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
6738 my $tok = $rtokens->[$ii];
6739 if ( $tok eq ":" ) { $colons++ }
6740 if ( $tok eq "'" ) { $s_quote++ }
6741 if ( $tok eq '"' ) { $d_quote++ }
6743 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
6745 $msg .= "found ending ? but unbalanced quote chars\n";
6747 elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
6749 $msg .= "pattern (found ending ? and pattern expected)\n";
6752 $msg .= "pattern (uncertain, but found ending ?)\n";
6756 return ( $is_pattern, $msg );
6757 } ## end sub guess_if_pattern_or_conditional
6759 my %is_known_constant;
6760 my %is_known_function;
6764 # Constants like 'pi' in Trig.pm are common
6765 my @q = qw(pi pi2 pi4 pip2 pip4);
6766 @{is_known_constant}{@q} = (1) x scalar(@q);
6768 # parenless calls of 'ok' are common
6770 @{is_known_function}{@q} = (1) x scalar(@q);
6773 sub guess_if_pattern_or_division {
6775 # this routine is called when we have encountered a / following an
6776 # unknown bareword, and we must decide if it starts a pattern or is a
6779 # $i - token index of the / starting possible pattern
6780 # output parameters:
6781 # $is_pattern = 0 if probably division, =1 if probably a pattern
6782 # msg = a warning or diagnostic message
6783 # USES GLOBAL VARIABLES: $last_nonblank_token
6784 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6786 my $msg = "guessing that / after $last_nonblank_token starts a ";
6788 if ( $i >= $max_token_index ) {
6789 $msg .= "division (no end to pattern found on the line)\n";
6793 my $divide_possible =
6794 is_possible_numerator( $i, $rtokens, $max_token_index );
6796 if ( $divide_possible < 0 ) {
6797 $msg = "pattern (division not possible here)\n";
6803 my $next_token = $rtokens->[$i]; # first token after slash
6805 # One of the things we can look at is the spacing around the slash.
6806 # There # are four possible spacings around the first slash:
6808 # return pi/two;#/; -/-
6809 # return pi/ two;#/; -/+
6810 # return pi / two;#/; +/+
6811 # return pi /two;#/; +/- <-- possible pattern
6813 # Spacing rule: a space before the slash but not after the slash
6814 # usually indicates a pattern. We can use this to break ties.
6816 my $is_pattern_by_spacing =
6817 ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
6819 # look for a possible ending / on this line..
6821 my $quote_depth = 0;
6822 my $quote_character = EMPTY_STRING;
6826 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6829 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6830 $quote_pos, $quote_depth, $max_token_index );
6834 # we didn't find an ending / on this line, so we bias towards
6836 if ( $divide_possible >= 0 ) {
6838 $msg .= "division (no ending / on this line)\n";
6842 # assuming a multi-line pattern ... this is risky, but division
6843 # does not seem possible. If this fails, it would either be due
6844 # to a syntax error in the code, or the division_expected logic
6845 # needs to be fixed.
6846 $msg = "multi-line pattern (division not possible)\n";
6851 # we found an ending /, so we bias slightly towards a pattern
6854 my $pattern_expected =
6855 pattern_expected( $i, $rtokens, $max_token_index );
6857 if ( $pattern_expected >= 0 ) {
6859 # pattern looks possible...
6860 if ( $divide_possible >= 0 ) {
6862 # Both pattern and divide can work here...
6864 # Increase weight of divide if a pure number follows
6865 $divide_possible += $next_token =~ /^\d+$/;
6867 # Check for known constants in the numerator, like 'pi'
6868 if ( $is_known_constant{$last_nonblank_token} ) {
6870 "division (pattern works too but saw known constant '$last_nonblank_token')\n";
6874 # A very common bare word in pattern expressions is 'ok'
6875 elsif ( $is_known_function{$last_nonblank_token} ) {
6877 "pattern (division works too but saw '$last_nonblank_token')\n";
6881 # If one rule is more definite, use it
6882 elsif ( $divide_possible > $pattern_expected ) {
6884 "division (more likely based on following tokens)\n";
6888 # otherwise, use the spacing rule
6889 elsif ($is_pattern_by_spacing) {
6891 "pattern (guess on spacing, but division possible too)\n";
6896 "division (guess on spacing, but pattern is possible too)\n";
6901 # divide_possible < 0 means divide can not work here
6904 $msg .= "pattern (division not possible)\n";
6908 # pattern does not look possible...
6911 if ( $divide_possible >= 0 ) {
6913 $msg .= "division (pattern not possible)\n";
6916 # Neither pattern nor divide look possible...go by spacing
6918 if ($is_pattern_by_spacing) {
6919 $msg .= "pattern (guess on spacing)\n";
6923 $msg .= "division (guess on spacing)\n";
6932 return ( $is_pattern, $msg );
6933 } ## end sub guess_if_pattern_or_division
6935 # try to resolve here-doc vs. shift by looking ahead for
6936 # non-code or the end token (currently only looks for end token)
6937 # returns 1 if it is probably a here doc, 0 if not
6938 sub guess_if_here_doc {
6940 # This is how many lines we will search for a target as part of the
6941 # guessing strategy. It is a constant because there is probably
6942 # little reason to change it.
6943 # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
6945 my $HERE_DOC_WINDOW = 40;
6947 my $next_token = shift;
6948 my $here_doc_expected = 0;
6951 my $msg = "checking <<";
6954 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) )
6958 if ( $line =~ /^$next_token$/ ) {
6959 $msg .= " -- found target $next_token ahead $k lines\n";
6960 $here_doc_expected = 1; # got it
6963 last if ( $k >= $HERE_DOC_WINDOW );
6966 unless ($here_doc_expected) {
6968 if ( !defined($line) ) {
6969 $here_doc_expected = -1; # hit eof without seeing target
6970 $msg .= " -- must be shift; target $next_token not in file\n";
6973 else { # still unsure..taking a wild guess
6975 if ( !$is_constant{$current_package}{$next_token} ) {
6976 $here_doc_expected = 1;
6978 " -- guessing it's a here-doc ($next_token not a constant)\n";
6982 " -- guessing it's a shift ($next_token is a constant)\n";
6986 write_logfile_entry($msg);
6987 return $here_doc_expected;
6988 } ## end sub guess_if_here_doc
6990 #########i#############################################################
6991 # Tokenizer Routines for scanning identifiers and related items
6992 #######################################################################
6994 sub scan_bare_identifier_do {
6996 # this routine is called to scan a token starting with an alphanumeric
6997 # variable or package separator, :: or '.
6998 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
6999 # $last_nonblank_type,@paren_type, $paren_depth
7001 my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
7005 my $package = undef;
7009 # we have to back up one pretoken at a :: since each : is one pretoken
7010 if ( $tok eq '::' ) { $i_beg-- }
7011 if ( $tok eq '->' ) { $i_beg-- }
7012 my $pos_beg = $rtoken_map->[$i_beg];
7013 pos($input_line) = $pos_beg;
7020 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
7022 my $pos = pos($input_line);
7023 my $numc = $pos - $pos_beg;
7024 $tok = substr( $input_line, $pos_beg, $numc );
7026 # type 'w' includes anything without leading type info
7027 # ($,%,@,*) including something like abc::def::ghi
7030 my $sub_name = EMPTY_STRING;
7031 if ( defined($2) ) { $sub_name = $2; }
7032 if ( defined($1) ) {
7035 # patch: don't allow isolated package name which just ends
7036 # in the old style package separator (single quote). Example:
7038 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
7042 $package =~ s/\'/::/g;
7043 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7044 $package =~ s/::$//;
7047 $package = $current_package;
7049 # patched for c043, part 1: keyword does not follow '->'
7050 if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
7055 # if it is a bareword.. patched for c043, part 2: not following '->'
7056 if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
7058 # check for v-string with leading 'v' type character
7059 # (This seems to have precedence over filehandle, type 'Y')
7060 if ( $tok =~ /^v\d[_\d]*$/ ) {
7062 # we only have the first part - something like 'v101' -
7064 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
7065 $pos = pos($input_line);
7066 $numc = $pos - $pos_beg;
7067 $tok = substr( $input_line, $pos_beg, $numc );
7071 # warn if this version can't handle v-strings
7072 report_v_string($tok);
7075 elsif ( $is_constant{$package}{$sub_name} ) {
7079 # bareword after sort has implied empty prototype; for example:
7080 # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
7081 # This has priority over whatever the user has specified.
7082 elsif ($last_nonblank_token eq 'sort'
7083 && $last_nonblank_type eq 'k' )
7088 # Note: strangely, perl does not seem to really let you create
7089 # functions which act like eval and do, in the sense that eval
7090 # and do may have operators following the final }, but any operators
7091 # that you create with prototype (&) apparently do not allow
7092 # trailing operators, only terms. This seems strange.
7093 # If this ever changes, here is the update
7094 # to make perltidy behave accordingly:
7096 # elsif ( $is_block_function{$package}{$tok} ) {
7097 # $tok='eval'; # patch to do braces like eval - doesn't work
7100 # FIXME: This could become a separate type to allow for different
7102 elsif ( $is_block_function{$package}{$sub_name} ) {
7105 elsif ( $is_block_list_function{$package}{$sub_name} ) {
7108 elsif ( $is_user_function{$package}{$sub_name} ) {
7110 $prototype = $user_function_prototype{$package}{$sub_name};
7113 # check for indirect object
7116 # added 2001-03-27: must not be followed immediately by '('
7118 ( $input_line !~ m/\G\(/gc )
7123 # preceded by keyword like 'print', 'printf' and friends
7124 $is_indirect_object_taker{$last_nonblank_token}
7126 # or preceded by something like 'print(' or 'printf('
7128 ( $last_nonblank_token eq '(' )
7129 && $is_indirect_object_taker{ $paren_type[$paren_depth]
7137 # may not be indirect object unless followed by a space;
7138 # updated 2021-01-16 to consider newline to be a space.
7139 # updated for case b990 to look for either ';' or space
7140 if ( pos($input_line) == length($input_line)
7141 || $input_line =~ m/\G[;\s]/gc )
7146 # Perl's indirect object notation is a very bad
7147 # thing and can cause subtle bugs, especially for
7148 # beginning programmers. And I haven't even been
7149 # able to figure out a sane warning scheme which
7150 # doesn't get in the way of good scripts.
7152 # Complain if a filehandle has any lower case
7153 # letters. This is suggested good practice.
7154 # Use 'sub_name' because something like
7155 # main::MYHANDLE is ok for filehandle
7156 if ( $sub_name =~ /[a-z]/ ) {
7158 # could be bug caused by older perltidy if
7160 if ( $input_line =~ m/\G\s*\(/gc ) {
7162 "Caution: unknown word '$tok' in indirect object slot\n"
7168 # bareword not followed by a space -- may not be filehandle
7169 # (may be function call defined in a 'use' statement)
7176 # Now we must convert back from character position
7177 # to pre_token index.
7178 # I don't think an error flag can occur here ..but who knows
7181 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7183 warning("scan_bare_identifier: Possibly invalid tokenization\n");
7187 # no match but line not blank - could be syntax error
7188 # perl will take '::' alone without complaint
7192 # change this warning to log message if it becomes annoying
7193 warning("didn't find identifier after leading ::\n");
7195 return ( $i, $tok, $type, $prototype );
7196 } ## end sub scan_bare_identifier_do
7200 # This is the new scanner and will eventually replace scan_identifier.
7201 # Only type 'sub' and 'package' are implemented.
7202 # Token types $ * % @ & -> are not yet implemented.
7204 # Scan identifier following a type token.
7205 # The type of call depends on $id_scan_state: $id_scan_state = ''
7206 # for starting call, in which case $tok must be the token defining
7209 # If the type token is the last nonblank token on the line, a value
7210 # of $id_scan_state = $tok is returned, indicating that further
7211 # calls must be made to get the identifier. If the type token is
7212 # not the last nonblank token on the line, the identifier is
7213 # scanned and handled and a value of '' is returned.
7214 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
7215 # $statement_type, $tokenizer_self
7217 my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
7220 use constant DEBUG_NSCAN => 0;
7221 my $type = EMPTY_STRING;
7222 my ( $i_beg, $pos_beg );
7224 #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7225 #my ($a,$b,$c) = caller;
7226 #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
7228 # on re-entry, start scanning at first token on the line
7229 if ($id_scan_state) {
7231 $type = EMPTY_STRING;
7234 # on initial entry, start scanning just after type token
7237 $id_scan_state = $tok;
7241 # find $i_beg = index of next nonblank token,
7242 # and handle empty lines
7244 my $next_nonblank_token = $rtokens->[$i_beg];
7245 if ( $i_beg > $max_token_index ) {
7250 # only a '#' immediately after a '$' is not a comment
7251 if ( $next_nonblank_token eq '#' ) {
7252 unless ( $tok eq '$' ) {
7257 if ( $next_nonblank_token =~ /^\s/ ) {
7258 ( $next_nonblank_token, $i_beg ) =
7259 find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
7261 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
7267 # handle non-blank line; identifier, if any, must follow
7268 unless ($blank_line) {
7270 if ( $is_sub{$id_scan_state} ) {
7271 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
7273 input_line => $input_line,
7278 rtokens => $rtokens,
7279 rtoken_map => $rtoken_map,
7280 id_scan_state => $id_scan_state,
7281 max_token_index => $max_token_index
7286 elsif ( $is_package{$id_scan_state} ) {
7287 ( $i, $tok, $type ) =
7288 do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
7289 $rtoken_map, $max_token_index );
7290 $id_scan_state = EMPTY_STRING;
7294 warning("invalid token in scan_id: $tok\n");
7295 $id_scan_state = EMPTY_STRING;
7299 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
7304 Program bug in scan_id: undefined type but scan_state=$id_scan_state
7308 "Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
7310 report_definite_bug();
7315 "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7317 return ( $i, $tok, $type, $id_scan_state );
7318 } ## end sub scan_id_do
7320 sub check_prototype {
7321 my ( $proto, $package, $subname ) = @_;
7322 return unless ( defined($package) && defined($subname) );
7323 if ( defined($proto) ) {
7324 $proto =~ s/^\s*\(\s*//;
7325 $proto =~ s/\s*\)$//;
7327 $is_user_function{$package}{$subname} = 1;
7328 $user_function_prototype{$package}{$subname} = "($proto)";
7330 # prototypes containing '&' must be treated specially..
7331 if ( $proto =~ /\&/ ) {
7333 # right curly braces of prototypes ending in
7334 # '&' may be followed by an operator
7335 if ( $proto =~ /\&$/ ) {
7336 $is_block_function{$package}{$subname} = 1;
7339 # right curly braces of prototypes NOT ending in
7340 # '&' may NOT be followed by an operator
7341 elsif ( $proto !~ /\&$/ ) {
7342 $is_block_list_function{$package}{$subname} = 1;
7347 $is_constant{$package}{$subname} = 1;
7351 $is_user_function{$package}{$subname} = 1;
7354 } ## end sub check_prototype
7356 sub do_scan_package {
7358 # do_scan_package parses a package name
7359 # it is called with $i_beg equal to the index of the first nonblank
7360 # token following a 'package' token.
7361 # USES GLOBAL VARIABLES: $current_package,
7364 # package NAMESPACE VERSION
7365 # package NAMESPACE BLOCK
7366 # package NAMESPACE VERSION BLOCK
7368 # If VERSION is provided, package sets the $VERSION variable in the given
7369 # namespace to a version object with the VERSION provided. VERSION must be
7370 # a "strict" style version number as defined by the version module: a
7371 # positive decimal number (integer or decimal-fraction) without
7372 # exponentiation or else a dotted-decimal v-string with a leading 'v'
7373 # character and at least three components.
7374 # reference http://perldoc.perl.org/functions/package.html
7376 my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
7379 my $package = undef;
7380 my $pos_beg = $rtoken_map->[$i_beg];
7381 pos($input_line) = $pos_beg;
7383 # handle non-blank line; package name, if any, must follow
7384 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
7386 $package = ( defined($1) && $1 ) ? $1 : 'main';
7387 $package =~ s/\'/::/g;
7388 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7389 $package =~ s/::$//;
7390 my $pos = pos($input_line);
7391 my $numc = $pos - $pos_beg;
7392 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
7395 # Now we must convert back from character position
7396 # to pre_token index.
7397 # I don't think an error flag can occur here ..but ?
7400 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7401 if ($error) { warning("Possibly invalid package\n") }
7402 $current_package = $package;
7404 # we should now have package NAMESPACE
7405 # now expecting VERSION, BLOCK, or ; to follow ...
7406 # package NAMESPACE VERSION
7407 # package NAMESPACE BLOCK
7408 # package NAMESPACE VERSION BLOCK
7409 my ( $next_nonblank_token, $i_next ) =
7410 find_next_nonblank_token( $i, $rtokens, $max_token_index );
7412 # check that something recognizable follows, but do not parse.
7413 # A VERSION number will be parsed later as a number or v-string in the
7414 # normal way. What is important is to set the statement type if
7415 # everything looks okay so that the operator_expected() routine
7416 # knows that the number is in a package statement.
7417 # Examples of valid primitive tokens that might follow are:
7419 # FIX: added a '#' since a side comment may also follow
7420 if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#])|v\d|\d+$/ ) {
7421 $statement_type = $tok;
7425 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
7430 # no match but line not blank --
7431 # could be a label with name package, like package: , for example.
7436 return ( $i, $tok, $type );
7437 } ## end sub do_scan_package
7439 my %is_special_variable_char;
7443 # These are the only characters which can (currently) form special
7444 # variables, like $^W: (issue c066).
7446 qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
7447 @{is_special_variable_char}{@q} = (1) x scalar(@q);
7450 { ## begin closure for sub scan_complex_identifier
7452 use constant DEBUG_SCAN_ID => 0;
7454 # These are the possible states for this scanner:
7455 my $scan_state_SIGIL = '$';
7456 my $scan_state_ALPHA = 'A';
7457 my $scan_state_COLON = ':';
7458 my $scan_state_LPAREN = '(';
7459 my $scan_state_RPAREN = ')';
7460 my $scan_state_AMPERSAND = '&';
7461 my $scan_state_SPLIT = '^';
7463 # Only these non-blank states may be returned to caller:
7464 my %is_returnable_scan_state = (
7465 $scan_state_SIGIL => 1,
7466 $scan_state_AMPERSAND => 1,
7469 # USES GLOBAL VARIABLES:
7470 # $context, $last_nonblank_token, $last_nonblank_type
7475 my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
7476 $expecting, $container_type );
7478 #-------------------------------------------
7479 # my variables, re-initialized on each call:
7480 #-------------------------------------------
7481 my $i_begin; # starting index $i
7482 my $type; # returned identifier type
7483 my $tok_begin; # starting token
7484 my $tok; # returned token
7485 my $id_scan_state_begin; # starting scan state
7486 my $identifier_begin; # starting identifier
7487 my $i_save; # a last good index, in case of error
7488 my $message; # hold error message for log file
7490 my $last_tok_is_blank;
7491 my $in_prototype_or_signature;
7496 sub initialize_my_scan_id_vars {
7498 # Initialize all 'my' vars on entry
7500 $type = EMPTY_STRING;
7501 $tok_begin = $rtokens->[$i_begin];
7503 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
7504 $id_scan_state_begin = $id_scan_state;
7505 $identifier_begin = $identifier;
7508 $message = EMPTY_STRING;
7509 $tok_is_blank = undef; # a flag to speed things up
7510 $last_tok_is_blank = undef;
7512 $in_prototype_or_signature =
7513 $container_type && $container_type =~ /^sub\b/;
7515 # these flags will be used to help figure out the type:
7519 # allow old package separator (') except in 'use' statement
7520 $allow_tick = ( $last_nonblank_token ne 'use' );
7522 } ## end sub initialize_my_scan_id_vars
7524 #----------------------------------
7525 # Routines for handling scan states
7526 #----------------------------------
7527 sub do_id_scan_state_dollar {
7529 # We saw a sigil, now looking to start a variable name
7531 if ( $tok eq '$' ) {
7533 $identifier .= $tok;
7535 # we've got a punctuation variable if end of line (punct.t)
7536 if ( $i == $max_token_index ) {
7538 $id_scan_state = EMPTY_STRING;
7541 elsif ( $tok =~ /^\w/ ) { # alphanumeric ..
7543 $id_scan_state = $scan_state_COLON; # now need ::
7544 $identifier .= $tok;
7546 elsif ( $tok eq '::' ) {
7547 $id_scan_state = $scan_state_ALPHA;
7548 $identifier .= $tok;
7551 # POSTDEFREF ->@ ->% ->& ->*
7552 elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
7553 $identifier .= $tok;
7555 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7557 $id_scan_state = $scan_state_COLON; # now need ::
7558 $identifier .= $tok;
7560 # Perl will accept leading digits in identifiers,
7561 # although they may not always produce useful results.
7562 # Something like $main::0 is ok. But this also works:
7564 # sub howdy::123::bubba{ print "bubba $54321!\n" }
7565 # howdy::123::bubba();
7568 elsif ( $tok eq '#' ) {
7570 my $is_punct_var = $identifier eq '$$';
7572 # side comment or identifier?
7575 # A '#' starts a comment if it follows a space. For example,
7576 # the following is equivalent to $ans=40.
7581 # a # inside a prototype or signature can only start a
7583 && !$in_prototype_or_signature
7585 # these are valid punctuation vars: *# %# @# $#
7586 # May also be '$#array' or POSTDEFREF ->$#
7587 && ( $identifier =~ /^[\%\@\$\*]$/
7588 || $identifier =~ /\$$/ )
7590 # but a '#' after '$$' is a side comment; see c147
7595 $identifier .= $tok; # keep same state, a $ could follow
7599 # otherwise it is a side comment
7600 if ( $identifier eq '->' ) { }
7601 elsif ($is_punct_var) { $type = 'i' }
7602 elsif ( $id_scan_state eq $scan_state_SIGIL ) { $type = 't' }
7603 else { $type = 'i' }
7605 $id_scan_state = EMPTY_STRING;
7609 elsif ( $tok eq '{' ) {
7611 # check for something like ${#} or ${©}
7615 || $identifier eq '@'
7616 || $identifier eq '$#'
7618 && $i + 2 <= $max_token_index
7619 && $rtokens->[ $i + 2 ] eq '}'
7620 && $rtokens->[ $i + 1 ] !~ /[\s\w]/
7623 my $next2 = $rtokens->[ $i + 2 ];
7624 my $next1 = $rtokens->[ $i + 1 ];
7625 $identifier .= $tok . $next1 . $next2;
7627 $id_scan_state = EMPTY_STRING;
7631 # skip something like ${xxx} or ->{
7632 $id_scan_state = EMPTY_STRING;
7634 # if this is the first token of a line, any tokens for this
7635 # identifier have already been accumulated
7636 if ( $identifier eq '$' || $i == 0 ) {
7637 $identifier = EMPTY_STRING;
7643 # space ok after leading $ % * & @
7644 elsif ( $tok =~ /^\s*$/ ) {
7648 # note: an id with a leading '&' does not actually come this way
7649 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
7651 if ( length($identifier) > 1 ) {
7652 $id_scan_state = EMPTY_STRING;
7654 $type = 'i'; # probably punctuation variable
7658 # fix c139: trim line-ending type 't'
7659 if ( $i == $max_token_index ) {
7664 # spaces after $'s are common, and space after @
7665 # is harmless, so only complain about space
7666 # after other type characters. Space after $ and
7667 # @ will be removed in formatting. Report space
7668 # after % and * because they might indicate a
7669 # parsing error. In other words '% ' might be a
7670 # modulo operator. Delete this warning if it
7672 elsif ( $identifier !~ /^[\@\$]$/ ) {
7674 "Space in identifier, following $identifier\n";
7677 ## ok: silently accept space after '$' and '@' sigils
7682 elsif ( $identifier eq '->' ) {
7684 # space after '->' is ok except at line end ..
7685 # so trim line-ending in type '->' (fixes c139)
7686 if ( $i == $max_token_index ) {
7692 # stop at space after something other than -> or sigil
7693 # Example of what can arrive here:
7694 # eval { $MyClass->$$ };
7696 $id_scan_state = EMPTY_STRING;
7701 elsif ( $tok eq '^' ) {
7703 # check for some special variables like $^ $^W
7704 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
7705 $identifier .= $tok;
7708 # There may be one more character, not a space, after the ^
7709 my $next1 = $rtokens->[ $i + 1 ];
7710 my $chr = substr( $next1, 0, 1 );
7711 if ( $is_special_variable_char{$chr} ) {
7713 # It is something like $^W
7714 # Test case (c066) : $^Oeq'linux'
7716 $identifier .= $next1;
7718 # If pretoken $next1 is more than one character long,
7719 # set a flag indicating that it needs to be split.
7721 ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
7726 # Simple test case (c065): '$aa=$^if($bb)';
7727 $id_scan_state = EMPTY_STRING;
7731 $id_scan_state = EMPTY_STRING;
7735 else { # something else
7737 if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
7739 # We might be in an extrusion of
7740 # sub foo2 ( $first, $, $third ) {
7741 # looking at a line starting with a comma, like
7744 # in this case the comma ends the signature variable
7745 # '$' which will have been previously marked type 't'
7747 if ( $i == $i_begin ) {
7748 $identifier = EMPTY_STRING;
7749 $type = EMPTY_STRING;
7752 # at a # we have to mark as type 't' because more may
7753 # follow, otherwise, in a signature we can let '$' be an
7754 # identifier here for better formatting.
7755 # See 'mangle4.in' for a test case.
7758 if ( $id_scan_state eq $scan_state_SIGIL && $tok eq '#' ) {
7763 $id_scan_state = EMPTY_STRING;
7766 # check for various punctuation variables
7767 elsif ( $identifier =~ /^[\$\*\@\%]$/ ) {
7768 $identifier .= $tok;
7771 # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
7773 && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
7775 $identifier .= $tok;
7778 elsif ( $identifier eq '$#' ) {
7780 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
7782 # perl seems to allow just these: $#: $#- $#+
7783 elsif ( $tok =~ /^[\:\-\+]$/ ) {
7785 $identifier .= $tok;
7789 write_logfile_entry( 'Use of $# is deprecated' . "\n" );
7792 elsif ( $identifier eq '$$' ) {
7794 # perl does not allow references to punctuation
7795 # variables without braces. For example, this
7799 # You would have to use
7802 # '$$' alone is punctuation variable for PID
7804 if ( $tok eq '{' ) { $type = 't' }
7805 else { $type = 'i' }
7807 elsif ( $identifier eq '->' ) {
7812 if ( length($identifier) == 1 ) {
7813 $identifier = EMPTY_STRING;
7816 $id_scan_state = EMPTY_STRING;
7819 } ## end sub do_id_scan_state_dollar
7821 sub do_id_scan_state_alpha {
7823 # looking for alphanumeric after ::
7824 $tok_is_blank = $tok =~ /^\s*$/;
7826 if ( $tok =~ /^\w/ ) { # found it
7827 $identifier .= $tok;
7828 $id_scan_state = $scan_state_COLON; # now need ::
7831 elsif ( $tok eq "'" && $allow_tick ) {
7832 $identifier .= $tok;
7833 $id_scan_state = $scan_state_COLON; # now need ::
7836 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7837 $id_scan_state = $scan_state_LPAREN;
7838 $identifier .= $tok;
7840 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7841 $id_scan_state = $scan_state_RPAREN;
7842 $identifier .= $tok;
7845 $id_scan_state = EMPTY_STRING;
7849 } ## end sub do_id_scan_state_alpha
7851 sub do_id_scan_state_colon {
7853 # looking for possible :: after alphanumeric
7855 $tok_is_blank = $tok =~ /^\s*$/;
7857 if ( $tok eq '::' ) { # got it
7858 $identifier .= $tok;
7859 $id_scan_state = $scan_state_ALPHA; # now require alpha
7861 elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here
7862 $identifier .= $tok;
7863 $id_scan_state = $scan_state_COLON; # now need ::
7866 elsif ( $tok eq "'" && $allow_tick ) { # tick
7868 if ( $is_keyword{$identifier} ) {
7869 $id_scan_state = EMPTY_STRING; # that's all
7873 $identifier .= $tok;
7876 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7877 $id_scan_state = $scan_state_LPAREN;
7878 $identifier .= $tok;
7880 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7881 $id_scan_state = $scan_state_RPAREN;
7882 $identifier .= $tok;
7885 $id_scan_state = EMPTY_STRING; # that's all
7889 } ## end sub do_id_scan_state_colon
7891 sub do_id_scan_state_left_paren {
7893 # looking for possible '(' of a prototype
7895 if ( $tok eq '(' ) { # got it
7896 $identifier .= $tok;
7897 $id_scan_state = $scan_state_RPAREN; # now find the end of it
7899 elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
7900 $identifier .= $tok;
7904 $id_scan_state = EMPTY_STRING; # that's all - no prototype
7908 } ## end sub do_id_scan_state_left_paren
7910 sub do_id_scan_state_right_paren {
7912 # looking for a ')' of prototype to close a '('
7914 $tok_is_blank = $tok =~ /^\s*$/;
7916 if ( $tok eq ')' ) { # got it
7917 $identifier .= $tok;
7918 $id_scan_state = EMPTY_STRING; # all done
7920 elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
7921 $identifier .= $tok;
7923 else { # probable error in script, but keep going
7924 warning("Unexpected '$tok' while seeking end of prototype\n");
7925 $identifier .= $tok;
7928 } ## end sub do_id_scan_state_right_paren
7930 sub do_id_scan_state_ampersand {
7932 # Starting sub call after seeing an '&'
7934 if ( $tok =~ /^[\$\w]/ ) { # alphanumeric ..
7935 $id_scan_state = $scan_state_COLON; # now need ::
7937 $identifier .= $tok;
7939 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7940 $id_scan_state = $scan_state_COLON; # now need ::
7942 $identifier .= $tok;
7944 elsif ( $tok =~ /^\s*$/ ) { # allow space
7947 # fix c139: trim line-ending type 't'
7948 if ( length($identifier) == 1 && $i == $max_token_index ) {
7953 elsif ( $tok eq '::' ) { # leading ::
7954 $id_scan_state = $scan_state_ALPHA; # accept alpha next
7955 $identifier .= $tok;
7957 elsif ( $tok eq '{' ) {
7958 if ( $identifier eq '&' || $i == 0 ) {
7959 $identifier = EMPTY_STRING;
7962 $id_scan_state = EMPTY_STRING;
7964 elsif ( $tok eq '^' ) {
7965 if ( $identifier eq '&' ) {
7967 # Special variable (c066)
7968 $identifier .= $tok;
7971 # There may be one more character, not a space, after the ^
7972 my $next1 = $rtokens->[ $i + 1 ];
7973 my $chr = substr( $next1, 0, 1 );
7974 if ( $is_special_variable_char{$chr} ) {
7976 # It is something like &^O
7978 $identifier .= $next1;
7980 # If pretoken $next1 is more than one character long,
7981 # set a flag indicating that it needs to be split.
7983 ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
7988 $id_scan_state = EMPTY_STRING;
7992 $identifier = EMPTY_STRING;
7998 # punctuation variable?
7999 # testfile: cunningham4.pl
8001 # We have to be careful here. If we are in an unknown state,
8002 # we will reject the punctuation variable. In the following
8003 # example the '&' is a binary operator but we are in an unknown
8004 # state because there is no sigil on 'Prima', so we don't
8005 # know what it is. But it is a bad guess that
8006 # '&~' is a function variable.
8007 # $self->{text}->{colorMap}->[
8008 # Prima::PodView::COLOR_CODE_FOREGROUND
8009 # & ~tb::COLOR_INDEX ] =
8012 # Fix for case c033: a '#' here starts a side comment
8013 if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
8014 $identifier .= $tok;
8017 $identifier = EMPTY_STRING;
8021 $id_scan_state = EMPTY_STRING;
8024 } ## end sub do_id_scan_state_ampersand
8026 #-------------------
8027 # hash of scanner subs
8028 #-------------------
8029 my $scan_identifier_code = {
8030 $scan_state_SIGIL => \&do_id_scan_state_dollar,
8031 $scan_state_ALPHA => \&do_id_scan_state_alpha,
8032 $scan_state_COLON => \&do_id_scan_state_colon,
8033 $scan_state_LPAREN => \&do_id_scan_state_left_paren,
8034 $scan_state_RPAREN => \&do_id_scan_state_right_paren,
8035 $scan_state_AMPERSAND => \&do_id_scan_state_ampersand,
8038 sub scan_complex_identifier {
8040 # This routine assembles tokens into identifiers. It maintains a
8041 # scan state, id_scan_state. It updates id_scan_state based upon
8042 # current id_scan_state and token, and returns an updated
8043 # id_scan_state and the next index after the identifier.
8045 # This routine now serves a a backup for sub scan_simple_identifier
8046 # which handles most identifiers.
8049 $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
8050 $expecting, $container_type
8053 # return flag telling caller to split the pretoken
8054 my $split_pretoken_flag;
8056 ####################
8057 # Initialize my vars
8058 ####################
8060 initialize_my_scan_id_vars();
8062 #########################################################
8063 # get started by defining a type and a state if necessary
8064 #########################################################
8066 if ( !$id_scan_state ) {
8067 $context = UNKNOWN_CONTEXT;
8070 if ( $tok eq '>' ) {
8076 if ( $tok eq '$' || $tok eq '*' ) {
8077 $id_scan_state = $scan_state_SIGIL;
8078 $context = SCALAR_CONTEXT;
8080 elsif ( $tok eq '%' || $tok eq '@' ) {
8081 $id_scan_state = $scan_state_SIGIL;
8082 $context = LIST_CONTEXT;
8084 elsif ( $tok eq '&' ) {
8085 $id_scan_state = $scan_state_AMPERSAND;
8087 elsif ( $tok eq 'sub' or $tok eq 'package' ) {
8088 $saw_alpha = 0; # 'sub' is considered type info here
8089 $id_scan_state = $scan_state_SIGIL;
8091 SPACE; # need a space to separate sub from sub name
8093 elsif ( $tok eq '::' ) {
8094 $id_scan_state = $scan_state_ALPHA;
8096 elsif ( $tok =~ /^\w/ ) {
8097 $id_scan_state = $scan_state_COLON;
8100 elsif ( $tok eq '->' ) {
8101 $id_scan_state = $scan_state_SIGIL;
8105 # shouldn't happen: bad call parameter
8107 "Program bug detected: scan_identifier received bad starting token = '$tok'\n";
8108 if (DEVEL_MODE) { Fault($msg) }
8109 if ( !$tokenizer_self->[_in_error_] ) {
8111 $tokenizer_self->[_in_error_] = 1;
8113 $id_scan_state = EMPTY_STRING;
8116 $saw_type = !$saw_alpha;
8120 $saw_alpha = ( $tok =~ /^\w/ );
8121 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
8123 # check for a valid starting state
8124 if ( DEVEL_MODE && !$is_returnable_scan_state{$id_scan_state} ) {
8126 Unexpected starting scan state in sub scan_complex_identifier: '$id_scan_state'
8131 ###############################
8132 # loop to gather the identifier
8133 ###############################
8137 while ( $i < $max_token_index && $id_scan_state ) {
8139 # Be sure we have code to handle this state before we proceed
8140 my $code = $scan_identifier_code->{$id_scan_state};
8143 if ( $id_scan_state eq $scan_state_SPLIT ) {
8144 ## OK: this is the signal to exit and split the pretoken
8147 # unknown state - should not happen
8151 Unknown scan state in sub scan_complex_identifier: '$id_scan_state'
8152 Scan state at sub entry was '$id_scan_state_begin'
8155 $id_scan_state = EMPTY_STRING;
8161 # Remember the starting index for progress check below
8162 my $i_start_loop = $i;
8164 $last_tok_is_blank = $tok_is_blank;
8165 if ($tok_is_blank) { $tok_is_blank = undef }
8166 else { $i_save = $i }
8168 $tok = $rtokens->[ ++$i ];
8170 # patch to make digraph :: if necessary
8171 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
8178 # check for forward progress: a decrease in the index $i
8179 # implies that scanning has finished
8180 last if ( $i <= $i_start_loop );
8182 } ## end of main loop
8188 # Be sure a valid state is returned
8189 if ($id_scan_state) {
8191 if ( !$is_returnable_scan_state{$id_scan_state} ) {
8193 if ( $id_scan_state eq $scan_state_SPLIT ) {
8194 $split_pretoken_flag = 1;
8197 if ( $id_scan_state eq $scan_state_RPAREN ) {
8199 "Hit end of line while seeking ) to end prototype\n");
8202 $id_scan_state = EMPTY_STRING;
8205 # Patch: the deprecated variable $# does not combine with anything
8207 if ( $identifier eq '$#' ) { $id_scan_state = EMPTY_STRING }
8210 # Be sure the token index is valid
8211 if ( $i < 0 ) { $i = 0 }
8213 # Be sure a token type is defined
8220 # The type without the -> should be the same as with the -> so
8221 # that if they get separated we get the same bond strengths,
8223 if ( $identifier =~ /^->/
8224 && $last_nonblank_type eq 'w'
8225 && substr( $identifier, 2, 1 ) =~ /^\w/ )
8229 else { $type = 'i' }
8231 elsif ( $identifier eq '->' ) {
8235 ( length($identifier) > 1 )
8237 # In something like '@$=' we have an identifier '@$'
8238 # In something like '$${' we have type '$$' (and only
8239 # part of an identifier)
8240 && !( $identifier =~ /\$$/ && $tok eq '{' )
8242 ## && ( $identifier !~ /^(sub |package )$/ )
8243 && $identifier ne 'sub '
8244 && $identifier ne 'package '
8249 else { $type = 't' }
8251 elsif ($saw_alpha) {
8253 # type 'w' includes anything without leading type info
8254 # ($,%,@,*) including something like abc::def::ghi
8257 # Fix for b1337, if restarting scan after line break between
8258 # '->' or sigil and identifier name, use type 'i'
8259 if ( $id_scan_state_begin
8260 && $identifier =~ /^([\$\%\@\*\&]|->)/ )
8266 $type = EMPTY_STRING;
8267 } # this can happen on a restart
8270 # See if we formed an identifier...
8273 if ($message) { write_logfile_entry($message) }
8276 # did not find an identifier, back up
8284 DEBUG_SCAN_ID && do {
8285 my ( $a, $b, $c ) = caller;
8287 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
8289 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
8291 return ( $i, $tok, $type, $id_scan_state, $identifier,
8292 $split_pretoken_flag );
8293 } ## end sub scan_complex_identifier
8294 } ## end closure for sub scan_complex_identifier
8296 { ## closure for sub do_scan_sub
8298 my %warn_if_lexical;
8302 # lexical subs with these names can cause parsing errors in this version
8303 my @q = qw( m q qq qr qw qx s tr y );
8304 @{warn_if_lexical}{@q} = (1) x scalar(@q);
8307 # saved package and subnames in case prototype is on separate line
8308 my ( $package_saved, $subname_saved );
8310 # initialize subname each time a new 'sub' keyword is encountered
8311 sub initialize_subname {
8312 $package_saved = EMPTY_STRING;
8313 $subname_saved = EMPTY_STRING;
8320 PROTOTYPE_CALL => 3,
8325 # do_scan_sub parses a sub name and prototype.
8327 # At present there are three basic CALL TYPES which are
8328 # distinguished by the starting value of '$tok':
8329 # 1. $tok='sub', id_scan_state='sub'
8330 # it is called with $i_beg equal to the index of the first nonblank
8331 # token following a 'sub' token.
8332 # 2. $tok='(', id_scan_state='sub',
8333 # it is called with $i_beg equal to the index of a '(' which may
8334 # start a prototype.
8335 # 3. $tok='prototype', id_scan_state='prototype'
8336 # it is called with $i_beg equal to the index of a '(' which is
8337 # preceded by ': prototype' and has $id_scan_state eq 'prototype'
8341 # A single type 1 call will get both the sub and prototype
8342 # sub foo1 ( $$ ) { }
8345 # The subname will be obtained with a 'sub' call
8346 # The prototype on line 2 will be obtained with a '(' call
8352 # The subname will be obtained with a 'sub' call
8353 # The prototype will be obtained with a 'prototype' call
8354 # sub foo1 ( $x, $y ) : prototype ( $$ ) { }
8355 # ^ <---type 1 ^ <---type 3
8357 # TODO: add future error checks to be sure we have a valid
8358 # sub name. For example, 'sub &doit' is wrong. Also, be sure
8359 # a name is given if and only if a non-anonymous sub is
8361 # USES GLOBAL VARS: $current_package, $last_nonblank_token,
8362 # $in_attribute_list, %saw_function_definition,
8365 my ($rinput_hash) = @_;
8367 my $input_line = $rinput_hash->{input_line};
8368 my $i = $rinput_hash->{i};
8369 my $i_beg = $rinput_hash->{i_beg};
8370 my $tok = $rinput_hash->{tok};
8371 my $type = $rinput_hash->{type};
8372 my $rtokens = $rinput_hash->{rtokens};
8373 my $rtoken_map = $rinput_hash->{rtoken_map};
8374 my $id_scan_state = $rinput_hash->{id_scan_state};
8375 my $max_token_index = $rinput_hash->{max_token_index};
8379 # Determine the CALL TYPE
8384 $tok eq 'prototype' ? PROTOTYPE_CALL
8385 : $tok eq '(' ? PAREN_CALL
8388 $id_scan_state = EMPTY_STRING; # normally we get everything in one call
8389 my $subname = $subname_saved;
8390 my $package = $package_saved;
8395 my $pos_beg = $rtoken_map->[$i_beg];
8396 pos($input_line) = $pos_beg;
8398 # Look for the sub NAME if this is a SUB call
8400 $call_type == SUB_CALL
8401 && $input_line =~ m/\G\s*
8402 ((?:\w*(?:'|::))*) # package - something that ends in :: or '
8403 (\w+) # NAME - required
8410 my $is_lexical_sub =
8411 $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
8412 if ( $is_lexical_sub && $1 ) {
8413 warning("'my' sub $subname cannot be in package '$1'\n");
8414 $is_lexical_sub = 0;
8417 if ($is_lexical_sub) {
8419 # lexical subs use the block sequence number as a package name
8421 $current_sequence_number[BRACE][ $current_depth[BRACE] ];
8422 $seqno = 1 unless ( defined($seqno) );
8424 if ( $warn_if_lexical{$subname} ) {
8426 "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
8431 $package = ( defined($1) && $1 ) ? $1 : $current_package;
8432 $package =~ s/\'/::/g;
8433 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
8434 $package =~ s/::$//;
8437 my $pos = pos($input_line);
8438 my $numc = $pos - $pos_beg;
8439 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
8442 # remember the sub name in case another call is needed to
8444 $package_saved = $package;
8445 $subname_saved = $subname;
8448 # Now look for PROTO ATTRS for all call types
8449 # Look for prototype/attributes which are usually on the same
8450 # line as the sub name but which might be on a separate line.
8451 # For example, we might have an anonymous sub with attributes,
8452 # or a prototype on a separate line from its sub name
8454 # NOTE: We only want to parse PROTOTYPES here. If we see anything that
8455 # does not look like a prototype, we assume it is a SIGNATURE and we
8456 # will stop and let the the standard tokenizer handle it. In
8457 # particular, we stop if we see any nested parens, braces, or commas.
8458 # Also note, a valid prototype cannot contain any alphabetic character
8459 # -- see https://perldoc.perl.org/perlsub
8460 # But it appears that an underscore is valid in a prototype, so the
8461 # regex below uses [A-Za-z] rather than \w
8462 # This is the old regex which has been replaced:
8463 # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO
8464 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
8466 $input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO
8467 (\s*:)? # ATTRS leading ':'
8475 # Append the prototype to the starting token if it is 'sub' or
8476 # 'prototype'. This is not necessary but for compatibility with
8477 # previous versions when the -csc flag is used:
8478 if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
8482 # If we just entered the sub at an opening paren on this call, not
8483 # a following :prototype, label it with the previous token. This is
8484 # necessary to propagate the sub name to its opening block.
8485 elsif ( $call_type == PAREN_CALL ) {
8486 $tok = $last_nonblank_token;
8491 # Patch part #1 to fixes cases b994 and b1053:
8492 # Mark an anonymous sub keyword without prototype as type 'k', i.e.
8493 # 'sub : lvalue { ...'
8495 if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
8500 # ATTRS: if there are attributes, back up and let the ':' be
8501 # found later by the scanner.
8502 my $pos = pos($input_line);
8504 $pos -= length($attrs);
8507 my $next_nonblank_token = $tok;
8509 # catch case of line with leading ATTR ':' after anonymous sub
8510 if ( $pos == $pos_beg && $tok eq ':' ) {
8512 $in_attribute_list = 1;
8515 # Otherwise, if we found a match we must convert back from
8516 # string position to the pre_token index for continued parsing.
8519 # I don't think an error flag can occur here ..but ?
8521 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
8523 if ($error) { warning("Possibly invalid sub\n") }
8525 # Patch part #2 to fixes cases b994 and b1053:
8526 # Do not let spaces be part of the token of an anonymous sub
8527 # keyword which we marked as type 'k' above...i.e. for
8529 # 'sub : lvalue { ...'
8530 # Back up and let it be parsed as a blank
8534 && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
8539 # check for multiple definitions of a sub
8540 ( $next_nonblank_token, my $i_next ) =
8541 find_next_nonblank_token_on_this_line( $i, $rtokens,
8545 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
8546 { # skip blank or side comment
8547 my ( $rpre_tokens, $rpre_types ) =
8548 peek_ahead_for_n_nonblank_pre_tokens(1);
8549 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
8550 $next_nonblank_token = $rpre_tokens->[0];
8553 $next_nonblank_token = '}';
8557 # See what's next...
8558 if ( $next_nonblank_token eq '{' ) {
8561 # Check for multiple definitions of a sub, but
8562 # it is ok to have multiple sub BEGIN, etc,
8563 # so we do not complain if name is all caps
8564 if ( $saw_function_definition{$subname}{$package}
8565 && $subname !~ /^[A-Z]+$/ )
8567 my $lno = $saw_function_definition{$subname}{$package};
8568 if ( $package =~ /^\d/ ) {
8570 "already saw definition of lexical 'sub $subname' at line $lno\n"
8576 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
8577 ) unless (DEVEL_MODE);
8580 $saw_function_definition{$subname}{$package} =
8581 $tokenizer_self->[_last_line_number_];
8584 elsif ( $next_nonblank_token eq ';' ) {
8586 elsif ( $next_nonblank_token eq '}' ) {
8589 # ATTRS - if an attribute list follows, remember the name
8590 # of the sub so the next opening brace can be labeled.
8591 # Setting 'statement_type' causes any ':'s to introduce
8593 elsif ( $next_nonblank_token eq ':' ) {
8594 if ( $call_type == SUB_CALL ) {
8596 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8600 # if we stopped before an open paren ...
8601 elsif ( $next_nonblank_token eq '(' ) {
8603 # If we DID NOT see this paren above then it must be on the
8604 # next line so we will set a flag to come back here and see if
8607 # Otherwise, we assume it is a SIGNATURE rather than a
8608 # PROTOTYPE and let the normal tokenizer handle it as a list
8609 if ( !$saw_opening_paren ) {
8610 $id_scan_state = 'sub'; # we must come back to get proto
8612 if ( $call_type == SUB_CALL ) {
8614 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8617 elsif ($next_nonblank_token) { # EOF technically ok
8618 $subname = EMPTY_STRING unless defined($subname);
8620 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
8623 check_prototype( $proto, $package, $subname );
8626 # no match to either sub name or prototype, but line not blank
8630 return ( $i, $tok, $type, $id_scan_state );
8631 } ## end sub do_scan_sub
8634 #########i###############################################################
8635 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
8636 #########################################################################
8638 sub find_next_nonblank_token {
8639 my ( $i, $rtokens, $max_token_index ) = @_;
8641 # Returns the next nonblank token after the token at index $i
8642 # To skip past a side comment, and any subsequent block comments
8643 # and blank lines, call with i=$max_token_index
8645 if ( $i >= $max_token_index ) {
8646 if ( !peeked_ahead() ) {
8648 peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
8652 my $next_nonblank_token = $rtokens->[ ++$i ];
8653 return ( SPACE, $i ) unless defined($next_nonblank_token);
8655 if ( $next_nonblank_token =~ /^\s*$/ ) {
8656 $next_nonblank_token = $rtokens->[ ++$i ];
8657 return ( SPACE, $i ) unless defined($next_nonblank_token);
8659 return ( $next_nonblank_token, $i );
8660 } ## end sub find_next_nonblank_token
8662 sub find_next_noncomment_type {
8663 my ( $i, $rtokens, $max_token_index ) = @_;
8665 # Given the current character position, look ahead past any comments
8666 # and blank lines and return the next token, including digraphs and
8669 my ( $next_nonblank_token, $i_next ) =
8670 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8672 # skip past any side comment
8673 if ( $next_nonblank_token eq '#' ) {
8674 ( $next_nonblank_token, $i_next ) =
8675 find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
8678 goto RETURN if ( !$next_nonblank_token || $next_nonblank_token eq SPACE );
8680 # check for possible a digraph
8681 goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
8682 my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8683 goto RETURN if ( !$is_digraph{$test2} );
8684 $next_nonblank_token = $test2;
8685 $i_next = $i_next + 1;
8687 # check for possible a trigraph
8688 goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
8689 my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8690 goto RETURN if ( !$is_trigraph{$test3} );
8691 $next_nonblank_token = $test3;
8692 $i_next = $i_next + 1;
8695 return ( $next_nonblank_token, $i_next );
8696 } ## end sub find_next_noncomment_type
8698 sub is_possible_numerator {
8700 # Look at the next non-comment character and decide if it could be a
8706 my ( $i, $rtokens, $max_token_index ) = @_;
8707 my $is_possible_numerator = 0;
8709 my $next_token = $rtokens->[ $i + 1 ];
8710 if ( $next_token eq '=' ) { $i++; } # handle /=
8711 my ( $next_nonblank_token, $i_next ) =
8712 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8714 if ( $next_nonblank_token eq '#' ) {
8715 ( $next_nonblank_token, $i_next ) =
8716 find_next_nonblank_token( $max_token_index, $rtokens,
8720 if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
8721 $is_possible_numerator = 1;
8723 elsif ( $next_nonblank_token =~ /^\s*$/ ) {
8724 $is_possible_numerator = 0;
8727 $is_possible_numerator = -1;
8730 return $is_possible_numerator;
8731 } ## end sub is_possible_numerator
8733 { ## closure for sub pattern_expected
8738 # List of tokens which may follow a pattern. Note that we will not
8739 # have formed digraphs at this point, so we will see '&' instead of
8740 # '&&' and '|' instead of '||'
8742 # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
8743 my @q = qw( & && | || ? : + - * and or while if unless);
8744 push @q, ')', '}', ']', '>', ',', ';';
8745 @{pattern_test}{@q} = (1) x scalar(@q);
8748 sub pattern_expected {
8750 # This a filter for a possible pattern.
8751 # It looks at the token after a possible pattern and tries to
8752 # determine if that token could end a pattern.
8757 my ( $i, $rtokens, $max_token_index ) = @_;
8760 my $next_token = $rtokens->[ $i + 1 ];
8761 if ( $next_token =~ /^[msixpodualgc]/ ) {
8763 } # skip possible modifier
8764 my ( $next_nonblank_token, $i_next ) =
8765 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8767 if ( $pattern_test{$next_nonblank_token} ) {
8772 # Added '#' to fix issue c044
8773 if ( $next_nonblank_token =~ /^\s*$/
8774 || $next_nonblank_token eq '#' )
8783 } ## end sub pattern_expected
8786 sub find_next_nonblank_token_on_this_line {
8787 my ( $i, $rtokens, $max_token_index ) = @_;
8788 my $next_nonblank_token;
8790 if ( $i < $max_token_index ) {
8791 $next_nonblank_token = $rtokens->[ ++$i ];
8793 if ( $next_nonblank_token =~ /^\s*$/ ) {
8795 if ( $i < $max_token_index ) {
8796 $next_nonblank_token = $rtokens->[ ++$i ];
8801 $next_nonblank_token = EMPTY_STRING;
8803 return ( $next_nonblank_token, $i );
8804 } ## end sub find_next_nonblank_token_on_this_line
8806 sub find_angle_operator_termination {
8808 # We are looking at a '<' and want to know if it is an angle operator.
8810 # $i = pretoken index of ending '>' if found, current $i otherwise
8811 # $type = 'Q' if found, '>' otherwise
8812 my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
8815 pos($input_line) = 1 + $rtoken_map->[$i];
8819 # we just have to find the next '>' if a term is expected
8820 if ( $expecting == TERM ) { $filter = '[\>]' }
8822 # we have to guess if we don't know what is expected
8823 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
8825 # shouldn't happen - we shouldn't be here if operator is expected
8829 Bad call to find_angle_operator_termination
8832 return ( $i, $type );
8835 # To illustrate what we might be looking at, in case we are
8836 # guessing, here are some examples of valid angle operators
8843 # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
8844 # <${PREFIX}*img*.$IMAGE_TYPE>
8845 # <img*.$IMAGE_TYPE>
8846 # <Timg*.$IMAGE_TYPE>
8847 # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
8849 # Here are some examples of lines which do not have angle operators:
8850 # return unless $self->[2]++ < $#{$self->[1]};
8853 # the following line from dlister.pl caused trouble:
8854 # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
8856 # If the '<' starts an angle operator, it must end on this line and
8857 # it must not have certain characters like ';' and '=' in it. I use
8858 # this to limit the testing. This filter should be improved if
8861 if ( $input_line =~ /($filter)/g ) {
8865 # We MAY have found an angle operator termination if we get
8866 # here, but we need to do more to be sure we haven't been
8868 my $pos = pos($input_line);
8870 my $pos_beg = $rtoken_map->[$i];
8871 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
8873 # Test for '<' after possible filehandle, issue c103
8874 # print $fh <>; # syntax error
8875 # print $fh <DATA>; # ok
8876 # print $fh < DATA>; # syntax error at '>'
8877 # print STDERR < DATA>; # ok, prints word 'DATA'
8878 # print BLABLA <DATA>; # ok; does nothing unless BLABLA is defined
8879 if ( $last_nonblank_type eq 'Z' ) {
8881 # $str includes brackets; something like '<DATA>'
8882 if ( substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
8883 && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
8885 return ( $i, $type );
8889 # Reject if the closing '>' follows a '-' as in:
8890 # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
8891 if ( $expecting eq UNKNOWN ) {
8892 my $check = substr( $input_line, $pos - 2, 1 );
8893 if ( $check eq '-' ) {
8894 return ( $i, $type );
8898 ######################################debug#####
8899 #write_diagnostics( "ANGLE? :$str\n");
8900 #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
8901 ######################################debug#####
8905 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
8907 # It may be possible that a quote ends midway in a pretoken.
8908 # If this happens, it may be necessary to split the pretoken.
8912 unexpected error condition returned by inverse_pretoken_map
8916 "Possible tokinization error..please check this line\n");
8919 # count blanks on inside of brackets
8920 my $blank_count = 0;
8921 $blank_count++ if ( $str =~ /<\s+/ );
8922 $blank_count++ if ( $str =~ /\s+>/ );
8924 # Now let's see where we stand....
8925 # OK if math op not possible
8926 if ( $expecting == TERM ) {
8929 # OK if there are no more than 2 non-blank pre-tokens inside
8930 # (not possible to write 2 token math between < and >)
8931 # This catches most common cases
8932 elsif ( $i <= $i_beg + 3 + $blank_count ) {
8934 # No longer any need to document this common case
8935 ## write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
8938 # OK if there is some kind of identifier inside
8939 # print $fh <tvg::INPUT>;
8940 elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
8941 write_diagnostics("ANGLE (contains identifier): $str\n");
8947 # Let's try a Brace Test: any braces inside must balance
8949 while ( $str =~ /\{/g ) { $br++ }
8950 while ( $str =~ /\}/g ) { $br-- }
8952 while ( $str =~ /\[/g ) { $sb++ }
8953 while ( $str =~ /\]/g ) { $sb-- }
8955 while ( $str =~ /\(/g ) { $pr++ }
8956 while ( $str =~ /\)/g ) { $pr-- }
8958 # if braces do not balance - not angle operator
8959 if ( $br || $sb || $pr ) {
8963 "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
8966 # we should keep doing more checks here...to be continued
8967 # Tentatively accepting this as a valid angle operator.
8968 # There are lots more things that can be checked.
8971 "ANGLE-Guessing yes: $str expecting=$expecting\n");
8972 write_logfile_entry("Guessing angle operator here: $str\n");
8977 # didn't find ending >
8979 if ( $expecting == TERM ) {
8980 warning("No ending > for angle operator\n");
8984 return ( $i, $type );
8985 } ## end sub find_angle_operator_termination
8987 sub scan_number_do {
8989 # scan a number in any of the formats that Perl accepts
8990 # Underbars (_) are allowed in decimal numbers.
8991 # input parameters -
8992 # $input_line - the string to scan
8993 # $i - pre_token index to start scanning
8994 # $rtoken_map - reference to the pre_token map giving starting
8995 # character position in $input_line of token $i
8996 # output parameters -
8997 # $i - last pre_token index of the number just scanned
8998 # number - the number (characters); or undef if not a number
9000 my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
9001 my $pos_beg = $rtoken_map->[$i];
9005 my $type = $input_type;
9007 my $first_char = substr( $input_line, $pos_beg, 1 );
9009 # Look for bad starting characters; Shouldn't happen..
9010 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
9013 Program bug - scan_number given bad first character = '$first_char'
9016 return ( $i, $type, $number );
9019 # handle v-string without leading 'v' character ('Two Dot' rule)
9021 # Here is the format prior to including underscores:
9022 ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
9023 pos($input_line) = $pos_beg;
9024 if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
9025 $pos = pos($input_line);
9026 my $numc = $pos - $pos_beg;
9027 $number = substr( $input_line, $pos_beg, $numc );
9029 report_v_string($number);
9032 # handle octal, hex, binary
9033 if ( !defined($number) ) {
9034 pos($input_line) = $pos_beg;
9036 # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
9037 # For reference, the format prior to hex floating point is:
9038 # /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
9039 # (hex) (octal) (binary)
9043 /\G[+-]?0( # leading [signed] 0
9045 # a hex float, i.e. '0x0.b17217f7d1cf78p0'
9046 ([xX][0-9a-fA-F_]* # X and optional leading digits
9047 (\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction
9048 [Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
9049 [0-9a-fA-F_]*) # optional Additional exponent digits
9052 |([xX][0-9a-fA-F_]+)
9055 |([oO]?[0-7_]+ # string of octal digits
9056 (\.([0-7][0-7_]*)?)? # optional decimal and fraction
9057 [Pp][+-]?[0-7] # REQUIRED exponent, no underscore
9058 [0-7_]*) # Additional exponent digits with underscores
9061 |([oO]?[0-7_]+) # string of octal digits
9064 |([bB][01_]* # 'b' with string of binary digits
9065 (\.([01][01_]*)?)? # optional decimal and fraction
9066 [Pp][+-]?[01] # Required exponent indicator, no underscore
9067 [01_]*) # additional exponent bits
9070 |([bB][01_]+) # 'b' with string of binary digits
9075 $pos = pos($input_line);
9076 my $numc = $pos - $pos_beg;
9077 $number = substr( $input_line, $pos_beg, $numc );
9083 if ( !defined($number) ) {
9084 pos($input_line) = $pos_beg;
9086 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
9087 $pos = pos($input_line);
9089 # watch out for things like 0..40 which would give 0. by this;
9090 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
9091 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
9095 my $numc = $pos - $pos_beg;
9096 $number = substr( $input_line, $pos_beg, $numc );
9101 # filter out non-numbers like e + - . e2 .e3 +e6
9102 # the rule: at least one digit, and any 'e' must be preceded by a digit
9104 $number !~ /\d/ # no digits
9105 || ( $number =~ /^(.*)[eE]/
9106 && $1 !~ /\d/ ) # or no digits before the 'e'
9110 $type = $input_type;
9111 return ( $i, $type, $number );
9114 # Found a number; now we must convert back from character position
9115 # to pre_token index. An error here implies user syntax error.
9116 # An example would be an invalid octal number like '009'.
9119 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9120 if ($error) { warning("Possibly invalid number\n") }
9122 return ( $i, $type, $number );
9123 } ## end sub scan_number_do
9125 sub inverse_pretoken_map {
9127 # Starting with the current pre_token index $i, scan forward until
9128 # finding the index of the next pre_token whose position is $pos.
9129 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
9132 while ( ++$i <= $max_token_index ) {
9134 if ( $pos <= $rtoken_map->[$i] ) {
9136 # Let the calling routine handle errors in which we do not
9137 # land on a pre-token boundary. It can happen by running
9138 # perltidy on some non-perl scripts, for example.
9139 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
9144 return ( $i, $error );
9145 } ## end sub inverse_pretoken_map
9149 # find the target of a here document, if any
9151 # $i - token index of the second < of <<
9152 # ($i must be less than the last token index if this is called)
9153 # output parameters:
9154 # $found_target = 0 didn't find target; =1 found target
9155 # HERE_TARGET - the target string (may be empty string)
9156 # $i - unchanged if not here doc,
9157 # or index of the last token of the here target
9158 # $saw_error - flag noting unbalanced quote on here target
9159 my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
9161 my $found_target = 0;
9162 my $here_doc_target = EMPTY_STRING;
9163 my $here_quote_character = EMPTY_STRING;
9165 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
9166 $next_token = $rtokens->[ $i + 1 ];
9168 # perl allows a backslash before the target string (heredoc.t)
9170 if ( $next_token eq '\\' ) {
9172 $next_token = $rtokens->[ $i + 2 ];
9175 ( $next_nonblank_token, $i_next_nonblank ) =
9176 find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
9178 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
9181 my $quote_depth = 0;
9186 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
9189 = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
9190 $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
9192 if ($in_quote) { # didn't find end of quote, so no target found
9194 if ( $expecting == TERM ) {
9196 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
9201 else { # found ending quote
9205 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
9206 $tokj = $rtokens->[$j];
9208 # we have to remove any backslash before the quote character
9209 # so that the here-doc-target exactly matches this string
9213 && $rtokens->[ $j + 1 ] eq $here_quote_character );
9214 $here_doc_target .= $tokj;
9219 elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
9221 write_logfile_entry(
9222 "found blank here-target after <<; suggest using \"\"\n");
9225 elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
9227 my $here_doc_expected;
9228 if ( $expecting == UNKNOWN ) {
9229 $here_doc_expected = guess_if_here_doc($next_token);
9232 $here_doc_expected = 1;
9235 if ($here_doc_expected) {
9237 $here_doc_target = $next_token;
9244 if ( $expecting == TERM ) {
9246 write_logfile_entry("Note: bare here-doc operator <<\n");
9253 # patch to neglect any prepended backslash
9254 if ( $found_target && $backslash ) { $i++ }
9256 return ( $found_target, $here_doc_target, $here_quote_character, $i,
9258 } ## end sub find_here_doc
9262 # follow (or continue following) quoted string(s)
9263 # $in_quote return code:
9265 # 1 - still must find end of quote whose target is $quote_character
9266 # 2 - still looking for end of first of two quotes
9268 # Returns updated strings:
9269 # $quoted_string_1 = quoted string seen while in_quote=1
9270 # $quoted_string_2 = quoted string seen while in_quote=2
9272 $i, $in_quote, $quote_character,
9273 $quote_pos, $quote_depth, $quoted_string_1,
9274 $quoted_string_2, $rtokens, $rtoken_map,
9278 my $in_quote_starting = $in_quote;
9281 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
9284 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9287 = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
9288 $quote_pos, $quote_depth, $max_token_index );
9289 $quoted_string_2 .= $quoted_string;
9290 if ( $in_quote == 1 ) {
9291 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
9292 $quote_character = EMPTY_STRING;
9295 $quoted_string_2 .= "\n";
9299 if ( $in_quote == 1 ) { # one (more) quote to follow
9302 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9305 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
9306 $quote_pos, $quote_depth, $max_token_index );
9307 $quoted_string_1 .= $quoted_string;
9308 if ( $in_quote == 1 ) {
9309 $quoted_string_1 .= "\n";
9312 return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9313 $quoted_string_1, $quoted_string_2 );
9314 } ## end sub do_quote
9316 sub follow_quoted_string {
9318 # scan for a specific token, skipping escaped characters
9319 # if the quote character is blank, use the first non-blank character
9321 # $rtokens = reference to the array of tokens
9322 # $i = the token index of the first character to search
9323 # $in_quote = number of quoted strings being followed
9324 # $beginning_tok = the starting quote character
9325 # $quote_pos = index to check next for alphanumeric delimiter
9326 # output parameters:
9327 # $i = the token index of the ending quote character
9328 # $in_quote = decremented if found end, unchanged if not
9329 # $beginning_tok = the starting quote character
9330 # $quote_pos = index to check next for alphanumeric delimiter
9331 # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
9332 # $quoted_string = the text of the quote (without quotation tokens)
9333 my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
9336 my ( $tok, $end_tok );
9338 my $quoted_string = EMPTY_STRING;
9342 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
9345 # get the corresponding end token
9346 if ( $beginning_tok !~ /^\s*$/ ) {
9347 $end_tok = matching_end_token($beginning_tok);
9350 # a blank token means we must find and use the first non-blank one
9352 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
9354 while ( $i < $max_token_index ) {
9355 $tok = $rtokens->[ ++$i ];
9357 if ( $tok !~ /^\s*$/ ) {
9359 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
9360 $i = $max_token_index;
9364 if ( length($tok) > 1 ) {
9365 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
9366 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
9369 $beginning_tok = $tok;
9372 $end_tok = matching_end_token($beginning_tok);
9378 $allow_quote_comments = 1;
9383 # There are two different loops which search for the ending quote
9384 # character. In the rare case of an alphanumeric quote delimiter, we
9385 # have to look through alphanumeric tokens character-by-character, since
9386 # the pre-tokenization process combines multiple alphanumeric
9387 # characters, whereas for a non-alphanumeric delimiter, only tokens of
9388 # length 1 can match.
9390 ###################################################################
9391 # Case 1 (rare): loop for case of alphanumeric quote delimiter..
9392 # "quote_pos" is the position the current word to begin searching
9393 ###################################################################
9394 if ( $beginning_tok =~ /\w/ ) {
9396 # Note this because it is not recommended practice except
9397 # for obfuscated perl contests
9398 if ( $in_quote == 1 ) {
9399 write_logfile_entry(
9400 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
9403 # Note: changed < to <= here to fix c109. Relying on extra end blanks.
9404 while ( $i <= $max_token_index ) {
9406 if ( $quote_pos == 0 || ( $i < 0 ) ) {
9407 $tok = $rtokens->[ ++$i ];
9409 if ( $tok eq '\\' ) {
9411 # retain backslash unless it hides the end token
9412 $quoted_string .= $tok
9413 unless $rtokens->[ $i + 1 ] eq $end_tok;
9415 last if ( $i >= $max_token_index );
9416 $tok = $rtokens->[ ++$i ];
9419 my $old_pos = $quote_pos;
9421 unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
9425 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
9427 if ( $quote_pos > 0 ) {
9430 substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
9432 # NOTE: any quote modifiers will be at the end of '$tok'. If we
9433 # wanted to check them, this is the place to get them. But
9434 # this quote form is rarely used in practice, so it isn't
9439 if ( $quote_depth == 0 ) {
9445 if ( $old_pos <= length($tok) ) {
9446 $quoted_string .= substr( $tok, $old_pos );
9452 ########################################################################
9453 # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
9454 ########################################################################
9457 while ( $i < $max_token_index ) {
9458 $tok = $rtokens->[ ++$i ];
9460 if ( $tok eq $end_tok ) {
9463 if ( $quote_depth == 0 ) {
9468 elsif ( $tok eq $beginning_tok ) {
9471 elsif ( $tok eq '\\' ) {
9473 # retain backslash unless it hides the beginning or end token
9474 $tok = $rtokens->[ ++$i ];
9475 $quoted_string .= '\\'
9476 unless ( $tok eq $end_tok || $tok eq $beginning_tok );
9478 $quoted_string .= $tok;
9481 if ( $i > $max_token_index ) { $i = $max_token_index }
9482 return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
9484 } ## end sub follow_quoted_string
9486 sub indicate_error {
9487 my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
9488 interrupt_logfile();
9490 write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
9495 sub write_error_indicator_pair {
9496 my ( $line_number, $input_line, $pos, $carrat ) = @_;
9497 my ( $offset, $numbered_line, $underline ) =
9498 make_numbered_line( $line_number, $input_line, $pos );
9499 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
9500 warning( $numbered_line . "\n" );
9501 $underline =~ s/\s*$//;
9502 warning( $underline . "\n" );
9504 } ## end sub write_error_indicator_pair
9506 sub make_numbered_line {
9508 # Given an input line, its line number, and a character position of
9509 # interest, create a string not longer than 80 characters of the form
9510 # $lineno: sub_string
9511 # such that the sub_string of $str contains the position of interest
9513 # Here is an example of what we want, in this case we add trailing
9514 # '...' because the line is long.
9516 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9518 # Here is another example, this time in which we used leading '...'
9519 # because of excessive length:
9521 # 2: ... er of the World Wide Web Consortium's
9523 # input parameters are:
9524 # $lineno = line number
9525 # $str = the text of the line
9526 # $pos = position of interest (the error) : 0 = first character
9529 # - $offset = an offset which corrects the position in case we only
9530 # display part of a line, such that $pos-$offset is the effective
9531 # position from the start of the displayed line.
9532 # - $numbered_line = the numbered line as above,
9533 # - $underline = a blank 'underline' which is all spaces with the same
9534 # number of characters as the numbered line.
9536 my ( $lineno, $str, $pos ) = @_;
9537 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
9538 my $excess = length($str) - $offset - 68;
9539 my $numc = ( $excess > 0 ) ? 68 : undef;
9541 if ( defined($numc) ) {
9542 if ( $offset == 0 ) {
9543 $str = substr( $str, $offset, $numc - 4 ) . " ...";
9546 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
9551 if ( $offset == 0 ) {
9554 $str = "... " . substr( $str, $offset + 4 );
9558 my $numbered_line = sprintf( "%d: ", $lineno );
9559 $offset -= length($numbered_line);
9560 $numbered_line .= $str;
9561 my $underline = SPACE x length($numbered_line);
9562 return ( $offset, $numbered_line, $underline );
9563 } ## end sub make_numbered_line
9565 sub write_on_underline {
9567 # The "underline" is a string that shows where an error is; it starts
9568 # out as a string of blanks with the same length as the numbered line of
9569 # code above it, and we have to add marking to show where an error is.
9570 # In the example below, we want to write the string '--^' just below
9571 # the line of bad code:
9573 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9575 # We are given the current underline string, plus a position and a
9576 # string to write on it.
9578 # In the above example, there will be 2 calls to do this:
9579 # First call: $pos=19, pos_chr=^
9580 # Second call: $pos=16, pos_chr=---
9582 # This is a trivial thing to do with substr, but there is some
9585 my ( $underline, $pos, $pos_chr ) = @_;
9587 # check for error..shouldn't happen
9588 unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
9591 my $excess = length($pos_chr) + $pos - length($underline);
9592 if ( $excess > 0 ) {
9593 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
9595 substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
9596 return ($underline);
9597 } ## end sub write_on_underline
9601 # Break a string, $str, into a sequence of preliminary tokens. We
9602 # are interested in these types of tokens:
9603 # words (type='w'), example: 'max_tokens_wanted'
9604 # digits (type = 'd'), example: '0755'
9605 # whitespace (type = 'b'), example: ' '
9606 # any other single character (i.e. punct; type = the character itself).
9607 # We cannot do better than this yet because we might be in a quoted
9608 # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
9611 # An advantage of doing this pre-tokenization step is that it keeps almost
9612 # all of the regex work highly localized. A disadvantage is that in some
9613 # very rare instances we will have to go back and split a pre-token.
9614 my ( $str, $max_tokens_wanted ) = @_;
9616 # we return references to these 3 arrays:
9617 my @tokens = (); # array of the tokens themselves
9618 my @token_map = (0); # string position of start of each token
9619 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
9624 if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
9627 # note that this must come before words!
9628 elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
9631 elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
9633 # single-character punctuation
9634 elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
9638 return ( \@tokens, \@token_map, \@type );
9642 push @token_map, pos($str);
9644 } while ( --$max_tokens_wanted != 0 );
9646 return ( \@tokens, \@token_map, \@type );
9647 } ## end sub pre_tokenize
9651 # this is an old debug routine
9652 # not called, but saved for reference
9653 my ( $rtokens, $rtoken_map ) = @_;
9654 my $num = scalar( @{$rtokens} );
9656 foreach my $i ( 0 .. $num - 1 ) {
9657 my $len = length( $rtokens->[$i] );
9658 print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
9661 } ## end sub show_tokens
9663 { ## closure for sub matching end token
9664 my %matching_end_token;
9667 %matching_end_token = (
9675 sub matching_end_token {
9677 # return closing character for a pattern
9678 my $beginning_token = shift;
9679 if ( $matching_end_token{$beginning_token} ) {
9680 return $matching_end_token{$beginning_token};
9682 return ($beginning_token);
9686 sub dump_token_types {
9687 my ( $class, $fh ) = @_;
9689 # This should be the latest list of token types in use
9690 # adding NEW_TOKENS: add a comment here
9691 $fh->print(<<'END_OF_LIST');
9693 Here is a list of the token types currently used for lines of type 'CODE'.
9694 For the following tokens, the "type" of a token is just the token itself.
9696 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
9697 ( ) <= >= == =~ !~ != ++ -- /= x=
9698 ... **= <<= >>= &&= ||= //= <=>
9699 , + - / * | % ! x ~ = \ ? : . < > ^ &
9701 The following additional token types are defined:
9704 b blank (white space)
9705 { indent: opening structural curly brace or square bracket or paren
9706 (code block, anonymous hash reference, or anonymous array reference)
9707 } outdent: right structural curly brace or square bracket or paren
9708 [ left non-structural square bracket (enclosing an array index)
9709 ] right non-structural square bracket
9710 ( left non-structural paren (all but a list right of an =)
9711 ) right non-structural paren
9712 L left non-structural curly brace (enclosing a key)
9713 R right non-structural curly brace
9714 ; terminal semicolon
9715 f indicates a semicolon in a "for" statement
9716 h here_doc operator <<
9718 Q indicates a quote or pattern
9719 q indicates a qw quote block
9721 C user-defined constant or constant function (with void prototype = ())
9722 U user-defined function taking parameters
9723 G user-defined function taking block parameter (like grep/map/eval)
9724 M (unused, but reserved for subroutine definition name)
9725 P (unused, but -html uses it to label pod text)
9726 t type indicater such as %,$,@,*,&,sub
9727 w bare word (perhaps a subroutine call)
9728 i identifier of some type (with leading %, $, @, *, &, sub, -> )
9731 F a file test operator (like -e)
9733 Z identifier in indirect object slot: may be file handle, object
9734 J LABEL: code block label
9735 j LABEL after next, last, redo, goto
9738 pp pre-increment operator ++
9739 mm pre-decrement operator --
9740 A : used as attribute separator
9742 Here are the '_line_type' codes used internally:
9743 SYSTEM - system-specific code before hash-bang line
9744 CODE - line of perl code (including comments)
9745 POD_START - line starting pod, such as '=head'
9746 POD - pod documentation text
9747 POD_END - last line of pod section, '=cut'
9748 HERE - text of here-document
9749 HERE_END - last line of here-doc (target word)
9750 FORMAT - format section
9751 FORMAT_END - last line of format section, '.'
9752 SKIP - code skipping section
9753 SKIP_END - last line of code skipping section, '#>>V'
9754 DATA_START - __DATA__ line
9755 DATA - unidentified text following __DATA__
9756 END_START - __END__ line
9757 END - unidentified text following __END__
9758 ERROR - we are in big trouble, probably not a perl script
9762 } ## end sub dump_token_types
9766 # These names are used in error messages
9767 @opening_brace_names = qw# '{' '[' '(' '?' #;
9768 @closing_brace_names = qw# '}' ']' ')' ':' #;
9773 .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
9774 <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
9776 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
9779 . : < > * & | / - = + - % ^ ! x ~
9781 @can_start_digraph{@q} = (1) x scalar(@q);
9783 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
9784 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
9786 my @tetragraphs = qw( <<>> );
9787 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
9789 # make a hash of all valid token types for self-checking the tokenizer
9790 # (adding NEW_TOKENS : select a new character and add to this list)
9791 my @valid_token_types = qw#
9792 A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
9793 { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
9795 push( @valid_token_types, @digraphs );
9796 push( @valid_token_types, @trigraphs );
9797 push( @valid_token_types, @tetragraphs );
9798 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
9799 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
9801 # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
9802 my @file_test_operators =
9803 qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
9804 @is_file_test_operator{@file_test_operators} =
9805 (1) x scalar(@file_test_operators);
9807 # these functions have prototypes of the form (&), so when they are
9808 # followed by a block, that block MAY BE followed by an operator.
9809 # Smartmatch operator ~~ may be followed by anonymous hash or array ref
9811 @is_block_operator{@q} = (1) x scalar(@q);
9813 # these functions allow an identifier in the indirect object slot
9814 @q = qw( print printf sort exec system say);
9815 @is_indirect_object_taker{@q} = (1) x scalar(@q);
9817 # These tokens may precede a code block
9818 # patched for SWITCH/CASE/CATCH. Actually these could be removed
9819 # now and we could let the extended-syntax coding handle them.
9820 # Added 'default' for Switch::Plain.
9822 qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
9823 unless do while until eval for foreach map grep sort
9824 switch case given when default catch try finally);
9825 @is_code_block_token{@q} = (1) x scalar(@q);
9827 # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
9828 # to contrast it with the block types in '%is_zero_continuation_block_type'
9829 @q = qw( sort map grep eval do );
9830 @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
9832 @q = qw( sort map grep );
9833 @is_sort_map_grep{@q} = (1) x scalar(@q);
9835 %is_grep_alias = ();
9837 # I'll build the list of keywords incrementally
9840 # keywords and tokens after which a value or pattern is expected,
9841 # but not an operator. In other words, these should consume terms
9842 # to their right, or at least they are not expected to be followed
9843 # immediately by operators.
9844 my @value_requestor = qw(
10072 # patched above for SWITCH/CASE given/when err say
10073 # 'err' is a fairly safe addition.
10074 # Added 'default' for Switch::Plain. Note that we could also have
10075 # a separate set of keywords to include if we see 'use Switch::Plain'
10076 push( @Keywords, @value_requestor );
10078 # These are treated the same but are not keywords:
10083 push( @value_requestor, @extra_vr );
10085 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
10087 # this list contains keywords which do not look for arguments,
10088 # so that they might be followed by an operator, or at least
10090 my @operator_requestor = qw(
10114 push( @Keywords, @operator_requestor );
10116 # These are treated the same but are not considered keywords:
10123 push( @operator_requestor, @extra_or );
10125 @expecting_operator_token{@operator_requestor} =
10126 (1) x scalar(@operator_requestor);
10128 # these token TYPES expect trailing operator but not a term
10129 # note: ++ and -- are post-increment and decrement, 'C' = constant
10130 my @operator_requestor_types = qw( ++ -- C <> q );
10131 @expecting_operator_types{@operator_requestor_types} =
10132 (1) x scalar(@operator_requestor_types);
10134 # these token TYPES consume values (terms)
10135 # note: pp and mm are pre-increment and decrement
10136 # f=semicolon in for, F=file test operator
10137 my @value_requestor_type = qw#
10138 L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
10139 **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
10140 <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
10141 f F pp mm Y p m U J G j >> << ^ t
10142 ~. ^. |. &. ^.= |.= &.=
10144 push( @value_requestor_type, ',' )
10145 ; # (perl doesn't like a ',' in a qw block)
10146 @expecting_term_types{@value_requestor_type} =
10147 (1) x scalar(@value_requestor_type);
10149 # Note: the following valid token types are not assigned here to
10150 # hashes requesting to be followed by values or terms, but are
10151 # instead currently hard-coded into sub operator_expected:
10152 # ) -> :: Q R Z ] b h i k n v w } #
10154 # For simple syntax checking, it is nice to have a list of operators which
10155 # will really be unhappy if not followed by a term. This includes most
10157 %really_want_term = %expecting_term_types;
10159 # with these exceptions...
10160 delete $really_want_term{'U'}; # user sub, depends on prototype
10161 delete $really_want_term{'F'}; # file test works on $_ if no following term
10162 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
10164 @q = qw(q qq qx qr s y tr m);
10165 @is_q_qq_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10167 # Note added 'qw' here
10168 @q = qw(q qq qw qx qr s y tr m);
10169 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10172 @is_package{@q} = (1) x scalar(@q);
10176 @is_comma_question_colon{@q} = (1) x scalar(@q);
10178 @q = qw( if elsif unless );
10179 @is_if_elsif_unless{@q} = (1) x scalar(@q);
10182 @is_semicolon_or_t{@q} = (1) x scalar(@q);
10184 @q = qw( if elsif unless case when );
10185 @is_if_elsif_unless_case_when{@q} = (1) x scalar(@q);
10187 # Hash of other possible line endings which may occur.
10188 # Keep these coordinated with the regex where this is used.
10189 # Note: chr(13) = chr(015)="\r".
10190 @q = ( chr(13), chr(29), chr(26) );
10191 @other_line_endings{@q} = (1) x scalar(@q);
10193 # These keywords are handled specially in the tokenizer code:
10194 my @special_keywords = qw(
10210 push( @Keywords, @special_keywords );
10212 # Keywords after which list formatting may be used
10213 # WARNING: do not include |map|grep|eval or perl may die on
10214 # syntax errors (map1.t).
10215 my @keyword_taking_list = qw(
10290 @is_keyword_taking_list{@keyword_taking_list} =
10291 (1) x scalar(@keyword_taking_list);
10293 # perl functions which may be unary operators.
10295 # This list is used to decide if a pattern delimited by slashes, /pattern/,
10296 # can follow one of these keywords.
10298 chomp eof eval fc lc pop shift uc undef
10301 @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
10304 # These are keywords for which an arg may optionally be omitted. They are
10305 # currently only used to disambiguate a ? used as a ternary from one used
10306 # as a (deprecated) pattern delimiter. In the future, they might be used
10307 # to give a warning about ambiguous syntax before a /.
10308 # Note: split has been omitted (see not below).
10309 my @keywords_taking_optional_arg = qw(
10378 @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
10379 (1) x scalar(@keywords_taking_optional_arg);
10381 # This list is used to decide if a pattern delimited by question marks,
10382 # ?pattern?, can follow one of these keywords. Note that from perl 5.22
10383 # on, a ?pattern? is not recognized, so we can be much more strict than
10384 # with a /pattern/. Note that 'split' is not in this list. In current
10385 # versions of perl a question following split must be a ternary, but
10386 # in older versions it could be a pattern. The guessing algorithm will
10387 # decide. We are combining two lists here to simplify the test.
10388 @q = ( @keywords_taking_optional_arg, @operator_requestor );
10389 @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
10392 # These are not used in any way yet
10393 # my @unused_keywords = qw(
10399 # The list of keywords was originally extracted from function 'keyword' in
10400 # perl file toke.c version 5.005.03, using this utility, plus a
10401 # little editing: (file getkwd.pl):
10402 # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
10403 # Add 'get' prefix where necessary, then split into the above lists.
10404 # This list should be updated as necessary.
10405 # The list should not contain these special variables:
10406 # ARGV DATA ENV SIG STDERR STDIN STDOUT
10409 @is_keyword{@Keywords} = (1) x scalar(@Keywords);