1 #####################################################################
3 # The Perl::Tidy::Tokenizer package is essentially a filter which
4 # reads lines of perl source code from a source object and provides
5 # corresponding tokenized lines through its get_line() method. Lines
6 # flow from the source_object to the caller like this:
8 # source_object --> LineBuffer_object --> Tokenizer --> calling routine
9 # get_line() get_line() get_line() line_of_tokens
11 # The source object can be any object with a get_line() method which
12 # supplies one line (a character string) perl call.
13 # The LineBuffer object is created by the Tokenizer.
14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
15 # containing one tokenized line for each call to its get_line() method.
17 # WARNING: This is not a real class. Only one tokenizer my be used.
19 ########################################################################
21 package Perl::Tidy::Tokenizer;
24 use English qw( -no_match_vars );
26 our $VERSION = '20230309';
28 use Perl::Tidy::LineBuffer;
31 use constant DEVEL_MODE => 0;
32 use constant EMPTY_STRING => q{};
33 use constant SPACE => q{ };
35 # Decimal values of some ascii characters for quick checks
36 use constant ORD_TAB => 9;
37 use constant ORD_SPACE => 32;
38 use constant ORD_PRINTABLE_MIN => 33;
39 use constant ORD_PRINTABLE_MAX => 126;
41 # PACKAGE VARIABLES for processing an entire FILE.
42 # These must be package variables because most may get localized during
43 # processing. Most are initialized in sub prepare_for_a_new_file.
49 $last_nonblank_block_type
57 %user_function_prototype
59 %is_block_list_function
60 %saw_function_definition
71 @nesting_sequence_number
72 @current_sequence_number
74 @paren_semicolon_count
75 @paren_structural_type
77 @brace_structural_type
81 @square_bracket_structural_type
84 @nested_statement_type
85 @starting_line_of_current_depth
88 # GLOBAL CONSTANTS for routines in this package,
89 # Initialized in a BEGIN block.
91 %is_indirect_object_taker
93 %expecting_operator_token
94 %expecting_operator_types
99 %is_file_test_operator
106 %is_sort_map_grep_eval_do
112 %is_keyword_taking_list
113 %is_keyword_taking_optional_arg
114 %is_keyword_rejecting_slash_as_pattern_delimiter
115 %is_keyword_rejecting_question_as_pattern_delimiter
116 %is_q_qq_qx_qr_s_y_tr_m
117 %is_q_qq_qw_qx_qr_s_y_tr_m
120 %is_comma_question_colon
122 %is_if_elsif_unless_case_when
124 %is_END_DATA_format_sub
126 $code_skipping_pattern_begin
127 $code_skipping_pattern_end
130 # GLOBAL VARIABLES which are constant after being configured by user-supplied
131 # parameters. They remain constant as a file is being processed.
134 $rOpts_code_skipping,
135 $code_skipping_pattern_begin,
136 $code_skipping_pattern_end,
139 # possible values of operator_expected()
140 use constant TERM => -1;
141 use constant UNKNOWN => 0;
142 use constant OPERATOR => 1;
144 # possible values of context
145 use constant SCALAR_CONTEXT => -1;
146 use constant UNKNOWN_CONTEXT => 0;
147 use constant LIST_CONTEXT => 1;
149 # Maximum number of little messages; probably need not be changed.
150 use constant MAX_NAG_MESSAGES => 6;
154 # Array index names for $self.
155 # Do not combine with other BEGIN blocks (c101).
158 _rhere_target_list_ => $i++,
159 _in_here_doc_ => $i++,
160 _here_doc_target_ => $i++,
161 _here_quote_character_ => $i++,
167 _in_skipped_ => $i++,
168 _in_attribute_list_ => $i++,
170 _quote_target_ => $i++,
171 _line_start_quote_ => $i++,
172 _starting_level_ => $i++,
173 _know_starting_level_ => $i++,
175 _indent_columns_ => $i++,
176 _look_for_hash_bang_ => $i++,
178 _continuation_indentation_ => $i++,
179 _outdent_labels_ => $i++,
180 _last_line_number_ => $i++,
181 _saw_perl_dash_P_ => $i++,
182 _saw_perl_dash_w_ => $i++,
183 _saw_use_strict_ => $i++,
184 _saw_v_string_ => $i++,
186 _look_for_autoloader_ => $i++,
187 _look_for_selfloader_ => $i++,
188 _saw_autoloader_ => $i++,
189 _saw_selfloader_ => $i++,
190 _saw_hash_bang_ => $i++,
193 _saw_negative_indentation_ => $i++,
194 _started_tokenizing_ => $i++,
195 _line_buffer_object_ => $i++,
196 _debugger_object_ => $i++,
197 _diagnostics_object_ => $i++,
198 _logger_object_ => $i++,
199 _unexpected_error_count_ => $i++,
200 _started_looking_for_here_target_at_ => $i++,
201 _nearly_matched_here_target_at_ => $i++,
202 _line_of_text_ => $i++,
203 _rlower_case_labels_at_ => $i++,
204 _extended_syntax_ => $i++,
205 _maximum_level_ => $i++,
206 _true_brace_error_count_ => $i++,
207 _rOpts_maximum_level_errors_ => $i++,
208 _rOpts_maximum_unexpected_errors_ => $i++,
209 _rOpts_logfile_ => $i++,
214 { ## closure for subs to count instances
216 # methods to count instances
218 sub get_count { return $_count; }
219 sub _increment_count { return ++$_count }
220 sub _decrement_count { return --$_count }
225 $self->_decrement_count();
231 # Catch any undefined sub calls so that we are sure to get
232 # some diagnostic information. This sub should never be called
233 # except for a programming error.
235 return if ( $AUTOLOAD =~ /\bDESTROY$/ );
236 my ( $pkg, $fname, $lno ) = caller();
237 my $my_package = __PACKAGE__;
239 ======================================================================
240 Error detected in package '$my_package', version $VERSION
241 Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
242 Called from package: '$pkg'
243 Called from File '$fname' at line '$lno'
244 This error is probably due to a recent programming change
245 ======================================================================
248 } ## end sub AUTOLOAD
252 Perl::Tidy::Die($msg);
253 croak "unexpected return from Perl::Tidy::Die";
259 # This routine is called for errors that really should not occur
260 # except if there has been a bug introduced by a recent program change.
261 # Please add comments at calls to Fault to explain why the call
262 # should not occur, and where to look to fix it.
263 my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
264 my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
265 my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
266 my $pkg = __PACKAGE__;
268 my $input_stream_name = get_input_stream_name();
271 ==============================================================================
272 While operating on input stream with name: '$input_stream_name'
273 A fault was detected at line $line0 of sub '$subroutine1'
275 which was called from line $line1 of sub '$subroutine2'
277 This is probably an error introduced by a recent programming change.
278 $pkg reports VERSION='$VERSION'.
279 ==============================================================================
282 # We shouldn't get here, but this return is to keep Perl-Critic from
289 # See if a pattern will compile. We have to use a string eval here,
290 # but it should be safe because the pattern has been constructed
293 my $ok = eval "'##'=~/$pattern/";
294 return !defined($ok) || $EVAL_ERROR;
295 } ## end sub bad_pattern
297 sub make_code_skipping_pattern {
298 my ( $rOpts, $opt_name, $default ) = @_;
299 my $param = $rOpts->{$opt_name};
300 unless ($param) { $param = $default }
301 $param =~ s/^\s*//; # allow leading spaces to be like format-skipping
302 if ( $param !~ /^#/ ) {
303 Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
305 my $pattern = '^\s*' . $param . '\b';
306 if ( bad_pattern($pattern) ) {
308 "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
312 } ## end sub make_code_skipping_pattern
316 # Check Tokenizer parameters
322 %is_END_DATA_format_sub = (
329 # Install any aliases to 'sub'
330 if ( $rOpts->{'sub-alias-list'} ) {
332 # Note that any 'sub-alias-list' has been preprocessed to
333 # be a trimmed, space-separated list which includes 'sub'
334 # for example, it might be 'sub method fun'
335 my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
336 foreach my $word (@sub_alias_list) {
338 $is_END_DATA_format_sub{$word} = 1;
342 #------------------------------------------------
343 # Update hash values for any -use-feature options
344 #------------------------------------------------
345 my $use_feature_class = $rOpts->{'use-feature'} =~ /\bclass\b/;
347 # These are the main updates for this option. There are additional
348 # changes elsewhere, usually indicated with a comment 'rt145706'
350 # Update hash values for use_feature=class, added for rt145706
351 # see 'perlclass.pod'
353 # IMPORTANT: We are changing global hash values initially set in a BEGIN
354 # block. Values must be defined (true or false) for each of these new
355 # words whether true or false. Otherwise, programs using the module which
356 # change options between runs (such as test code) will have
357 # incorrect settings and fail.
359 # There are 4 new keywords:
361 # 'class' - treated specially as generalization of 'package'
362 # Note: we must not set 'class' to be a keyword to avoid problems
364 $is_package{'class'} = $use_feature_class;
366 # 'method' - treated like sub using the sub-alias-list option
367 # Note: we must not set 'method' to be a keyword to avoid problems
370 # 'field' - added as a keyword, and works like 'my'
371 $is_keyword{'field'} = $use_feature_class;
372 $is_my_our_state{'field'} = $use_feature_class;
374 # 'ADJUST' - added as a keyword and works like 'BEGIN'
375 # TODO: if ADJUST gets a paren list, this will need to be updated
376 $is_keyword{'ADJUST'} = $use_feature_class;
377 $is_code_block_token{'ADJUST'} = $use_feature_class;
380 if ( $rOpts->{'grep-alias-list'} ) {
382 # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
383 # space-separated list
384 my @q = split /\s+/, $rOpts->{'grep-alias-list'};
385 @{is_grep_alias}{@q} = (1) x scalar(@q);
388 $rOpts_code_skipping = $rOpts->{'code-skipping'};
389 $code_skipping_pattern_begin =
390 make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
391 $code_skipping_pattern_end =
392 make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
395 } ## end sub check_options
399 my ( $class, @args ) = @_;
401 # Note: 'tabs' and 'indent_columns' are temporary and should be
404 source_object => undef,
405 debugger_object => undef,
406 diagnostics_object => undef,
407 logger_object => undef,
408 starting_level => undef,
411 look_for_hash_bang => 0,
413 look_for_autoloader => 1,
414 look_for_selfloader => 1,
415 starting_line_number => 1,
416 extended_syntax => 0,
419 my %args = ( %defaults, @args );
421 # we are given an object with a get_line() method to supply source lines
422 my $source_object = $args{source_object};
423 my $rOpts = $args{rOpts};
425 # we create another object with a get_line() and peek_ahead() method
426 my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
428 # Tokenizer state data is as follows:
429 # _rhere_target_list_ reference to list of here-doc targets
430 # _here_doc_target_ the target string for a here document
431 # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
432 # to determine if interpolation is done
433 # _quote_target_ character we seek if chasing a quote
434 # _line_start_quote_ line where we started looking for a long quote
435 # _in_here_doc_ flag indicating if we are in a here-doc
436 # _in_pod_ flag set if we are in pod documentation
437 # _in_skipped_ flag set if we are in a skipped section
438 # _in_error_ flag set if we saw severe error (binary in script)
439 # _in_data_ flag set if we are in __DATA__ section
440 # _in_end_ flag set if we are in __END__ section
441 # _in_format_ flag set if we are in a format description
442 # _in_attribute_list_ flag telling if we are looking for attributes
443 # _in_quote_ flag telling if we are chasing a quote
444 # _starting_level_ indentation level of first line
445 # _line_buffer_object_ object with get_line() method to supply source code
446 # _diagnostics_object_ place to write debugging information
447 # _unexpected_error_count_ error count used to limit output
448 # _lower_case_labels_at_ line numbers where lower case labels seen
449 # _hit_bug_ program bug detected
452 $self->[_rhere_target_list_] = [];
453 $self->[_in_here_doc_] = 0;
454 $self->[_here_doc_target_] = EMPTY_STRING;
455 $self->[_here_quote_character_] = EMPTY_STRING;
456 $self->[_in_data_] = 0;
457 $self->[_in_end_] = 0;
458 $self->[_in_format_] = 0;
459 $self->[_in_error_] = 0;
460 $self->[_in_pod_] = 0;
461 $self->[_in_skipped_] = 0;
462 $self->[_in_attribute_list_] = 0;
463 $self->[_in_quote_] = 0;
464 $self->[_quote_target_] = EMPTY_STRING;
465 $self->[_line_start_quote_] = -1;
466 $self->[_starting_level_] = $args{starting_level};
467 $self->[_know_starting_level_] = defined( $args{starting_level} );
468 $self->[_tabsize_] = $args{tabsize};
469 $self->[_indent_columns_] = $args{indent_columns};
470 $self->[_look_for_hash_bang_] = $args{look_for_hash_bang};
471 $self->[_trim_qw_] = $args{trim_qw};
472 $self->[_continuation_indentation_] = $args{continuation_indentation};
473 $self->[_outdent_labels_] = $args{outdent_labels};
474 $self->[_last_line_number_] = $args{starting_line_number} - 1;
475 $self->[_saw_perl_dash_P_] = 0;
476 $self->[_saw_perl_dash_w_] = 0;
477 $self->[_saw_use_strict_] = 0;
478 $self->[_saw_v_string_] = 0;
479 $self->[_hit_bug_] = 0;
480 $self->[_look_for_autoloader_] = $args{look_for_autoloader};
481 $self->[_look_for_selfloader_] = $args{look_for_selfloader};
482 $self->[_saw_autoloader_] = 0;
483 $self->[_saw_selfloader_] = 0;
484 $self->[_saw_hash_bang_] = 0;
485 $self->[_saw_end_] = 0;
486 $self->[_saw_data_] = 0;
487 $self->[_saw_negative_indentation_] = 0;
488 $self->[_started_tokenizing_] = 0;
489 $self->[_line_buffer_object_] = $line_buffer_object;
490 $self->[_debugger_object_] = $args{debugger_object};
491 $self->[_diagnostics_object_] = $args{diagnostics_object};
492 $self->[_logger_object_] = $args{logger_object};
493 $self->[_unexpected_error_count_] = 0;
494 $self->[_started_looking_for_here_target_at_] = 0;
495 $self->[_nearly_matched_here_target_at_] = undef;
496 $self->[_line_of_text_] = EMPTY_STRING;
497 $self->[_rlower_case_labels_at_] = undef;
498 $self->[_extended_syntax_] = $args{extended_syntax};
499 $self->[_maximum_level_] = 0;
500 $self->[_true_brace_error_count_] = 0;
501 $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
502 $self->[_rOpts_maximum_unexpected_errors_] =
503 $rOpts->{'maximum-unexpected-errors'};
504 $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
505 $self->[_rOpts_] = $rOpts;
507 # These vars are used for guessing indentation and must be positive
508 $self->[_tabsize_] = 8 if ( !$self->[_tabsize_] );
509 $self->[_indent_columns_] = 4 if ( !$self->[_indent_columns_] );
513 $tokenizer_self = $self;
515 prepare_for_a_new_file();
516 $self->find_starting_indentation_level();
518 # This is not a full class yet, so die if an attempt is made to
519 # create more than one object.
521 if ( _increment_count() > 1 ) {
523 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
530 # interface to Perl::Tidy::Logger routines
533 my $logger_object = $tokenizer_self->[_logger_object_];
534 if ($logger_object) {
535 $logger_object->warning($msg);
540 sub get_input_stream_name {
541 my $input_stream_name = EMPTY_STRING;
542 my $logger_object = $tokenizer_self->[_logger_object_];
543 if ($logger_object) {
544 $input_stream_name = $logger_object->get_input_stream_name();
546 return $input_stream_name;
547 } ## end sub get_input_stream_name
551 my $logger_object = $tokenizer_self->[_logger_object_];
552 if ($logger_object) {
553 my $input_line_number = $tokenizer_self->[_last_line_number_] + 1;
554 $msg = "Line $input_line_number: $msg";
555 $logger_object->complain($msg);
558 } ## end sub complain
560 sub write_logfile_entry {
562 my $logger_object = $tokenizer_self->[_logger_object_];
563 if ($logger_object) {
564 $logger_object->write_logfile_entry($msg);
567 } ## end sub write_logfile_entry
569 sub interrupt_logfile {
570 my $logger_object = $tokenizer_self->[_logger_object_];
571 if ($logger_object) {
572 $logger_object->interrupt_logfile();
575 } ## end sub interrupt_logfile
578 my $logger_object = $tokenizer_self->[_logger_object_];
579 if ($logger_object) {
580 $logger_object->resume_logfile();
583 } ## end sub resume_logfile
585 sub increment_brace_error {
586 my $logger_object = $tokenizer_self->[_logger_object_];
587 if ($logger_object) {
588 $logger_object->increment_brace_error();
591 } ## end sub increment_brace_error
593 sub report_definite_bug {
594 $tokenizer_self->[_hit_bug_] = 1;
595 my $logger_object = $tokenizer_self->[_logger_object_];
596 if ($logger_object) {
597 $logger_object->report_definite_bug();
600 } ## end sub report_definite_bug
604 my $logger_object = $tokenizer_self->[_logger_object_];
605 if ($logger_object) {
606 $logger_object->brace_warning($msg);
609 } ## end sub brace_warning
611 sub get_saw_brace_error {
612 my $logger_object = $tokenizer_self->[_logger_object_];
613 if ($logger_object) {
614 return $logger_object->get_saw_brace_error();
619 } ## end sub get_saw_brace_error
621 sub get_unexpected_error_count {
623 return $self->[_unexpected_error_count_];
626 # interface to Perl::Tidy::Diagnostics routines
627 sub write_diagnostics {
629 if ( $tokenizer_self->[_diagnostics_object_] ) {
630 $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
633 } ## end sub write_diagnostics
635 sub get_maximum_level {
636 return $tokenizer_self->[_maximum_level_];
639 sub report_tokenization_errors {
643 # Report any tokenization errors and return a flag '$severe_error'.
644 # Set $severe_error = 1 if the tokenization errors are so severe that
645 # the formatter should not attempt to format the file. Instead, it will
646 # just output the file verbatim.
648 # set severe error flag if tokenizer has encountered file reading problems
649 # (i.e. unexpected binary characters)
650 my $severe_error = $self->[_in_error_];
652 my $maxle = $self->[_rOpts_maximum_level_errors_];
653 my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
654 $maxle = 1 unless defined($maxle);
655 $maxue = 0 unless defined($maxue);
657 my $level = get_indentation_level();
658 if ( $level != $tokenizer_self->[_starting_level_] ) {
659 warning("final indentation level: $level\n");
660 my $level_diff = $tokenizer_self->[_starting_level_] - $level;
661 if ( $level_diff < 0 ) { $level_diff = -$level_diff }
663 # Set severe error flag if the level error is greater than 1.
664 # The formatter can function for any level error but it is probably
665 # best not to attempt formatting for a high level error.
666 if ( $maxle >= 0 && $level_diff > $maxle ) {
669 Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
674 check_final_nesting_depths();
676 # Likewise, large numbers of brace errors usually indicate non-perl
677 # scripts, so set the severe error flag at a low number. This is similar
678 # to the level check, but different because braces may balance but be
679 # incorrectly interlaced.
680 if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
684 if ( $tokenizer_self->[_look_for_hash_bang_]
685 && !$tokenizer_self->[_saw_hash_bang_] )
688 "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
691 if ( $tokenizer_self->[_in_format_] ) {
692 warning("hit EOF while in format description\n");
695 if ( $tokenizer_self->[_in_skipped_] ) {
697 "hit EOF while in lines skipped with --code-skipping\n");
700 if ( $tokenizer_self->[_in_pod_] ) {
702 # Just write log entry if this is after __END__ or __DATA__
703 # because this happens to often, and it is not likely to be
705 if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
707 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
713 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
719 if ( $tokenizer_self->[_in_here_doc_] ) {
721 my $here_doc_target = $tokenizer_self->[_here_doc_target_];
722 my $started_looking_for_here_target_at =
723 $tokenizer_self->[_started_looking_for_here_target_at_];
724 if ($here_doc_target) {
726 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
731 Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
732 (Perl will match to the end of file but this may not be intended).
735 my $nearly_matched_here_target_at =
736 $tokenizer_self->[_nearly_matched_here_target_at_];
737 if ($nearly_matched_here_target_at) {
739 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
744 # Something is seriously wrong if we ended inside a quote
745 if ( $tokenizer_self->[_in_quote_] ) {
747 my $line_start_quote = $tokenizer_self->[_line_start_quote_];
748 my $quote_target = $tokenizer_self->[_quote_target_];
750 ( $tokenizer_self->[_in_attribute_list_] )
754 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
758 if ( $tokenizer_self->[_hit_bug_] ) {
762 # Multiple "unexpected" type tokenization errors usually indicate parsing
763 # non-perl scripts, or that something is seriously wrong, so we should
764 # avoid formatting them. This can happen for example if we run perltidy on
765 # a shell script or an html file. But unfortunately this check can
766 # interfere with some extended syntaxes, such as RPerl, so it has to be off
768 my $ue_count = $tokenizer_self->[_unexpected_error_count_];
769 if ( $maxue > 0 && $ue_count > $maxue ) {
771 Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
776 unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
778 write_logfile_entry("Suggest including '-w parameter'\n");
781 write_logfile_entry("Suggest including 'use warnings;'\n");
785 if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
786 write_logfile_entry("Use of -P parameter for defines is discouraged\n");
789 unless ( $tokenizer_self->[_saw_use_strict_] ) {
790 write_logfile_entry("Suggest including 'use strict;'\n");
793 # it is suggested that labels have at least one upper case character
794 # for legibility and to avoid code breakage as new keywords are introduced
795 if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
796 my @lower_case_labels_at =
797 @{ $tokenizer_self->[_rlower_case_labels_at_] };
799 "Suggest using upper case characters in label(s)\n");
800 local $LIST_SEPARATOR = ')(';
801 write_logfile_entry(" defined at line(s): (@lower_case_labels_at)\n");
803 return $severe_error;
804 } ## end sub report_tokenization_errors
806 sub report_v_string {
808 # warn if this version can't handle v-strings
810 unless ( $tokenizer_self->[_saw_v_string_] ) {
811 $tokenizer_self->[_saw_v_string_] =
812 $tokenizer_self->[_last_line_number_];
816 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
820 } ## end sub report_v_string
822 sub is_valid_token_type {
824 return $is_valid_token_type{$type};
827 sub get_input_line_number {
828 return $tokenizer_self->[_last_line_number_];
831 sub log_numbered_msg {
832 my ( $self, $msg ) = @_;
834 # write input line number + message to logfile
835 my $input_line_number = $self->[_last_line_number_];
836 write_logfile_entry("Line $input_line_number: $msg");
838 } ## end sub log_numbered_msg
840 # returns the next tokenized line
845 # USES GLOBAL VARIABLES:
846 # $brace_depth, $square_bracket_depth, $paren_depth
848 my $input_line = $self->[_line_buffer_object_]->get_line();
849 $self->[_line_of_text_] = $input_line;
851 return unless ($input_line);
853 my $input_line_number = ++$self->[_last_line_number_];
855 # Find and remove what characters terminate this line, including any
857 my $input_line_separator = EMPTY_STRING;
858 if ( chomp($input_line) ) {
859 $input_line_separator = $INPUT_RECORD_SEPARATOR;
862 # The first test here very significantly speeds things up, but be sure to
863 # keep the regex and hash %other_line_endings the same.
864 if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
865 if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
866 $input_line_separator = $2 . $input_line_separator;
870 # for backwards compatibility we keep the line text terminated with
871 # a newline character
873 $self->[_line_of_text_] = $input_line;
875 # create a data structure describing this line which will be
876 # returned to the caller.
878 # _line_type codes are:
879 # SYSTEM - system-specific code before hash-bang line
880 # CODE - line of perl code (including comments)
881 # POD_START - line starting pod, such as '=head'
882 # POD - pod documentation text
883 # POD_END - last line of pod section, '=cut'
884 # HERE - text of here-document
885 # HERE_END - last line of here-doc (target word)
886 # FORMAT - format section
887 # FORMAT_END - last line of format section, '.'
888 # SKIP - code skipping section
889 # SKIP_END - last line of code skipping section, '#>>V'
890 # DATA_START - __DATA__ line
891 # DATA - unidentified text following __DATA__
892 # END_START - __END__ line
893 # END - unidentified text following __END__
894 # ERROR - we are in big trouble, probably not a perl script
897 # _curly_brace_depth - depth of curly braces at start of line
898 # _square_bracket_depth - depth of square brackets at start of line
899 # _paren_depth - depth of parens at start of line
900 # _starting_in_quote - this line continues a multi-line quote
901 # (so don't trim leading blanks!)
902 # _ending_in_quote - this line ends in a multi-line quote
903 # (so don't trim trailing blanks!)
904 my $line_of_tokens = {
906 _line_text => $input_line,
907 _line_number => $input_line_number,
908 _guessed_indentation_level => 0,
909 _curly_brace_depth => $brace_depth,
910 _square_bracket_depth => $square_bracket_depth,
911 _paren_depth => $paren_depth,
912 _quote_character => EMPTY_STRING,
913 ## Skip these needless initializations for efficiency:
914 ## _rtoken_type => undef,
915 ## _rtokens => undef,
916 ## _rlevels => undef,
917 ## _rblock_type => undef,
918 ## _rtype_sequence => undef,
919 ## _rci_levels => undef,
920 ## _starting_in_quote => 0,
921 ## _ending_in_quote => 0,
924 # must print line unchanged if we are in a here document
925 if ( $self->[_in_here_doc_] ) {
927 $line_of_tokens->{_line_type} = 'HERE';
928 my $here_doc_target = $self->[_here_doc_target_];
929 my $here_quote_character = $self->[_here_quote_character_];
930 my $candidate_target = $input_line;
931 chomp $candidate_target;
933 # Handle <<~ targets, which are indicated here by a leading space on
934 # the here quote character
935 if ( $here_quote_character =~ /^\s/ ) {
936 $candidate_target =~ s/^\s*//;
938 if ( $candidate_target eq $here_doc_target ) {
939 $self->[_nearly_matched_here_target_at_] = undef;
940 $line_of_tokens->{_line_type} = 'HERE_END';
941 $self->log_numbered_msg("Exiting HERE document $here_doc_target\n");
943 my $rhere_target_list = $self->[_rhere_target_list_];
944 if ( @{$rhere_target_list} ) { # there can be multiple here targets
945 ( $here_doc_target, $here_quote_character ) =
946 @{ shift @{$rhere_target_list} };
947 $self->[_here_doc_target_] = $here_doc_target;
948 $self->[_here_quote_character_] = $here_quote_character;
949 $self->log_numbered_msg(
950 "Entering HERE document $here_doc_target\n");
951 $self->[_nearly_matched_here_target_at_] = undef;
952 $self->[_started_looking_for_here_target_at_] =
956 $self->[_in_here_doc_] = 0;
957 $self->[_here_doc_target_] = EMPTY_STRING;
958 $self->[_here_quote_character_] = EMPTY_STRING;
962 # check for error of extra whitespace
963 # note for PERL6: leading whitespace is allowed
965 $candidate_target =~ s/\s*$//;
966 $candidate_target =~ s/^\s*//;
967 if ( $candidate_target eq $here_doc_target ) {
968 $self->[_nearly_matched_here_target_at_] = $input_line_number;
971 return $line_of_tokens;
974 # Print line unchanged if we are in a format section
975 elsif ( $self->[_in_format_] ) {
977 if ( $input_line =~ /^\.[\s#]*$/ ) {
979 # Decrement format depth count at a '.' after a 'format'
980 $self->[_in_format_]--;
982 # This is the end when count reaches 0
983 if ( !$self->[_in_format_] ) {
984 $self->log_numbered_msg("Exiting format section\n");
985 $line_of_tokens->{_line_type} = 'FORMAT_END';
989 $line_of_tokens->{_line_type} = 'FORMAT';
990 if ( $input_line =~ /^\s*format\s+\w+/ ) {
992 # Increment format depth count at a 'format' within a 'format'
993 # This is a simple way to handle nested formats (issue c019).
994 $self->[_in_format_]++;
997 return $line_of_tokens;
1000 # must print line unchanged if we are in pod documentation
1001 elsif ( $self->[_in_pod_] ) {
1003 $line_of_tokens->{_line_type} = 'POD';
1004 if ( $input_line =~ /^=cut/ ) {
1005 $line_of_tokens->{_line_type} = 'POD_END';
1006 $self->log_numbered_msg("Exiting POD section\n");
1007 $self->[_in_pod_] = 0;
1009 if ( $input_line =~ /^\#\!.*perl\b/ && !$self->[_in_end_] ) {
1011 "Hash-bang in pod can cause older versions of perl to fail! \n"
1015 return $line_of_tokens;
1018 # print line unchanged if in skipped section
1019 elsif ( $self->[_in_skipped_] ) {
1021 $line_of_tokens->{_line_type} = 'SKIP';
1022 if ( $input_line =~ /$code_skipping_pattern_end/ ) {
1023 $line_of_tokens->{_line_type} = 'SKIP_END';
1024 $self->log_numbered_msg("Exiting code-skipping section\n");
1025 $self->[_in_skipped_] = 0;
1027 return $line_of_tokens;
1030 # must print line unchanged if we have seen a severe error (i.e., we
1031 # are seeing illegal tokens and cannot continue. Syntax errors do
1032 # not pass this route). Calling routine can decide what to do, but
1033 # the default can be to just pass all lines as if they were after __END__
1034 elsif ( $self->[_in_error_] ) {
1035 $line_of_tokens->{_line_type} = 'ERROR';
1036 return $line_of_tokens;
1039 # print line unchanged if we are __DATA__ section
1040 elsif ( $self->[_in_data_] ) {
1042 # ...but look for POD
1043 # Note that the _in_data and _in_end flags remain set
1044 # so that we return to that state after seeing the
1045 # end of a pod section
1046 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1047 $line_of_tokens->{_line_type} = 'POD_START';
1048 $self->log_numbered_msg("Entering POD section\n");
1049 $self->[_in_pod_] = 1;
1050 return $line_of_tokens;
1053 $line_of_tokens->{_line_type} = 'DATA';
1054 return $line_of_tokens;
1058 # print line unchanged if we are in __END__ section
1059 elsif ( $self->[_in_end_] ) {
1061 # ...but look for POD
1062 # Note that the _in_data and _in_end flags remain set
1063 # so that we return to that state after seeing the
1064 # end of a pod section
1065 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1066 $line_of_tokens->{_line_type} = 'POD_START';
1067 $self->log_numbered_msg("Entering POD section\n");
1068 $self->[_in_pod_] = 1;
1069 return $line_of_tokens;
1072 $line_of_tokens->{_line_type} = 'END';
1073 return $line_of_tokens;
1077 # check for a hash-bang line if we haven't seen one
1078 if ( !$self->[_saw_hash_bang_] ) {
1079 if ( $input_line =~ /^\#\!.*perl\b/ ) {
1080 $self->[_saw_hash_bang_] = $input_line_number;
1082 # check for -w and -P flags
1083 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
1084 $self->[_saw_perl_dash_P_] = 1;
1087 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
1088 $self->[_saw_perl_dash_w_] = 1;
1092 $input_line_number > 1
1094 # leave any hash bang in a BEGIN block alone
1095 # i.e. see 'debugger-duck_type.t'
1097 $last_nonblank_block_type
1098 && $last_nonblank_block_type eq 'BEGIN'
1100 && !$self->[_look_for_hash_bang_]
1102 # Try to avoid giving a false alarm at a simple comment.
1103 # These look like valid hash-bang lines:
1107 #!c:\perl\bin\perl.exe
1109 # These are comments:
1111 #! sunos does not yet provide a /usr/bin/perl
1113 # Comments typically have multiple spaces, which suggests
1115 && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
1119 # this is helpful for VMS systems; we may have accidentally
1120 # tokenized some DCL commands
1121 if ( $self->[_started_tokenizing_] ) {
1123 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
1127 complain("Useless hash-bang after line 1\n");
1131 # Report the leading hash-bang as a system line
1132 # This will prevent -dac from deleting it
1134 $line_of_tokens->{_line_type} = 'SYSTEM';
1135 return $line_of_tokens;
1140 # wait for a hash-bang before parsing if the user invoked us with -x
1141 if ( $self->[_look_for_hash_bang_]
1142 && !$self->[_saw_hash_bang_] )
1144 $line_of_tokens->{_line_type} = 'SYSTEM';
1145 return $line_of_tokens;
1148 # a first line of the form ': #' will be marked as SYSTEM
1149 # since lines of this form may be used by tcsh
1150 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1151 $line_of_tokens->{_line_type} = 'SYSTEM';
1152 return $line_of_tokens;
1155 # now we know that it is ok to tokenize the line...
1156 # the line tokenizer will modify any of these private variables:
1157 # _rhere_target_list_
1165 $self->tokenize_this_line($line_of_tokens);
1167 # Now finish defining the return structure and return it
1168 $line_of_tokens->{_ending_in_quote} = $self->[_in_quote_];
1170 # handle severe error (binary data in script)
1171 if ( $self->[_in_error_] ) {
1172 $self->[_in_quote_] = 0; # to avoid any more messages
1173 warning("Giving up after error\n");
1174 $line_of_tokens->{_line_type} = 'ERROR';
1175 reset_indentation_level(0); # avoid error messages
1176 return $line_of_tokens;
1179 # handle start of pod documentation
1180 if ( $self->[_in_pod_] ) {
1182 # This gets tricky..above a __DATA__ or __END__ section, perl
1183 # accepts '=cut' as the start of pod section. But afterwards,
1184 # only pod utilities see it and they may ignore an =cut without
1185 # leading =head. In any case, this isn't good.
1186 if ( $input_line =~ /^=cut\b/ ) {
1187 if ( $self->[_saw_data_] || $self->[_saw_end_] ) {
1188 complain("=cut while not in pod ignored\n");
1189 $self->[_in_pod_] = 0;
1190 $line_of_tokens->{_line_type} = 'POD_END';
1193 $line_of_tokens->{_line_type} = 'POD_START';
1195 "=cut starts a pod section .. this can fool pod utilities.\n"
1196 ) unless (DEVEL_MODE);
1197 $self->log_numbered_msg("Entering POD section\n");
1202 $line_of_tokens->{_line_type} = 'POD_START';
1203 $self->log_numbered_msg("Entering POD section\n");
1206 return $line_of_tokens;
1209 # handle start of skipped section
1210 if ( $self->[_in_skipped_] ) {
1212 $line_of_tokens->{_line_type} = 'SKIP';
1213 $self->log_numbered_msg("Entering code-skipping section\n");
1214 return $line_of_tokens;
1217 # see if this line contains here doc targets
1218 my $rhere_target_list = $self->[_rhere_target_list_];
1219 if ( @{$rhere_target_list} ) {
1221 my ( $here_doc_target, $here_quote_character ) =
1222 @{ shift @{$rhere_target_list} };
1223 $self->[_in_here_doc_] = 1;
1224 $self->[_here_doc_target_] = $here_doc_target;
1225 $self->[_here_quote_character_] = $here_quote_character;
1226 $self->log_numbered_msg("Entering HERE document $here_doc_target\n");
1227 $self->[_started_looking_for_here_target_at_] = $input_line_number;
1230 # NOTE: __END__ and __DATA__ statements are written unformatted
1231 # because they can theoretically contain additional characters
1232 # which are not tokenized (and cannot be read with <DATA> either!).
1233 if ( $self->[_in_data_] ) {
1234 $line_of_tokens->{_line_type} = 'DATA_START';
1235 $self->log_numbered_msg("Starting __DATA__ section\n");
1236 $self->[_saw_data_] = 1;
1238 # keep parsing after __DATA__ if use SelfLoader was seen
1239 if ( $self->[_saw_selfloader_] ) {
1240 $self->[_in_data_] = 0;
1241 $self->log_numbered_msg(
1242 "SelfLoader seen, continuing; -nlsl deactivates\n");
1245 return $line_of_tokens;
1248 elsif ( $self->[_in_end_] ) {
1249 $line_of_tokens->{_line_type} = 'END_START';
1250 $self->log_numbered_msg("Starting __END__ section\n");
1251 $self->[_saw_end_] = 1;
1253 # keep parsing after __END__ if use AutoLoader was seen
1254 if ( $self->[_saw_autoloader_] ) {
1255 $self->[_in_end_] = 0;
1256 $self->log_numbered_msg(
1257 "AutoLoader seen, continuing; -nlal deactivates\n");
1259 return $line_of_tokens;
1262 # now, finally, we know that this line is type 'CODE'
1263 $line_of_tokens->{_line_type} = 'CODE';
1265 # remember if we have seen any real code
1266 if ( !$self->[_started_tokenizing_]
1267 && $input_line !~ /^\s*$/
1268 && $input_line !~ /^\s*#/ )
1270 $self->[_started_tokenizing_] = 1;
1273 if ( $self->[_debugger_object_] ) {
1274 $self->[_debugger_object_]->write_debug_entry($line_of_tokens);
1277 # Note: if keyword 'format' occurs in this line code, it is still CODE
1278 # (keyword 'format' need not start a line)
1279 if ( $self->[_in_format_] ) {
1280 $self->log_numbered_msg("Entering format section\n");
1283 if ( $self->[_in_quote_]
1284 and ( $self->[_line_start_quote_] < 0 ) )
1287 #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1288 if ( ( my $quote_target = $self->[_quote_target_] ) !~ /^\s*$/ ) {
1289 $self->[_line_start_quote_] = $input_line_number;
1290 $self->log_numbered_msg(
1291 "Start multi-line quote or pattern ending in $quote_target\n");
1294 elsif ( ( $self->[_line_start_quote_] >= 0 )
1295 && !$self->[_in_quote_] )
1297 $self->[_line_start_quote_] = -1;
1298 $self->log_numbered_msg("End of multi-line quote or pattern\n");
1301 # we are returning a line of CODE
1302 return $line_of_tokens;
1303 } ## end sub get_line
1305 sub find_starting_indentation_level {
1307 # We need to find the indentation level of the first line of the
1308 # script being formatted. Often it will be zero for an entire file,
1309 # but if we are formatting a local block of code (within an editor for
1310 # example) it may not be zero. The user may specify this with the
1311 # -sil=n parameter but normally doesn't so we have to guess.
1314 my $starting_level = 0;
1316 # use value if given as parameter
1317 if ( $self->[_know_starting_level_] ) {
1318 $starting_level = $self->[_starting_level_];
1321 # if we know there is a hash_bang line, the level must be zero
1322 elsif ( $self->[_look_for_hash_bang_] ) {
1323 $self->[_know_starting_level_] = 1;
1326 # otherwise figure it out from the input file
1331 # keep looking at lines until we find a hash bang or piece of code
1332 my $msg = EMPTY_STRING;
1333 while ( $line = $self->[_line_buffer_object_]->peek_ahead( $i++ ) ) {
1335 # if first line is #! then assume starting level is zero
1336 if ( $i == 1 && $line =~ /^\#\!/ ) {
1337 $starting_level = 0;
1340 next if ( $line =~ /^\s*#/ ); # skip past comments
1341 next if ( $line =~ /^\s*$/ ); # skip past blank lines
1342 $starting_level = guess_old_indentation_level($line);
1345 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1346 write_logfile_entry("$msg");
1348 $self->[_starting_level_] = $starting_level;
1349 reset_indentation_level($starting_level);
1351 } ## end sub find_starting_indentation_level
1353 sub guess_old_indentation_level {
1356 # Guess the indentation level of an input line.
1358 # For the first line of code this result will define the starting
1359 # indentation level. It will mainly be non-zero when perltidy is applied
1360 # within an editor to a local block of code.
1362 # This is an impossible task in general because we can't know what tabs
1363 # meant for the old script and how many spaces were used for one
1364 # indentation level in the given input script. For example it may have
1365 # been previously formatted with -i=7 -et=3. But we can at least try to
1366 # make sure that perltidy guesses correctly if it is applied repeatedly to
1367 # a block of code within an editor, so that the block stays at the same
1368 # level when perltidy is applied repeatedly.
1370 # USES GLOBAL VARIABLES: $tokenizer_self
1373 # find leading tabs, spaces, and any statement label
1375 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1377 # If there are leading tabs, we use the tab scheme for this run, if
1378 # any, so that the code will remain stable when editing.
1379 if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }
1381 if ($2) { $spaces += length($2) }
1383 # correct for outdented labels
1384 if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
1385 $spaces += $tokenizer_self->[_continuation_indentation_];
1389 # compute indentation using the value of -i for this run.
1390 # If -i=0 is used for this run (which is possible) it doesn't matter
1391 # what we do here but we'll guess that the old run used 4 spaces per level.
1392 my $indent_columns = $tokenizer_self->[_indent_columns_];
1393 $indent_columns = 4 if ( !$indent_columns );
1394 $level = int( $spaces / $indent_columns );
1396 } ## end sub guess_old_indentation_level
1398 # This is a currently unused debug routine
1399 sub dump_functions {
1402 foreach my $pkg ( keys %is_user_function ) {
1403 $fh->print("\nnon-constant subs in package $pkg\n");
1405 foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1406 my $msg = EMPTY_STRING;
1407 if ( $is_block_list_function{$pkg}{$sub} ) {
1408 $msg = 'block_list';
1411 if ( $is_block_function{$pkg}{$sub} ) {
1414 $fh->print("$sub $msg\n");
1418 foreach my $pkg ( keys %is_constant ) {
1419 $fh->print("\nconstants and constant subs in package $pkg\n");
1421 foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1422 $fh->print("$sub\n");
1426 } ## end sub dump_functions
1428 sub prepare_for_a_new_file {
1430 # previous tokens needed to determine what to expect next
1431 $last_nonblank_token = ';'; # the only possible starting state which
1432 $last_nonblank_type = ';'; # will make a leading brace a code block
1433 $last_nonblank_block_type = EMPTY_STRING;
1435 # scalars for remembering statement types across multiple lines
1436 $statement_type = EMPTY_STRING; # '' or 'use' or 'sub..' or 'case..'
1437 $in_attribute_list = 0;
1439 # scalars for remembering where we are in the file
1440 $current_package = "main";
1441 $context = UNKNOWN_CONTEXT;
1443 # hashes used to remember function information
1444 %is_constant = (); # user-defined constants
1445 %is_user_function = (); # user-defined functions
1446 %user_function_prototype = (); # their prototypes
1447 %is_block_function = ();
1448 %is_block_list_function = ();
1449 %saw_function_definition = ();
1450 %saw_use_module = ();
1452 # variables used to track depths of various containers
1453 # and report nesting errors
1456 $square_bracket_depth = 0;
1457 @current_depth = (0) x scalar @closing_brace_names;
1460 @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
1461 @current_sequence_number = ();
1462 $next_sequence_number = 2; # The value 1 is reserved for SEQ_ROOT
1465 @paren_semicolon_count = ();
1466 @paren_structural_type = ();
1468 @brace_structural_type = ();
1469 @brace_context = ();
1470 @brace_package = ();
1471 @square_bracket_type = ();
1472 @square_bracket_structural_type = ();
1474 @nested_ternary_flag = ();
1475 @nested_statement_type = ();
1476 @starting_line_of_current_depth = ();
1478 $paren_type[$paren_depth] = EMPTY_STRING;
1479 $paren_semicolon_count[$paren_depth] = 0;
1480 $paren_structural_type[$brace_depth] = EMPTY_STRING;
1481 $brace_type[$brace_depth] = ';'; # identify opening brace as code block
1482 $brace_structural_type[$brace_depth] = EMPTY_STRING;
1483 $brace_context[$brace_depth] = UNKNOWN_CONTEXT;
1484 $brace_package[$paren_depth] = $current_package;
1485 $square_bracket_type[$square_bracket_depth] = EMPTY_STRING;
1486 $square_bracket_structural_type[$square_bracket_depth] = EMPTY_STRING;
1488 initialize_tokenizer_state();
1490 } ## end sub prepare_for_a_new_file
1492 { ## closure for sub tokenize_this_line
1494 use constant BRACE => 0;
1495 use constant SQUARE_BRACKET => 1;
1496 use constant PAREN => 2;
1497 use constant QUESTION_COLON => 3;
1499 # TV1: scalars for processing one LINE.
1500 # Re-initialized on each entry to sub tokenize_this_line.
1502 $block_type, $container_type, $expecting,
1503 $i, $i_tok, $input_line,
1504 $input_line_number, $last_nonblank_i, $max_token_index,
1505 $next_tok, $next_type, $peeked_ahead,
1506 $prototype, $rhere_target_list, $rtoken_map,
1507 $rtoken_type, $rtokens, $tok,
1508 $type, $type_sequence, $indent_flag,
1511 # TV2: refs to ARRAYS for processing one LINE
1512 # Re-initialized on each call.
1513 my $routput_token_list = []; # stack of output token indexes
1514 my $routput_token_type = []; # token types
1515 my $routput_block_type = []; # types of code block
1516 my $routput_container_type = []; # paren types, such as if, elsif, ..
1517 my $routput_type_sequence = []; # nesting sequential number
1518 my $routput_indent_flag = []; #
1520 # TV3: SCALARS for quote variables. These are initialized with a
1521 # subroutine call and continually updated as lines are processed.
1522 my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1523 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1525 # TV4: SCALARS for multi-line identifiers and
1526 # statements. These are initialized with a subroutine call
1527 # and continually updated as lines are processed.
1528 my ( $id_scan_state, $identifier, $want_paren );
1530 # TV5: SCALARS for tracking indentation level.
1531 # Initialized once and continually updated as lines are
1534 $nesting_token_string, $nesting_type_string,
1535 $nesting_block_string, $nesting_block_flag,
1536 $nesting_list_string, $nesting_list_flag,
1537 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1538 $in_statement_continuation, $level_in_tokenizer,
1539 $slevel_in_tokenizer, $rslevel_stack,
1542 # TV6: SCALARS for remembering several previous
1543 # tokens. Initialized once and continually updated as
1544 # lines are processed.
1546 $last_nonblank_container_type, $last_nonblank_type_sequence,
1547 $last_last_nonblank_token, $last_last_nonblank_type,
1548 $last_last_nonblank_block_type, $last_last_nonblank_container_type,
1549 $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1552 # ----------------------------------------------------------------
1553 # beginning of tokenizer variable access and manipulation routines
1554 # ----------------------------------------------------------------
1556 sub initialize_tokenizer_state {
1558 # TV1: initialized on each call
1559 # TV2: initialized on each call
1563 $quote_character = EMPTY_STRING;
1566 $quoted_string_1 = EMPTY_STRING;
1567 $quoted_string_2 = EMPTY_STRING;
1568 $allowed_quote_modifiers = EMPTY_STRING;
1571 $id_scan_state = EMPTY_STRING;
1572 $identifier = EMPTY_STRING;
1573 $want_paren = EMPTY_STRING;
1576 $nesting_token_string = EMPTY_STRING;
1577 $nesting_type_string = EMPTY_STRING;
1578 $nesting_block_string = '1'; # initially in a block
1579 $nesting_block_flag = 1;
1580 $nesting_list_string = '0'; # initially not in a list
1581 $nesting_list_flag = 0; # initially not in a list
1582 $ci_string_in_tokenizer = EMPTY_STRING;
1583 $continuation_string_in_tokenizer = "0";
1584 $in_statement_continuation = 0;
1585 $level_in_tokenizer = 0;
1586 $slevel_in_tokenizer = 0;
1587 $rslevel_stack = [];
1590 $last_nonblank_container_type = EMPTY_STRING;
1591 $last_nonblank_type_sequence = EMPTY_STRING;
1592 $last_last_nonblank_token = ';';
1593 $last_last_nonblank_type = ';';
1594 $last_last_nonblank_block_type = EMPTY_STRING;
1595 $last_last_nonblank_container_type = EMPTY_STRING;
1596 $last_last_nonblank_type_sequence = EMPTY_STRING;
1597 $last_nonblank_prototype = EMPTY_STRING;
1599 } ## end sub initialize_tokenizer_state
1601 sub save_tokenizer_state {
1604 $block_type, $container_type, $expecting,
1605 $i, $i_tok, $input_line,
1606 $input_line_number, $last_nonblank_i, $max_token_index,
1607 $next_tok, $next_type, $peeked_ahead,
1608 $prototype, $rhere_target_list, $rtoken_map,
1609 $rtoken_type, $rtokens, $tok,
1610 $type, $type_sequence, $indent_flag,
1614 $routput_token_list, $routput_token_type,
1615 $routput_block_type, $routput_container_type,
1616 $routput_type_sequence, $routput_indent_flag,
1620 $in_quote, $quote_type,
1621 $quote_character, $quote_pos,
1622 $quote_depth, $quoted_string_1,
1623 $quoted_string_2, $allowed_quote_modifiers,
1626 my $rTV4 = [ $id_scan_state, $identifier, $want_paren ];
1629 $nesting_token_string, $nesting_type_string,
1630 $nesting_block_string, $nesting_block_flag,
1631 $nesting_list_string, $nesting_list_flag,
1632 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1633 $in_statement_continuation, $level_in_tokenizer,
1634 $slevel_in_tokenizer, $rslevel_stack,
1638 $last_nonblank_container_type,
1639 $last_nonblank_type_sequence,
1640 $last_last_nonblank_token,
1641 $last_last_nonblank_type,
1642 $last_last_nonblank_block_type,
1643 $last_last_nonblank_container_type,
1644 $last_last_nonblank_type_sequence,
1645 $last_nonblank_prototype,
1647 return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1648 } ## end sub save_tokenizer_state
1650 sub restore_tokenizer_state {
1652 my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1654 $block_type, $container_type, $expecting,
1655 $i, $i_tok, $input_line,
1656 $input_line_number, $last_nonblank_i, $max_token_index,
1657 $next_tok, $next_type, $peeked_ahead,
1658 $prototype, $rhere_target_list, $rtoken_map,
1659 $rtoken_type, $rtokens, $tok,
1660 $type, $type_sequence, $indent_flag,
1664 $routput_token_list, $routput_token_type,
1665 $routput_block_type, $routput_container_type,
1666 $routput_type_sequence, $routput_indent_flag,
1670 $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1671 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1674 ( $id_scan_state, $identifier, $want_paren ) = @{$rTV4};
1677 $nesting_token_string, $nesting_type_string,
1678 $nesting_block_string, $nesting_block_flag,
1679 $nesting_list_string, $nesting_list_flag,
1680 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1681 $in_statement_continuation, $level_in_tokenizer,
1682 $slevel_in_tokenizer, $rslevel_stack,
1686 $last_nonblank_container_type,
1687 $last_nonblank_type_sequence,
1688 $last_last_nonblank_token,
1689 $last_last_nonblank_type,
1690 $last_last_nonblank_block_type,
1691 $last_last_nonblank_container_type,
1692 $last_last_nonblank_type_sequence,
1693 $last_nonblank_prototype,
1696 } ## end sub restore_tokenizer_state
1698 sub split_pretoken {
1702 # Split the leading $numc characters from the current token (at index=$i)
1703 # which is pre-type 'w' and insert the remainder back into the pretoken
1704 # stream with appropriate settings. Since we are splitting a pre-type 'w',
1705 # there are three cases, depending on if the remainder starts with a digit:
1706 # Case 1: remainder is type 'd', all digits
1707 # Case 2: remainder is type 'd' and type 'w': digits and other characters
1708 # Case 3: remainder is type 'w'
1710 # Examples, for $numc=1:
1711 # $tok => $tok_0 $tok_1 $tok_2
1712 # 'x10' => 'x' '10' # case 1
1713 # 'x10if' => 'x' '10' 'if' # case 2
1714 # '0ne => 'O' 'ne' # case 3
1717 # $tok_1 is a possible string of digits (pre-type 'd')
1718 # $tok_2 is a possible word (pre-type 'w')
1720 # return 1 if successful
1721 # return undef if error (shouldn't happen)
1723 # Calling routine should update '$type' and '$tok' if successful.
1725 my $pretoken = $rtokens->[$i];
1727 && length($pretoken) > $numc
1728 && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
1731 # Split $tok into up to 3 tokens:
1732 my $tok_0 = substr( $pretoken, 0, $numc );
1733 my $tok_1 = defined($1) ? $1 : EMPTY_STRING;
1734 my $tok_2 = defined($2) ? $2 : EMPTY_STRING;
1736 my $len_0 = length($tok_0);
1737 my $len_1 = length($tok_1);
1738 my $len_2 = length($tok_2);
1740 my $pre_type_0 = 'w';
1741 my $pre_type_1 = 'd';
1742 my $pre_type_2 = 'w';
1744 my $pos_0 = $rtoken_map->[$i];
1745 my $pos_1 = $pos_0 + $len_0;
1746 my $pos_2 = $pos_1 + $len_1;
1748 my $isplice = $i + 1;
1750 # Splice in any digits
1752 splice @{$rtoken_map}, $isplice, 0, $pos_1;
1753 splice @{$rtokens}, $isplice, 0, $tok_1;
1754 splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
1759 # Splice in any trailing word
1761 splice @{$rtoken_map}, $isplice, 0, $pos_2;
1762 splice @{$rtokens}, $isplice, 0, $tok_2;
1763 splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
1767 $rtokens->[$i] = $tok_0;
1772 # Shouldn't get here
1775 While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
1780 } ## end sub split_pretoken
1782 sub get_indentation_level {
1783 return $level_in_tokenizer;
1786 sub reset_indentation_level {
1787 $level_in_tokenizer = $slevel_in_tokenizer = shift;
1788 push @{$rslevel_stack}, $slevel_in_tokenizer;
1794 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1795 return $peeked_ahead;
1798 # ------------------------------------------------------------
1799 # end of tokenizer variable access and manipulation routines
1800 # ------------------------------------------------------------
1802 #------------------------------
1803 # beginning of tokenizer hashes
1804 #------------------------------
1806 my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
1808 # These block types terminate statements and do not need a trailing
1810 # patched for SWITCH/CASE/
1811 my %is_zero_continuation_block_type;
1813 @q = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
1814 if elsif else unless while until for foreach switch case given when);
1815 @is_zero_continuation_block_type{@q} = (1) x scalar(@q);
1817 my %is_logical_container;
1818 @q = qw(if elsif unless while and or err not && ! || for foreach);
1819 @is_logical_container{@q} = (1) x scalar(@q);
1823 @is_binary_type{@q} = (1) x scalar(@q);
1825 my %is_binary_keyword;
1826 @q = qw(and or err eq ne cmp);
1827 @is_binary_keyword{@q} = (1) x scalar(@q);
1829 # 'L' is token for opening { at hash key
1830 my %is_opening_type;
1832 @is_opening_type{@q} = (1) x scalar(@q);
1834 # 'R' is token for closing } at hash key
1835 my %is_closing_type;
1837 @is_closing_type{@q} = (1) x scalar(@q);
1839 my %is_redo_last_next_goto;
1840 @q = qw(redo last next goto);
1841 @is_redo_last_next_goto{@q} = (1) x scalar(@q);
1844 @q = qw(use require);
1845 @is_use_require{@q} = (1) x scalar(@q);
1847 # This hash holds the array index in $tokenizer_self for these keywords:
1848 # Fix for issue c035: removed 'format' from this hash
1850 '__END__' => _in_end_,
1851 '__DATA__' => _in_data_,
1854 my %is_list_end_type;
1857 @is_list_end_type{@q} = (1) x scalar(@q);
1859 # original ref: camel 3 p 147,
1860 # but perl may accept undocumented flags
1861 # perl 5.10 adds 'p' (preserve)
1862 # Perl version 5.22 added 'n'
1863 # From http://perldoc.perl.org/perlop.html we have
1864 # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
1865 # s/PATTERN/REPLACEMENT/msixpodualngcer
1866 # y/SEARCHLIST/REPLACEMENTLIST/cdsr
1867 # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
1868 # qr/STRING/msixpodualn
1869 my %quote_modifiers = (
1870 's' => '[msixpodualngcer]',
1873 'm' => '[msixpodualngc]',
1874 'qr' => '[msixpodualn]',
1875 'q' => EMPTY_STRING,
1876 'qq' => EMPTY_STRING,
1877 'qw' => EMPTY_STRING,
1878 'qx' => EMPTY_STRING,
1881 # table showing how many quoted things to look for after quote operator..
1882 # s, y, tr have 2 (pattern and replacement)
1883 # others have 1 (pattern only)
1897 @q = qw(for foreach);
1898 @is_for_foreach{@q} = (1) x scalar(@q);
1900 # These keywords may introduce blocks after parenthesized expressions,
1902 # keyword ( .... ) { BLOCK }
1903 # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
1904 # NOTE for --use-feature=class: if ADJUST blocks eventually take a
1905 # parameter list, then ADJUST might need to be added to this list (see
1907 my %is_blocktype_with_paren;
1909 qw(if elsif unless while until for foreach switch case given when catch);
1910 @is_blocktype_with_paren{@q} = (1) x scalar(@q);
1912 my %is_case_default;
1913 @q = qw(case default);
1914 @is_case_default{@q} = (1) x scalar(@q);
1916 #------------------------
1917 # end of tokenizer hashes
1918 #------------------------
1920 # ------------------------------------------------------------
1921 # beginning of various scanner interface routines
1922 # ------------------------------------------------------------
1923 sub scan_replacement_text {
1925 # check for here-docs in replacement text invoked by
1926 # a substitution operator with executable modifier 'e'.
1931 # $rht = reference to any here-doc targets
1932 my ($replacement_text) = @_;
1935 return unless ( $replacement_text =~ /<</ );
1937 write_logfile_entry("scanning replacement text for here-doc targets\n");
1939 # save the logger object for error messages
1940 my $logger_object = $tokenizer_self->[_logger_object_];
1942 # localize all package variables
1944 $tokenizer_self, $last_nonblank_token,
1945 $last_nonblank_type, $last_nonblank_block_type,
1946 $statement_type, $in_attribute_list,
1947 $current_package, $context,
1948 %is_constant, %is_user_function,
1949 %user_function_prototype, %is_block_function,
1950 %is_block_list_function, %saw_function_definition,
1951 $brace_depth, $paren_depth,
1952 $square_bracket_depth, @current_depth,
1953 @total_depth, $total_depth,
1954 @nesting_sequence_number, @current_sequence_number,
1955 @paren_type, @paren_semicolon_count,
1956 @paren_structural_type, @brace_type,
1957 @brace_structural_type, @brace_context,
1958 @brace_package, @square_bracket_type,
1959 @square_bracket_structural_type, @depth_array,
1960 @starting_line_of_current_depth, @nested_ternary_flag,
1961 @nested_statement_type, $next_sequence_number,
1964 # save all lexical variables
1965 my $rstate = save_tokenizer_state();
1966 _decrement_count(); # avoid error check for multiple tokenizers
1968 # make a new tokenizer
1970 my $source_object = Perl::Tidy::LineSource->new(
1971 input_file => \$replacement_text,
1974 my $tokenizer = Perl::Tidy::Tokenizer->new(
1975 source_object => $source_object,
1976 logger_object => $logger_object,
1977 starting_line_number => $input_line_number,
1980 # scan the replacement text
1981 1 while ( $tokenizer->get_line() );
1983 # remove any here doc targets
1985 if ( $tokenizer_self->[_in_here_doc_] ) {
1989 $tokenizer_self->[_here_doc_target_],
1990 $tokenizer_self->[_here_quote_character_]
1992 if ( $tokenizer_self->[_rhere_target_list_] ) {
1993 push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
1994 $tokenizer_self->[_rhere_target_list_] = undef;
1996 $tokenizer_self->[_in_here_doc_] = undef;
1999 # now its safe to report errors
2000 my $severe_error = $tokenizer->report_tokenization_errors();
2002 # TODO: Could propagate a severe error up
2004 # restore all tokenizer lexical variables
2005 restore_tokenizer_state($rstate);
2007 # return the here doc targets
2009 } ## end sub scan_replacement_text
2011 sub scan_bare_identifier {
2012 ( $i, $tok, $type, $prototype ) =
2013 scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
2014 $rtoken_map, $max_token_index );
2016 } ## end sub scan_bare_identifier
2018 sub scan_identifier {
2020 $i, $tok, $type, $id_scan_state, $identifier,
2021 my $split_pretoken_flag
2023 = scan_complex_identifier( $i, $id_scan_state, $identifier, $rtokens,
2024 $max_token_index, $expecting, $paren_type[$paren_depth] );
2026 # Check for signal to fix a special variable adjacent to a keyword,
2027 # such as '$^One$0'.
2028 if ($split_pretoken_flag) {
2030 # Try to fix it by splitting the pretoken
2032 && $rtokens->[ $i - 1 ] eq '^'
2033 && split_pretoken(1) )
2035 $identifier = substr( $identifier, 0, 3 );
2040 # This shouldn't happen ...
2041 my $var = substr( $tok, 0, 3 );
2042 my $excess = substr( $tok, 3 );
2043 interrupt_logfile();
2045 $input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
2046 A space may be needed after '$var'.
2052 } ## end sub scan_identifier
2054 use constant VERIFY_FASTSCAN => 0;
2055 my %fast_scan_context;
2058 %fast_scan_context = (
2059 '$' => SCALAR_CONTEXT,
2060 '*' => SCALAR_CONTEXT,
2061 '@' => LIST_CONTEXT,
2062 '%' => LIST_CONTEXT,
2063 '&' => UNKNOWN_CONTEXT,
2067 sub scan_simple_identifier {
2069 # This is a wrapper for sub scan_identifier. It does a fast preliminary
2070 # scan for certain common identifiers:
2071 # '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
2072 # If it does not find one of these, or this is a restart, it calls the
2073 # original scanner directly.
2075 # This gives the same results as the full scanner in about 1/4 the
2076 # total runtime for a typical input stream.
2082 # || ---- $i_next [= next nonblank pretoken ]
2083 # |----$i_plus_1 [= a bareword ]
2084 # ---$i_begin [= a sigil]
2087 my $tok_begin = $tok;
2088 my $i_plus_1 = $i + 1;
2091 #-------------------------------------------------------
2092 # Do full scan for anything following a pointer, such as
2093 # $cref->&*; # a postderef
2094 #-------------------------------------------------------
2095 if ( $last_nonblank_token eq '->' ) {
2099 #------------------------------
2100 # quick scan with leading sigil
2101 #------------------------------
2102 elsif ( !$id_scan_state
2103 && $i_plus_1 <= $max_token_index
2104 && $fast_scan_context{$tok} )
2106 $context = $fast_scan_context{$tok};
2108 # look for $var, @var, ...
2109 if ( $rtoken_type->[$i_plus_1] eq 'w' ) {
2110 my $pretype_next = EMPTY_STRING;
2111 if ( $i_plus_1 < $max_token_index ) {
2112 my $i_next = $i_plus_1 + 1;
2113 if ( $rtoken_type->[$i_next] eq 'b'
2114 && $i_next < $max_token_index )
2118 $pretype_next = $rtoken_type->[$i_next];
2120 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
2122 # Found type 'i' like '$var', '@var', or '%var'
2123 $identifier = $tok . $rtokens->[$i_plus_1];
2127 $fast_scan_type = $type;
2131 # Look for @{ or %{ .
2132 # But we must let the full scanner handle things ${ because it may
2133 # keep going to get a complete identifier like '${#}' .
2135 $rtoken_type->[$i_plus_1] eq '{'
2136 && ( $tok_begin eq '@'
2137 || $tok_begin eq '%' )
2143 $fast_scan_type = $type;
2147 #---------------------------
2148 # Quick scan with leading ->
2149 # Look for ->[ and ->{
2150 #---------------------------
2153 && $i < $max_token_index
2154 && ( $rtokens->[$i_plus_1] eq '{'
2155 || $rtokens->[$i_plus_1] eq '[' )
2159 $fast_scan_type = $type;
2161 $context = UNKNOWN_CONTEXT;
2164 #--------------------------------------
2165 # Verify correctness during development
2166 #--------------------------------------
2167 if ( VERIFY_FASTSCAN && $fast_scan_type ) {
2169 # We will call the full method
2170 my $identifier_simple = $identifier;
2171 my $tok_simple = $tok;
2173 my $context_simple = $context;
2179 if ( $tok ne $tok_simple
2180 || $type ne $fast_scan_type
2182 || $identifier ne $identifier_simple
2184 || $context ne $context_simple )
2187 scan_simple_identifier differs from scan_identifier:
2188 simple: i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
2189 full: i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
2194 #-------------------------------------------------
2195 # call full scanner if fast method did not succeed
2196 #-------------------------------------------------
2197 if ( !$fast_scan_type ) {
2201 } ## end sub scan_simple_identifier
2203 sub method_ok_here {
2206 # false if this is definitely an invalid method declaration
2207 # true otherwise (even if not sure)
2209 # We are trying to avoid problems with old uses of 'method'
2210 # when --use-feature=class is set (rt145706).
2211 # For example, this should cause a return of 'false':
2213 # method paint => sub {
2219 my $pos_beg = $rtoken_map->[$i_beg];
2220 pos($input_line) = $pos_beg;
2222 # TEST 1: look a valid sub NAME
2224 $input_line =~ m/\G\s*
2225 ((?:\w*(?:'|::))*) # package - something that ends in :: or '
2226 (\w+) # NAME - required
2230 # For possible future use..
2232 my $package = $1 ? $1 : EMPTY_STRING;
2238 # TEST 2: look for invalid characters after name, such as here:
2239 # method paint => sub {
2242 my $next_char = EMPTY_STRING;
2243 if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 }
2244 if ( !$next_char || $next_char eq '#' ) {
2245 ( $next_char, my $i_next ) =
2246 find_next_nonblank_token( $max_token_index,
2247 $rtokens, $max_token_index );
2250 if ( !$next_char ) {
2252 # out of characters - give up
2256 # Possibly valid next token types:
2257 # '(' could start prototype or signature
2258 # ':' could start ATTRIBUTE
2259 # '{' cold start BLOCK
2260 # ';' or '}' could end a statement
2261 if ( $next_char !~ /^[\(\:\{\;\}]/ ) {
2263 # This does not match use feature 'class' syntax
2267 # We will stop here and assume that this is valid syntax for
2268 # use feature 'class'.
2270 } ## end sub method_ok_here
2275 # false if this is definitely an invalid class declaration
2276 # true otherwise (even if not sure)
2278 # We are trying to avoid problems with old uses of 'class'
2279 # when --use-feature=class is set (rt145706). We look ahead
2280 # see if this use of 'class' is obviously inconsistent with
2281 # the syntax of use feature 'class'. This allows the default
2282 # setting --use-feature=class to work for old syntax too.
2284 # Valid class declarations look like
2285 # class NAME ?ATTRS ?VERSION ?BLOCK
2286 # where ATTRS VERSION and BLOCK are optional
2288 # For example, this should produce a return of 'false':
2290 # class ExtendsBasicAttributes is BasicAttributes{
2292 # TEST 1: class stmt can only go where a new statment can start
2293 if ( !new_statement_ok() ) { return }
2296 my $pos_beg = $rtoken_map->[$i_beg];
2297 pos($input_line) = $pos_beg;
2299 # TEST 2: look for a valid NAME
2301 $input_line =~ m/\G\s*
2302 ((?:\w*(?:'|::))*) # package - something that ends in :: or '
2303 (\w+) # NAME - required
2307 # For possible future use..
2309 my $package = $1 ? $1 : EMPTY_STRING;
2315 # TEST 3: look for valid characters after NAME
2316 my $next_char = EMPTY_STRING;
2317 if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 }
2318 if ( !$next_char || $next_char eq '#' ) {
2319 ( $next_char, my $i_next ) =
2320 find_next_nonblank_token( $max_token_index,
2321 $rtokens, $max_token_index );
2323 if ( !$next_char ) {
2325 # out of characters - give up
2329 # Must see one of: ATTRIBUTE, VERSION, BLOCK, or end stmt
2331 # Possibly valid next token types:
2332 # ':' could start ATTRIBUTE
2333 # '\d' could start VERSION
2334 # '{' cold start BLOCK
2335 # ';' could end a statement
2336 # '}' could end statement but would be strange
2338 if ( $next_char !~ /^[\:\d\{\;\}]/ ) {
2340 # This does not match use feature 'class' syntax
2344 # We will stop here and assume that this is valid syntax for
2345 # use feature 'class'.
2347 } ## end sub class_ok_here
2350 ( $i, $tok, $type, $id_scan_state ) =
2351 scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
2352 $id_scan_state, $max_token_index );
2354 } ## end sub scan_id
2358 ( $i, $type, $number ) =
2359 scan_number_do( $input_line, $i, $rtoken_map, $type,
2362 } ## end sub scan_number
2364 use constant VERIFY_FASTNUM => 0;
2366 sub scan_number_fast {
2368 # This is a wrapper for sub scan_number. It does a fast preliminary
2369 # scan for a simple integer. It calls the original scan_number if it
2370 # does not find one.
2373 my $tok_begin = $tok;
2376 #---------------------------------
2377 # Quick check for (signed) integer
2378 #---------------------------------
2380 # This will be the string of digits:
2383 my $typ_d = $rtoken_type->[$i_d];
2385 # check for signed integer
2386 my $sign = EMPTY_STRING;
2388 && ( $typ_d eq '+' || $typ_d eq '-' )
2389 && $i_d < $max_token_index )
2393 $tok_d = $rtokens->[$i_d];
2394 $typ_d = $rtoken_type->[$i_d];
2401 $i_d == $max_token_index
2402 || ( $i_d < $max_token_index
2403 && $rtoken_type->[ $i_d + 1 ] ne '.'
2404 && $rtoken_type->[ $i_d + 1 ] ne 'w' )
2408 # Let let full scanner handle multi-digit integers beginning with
2409 # '0' because there could be error messages. For example, '009' is
2410 # not a valid number.
2412 if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
2413 $number = $sign . $tok_d;
2419 #--------------------------------------
2420 # Verify correctness during development
2421 #--------------------------------------
2422 if ( VERIFY_FASTNUM && defined($number) ) {
2424 # We will call the full method
2425 my $type_simple = $type;
2427 my $number_simple = $number;
2431 $number = scan_number();
2433 if ( $type ne $type_simple
2434 || ( $i != $i_simple && $i <= $max_token_index )
2435 || $number ne $number_simple )
2438 scan_number_fast differs from scan_number:
2439 simple: i=$i_simple, type=$type_simple, number=$number_simple
2440 full: i=$i, type=$type, number=$number
2445 #----------------------------------------
2446 # call full scanner if may not be integer
2447 #----------------------------------------
2448 if ( !defined($number) ) {
2449 $number = scan_number();
2452 } ## end sub scan_number_fast
2454 # a sub to warn if token found where term expected
2455 sub error_if_expecting_TERM {
2456 if ( $expecting == TERM ) {
2457 if ( $really_want_term{$last_nonblank_type} ) {
2458 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
2459 $rtoken_map, $rtoken_type, $input_line );
2464 } ## end sub error_if_expecting_TERM
2466 # a sub to warn if token found where operator expected
2467 sub error_if_expecting_OPERATOR {
2469 if ( $expecting == OPERATOR ) {
2470 if ( !defined($thing) ) { $thing = $tok }
2471 report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
2472 $rtoken_map, $rtoken_type, $input_line );
2473 if ( $i_tok == 0 ) {
2474 interrupt_logfile();
2475 warning("Missing ';' or ',' above?\n");
2481 } ## end sub error_if_expecting_OPERATOR
2483 # ------------------------------------------------------------
2484 # end scanner interfaces
2485 # ------------------------------------------------------------
2490 sub do_GREATER_THAN_SIGN {
2493 error_if_expecting_TERM()
2494 if ( $expecting == TERM );
2496 } ## end sub do_GREATER_THAN_SIGN
2498 sub do_VERTICAL_LINE {
2501 error_if_expecting_TERM()
2502 if ( $expecting == TERM );
2504 } ## end sub do_VERTICAL_LINE
2506 sub do_DOLLAR_SIGN {
2509 # start looking for a scalar
2510 error_if_expecting_OPERATOR("Scalar")
2511 if ( $expecting == OPERATOR );
2512 scan_simple_identifier();
2514 if ( $identifier eq '$^W' ) {
2515 $tokenizer_self->[_saw_perl_dash_w_] = 1;
2518 # Check for identifier in indirect object slot
2519 # (vorboard.pl, sort.t). Something like:
2520 # /^(print|printf|sort|exec|system)$/
2522 $is_indirect_object_taker{$last_nonblank_token}
2523 && $last_nonblank_type eq 'k'
2524 || ( ( $last_nonblank_token eq '(' )
2525 && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
2526 || ( $last_nonblank_type eq 'w'
2527 || $last_nonblank_type eq 'U' ) # possible object
2531 # An identifier followed by '->' is not indirect object;
2532 # fixes b1175, b1176
2533 my ( $next_nonblank_type, $i_next ) =
2534 find_next_noncomment_type( $i, $rtokens, $max_token_index );
2535 $type = 'Z' if ( $next_nonblank_type ne '->' );
2538 } ## end sub do_DOLLAR_SIGN
2540 sub do_LEFT_PARENTHESIS {
2544 $paren_semicolon_count[$paren_depth] = 0;
2546 $container_type = $want_paren;
2547 $want_paren = EMPTY_STRING;
2549 elsif ( $statement_type =~ /^sub\b/ ) {
2550 $container_type = $statement_type;
2553 $container_type = $last_nonblank_token;
2555 # We can check for a syntax error here of unexpected '(',
2556 # but this is going to get messy...
2558 $expecting == OPERATOR
2560 # Be sure this is not a method call of the form
2561 # &method(...), $method->(..), &{method}(...),
2562 # $ref[2](list) is ok & short for $ref[2]->(list)
2563 # NOTE: at present, braces in something like &{ xxx }
2564 # are not marked as a block, we might have a method call.
2565 # Added ')' to fix case c017, something like ()()()
2566 && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/
2570 # ref: camel 3 p 703.
2571 if ( $last_last_nonblank_token eq 'do' ) {
2573 "do SUBROUTINE is deprecated; consider & or -> notation\n"
2578 # if this is an empty list, (), then it is not an
2579 # error; for example, we might have a constant pi and
2580 # invoke it with pi() or just pi;
2581 my ( $next_nonblank_token, $i_next ) =
2582 find_next_nonblank_token( $i, $rtokens,
2585 # Patch for c029: give up error check if
2586 # a side comment follows
2587 if ( $next_nonblank_token ne ')'
2588 && $next_nonblank_token ne '#' )
2592 error_if_expecting_OPERATOR('(');
2594 if ( $last_nonblank_type eq 'C' ) {
2596 "$last_nonblank_token has a void prototype\n";
2598 elsif ( $last_nonblank_type eq 'i' ) {
2600 && $last_nonblank_token =~ /^\$/ )
2603 "Do you mean '$last_nonblank_token->(' ?\n";
2607 interrupt_logfile();
2611 } ## end if ( $next_nonblank_token...
2612 } ## end else [ if ( $last_last_nonblank_token...
2613 } ## end if ( $expecting == OPERATOR...
2616 # Do not update container type at ') ('; fix for git #105. This will
2617 # propagate the container type onward so that any subsequent brace gets
2618 # correctly marked. I have implemented this as a general rule, which
2619 # should be safe, but if necessary it could be restricted to certain
2620 # container statement types such as 'for'.
2621 $paren_type[$paren_depth] = $container_type
2622 if ( $last_nonblank_token ne ')' );
2624 ( $type_sequence, $indent_flag ) =
2625 increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2627 # propagate types down through nested parens
2628 # for example: the second paren in 'if ((' would be structural
2629 # since the first is.
2631 if ( $last_nonblank_token eq '(' ) {
2632 $type = $last_nonblank_type;
2635 # We exclude parens as structural after a ',' because it
2636 # causes subtle problems with continuation indentation for
2637 # something like this, where the first 'or' will not get
2642 # ( not defined $check )
2644 # or $check eq "new"
2645 # or $check eq "old",
2648 # Likewise, we exclude parens where a statement can start
2649 # because of problems with continuation indentation, like
2652 # ($firstline =~ /^#\!.*perl/)
2653 # and (print $File::Find::name, "\n")
2656 # (ref($usage_fref) =~ /CODE/)
2658 # : (&blast_usage, &blast_params, &blast_general_params);
2664 if ( $last_nonblank_type eq ')' ) {
2666 "Syntax error? found token '$last_nonblank_type' then '('\n");
2668 $paren_structural_type[$paren_depth] = $type;
2671 } ## end sub do_LEFT_PARENTHESIS
2673 sub do_RIGHT_PARENTHESIS {
2676 ( $type_sequence, $indent_flag ) =
2677 decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2679 if ( $paren_structural_type[$paren_depth] eq '{' ) {
2683 $container_type = $paren_type[$paren_depth];
2685 # restore statement type as 'sub' at closing paren of a signature
2686 # so that a subsequent ':' is identified as an attribute
2687 if ( $container_type =~ /^sub\b/ ) {
2688 $statement_type = $container_type;
2692 if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
2693 my $num_sc = $paren_semicolon_count[$paren_depth];
2694 if ( $num_sc > 0 && $num_sc != 2 ) {
2695 warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2699 if ( $paren_depth > 0 ) { $paren_depth-- }
2701 } ## end sub do_RIGHT_PARENTHESIS
2706 if ( $last_nonblank_type eq ',' ) {
2707 complain("Repeated ','s \n");
2710 # Note that we have to check both token and type here because a
2711 # comma following a qw list can have last token='(' but type = 'q'
2712 elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' ) {
2713 warning("Unexpected leading ',' after a '('\n");
2716 # patch for operator_expected: note if we are in the list (use.t)
2717 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2720 } ## end sub do_COMMA
2725 $context = UNKNOWN_CONTEXT;
2726 $statement_type = EMPTY_STRING;
2727 $want_paren = EMPTY_STRING;
2730 if ( $is_for_foreach{ $paren_type[$paren_depth] } )
2731 { # mark ; in for loop
2733 # Be careful: we do not want a semicolon such as the
2734 # following to be included:
2736 # for (sort {strcoll($a,$b);} keys %investments) {
2738 if ( $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
2739 && $square_bracket_depth ==
2740 $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
2744 $paren_semicolon_count[$paren_depth]++;
2748 } ## end sub do_SEMICOLON
2750 sub do_QUOTATION_MARK {
2753 error_if_expecting_OPERATOR("String")
2754 if ( $expecting == OPERATOR );
2757 $allowed_quote_modifiers = EMPTY_STRING;
2759 } ## end sub do_QUOTATION_MARK
2764 error_if_expecting_OPERATOR("String")
2765 if ( $expecting == OPERATOR );
2768 $allowed_quote_modifiers = EMPTY_STRING;
2770 } ## end sub do_APOSTROPHE
2775 error_if_expecting_OPERATOR("String")
2776 if ( $expecting == OPERATOR );
2779 $allowed_quote_modifiers = EMPTY_STRING;
2781 } ## end sub do_BACKTICK
2788 # a pattern cannot follow certain keywords which take optional
2789 # arguments, like 'shift' and 'pop'. See also '?'.
2791 $last_nonblank_type eq 'k'
2792 && $is_keyword_rejecting_slash_as_pattern_delimiter{
2793 $last_nonblank_token}
2798 elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
2800 ( $is_pattern, $msg ) =
2801 guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
2805 write_diagnostics("DIVIDE:$msg\n");
2806 write_logfile_entry($msg);
2809 else { $is_pattern = ( $expecting == TERM ) }
2814 $allowed_quote_modifiers = '[msixpodualngc]';
2816 else { # not a pattern; check for a /= token
2818 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
2824 #DEBUG - collecting info on what tokens follow a divide
2825 # for development of guessing algorithm
2826 #if ( is_possible_numerator( $i, $rtokens, $max_token_index ) < 0 ) {
2827 # #write_diagnostics( "DIVIDE? $input_line\n" );
2831 } ## end sub do_SLASH
2833 sub do_LEFT_CURLY_BRACKET {
2836 # if we just saw a ')', we will label this block with
2837 # its type. We need to do this to allow sub
2838 # code_block_type to determine if this brace starts a
2839 # code block or anonymous hash. (The type of a paren
2840 # pair is the preceding token, such as 'if', 'else',
2842 $container_type = EMPTY_STRING;
2844 # ATTRS: for a '{' following an attribute list, reset
2845 # things to look like we just saw the sub name
2846 # Added 'package' (can be 'class') for --use-feature=class (rt145706)
2847 if ( $statement_type =~ /^(sub|package)\b/ ) {
2848 $last_nonblank_token = $statement_type;
2849 $last_nonblank_type = 'i';
2850 $statement_type = EMPTY_STRING;
2853 # patch for SWITCH/CASE: hide these keywords from an immediately
2854 # following opening brace
2855 elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
2856 && $statement_type eq $last_nonblank_token )
2858 $last_nonblank_token = ";";
2861 elsif ( $last_nonblank_token eq ')' ) {
2862 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
2864 # defensive move in case of a nesting error (pbug.t)
2865 # in which this ')' had no previous '('
2866 # this nesting error will have been caught
2867 if ( !defined($last_nonblank_token) ) {
2868 $last_nonblank_token = 'if';
2871 # check for syntax error here;
2872 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
2873 if ( $tokenizer_self->[_extended_syntax_] ) {
2875 # we append a trailing () to mark this as an unknown
2876 # block type. This allows perltidy to format some
2877 # common extensions of perl syntax.
2878 # This is used by sub code_block_type
2879 $last_nonblank_token .= '()';
2883 join( SPACE, sort keys %is_blocktype_with_paren );
2885 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
2891 # patch for paren-less for/foreach glitch, part 2.
2892 # see note below under 'qw'
2893 elsif ($last_nonblank_token eq 'qw'
2894 && $is_for_foreach{$want_paren} )
2896 $last_nonblank_token = $want_paren;
2897 if ( $last_last_nonblank_token eq $want_paren ) {
2899 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
2903 $want_paren = EMPTY_STRING;
2906 # now identify which of the three possible types of
2907 # curly braces we have: hash index container, anonymous
2908 # hash reference, or code block.
2910 # non-structural (hash index) curly brace pair
2911 # get marked 'L' and 'R'
2912 if ( is_non_structural_brace() ) {
2915 # patch for SWITCH/CASE:
2916 # allow paren-less identifier after 'when'
2917 # if the brace is preceded by a space
2918 if ( $statement_type eq 'when'
2919 && $last_nonblank_type eq 'i'
2920 && $last_last_nonblank_type eq 'k'
2921 && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
2924 $block_type = $statement_type;
2928 # code and anonymous hash have the same type, '{', but are
2929 # distinguished by 'block_type',
2930 # which will be blank for an anonymous hash
2933 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
2936 # patch to promote bareword type to function taking block
2938 && $last_nonblank_type eq 'w'
2939 && $last_nonblank_i >= 0 )
2941 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
2942 $routput_token_type->[$last_nonblank_i] =
2943 $is_grep_alias{$block_type} ? 'k' : 'G';
2947 # patch for SWITCH/CASE: if we find a stray opening block brace
2948 # where we might accept a 'case' or 'when' block, then take it
2949 if ( $statement_type eq 'case'
2950 || $statement_type eq 'when' )
2952 if ( !$block_type || $block_type eq '}' ) {
2953 $block_type = $statement_type;
2958 $brace_type[ ++$brace_depth ] = $block_type;
2960 # Patch for CLASS BLOCK definitions: do not update the package for the
2961 # current depth if this is a BLOCK type definition.
2962 # TODO: should make 'class' separate from 'package' and only do
2964 $brace_package[$brace_depth] = $current_package
2965 if ( substr( $block_type, 0, 8 ) ne 'package ' );
2967 $brace_structural_type[$brace_depth] = $type;
2968 $brace_context[$brace_depth] = $context;
2969 ( $type_sequence, $indent_flag ) =
2970 increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2972 } ## end sub do_LEFT_CURLY_BRACKET
2974 sub do_RIGHT_CURLY_BRACKET {
2977 $block_type = $brace_type[$brace_depth];
2978 if ($block_type) { $statement_type = EMPTY_STRING }
2979 if ( defined( $brace_package[$brace_depth] ) ) {
2980 $current_package = $brace_package[$brace_depth];
2983 # can happen on brace error (caught elsewhere)
2986 ( $type_sequence, $indent_flag ) =
2987 decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2989 if ( $brace_structural_type[$brace_depth] eq 'L' ) {
2993 # propagate type information for 'do' and 'eval' blocks, and also
2994 # for smartmatch operator. This is necessary to enable us to know
2995 # if an operator or term is expected next.
2996 if ( $is_block_operator{$block_type} ) {
3000 $context = $brace_context[$brace_depth];
3001 if ( $brace_depth > 0 ) { $brace_depth--; }
3003 } ## end sub do_RIGHT_CURLY_BRACKET
3007 # '&' = maybe sub call? start looking
3008 # We have to check for sub call unless we are sure we
3009 # are expecting an operator. This example from s2p
3010 # got mistaken as a q operator in an early version:
3011 # print BODY &q(<<'EOT');
3012 if ( $expecting != OPERATOR ) {
3014 # But only look for a sub call if we are expecting a term or
3015 # if there is no existing space after the &.
3016 # For example we probably don't want & as sub call here:
3017 # Fcntl::S_IRUSR & $mode;
3018 if ( $expecting == TERM || $next_type ne 'b' ) {
3019 scan_simple_identifier();
3025 } ## end sub do_AMPERSAND
3027 sub do_LESS_THAN_SIGN {
3029 # '<' - angle operator or less than?
3030 if ( $expecting != OPERATOR ) {
3032 find_angle_operator_termination( $input_line, $i, $rtoken_map,
3033 $expecting, $max_token_index );
3035 ## This message is not very helpful and quite confusing if the above
3036 ## routine decided not to write a message with the line number.
3037 ## if ( $type eq '<' && $expecting == TERM ) {
3038 ## error_if_expecting_TERM();
3039 ## interrupt_logfile();
3040 ## warning("Unterminated <> operator?\n");
3041 ## resume_logfile();
3048 } ## end sub do_LESS_THAN_SIGN
3050 sub do_QUESTION_MARK {
3052 # '?' = conditional or starting pattern?
3055 # Patch for rt #126965
3056 # a pattern cannot follow certain keywords which take optional
3057 # arguments, like 'shift' and 'pop'. See also '/'.
3059 $last_nonblank_type eq 'k'
3060 && $is_keyword_rejecting_question_as_pattern_delimiter{
3061 $last_nonblank_token}
3067 # patch for RT#131288, user constant function without prototype
3068 # last type is 'U' followed by ?.
3069 elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
3072 elsif ( $expecting == UNKNOWN ) {
3074 # In older versions of Perl, a bare ? can be a pattern
3075 # delimiter. In perl version 5.22 this was
3076 # dropped, but we have to support it in order to format
3077 # older programs. See:
3078 ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
3079 # For example, the following line worked
3081 # ?(.*)? && (print $1,"\n");
3082 # In current versions it would have to be written with slashes:
3083 # /(.*)/ && (print $1,"\n");
3085 ( $is_pattern, $msg ) =
3086 guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
3089 if ($msg) { write_logfile_entry($msg) }
3091 else { $is_pattern = ( $expecting == TERM ) }
3096 $allowed_quote_modifiers = '[msixpodualngc]';
3099 ( $type_sequence, $indent_flag ) =
3100 increase_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
3103 } ## end sub do_QUESTION_MARK
3107 # '*' = typeglob, or multiply?
3108 if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
3109 if ( $next_type ne 'b'
3110 && $next_type ne '('
3111 && $next_type ne '#' ) # Fix c036
3116 if ( $expecting == TERM ) {
3117 scan_simple_identifier();
3121 if ( $rtokens->[ $i + 1 ] eq '=' ) {
3126 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
3130 if ( $rtokens->[ $i + 1 ] eq '=' ) {
3138 } ## end sub do_STAR
3142 # '.' = what kind of . ?
3143 if ( $expecting != OPERATOR ) {
3145 if ( $type eq '.' ) {
3146 error_if_expecting_TERM()
3147 if ( $expecting == TERM );
3157 # ':' = label, ternary, attribute, ?
3159 # if this is the first nonblank character, call it a label
3160 # since perl seems to just swallow it
3161 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
3165 # ATTRS: check for a ':' which introduces an attribute list
3166 # either after a 'sub' keyword or within a paren list
3167 # Added 'package' (can be 'class') for --use-feature=class (rt145706)
3168 elsif ( $statement_type =~ /^(sub|package)\b/ ) {
3170 $in_attribute_list = 1;
3173 # Within a signature, unless we are in a ternary. For example,
3174 # from 't/filter_example.t':
3175 # method foo4 ( $class: $bar ) { $class->bar($bar) }
3176 elsif ( $paren_type[$paren_depth] =~ /^sub\b/
3177 && !is_balanced_closing_container(QUESTION_COLON) )
3180 $in_attribute_list = 1;
3183 # check for scalar attribute, such as
3184 # my $foo : shared = 1;
3185 elsif ($is_my_our_state{$statement_type}
3186 && $current_depth[QUESTION_COLON] == 0 )
3189 $in_attribute_list = 1;
3192 # Look for Switch::Plain syntax if an error would otherwise occur
3193 # here. Note that we do not need to check if the extended syntax
3194 # flag is set because otherwise an error would occur, and we would
3195 # then have to output a message telling the user to set the
3196 # extended syntax flag to avoid the error.
3200 # Note that the line 'default:' will be parsed as a label elsewhere.
3201 elsif ( $is_case_default{$statement_type}
3202 && !is_balanced_closing_container(QUESTION_COLON) )
3204 # mark it as a perltidy label type
3208 # otherwise, it should be part of a ?/: operator
3210 ( $type_sequence, $indent_flag ) =
3211 decrease_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
3212 if ( $last_nonblank_token eq '?' ) {
3213 warning("Syntax error near ? :\n");
3217 } ## end sub do_COLON
3221 # '+' = what kind of plus?
3222 if ( $expecting == TERM ) {
3223 my $number = scan_number_fast();
3225 # unary plus is safest assumption if not a number
3226 if ( !defined($number) ) { $type = 'p'; }
3228 elsif ( $expecting == OPERATOR ) {
3231 if ( $next_type eq 'w' ) { $type = 'p' }
3234 } ## end sub do_PLUS_SIGN
3238 # '@' = sigil for array?
3239 error_if_expecting_OPERATOR("Array")
3240 if ( $expecting == OPERATOR );
3241 scan_simple_identifier();
3243 } ## end sub do_AT_SIGN
3245 sub do_PERCENT_SIGN {
3247 # '%' = hash or modulo?
3248 # first guess is hash if no following blank or paren
3249 if ( $expecting == UNKNOWN ) {
3250 if ( $next_type ne 'b' && $next_type ne '(' ) {
3254 if ( $expecting == TERM ) {
3255 scan_simple_identifier();
3258 } ## end sub do_PERCENT_SIGN
3260 sub do_LEFT_SQUARE_BRACKET {
3263 $square_bracket_type[ ++$square_bracket_depth ] = $last_nonblank_token;
3264 ( $type_sequence, $indent_flag ) =
3265 increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3267 # It may seem odd, but structural square brackets have
3268 # type '{' and '}'. This simplifies the indentation logic.
3269 if ( !is_non_structural_brace() ) {
3272 $square_bracket_structural_type[$square_bracket_depth] = $type;
3274 } ## end sub do_LEFT_SQUARE_BRACKET
3276 sub do_RIGHT_SQUARE_BRACKET {
3279 ( $type_sequence, $indent_flag ) =
3280 decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3282 if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' ) {
3286 # propagate type information for smartmatch operator. This is
3287 # necessary to enable us to know if an operator or term is expected
3289 if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
3290 $tok = $square_bracket_type[$square_bracket_depth];
3293 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
3295 } ## end sub do_RIGHT_SQUARE_BRACKET
3299 # '-' = what kind of minus?
3300 if ( ( $expecting != OPERATOR )
3301 && $is_file_test_operator{$next_tok} )
3303 my ( $next_nonblank_token, $i_next ) =
3304 find_next_nonblank_token( $i + 1, $rtokens, $max_token_index );
3306 # check for a quoted word like "-w=>xx";
3307 # it is sufficient to just check for a following '='
3308 if ( $next_nonblank_token eq '=' ) {
3317 elsif ( $expecting == TERM ) {
3318 my $number = scan_number_fast();
3320 # maybe part of bareword token? unary is safest
3321 if ( !defined($number) ) { $type = 'm'; }
3324 elsif ( $expecting == OPERATOR ) {
3328 if ( $next_type eq 'w' ) {
3333 } ## end sub do_MINUS_SIGN
3338 # check for special variables like ${^WARNING_BITS}
3339 if ( $expecting == TERM ) {
3341 if ( $last_nonblank_token eq '{'
3342 && ( $next_tok !~ /^\d/ )
3343 && ( $next_tok =~ /^\w/ ) )
3346 if ( $next_tok eq 'W' ) {
3347 $tokenizer_self->[_saw_perl_dash_w_] = 1;
3349 $tok = $tok . $next_tok;
3353 # Optional coding to try to catch syntax errors. This can
3354 # be removed if it ever causes incorrect warning messages.
3355 # The '{^' should be preceded by either by a type or '$#'
3358 # *${^LAST_FH}{NAME} ok
3360 # $hash{^HOWDY} error
3362 # Note that a type sigil '$' may be tokenized as 'Z'
3363 # after something like 'print', so allow type 'Z'
3364 if ( $last_last_nonblank_type ne 't'
3365 && $last_last_nonblank_type ne 'Z'
3366 && $last_last_nonblank_token ne '$#' )
3368 warning("Possible syntax error near '{^'\n");
3373 unless ( error_if_expecting_TERM() ) {
3375 # Something like this is valid but strange:
3377 complain("The '^' seems unusual here\n");
3382 } ## end sub do_CARAT_SIGN
3384 sub do_DOUBLE_COLON {
3386 # '::' = probably a sub call
3387 scan_bare_identifier();
3389 } ## end sub do_DOUBLE_COLON
3393 # '<<' = maybe a here-doc?
3395 ## This check removed because it could be a deprecated here-doc with
3396 ## no specified target. See example in log 16 Sep 2020.
3398 ## unless ( $i < $max_token_index )
3399 ## ; # here-doc not possible if end of line
3401 if ( $expecting != OPERATOR ) {
3402 my ( $found_target, $here_doc_target, $here_quote_character,
3405 $found_target, $here_doc_target, $here_quote_character, $i,
3408 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3411 if ($found_target) {
3412 push @{$rhere_target_list},
3413 [ $here_doc_target, $here_quote_character ];
3415 if ( length($here_doc_target) > 80 ) {
3416 my $truncated = substr( $here_doc_target, 0, 80 );
3417 complain("Long here-target: '$truncated' ...\n");
3419 elsif ( !$here_doc_target ) {
3421 'Use of bare << to mean <<"" is deprecated' . "\n" )
3422 unless ($here_quote_character);
3424 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3426 "Unconventional here-target: '$here_doc_target'\n");
3429 elsif ( $expecting == TERM ) {
3430 unless ($saw_error) {
3432 # shouldn't happen..arriving here implies an error in
3433 # the logic in sub 'find_here_doc'
3436 Program bug; didn't find here doc target
3440 "Possible program error: didn't find here doc target\n"
3442 report_definite_bug();
3449 } ## end sub do_LEFT_SHIFT
3451 sub do_NEW_HERE_DOC {
3453 # '<<~' = a here-doc, new type added in v26
3455 unless ( $i < $max_token_index )
3456 ; # here-doc not possible if end of line
3457 if ( $expecting != OPERATOR ) {
3458 my ( $found_target, $here_doc_target, $here_quote_character,
3461 $found_target, $here_doc_target, $here_quote_character, $i,
3464 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3467 if ($found_target) {
3469 if ( length($here_doc_target) > 80 ) {
3470 my $truncated = substr( $here_doc_target, 0, 80 );
3471 complain("Long here-target: '$truncated' ...\n");
3473 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3475 "Unconventional here-target: '$here_doc_target'\n");
3478 # Note that we put a leading space on the here quote
3479 # character indicate that it may be preceded by spaces
3480 $here_quote_character = SPACE . $here_quote_character;
3481 push @{$rhere_target_list},
3482 [ $here_doc_target, $here_quote_character ];
3485 elsif ( $expecting == TERM ) {
3486 unless ($saw_error) {
3488 # shouldn't happen..arriving here implies an error in
3489 # the logic in sub 'find_here_doc'
3492 Program bug; didn't find here doc target
3496 "Possible program error: didn't find here doc target\n"
3498 report_definite_bug();
3503 error_if_expecting_OPERATOR();
3506 } ## end sub do_NEW_HERE_DOC
3517 # type = 'pp' for pre-increment, '++' for post-increment
3518 if ( $expecting == TERM ) { $type = 'pp' }
3519 elsif ( $expecting == UNKNOWN ) {
3521 my ( $next_nonblank_token, $i_next ) =
3522 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3524 # Fix for c042: look past a side comment
3525 if ( $next_nonblank_token eq '#' ) {
3526 ( $next_nonblank_token, $i_next ) =
3527 find_next_nonblank_token( $max_token_index,
3528 $rtokens, $max_token_index );
3531 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
3534 } ## end sub do_PLUS_PLUS
3539 if ( $last_nonblank_type eq $tok ) {
3540 complain("Repeated '=>'s \n");
3543 # patch for operator_expected: note if we are in the list (use.t)
3544 # TODO: make version numbers a new token type
3545 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
3547 } ## end sub do_FAT_COMMA
3549 sub do_MINUS_MINUS {
3552 # type = 'mm' for pre-decrement, '--' for post-decrement
3554 if ( $expecting == TERM ) { $type = 'mm' }
3555 elsif ( $expecting == UNKNOWN ) {
3556 my ( $next_nonblank_token, $i_next ) =
3557 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3559 # Fix for c042: look past a side comment
3560 if ( $next_nonblank_token eq '#' ) {
3561 ( $next_nonblank_token, $i_next ) =
3562 find_next_nonblank_token( $max_token_index,
3563 $rtokens, $max_token_index );
3566 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
3569 } ## end sub do_MINUS_MINUS
3571 sub do_LOGICAL_AND {
3574 error_if_expecting_TERM()
3575 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3577 } ## end sub do_LOGICAL_AND
3582 error_if_expecting_TERM()
3583 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3585 } ## end sub do_LOGICAL_OR
3587 sub do_SLASH_SLASH {
3590 error_if_expecting_TERM()
3591 if ( $expecting == TERM );
3593 } ## end sub do_SLASH_SLASH
3597 # 'd' = string of digits
3598 error_if_expecting_OPERATOR("Number")
3599 if ( $expecting == OPERATOR );
3601 my $number = scan_number_fast();
3602 if ( !defined($number) ) {
3604 # shouldn't happen - we should always get a number
3607 non-number beginning with digit--program bug
3611 "Unexpected error condition: non-number beginning with digit\n"
3613 report_definite_bug();
3616 } ## end sub do_DIGITS
3618 sub do_ATTRIBUTE_LIST {
3620 my ($next_nonblank_token) = @_;
3622 # Called at a bareword encountered while in an attribute list
3623 # returns 'is_attribute':
3624 # true if attribute found
3625 # false if an attribute (continue parsing bareword)
3627 # treat bare word followed by open paren like qw(
3628 if ( $next_nonblank_token eq '(' ) {
3630 # For something like:
3632 # we should let do_scan_sub see it so that it can see
3633 # the prototype. All other attributes get parsed as a
3635 if ( $tok eq 'prototype' ) {
3636 $id_scan_state = 'prototype';
3638 # start just after the word 'prototype'
3640 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
3642 input_line => $input_line,
3647 rtokens => $rtokens,
3648 rtoken_map => $rtoken_map,
3649 id_scan_state => $id_scan_state,
3650 max_token_index => $max_token_index,
3654 # If successful, mark as type 'q' to be consistent
3655 # with other attributes. Type 'w' would also work.
3656 if ( $i > $i_beg ) {
3661 # If not successful, continue and parse as a quote.
3664 # All other attribute lists must be parsed as quotes
3665 # (see 'signatures.t' for good examples)
3666 $in_quote = $quote_items{'q'};
3667 $allowed_quote_modifiers = $quote_modifiers{'q'};
3673 # handle bareword not followed by open paren
3679 # attribute not found
3681 } ## end sub do_ATTRIBUTE_LIST
3683 sub do_QUOTED_BAREWORD {
3685 # find type of a bareword followed by a '=>'
3686 if ( $is_constant{$current_package}{$tok} ) {
3689 elsif ( $is_user_function{$current_package}{$tok} ) {
3691 $prototype = $user_function_prototype{$current_package}{$tok};
3693 elsif ( $tok =~ /^v\d+$/ ) {
3695 report_v_string($tok);
3699 # Bareword followed by a fat comma - see 'git18.in'
3700 # If tok is something like 'x17' then it could
3701 # actually be operator x followed by number 17.
3702 # For example, here:
3703 # 123x17 => [ 792, 1224 ],
3704 # (a key of 123 repeated 17 times, perhaps not
3705 # what was intended). We will mark x17 as type
3706 # 'n' and it will be split. If the previous token
3707 # was also a bareword then it is not very clear is
3708 # going on. In this case we will not be sure that
3709 # an operator is expected, so we just mark it as a
3710 # bareword. Perl is a little murky in what it does
3711 # with stuff like this, and its behavior can change
3712 # over time. Something like
3713 # a x18 => [792, 1224], will compile as
3714 # a key with 18 a's. But something like
3715 # push @array, a x18;
3716 # is a syntax error.
3718 $expecting == OPERATOR
3719 && substr( $tok, 0, 1 ) eq 'x'
3720 && ( length($tok) == 1
3721 || substr( $tok, 1, 1 ) =~ /^\d/ )
3725 if ( split_pretoken(1) ) {
3734 error_if_expecting_OPERATOR();
3738 } ## end sub do_QUOTED_BAREWORD
3742 if ( $tok eq 'x' ) {
3743 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
3754 # Split a pretoken like 'x10' into 'x' and '10'.
3755 # Note: In previous versions of perltidy it was marked
3756 # as a number, $type = 'n', and fixed downstream by the
3759 if ( split_pretoken(1) ) {
3765 } ## end sub do_X_OPERATOR
3767 sub do_USE_CONSTANT {
3768 scan_bare_identifier();
3769 my ( $next_nonblank_tok2, $i_next2 ) =
3770 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3772 if ($next_nonblank_tok2) {
3774 if ( $is_keyword{$next_nonblank_tok2} ) {
3776 # Assume qw is used as a quote and okay, as in:
3777 # use constant qw{ DEBUG 0 };
3778 # Not worth trying to parse for just a warning
3780 # NOTE: This warning is deactivated because recent
3781 # versions of perl do not complain here, but
3782 # the coding is retained for reference.
3783 if ( 0 && $next_nonblank_tok2 ne 'qw' ) {
3785 "Attempting to define constant '$next_nonblank_tok2' which is a perl keyword\n"
3791 $is_constant{$current_package}{$next_nonblank_tok2} = 1;
3795 } ## end sub do_USE_CONSTANT
3799 # found a keyword - set any associated flags
3802 # Since for and foreach may not be followed immediately
3803 # by an opening paren, we have to remember which keyword
3804 # is associated with the next '('
3805 if ( $is_for_foreach{$tok} ) {
3806 if ( new_statement_ok() ) {
3811 # recognize 'use' statements, which are special
3812 elsif ( $is_use_require{$tok} ) {
3813 $statement_type = $tok;
3814 error_if_expecting_OPERATOR()
3815 if ( $expecting == OPERATOR );
3818 # remember my and our to check for trailing ": shared"
3819 elsif ( $is_my_our_state{$tok} ) {
3820 $statement_type = $tok;
3823 # Check for misplaced 'elsif' and 'else', but allow isolated
3824 # else or elsif blocks to be formatted. This is indicated
3825 # by a last noblank token of ';'
3826 elsif ( $tok eq 'elsif' ) {
3828 $last_nonblank_token ne ';'
3830 ## !~ /^(if|elsif|unless)$/
3831 && !$is_if_elsif_unless{$last_nonblank_block_type}
3835 "expecting '$tok' to follow one of 'if|elsif|unless'\n");
3838 elsif ( $tok eq 'else' ) {
3840 # patched for SWITCH/CASE
3842 $last_nonblank_token ne ';'
3844 ## !~ /^(if|elsif|unless|case|when)$/
3845 && !$is_if_elsif_unless_case_when{$last_nonblank_block_type}
3847 # patch to avoid an unwanted error message for
3848 # the case of a parenless 'case' (RT 105484):
3849 # switch ( 1 ) { case x { 2 } else { } }
3850 ## !~ /^(if|elsif|unless|case|when)$/
3851 && !$is_if_elsif_unless_case_when{$statement_type}
3855 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
3860 # patch for SWITCH/CASE if 'case' and 'when are
3861 # treated as keywords. Also 'default' for Switch::Plain
3862 elsif ($tok eq 'when'
3864 || $tok eq 'default' )
3866 $statement_type = $tok; # next '{' is block
3869 # feature 'err' was removed in Perl 5.10. So mark this as
3870 # a bareword unless an operator is expected (see c158).
3871 elsif ( $tok eq 'err' ) {
3872 if ( $expecting != OPERATOR ) { $type = 'w' }
3876 } ## end sub do_KEYWORD
3878 sub do_QUOTE_OPERATOR {
3880 if ( $expecting == OPERATOR ) {
3882 # Be careful not to call an error for a qw quote
3883 # where a parenthesized list is allowed. For example,
3884 # it could also be a for/foreach construct such as
3886 # foreach my $key qw\Uno Due Tres Quadro\ {
3887 # print "Set $key\n";
3891 # Or it could be a function call.
3892 # NOTE: Braces in something like &{ xxx } are not
3893 # marked as a block, we might have a method call.
3894 # &method(...), $method->(..), &{method}(...),
3895 # $ref[2](list) is ok & short for $ref[2]->(list)
3897 # See notes in 'sub code_block_type' and
3898 # 'sub is_non_structural_brace'
3902 && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
3903 || $is_for_foreach{$want_paren} )
3906 error_if_expecting_OPERATOR();
3909 $in_quote = $quote_items{$tok};
3910 $allowed_quote_modifiers = $quote_modifiers{$tok};
3912 # All quote types are 'Q' except possibly qw quotes.
3913 # qw quotes are special in that they may generally be trimmed
3914 # of leading and trailing whitespace. So they are given a
3915 # separate type, 'q', unless requested otherwise.
3917 ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
3920 $quote_type = $type;
3922 } ## end sub do_QUOTE_OPERATOR
3924 sub do_UNKNOWN_BAREWORD {
3926 my ($next_nonblank_token) = @_;
3928 scan_bare_identifier();
3930 if ( $statement_type eq 'use'
3931 && $last_nonblank_token eq 'use' )
3933 $saw_use_module{$current_package}->{$tok} = 1;
3936 if ( $type eq 'w' ) {
3938 if ( $expecting == OPERATOR ) {
3940 # Patch to avoid error message for RPerl overloaded
3941 # operator functions: use overload
3946 # TODO: this could eventually be generalized
3947 if ( $saw_use_module{$current_package}->{'RPerl'}
3948 && $tok =~ /^sse_(mul|div|add|sub)$/ )
3953 # Fix part 1 for git #63 in which a comment falls
3954 # between an -> and the following word. An
3955 # alternate fix would be to change operator_expected
3956 # to return an UNKNOWN for this type.
3957 elsif ( $last_nonblank_type eq '->' ) {
3961 # don't complain about possible indirect object
3965 # sub new($) { ... }
3966 # $b = new A::; # calls A::new
3967 # $c = new A; # same thing but suspicious
3968 # This will call A::new but we have a 'new' in
3969 # main:: which looks like a constant.
3971 elsif ( $last_nonblank_type eq 'C' ) {
3972 if ( $tok !~ /::$/ ) {
3974 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
3975 Maybe indirectet object notation?
3980 error_if_expecting_OPERATOR("bareword");
3984 # mark bare words immediately followed by a paren as
3986 $next_tok = $rtokens->[ $i + 1 ];
3987 if ( $next_tok eq '(' ) {
3989 # Patch for issue c151, where we are processing a snippet and
3990 # have not seen that SPACE is a constant. In this case 'x' is
3991 # probably an operator. The only disadvantage with an incorrect
3992 # guess is that the space after it may be incorrect. For example
3993 # $str .= SPACE x ( 16 - length($str) ); See also b1410.
3994 if ( $tok eq 'x' && $last_nonblank_type eq 'w' ) { $type = 'x' }
3996 # Fix part 2 for git #63. Leave type as 'w' to keep
3997 # the type the same as if the -> were not separated
3998 elsif ( $last_nonblank_type ne '->' ) { $type = 'U' }
4002 # underscore after file test operator is file handle
4003 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
4007 # patch for SWITCH/CASE if 'case' and 'when are
4008 # not treated as keywords:
4010 ( $tok eq 'case' && $brace_type[$brace_depth] eq 'switch' )
4012 && $brace_type[$brace_depth] eq 'given' )
4015 $statement_type = $tok; # next '{' is block
4016 $type = 'k'; # for keyword syntax coloring
4018 if ( $next_nonblank_token eq '(' ) {
4020 # patch for SWITCH/CASE if switch and given not keywords
4021 # Switch is not a perl 5 keyword, but we will gamble
4022 # and mark switch followed by paren as a keyword. This
4023 # is only necessary to get html syntax coloring nice,
4024 # and does not commit this as being a switch/case.
4025 if ( $tok eq 'switch' || $tok eq 'given' ) {
4026 $type = 'k'; # for keyword syntax coloring
4029 # mark 'x' as operator for something like this (see b1410)
4030 # my $line = join( LD_X, map { LD_H x ( $_ + 2 ) } @$widths );
4031 elsif ( $tok eq 'x' && $last_nonblank_type eq 'w' ) {
4037 } ## end sub do_UNKNOWN_BAREWORD
4039 sub sub_attribute_ok_here {
4041 my ( $tok_kw, $next_nonblank_token, $i_next ) = @_;
4043 # Decide if 'sub :' can be the start of a sub attribute list.
4044 # We will decide based on if the colon is followed by a
4045 # bareword which is not a keyword.
4046 # Changed inext+1 to inext to fixed case b1190.
4047 my $sub_attribute_ok_here;
4048 if ( $is_sub{$tok_kw}
4049 && $expecting != OPERATOR
4050 && $next_nonblank_token eq ':' )
4052 my ( $nn_nonblank_token, $i_nn ) =
4053 find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
4054 $sub_attribute_ok_here =
4055 $nn_nonblank_token =~ /^\w/
4056 && $nn_nonblank_token !~ /^\d/
4057 && !$is_keyword{$nn_nonblank_token};
4059 return $sub_attribute_ok_here;
4060 } ## end sub sub_attribute_ok_here
4064 my ($is_END_or_DATA) = @_;
4066 # handle a bareword token:
4068 # true if this token ends the current line
4071 my ( $next_nonblank_token, $i_next ) =
4072 find_next_nonblank_token( $i, $rtokens, $max_token_index );
4074 # a bare word immediately followed by :: is not a keyword;
4075 # use $tok_kw when testing for keywords to avoid a mistake
4077 if ( $rtokens->[ $i + 1 ] eq ':'
4078 && $rtokens->[ $i + 2 ] eq ':' )
4083 if ($in_attribute_list) {
4084 my $is_attribute = do_ATTRIBUTE_LIST($next_nonblank_token);
4085 return if ($is_attribute);
4088 #----------------------------------------
4089 # Starting final if-elsif- chain of tests
4090 #----------------------------------------
4092 # This is the return flag:
4093 # true => this is the last token on the line
4094 # false => keep tokenizing the line
4097 # The following blocks of code must update these vars:
4098 # $type - the final token type, must always be set
4100 # In addition, if additional pretokens are added:
4101 # $tok - the final token
4102 # $i - the index of the last pretoken
4104 # They may also need to check and set various flags
4106 # Scan a bare word following a -> as an identifier; it could
4107 # have a long package name. Fixes c037, c041.
4108 if ( $last_nonblank_token eq '->' ) {
4109 scan_bare_identifier();
4111 # a bareward after '->' gets type 'i'
4115 # Quote a word followed by => operator
4116 # unless the word __END__ or __DATA__ and the only word on
4118 elsif ( !$is_END_or_DATA
4119 && $next_nonblank_token eq '='
4120 && $rtokens->[ $i_next + 1 ] eq '>' )
4122 do_QUOTED_BAREWORD();
4125 # quote a bare word within braces..like xxx->{s}; note that we
4126 # must be sure this is not a structural brace, to avoid
4127 # mistaking {s} in the following for a quoted bare word:
4128 # for(@[){s}bla}BLA}
4129 # Also treat q in something like var{-q} as a bare word, not
4132 $next_nonblank_token eq '}'
4134 $last_nonblank_type eq 'L'
4135 || ( $last_nonblank_type eq 'm'
4136 && $last_last_nonblank_type eq 'L' )
4143 # handle operator x (now we know it isn't $x=)
4145 $expecting == OPERATOR
4146 && substr( $tok, 0, 1 ) eq 'x'
4147 && ( length($tok) == 1
4148 || substr( $tok, 1, 1 ) =~ /^\d/ )
4153 elsif ( $tok_kw eq 'CORE::' ) {
4154 $type = $tok = $tok_kw;
4157 elsif ( ( $tok eq 'strict' )
4158 and ( $last_nonblank_token eq 'use' ) )
4160 $tokenizer_self->[_saw_use_strict_] = 1;
4161 scan_bare_identifier();
4164 elsif ( ( $tok eq 'warnings' )
4165 and ( $last_nonblank_token eq 'use' ) )
4167 $tokenizer_self->[_saw_perl_dash_w_] = 1;
4169 # scan as identifier, so that we pick up something like:
4170 # use warnings::register
4171 scan_bare_identifier();
4175 $tok eq 'AutoLoader'
4176 && $tokenizer_self->[_look_for_autoloader_]
4178 $last_nonblank_token eq 'use'
4180 # these regexes are from AutoSplit.pm, which we want
4182 || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
4183 || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
4187 write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
4188 $tokenizer_self->[_saw_autoloader_] = 1;
4189 $tokenizer_self->[_look_for_autoloader_] = 0;
4190 scan_bare_identifier();
4194 $tok eq 'SelfLoader'
4195 && $tokenizer_self->[_look_for_selfloader_]
4196 && ( $last_nonblank_token eq 'use'
4197 || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
4198 || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
4201 write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
4202 $tokenizer_self->[_saw_selfloader_] = 1;
4203 $tokenizer_self->[_look_for_selfloader_] = 0;
4204 scan_bare_identifier();
4207 elsif ( ( $tok eq 'constant' )
4208 and ( $last_nonblank_token eq 'use' ) )
4213 # various quote operators
4214 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
4215 do_QUOTE_OPERATOR();
4218 # check for a statement label
4220 ( $next_nonblank_token eq ':' )
4221 && ( $rtokens->[ $i_next + 1 ] ne ':' )
4222 && ( $i_next <= $max_token_index ) # colon on same line
4224 # like 'sub : lvalue' ?
4225 && !sub_attribute_ok_here( $tok_kw, $next_nonblank_token, $i_next )
4229 if ( $tok !~ /[A-Z]/ ) {
4230 push @{ $tokenizer_self->[_rlower_case_labels_at_] },
4238 # 'sub' or other sub alias
4239 elsif ( $is_sub{$tok_kw} ) {
4241 # Update for --use-feature=class (rt145706):
4242 # We have to be extra careful to avoid misparsing other uses of
4243 # 'method' in older scripts.
4244 if ( $tok_kw eq 'method' ) {
4245 if ( $expecting == OPERATOR
4246 || $next_nonblank_token !~ /^(\w|\:)/
4247 || !method_ok_here() )
4249 do_UNKNOWN_BAREWORD($next_nonblank_token);
4252 initialize_subname();
4257 error_if_expecting_OPERATOR()
4258 if ( $expecting == OPERATOR );
4259 initialize_subname();
4265 elsif ( $is_package{$tok_kw} ) {
4267 # Update for --use-feature=class (rt145706):
4268 # We have to be extra careful because 'class' may be used for other
4269 # purposes on older code; i.e.
4270 # class($x) - valid sub call
4271 # package($x) - error
4272 if ( $tok_kw eq 'class' ) {
4273 if ( $expecting == OPERATOR
4274 || $next_nonblank_token !~ /^(\w|\:)/
4275 || !class_ok_here() )
4277 do_UNKNOWN_BAREWORD($next_nonblank_token);
4282 error_if_expecting_OPERATOR()
4283 if ( $expecting == OPERATOR );
4288 # Fix for c035: split 'format' from 'is_format_END_DATA' to be
4289 # more restrictive. Require a new statement to be ok here.
4290 elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
4291 $type = ';'; # make tokenizer look for TERM next
4292 $tokenizer_self->[_in_format_] = 1;
4293 $is_last = 1; ## is last token on this line
4296 # Note on token types for format, __DATA__, __END__:
4297 # It simplifies things to give these type ';', so that when we
4298 # start rescanning we will be expecting a token of type TERM.
4299 # We will switch to type 'k' before outputting the tokens.
4300 elsif ( $is_END_DATA{$tok_kw} ) {
4301 $type = ';'; # make tokenizer look for TERM next
4303 # Remember that we are in one of these three sections
4304 $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1;
4305 $is_last = 1; ## is last token on this line
4308 elsif ( $is_keyword{$tok_kw} ) {
4312 # check for inline label following
4313 # /^(redo|last|next|goto)$/
4314 elsif (( $last_nonblank_type eq 'k' )
4315 && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4322 do_UNKNOWN_BAREWORD($next_nonblank_token);
4327 } ## end sub do_BAREWORD
4329 sub do_FOLLOW_QUOTE {
4331 # Continue following a quote on a new line
4332 $type = $quote_type;
4334 unless ( @{$routput_token_list} ) { # initialize if continuation line
4335 push( @{$routput_token_list}, $i );
4336 $routput_token_type->[$i] = $type;
4340 # scan for the end of the quote or pattern
4365 # all done if we didn't find it
4366 if ($in_quote) { return }
4368 # save pattern and replacement text for rescanning
4369 my $qs1 = $quoted_string_1;
4371 # re-initialize for next search
4372 $quote_character = EMPTY_STRING;
4375 $quoted_string_1 = EMPTY_STRING;
4376 $quoted_string_2 = EMPTY_STRING;
4377 if ( ++$i > $max_token_index ) { return }
4379 # look for any modifiers
4380 if ($allowed_quote_modifiers) {
4382 # check for exact quote modifiers
4383 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
4384 my $str = $rtokens->[$i];
4386 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
4387 my $pos = pos($str);
4388 my $char = substr( $str, $pos - 1, 1 );
4389 $saw_modifier_e ||= ( $char eq 'e' );
4392 # For an 'e' quote modifier we must scan the replacement
4393 # text for here-doc targets...
4394 # but if the modifier starts a new line we can skip
4395 # this because either the here doc will be fully
4396 # contained in the replacement text (so we can
4397 # ignore it) or Perl will not find it.
4398 # See test 'here2.in'.
4399 if ( $saw_modifier_e && $i_tok >= 0 ) {
4401 my $rht = scan_replacement_text($qs1);
4403 # Change type from 'Q' to 'h' for quotes with
4404 # here-doc targets so that the formatter (see sub
4405 # process_line_of_CODE) will not make any line
4406 # breaks after this point.
4408 push @{$rhere_target_list}, @{$rht};
4411 my $ilast = $routput_token_list->[-1];
4412 $routput_token_type->[$ilast] = $type;
4417 if ( defined( pos($str) ) ) {
4420 if ( pos($str) == length($str) ) {
4421 if ( ++$i > $max_token_index ) { return }
4424 # Looks like a joined quote modifier
4425 # and keyword, maybe something like
4426 # s/xxx/yyy/gefor @k=...
4427 # Example is "galgen.pl". Would have to split
4428 # the word and insert a new token in the
4429 # pre-token list. This is so rare that I haven't
4430 # done it. Will just issue a warning citation.
4432 # This error might also be triggered if my quote
4433 # modifier characters are incomplete
4437 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
4438 Please put a space between quote modifiers and trailing keywords.
4441 # print "token $rtokens->[$i]\n";
4442 # my $num = length($str) - pos($str);
4443 # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
4444 # print "continuing with new token $rtokens->[$i]\n";
4446 # skipping past this token does least damage
4447 if ( ++$i > $max_token_index ) { return }
4452 # example file: rokicki4.pl
4453 # This error might also be triggered if my quote
4454 # modifier characters are incomplete
4455 write_logfile_entry(
4456 "Note: found word $str at quote modifier location\n");
4461 $allowed_quote_modifiers = EMPTY_STRING;
4464 } ## end sub do_FOLLOW_QUOTE
4466 # ------------------------------------------------------------
4467 # begin hash of code for handling most token types
4468 # ------------------------------------------------------------
4469 my $tokenization_code = {
4471 '>' => \&do_GREATER_THAN_SIGN,
4472 '|' => \&do_VERTICAL_LINE,
4473 '$' => \&do_DOLLAR_SIGN,
4474 '(' => \&do_LEFT_PARENTHESIS,
4475 ')' => \&do_RIGHT_PARENTHESIS,
4477 ';' => \&do_SEMICOLON,
4478 '"' => \&do_QUOTATION_MARK,
4479 "'" => \&do_APOSTROPHE,
4480 '`' => \&do_BACKTICK,
4482 '{' => \&do_LEFT_CURLY_BRACKET,
4483 '}' => \&do_RIGHT_CURLY_BRACKET,
4484 '&' => \&do_AMPERSAND,
4485 '<' => \&do_LESS_THAN_SIGN,
4486 '?' => \&do_QUESTION_MARK,
4490 '+' => \&do_PLUS_SIGN,
4491 '@' => \&do_AT_SIGN,
4492 '%' => \&do_PERCENT_SIGN,
4493 '[' => \&do_LEFT_SQUARE_BRACKET,
4494 ']' => \&do_RIGHT_SQUARE_BRACKET,
4495 '-' => \&do_MINUS_SIGN,
4496 '^' => \&do_CARAT_SIGN,
4497 '::' => \&do_DOUBLE_COLON,
4498 '<<' => \&do_LEFT_SHIFT,
4499 '<<~' => \&do_NEW_HERE_DOC,
4500 '->' => \&do_POINTER,
4501 '++' => \&do_PLUS_PLUS,
4502 '=>' => \&do_FAT_COMMA,
4503 '--' => \&do_MINUS_MINUS,
4504 '&&' => \&do_LOGICAL_AND,
4505 '||' => \&do_LOGICAL_OR,
4506 '//' => \&do_SLASH_SLASH,
4508 # No special code for these types yet, but syntax checks
4543 # ------------------------------------------------------------
4544 # end hash of code for handling individual token types
4545 # ------------------------------------------------------------
4547 use constant DEBUG_TOKENIZE => 0;
4549 sub tokenize_this_line {
4551 # This routine breaks a line of perl code into tokens which are of use in
4552 # indentation and reformatting. One of my goals has been to define tokens
4553 # such that a newline may be inserted between any pair of tokens without
4554 # changing or invalidating the program. This version comes close to this,
4555 # although there are necessarily a few exceptions which must be caught by
4556 # the formatter. Many of these involve the treatment of bare words.
4558 # The tokens and their types are returned in arrays. See previous
4559 # routine for their names.
4561 # See also the array "valid_token_types" in the BEGIN section for an
4564 # To simplify things, token types are either a single character, or they
4565 # are identical to the tokens themselves.
4567 # As a debugging aid, the -D flag creates a file containing a side-by-side
4568 # comparison of the input string and its tokenization for each line of a file.
4569 # This is an invaluable debugging aid.
4571 # In addition to tokens, and some associated quantities, the tokenizer
4572 # also returns flags indication any special line types. These include
4573 # quotes, here_docs, formats.
4575 # -----------------------------------------------------------------------
4577 # How to add NEW_TOKENS:
4579 # New token types will undoubtedly be needed in the future both to keep up
4580 # with changes in perl and to help adapt the tokenizer to other applications.
4582 # Here are some notes on the minimal steps. I wrote these notes while
4583 # adding the 'v' token type for v-strings, which are things like version
4584 # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
4585 # can use your editor to search for the string "NEW_TOKENS" to find the
4586 # appropriate sections to change):
4588 # *. Try to talk somebody else into doing it! If not, ..
4590 # *. Make a backup of your current version in case things don't work out!
4592 # *. Think of a new, unused character for the token type, and add to
4593 # the array @valid_token_types in the BEGIN section of this package.
4594 # For example, I used 'v' for v-strings.
4596 # *. Implement coding to recognize the $type of the token in this routine.
4597 # This is the hardest part, and is best done by imitating or modifying
4598 # some of the existing coding. For example, to recognize v-strings, I
4599 # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
4600 # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
4602 # *. Update sub operator_expected. This update is critically important but
4603 # the coding is trivial. Look at the comments in that routine for help.
4604 # For v-strings, which should behave like numbers, I just added 'v' to the
4605 # regex used to handle numbers and strings (types 'n' and 'Q').
4607 # *. Implement a 'bond strength' rule in sub set_bond_strengths in
4608 # Perl::Tidy::Formatter for breaking lines around this token type. You can
4609 # skip this step and take the default at first, then adjust later to get
4610 # desired results. For adding type 'v', I looked at sub bond_strength and
4611 # saw that number type 'n' was using default strengths, so I didn't do
4612 # anything. I may tune it up someday if I don't like the way line
4613 # breaks with v-strings look.
4615 # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
4616 # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
4617 # and saw that type 'n' used spaces on both sides, so I just added 'v'
4618 # to the array @spaces_both_sides.
4620 # *. Update HtmlWriter package so that users can colorize the token as
4621 # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
4622 # that package. For v-strings, I initially chose to use a default color
4623 # equal to the default for numbers, but it might be nice to change that
4626 # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
4628 # *. Run lots and lots of debug tests. Start with special files designed
4629 # to test the new token type. Run with the -D flag to create a .DEBUG
4630 # file which shows the tokenization. When these work ok, test as many old
4631 # scripts as possible. Start with all of the '.t' files in the 'test'
4632 # directory of the distribution file. Compare .tdy output with previous
4633 # version and updated version to see the differences. Then include as
4634 # many more files as possible. My own technique has been to collect a huge
4635 # number of perl scripts (thousands!) into one directory and run perltidy
4636 # *, then run diff between the output of the previous version and the
4639 # *. For another example, search for the smartmatch operator '~~'
4640 # with your editor to see where updates were made for it.
4642 # -----------------------------------------------------------------------
4644 my ( $self, $line_of_tokens ) = @_;
4645 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
4647 # Extract line number for use in error messages
4648 $input_line_number = $line_of_tokens->{_line_number};
4650 # Check for pod documentation
4651 if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
4652 && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
4655 # Must not be in multi-line quote
4656 # and must not be in an equation
4658 && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
4660 $self->[_in_pod_] = 1;
4665 $input_line = $untrimmed_input_line;
4669 # Set a flag to indicate if we might be at an __END__ or __DATA__ line
4670 # This will be used below to avoid quoting a bare word followed by
4674 # Reinitialize the multi-line quote flag
4675 if ( $in_quote && $quote_type eq 'Q' ) {
4676 $line_of_tokens->{_starting_in_quote} = 1;
4679 $line_of_tokens->{_starting_in_quote} = 0;
4681 # Trim start of this line unless we are continuing a quoted line.
4682 # Do not trim end because we might end in a quote (test: deken4.pl)
4683 # Perl::Tidy::Formatter will delete needless trailing blanks
4684 $input_line =~ s/^(\s+)//;
4686 # Calculate a guessed level for nonblank lines to avoid calls to
4687 # sub guess_old_indentation_level()
4688 if ( length($input_line) && $1 ) {
4689 my $leading_spaces = $1;
4690 my $spaces = length($leading_spaces);
4692 # handle leading tabs
4693 if ( ord( substr( $leading_spaces, 0, 1 ) ) == ORD_TAB
4694 && $leading_spaces =~ /^(\t+)/ )
4696 my $tabsize = $self->[_tabsize_];
4697 $spaces += length($1) * ( $tabsize - 1 );
4700 my $indent_columns = $self->[_indent_columns_];
4701 $line_of_tokens->{_guessed_indentation_level} =
4702 int( $spaces / $indent_columns );
4705 $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
4706 && $input_line =~ /^__(END|DATA)__\s*$/;
4709 # Optimize for a full-line comment.
4711 if ( substr( $input_line, 0, 1 ) eq '#' ) {
4713 # and check for skipped section
4714 if ( $rOpts_code_skipping
4715 && $input_line =~ /$code_skipping_pattern_begin/ )
4717 $self->[_in_skipped_] = 1;
4721 # Optional fast processing of a block comment
4723 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4724 my $ci_string_i = $ci_string_sum + $in_statement_continuation;
4725 $line_of_tokens->{_line_type} = 'CODE';
4726 $line_of_tokens->{_rtokens} = [$input_line];
4727 $line_of_tokens->{_rtoken_type} = ['#'];
4728 $line_of_tokens->{_rlevels} = [$level_in_tokenizer];
4729 $line_of_tokens->{_rci_levels} = [$ci_string_i];
4730 $line_of_tokens->{_rblock_type} = [EMPTY_STRING];
4731 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
4732 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
4736 # Optimize handling of a blank line
4737 if ( !length($input_line) ) {
4738 $line_of_tokens->{_line_type} = 'CODE';
4739 $line_of_tokens->{_rtokens} = [];
4740 $line_of_tokens->{_rtoken_type} = [];
4741 $line_of_tokens->{_rlevels} = [];
4742 $line_of_tokens->{_rci_levels} = [];
4743 $line_of_tokens->{_rblock_type} = [];
4744 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
4745 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
4750 # update the copy of the line for use in error messages
4751 # This must be exactly what we give the pre_tokenizer
4752 $self->[_line_of_text_] = $input_line;
4754 # re-initialize for the main loop
4755 $routput_token_list = []; # stack of output token indexes
4756 $routput_token_type = []; # token types
4757 $routput_block_type = []; # types of code block
4758 $routput_container_type = []; # paren types, such as if, elsif, ..
4759 $routput_type_sequence = []; # nesting sequential number
4761 $rhere_target_list = [];
4763 $tok = $last_nonblank_token;
4764 $type = $last_nonblank_type;
4765 $prototype = $last_nonblank_prototype;
4766 $last_nonblank_i = -1;
4767 $block_type = $last_nonblank_block_type;
4768 $container_type = $last_nonblank_container_type;
4769 $type_sequence = $last_nonblank_type_sequence;
4773 $self->tokenizer_main_loop($is_END_or_DATA);
4775 #-----------------------------------------------
4776 # all done tokenizing this line ...
4777 # now prepare the final list of tokens and types
4778 #-----------------------------------------------
4780 $self->tokenizer_wrapup_line($line_of_tokens);
4783 } ## end sub tokenize_this_line
4785 sub tokenizer_main_loop {
4787 my ( $self, $is_END_or_DATA ) = @_;
4789 #---------------------------------
4790 # Break one input line into tokens
4791 #---------------------------------
4794 # $is_END_or_DATA is true for a __END__ or __DATA__ line
4796 # start by breaking the line into pre-tokens
4797 my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
4798 ( $rtokens, $rtoken_map, $rtoken_type ) =
4799 pre_tokenize( $input_line, $max_tokens_wanted );
4801 $max_token_index = scalar( @{$rtokens} ) - 1;
4802 push( @{$rtokens}, SPACE, SPACE, SPACE )
4803 ; # extra whitespace simplifies logic
4804 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
4805 push( @{$rtoken_type}, 'b', 'b', 'b' );
4807 # initialize for main loop
4808 if (0) { #<<< this is not necessary
4809 foreach my $ii ( 0 .. $max_token_index + 3 ) {
4810 $routput_token_type->[$ii] = EMPTY_STRING;
4811 $routput_block_type->[$ii] = EMPTY_STRING;
4812 $routput_container_type->[$ii] = EMPTY_STRING;
4813 $routput_type_sequence->[$ii] = EMPTY_STRING;
4814 $routput_indent_flag->[$ii] = 0;
4821 #-----------------------------
4822 # begin main tokenization loop
4823 #-----------------------------
4825 # we are looking at each pre-token of one line and combining them
4827 while ( ++$i <= $max_token_index ) {
4829 # continue looking for the end of a quote
4832 last if ( $in_quote || $i > $max_token_index );
4835 if ( $type ne 'b' && $tok ne 'CORE::' ) {
4837 # try to catch some common errors
4838 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
4840 if ( $last_nonblank_token eq 'eq' ) {
4841 complain("Should 'eq' be '==' here ?\n");
4843 elsif ( $last_nonblank_token eq 'ne' ) {
4844 complain("Should 'ne' be '!=' here ?\n");
4848 # fix c090, only rotate vars if a new token will be stored
4849 if ( $i_tok >= 0 ) {
4850 $last_last_nonblank_token = $last_nonblank_token;
4851 $last_last_nonblank_type = $last_nonblank_type;
4852 $last_last_nonblank_block_type = $last_nonblank_block_type;
4853 $last_last_nonblank_container_type =
4854 $last_nonblank_container_type;
4855 $last_last_nonblank_type_sequence =
4856 $last_nonblank_type_sequence;
4858 # Fix part #3 for git82: propagate type 'Z' though L-R pair
4859 unless ( $type eq 'R' && $last_nonblank_type eq 'Z' ) {
4860 $last_nonblank_token = $tok;
4861 $last_nonblank_type = $type;
4863 $last_nonblank_prototype = $prototype;
4864 $last_nonblank_block_type = $block_type;
4865 $last_nonblank_container_type = $container_type;
4866 $last_nonblank_type_sequence = $type_sequence;
4867 $last_nonblank_i = $i_tok;
4870 # Patch for c030: Fix things in case a '->' got separated from
4871 # the subsequent identifier by a side comment. We need the
4872 # last_nonblank_token to have a leading -> to avoid triggering
4873 # an operator expected error message at the next '('. See also
4875 if ( $last_last_nonblank_token eq '->' ) {
4876 if ( $last_nonblank_type eq 'w'
4877 || $last_nonblank_type eq 'i' )
4879 $last_nonblank_token = '->' . $last_nonblank_token;
4880 $last_nonblank_type = 'i';
4885 # store previous token type
4886 if ( $i_tok >= 0 ) {
4887 $routput_token_type->[$i_tok] = $type;
4888 $routput_block_type->[$i_tok] = $block_type;
4889 $routput_container_type->[$i_tok] = $container_type;
4890 $routput_type_sequence->[$i_tok] = $type_sequence;
4891 $routput_indent_flag->[$i_tok] = $indent_flag;
4894 # get the next pre-token and type
4895 # $tok and $type will be modified to make the output token
4896 my $pre_tok = $tok = $rtokens->[$i]; # get the next pre-token
4897 my $pre_type = $type = $rtoken_type->[$i]; # and type
4899 # remember the starting index of this token; we will be updating $i
4902 # re-initialize various flags for the next output token
4903 $block_type &&= EMPTY_STRING;
4904 $container_type &&= EMPTY_STRING;
4905 $type_sequence &&= EMPTY_STRING;
4907 $prototype &&= EMPTY_STRING;
4909 # this pre-token will start an output token
4910 push( @{$routput_token_list}, $i_tok );
4912 #--------------------------
4913 # handle a whitespace token
4914 #--------------------------
4915 next if ( $pre_type eq 'b' );
4920 last if ( $pre_type eq '#' );
4922 # continue gathering identifier if necessary
4923 if ($id_scan_state) {
4925 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
4932 if ($id_scan_state) {
4934 # Still scanning ...
4935 # Check for side comment between sub and prototype (c061)
4937 # done if nothing left to scan on this line
4938 last if ( $i > $max_token_index );
4940 my ( $next_nonblank_token, $i_next ) =
4941 find_next_nonblank_token_on_this_line( $i, $rtokens,
4944 # done if it was just some trailing space
4945 last if ( $i_next > $max_token_index );
4947 # something remains on the line ... must be a side comment
4951 next if ( ( $i > 0 ) || $type );
4953 # didn't find any token; start over
4958 ## my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : SPACE;
4959 my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
4961 #-----------------------------------------------------------
4962 # Combine pre-tokens into digraphs and trigraphs if possible
4963 #-----------------------------------------------------------
4965 # See if we can make a digraph...
4966 # The following tokens are excluded and handled specially:
4967 # '/=' is excluded because the / might start a pattern.
4968 # 'x=' is excluded since it might be $x=, with $ on previous line
4969 # '**' and *= might be typeglobs of punctuation variables
4970 # I have allowed tokens starting with <, such as <=,
4971 # because I don't think these could be valid angle operators.
4972 # test file: storrs4.pl
4973 if ( $can_start_digraph{$tok}
4974 && $i < $max_token_index
4975 && $is_digraph{ $tok . $rtokens->[ $i + 1 ] } )
4979 my $test_tok = $tok . $rtokens->[ $i + 1 ];
4981 # check for special cases which cannot be combined
4983 # '//' must be defined_or operator if an operator is expected.
4984 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
4985 # could be migrated here for clarity
4987 # Patch for RT#102371, misparsing a // in the following snippet:
4988 # state $b //= ccc();
4989 # The solution is to always accept the digraph (or trigraph)
4990 # after type 'Z' (possible file handle). The reason is that
4991 # sub operator_expected gives TERM expected here, which is
4992 # wrong in this case.
4993 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
4995 # note that here $tok = '/' and the next tok and type is '/'
4996 $expecting = operator_expected( [ $prev_type, $tok, '/' ] );
4998 # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
4999 $combine_ok = 0 if ( $expecting == TERM );
5002 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
5003 # Accept the digraphs '**' only after type 'Z'
5004 # Otherwise postpone the decision.
5005 if ( $test_tok eq '**' ) {
5006 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
5011 # still ok to combine?
5014 && ( $test_tok ne '/=' ) # might be pattern
5015 && ( $test_tok ne 'x=' ) # might be $x
5016 && ( $test_tok ne '*=' ) # typeglob?
5018 # Moved above as part of fix for
5019 # RT #114359: Missparsing of "print $x ** 0.5;
5020 # && ( $test_tok ne '**' ) # typeglob?
5026 # Now try to assemble trigraphs. Note that all possible
5027 # perl trigraphs can be constructed by appending a character
5029 $test_tok = $tok . $rtokens->[ $i + 1 ];
5031 if ( $is_trigraph{$test_tok} ) {
5036 # The only current tetragraph is the double diamond operator
5037 # and its first three characters are not a trigraph, so
5038 # we do can do a special test for it
5039 elsif ( $test_tok eq '<<>' ) {
5040 $test_tok .= $rtokens->[ $i + 2 ];
5041 if ( $is_tetragraph{$test_tok} ) {
5050 $next_tok = $rtokens->[ $i + 1 ];
5051 $next_type = $rtoken_type->[ $i + 1 ];
5053 DEBUG_TOKENIZE && do {
5054 local $LIST_SEPARATOR = ')(';
5056 $last_nonblank_token, $tok,
5057 $next_tok, $brace_depth,
5058 $brace_type[$brace_depth], $paren_depth,
5059 $paren_type[$paren_depth],
5061 print STDOUT "TOKENIZE:(@debug_list)\n";
5064 # Turn off attribute list on first non-blank, non-bareword.
5065 # Added '#' to fix c038 (later moved above).
5066 if ( $in_attribute_list && $pre_type ne 'w' ) {
5067 $in_attribute_list = 0;
5070 #--------------------------------------------------------
5071 # We have the next token, $tok.
5072 # Now we have to examine this token and decide what it is
5073 # and define its $type
5075 # section 1: bare words
5076 #--------------------------------------------------------
5078 if ( $pre_type eq 'w' ) {
5080 operator_expected( [ $prev_type, $tok, $next_type ] );
5081 my $is_last = do_BAREWORD($is_END_or_DATA);
5085 #-----------------------------
5086 # section 2: strings of digits
5087 #-----------------------------
5088 elsif ( $pre_type eq 'd' ) {
5090 operator_expected( [ $prev_type, $tok, $next_type ] );
5094 #----------------------------
5095 # section 3: all other tokens
5096 #----------------------------
5098 my $code = $tokenization_code->{$tok};
5101 operator_expected( [ $prev_type, $tok, $next_type ] );
5108 # -----------------------------
5109 # end of main tokenization loop
5110 # -----------------------------
5112 # Store the final token
5113 if ( $i_tok >= 0 ) {
5114 $routput_token_type->[$i_tok] = $type;
5115 $routput_block_type->[$i_tok] = $block_type;
5116 $routput_container_type->[$i_tok] = $container_type;
5117 $routput_type_sequence->[$i_tok] = $type_sequence;
5118 $routput_indent_flag->[$i_tok] = $indent_flag;
5121 # Remember last nonblank values
5122 if ( $type ne 'b' && $type ne '#' ) {
5123 $last_last_nonblank_token = $last_nonblank_token;
5124 $last_last_nonblank_type = $last_nonblank_type;
5125 $last_last_nonblank_block_type = $last_nonblank_block_type;
5126 $last_last_nonblank_container_type = $last_nonblank_container_type;
5127 $last_last_nonblank_type_sequence = $last_nonblank_type_sequence;
5128 $last_nonblank_token = $tok;
5129 $last_nonblank_type = $type;
5130 $last_nonblank_block_type = $block_type;
5131 $last_nonblank_container_type = $container_type;
5132 $last_nonblank_type_sequence = $type_sequence;
5133 $last_nonblank_prototype = $prototype;
5136 # reset indentation level if necessary at a sub or package
5137 # in an attempt to recover from a nesting error
5138 if ( $level_in_tokenizer < 0 ) {
5139 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
5140 reset_indentation_level(0);
5141 brace_warning("resetting level to 0 at $1 $2\n");
5145 $self->[_in_attribute_list_] = $in_attribute_list;
5146 $self->[_in_quote_] = $in_quote;
5147 $self->[_quote_target_] =
5148 $in_quote ? matching_end_token($quote_character) : EMPTY_STRING;
5149 $self->[_rhere_target_list_] = $rhere_target_list;
5152 } ## end sub tokenizer_main_loop
5154 sub tokenizer_wrapup_line {
5155 my ( $self, $line_of_tokens ) = @_;
5157 #---------------------------------------------------------
5158 # Package a line of tokens for shipping back to the caller
5159 #---------------------------------------------------------
5161 # Most of the remaining work involves defining the two indentation
5162 # parameters that the formatter needs for each token:
5163 # - $level = structural indentation level and
5164 # - $ci_level = continuation indentation level
5166 # The method for setting the indentation level is straightforward.
5167 # But the method used to define the continuation indentation is
5168 # complicated because it has evolved over a long time by trial and
5169 # error. It could undoubtedly be simplified but it works okay as is.
5171 # Here is a brief description of how indentation is computed.
5172 # Perl::Tidy computes indentation as the sum of 2 terms:
5174 # (1) structural indentation, such as if/else/elsif blocks
5175 # (2) continuation indentation, such as long parameter call lists.
5177 # These are occasionally called primary and secondary indentation.
5179 # Structural indentation is introduced by tokens of type '{',
5180 # although the actual tokens might be '{', '(', or '['. Structural
5181 # indentation is of two types: BLOCK and non-BLOCK. Default
5182 # structural indentation is 4 characters if the standard indentation
5185 # Continuation indentation is introduced whenever a line at BLOCK
5186 # level is broken before its termination. Default continuation
5187 # indentation is 2 characters in the standard indentation scheme.
5189 # Both types of indentation may be nested arbitrarily deep and
5190 # interlaced. The distinction between the two is somewhat arbitrary.
5192 # For each token, we will define two variables which would apply if
5193 # the current statement were broken just before that token, so that
5194 # that token started a new line:
5196 # $level = the structural indentation level,
5197 # $ci_level = the continuation indentation level
5199 # The total indentation will be $level * (4 spaces) + $ci_level * (2
5200 # spaces), assuming defaults. However, in some special cases it is
5201 # customary to modify $ci_level from this strict value.
5203 # The total structural indentation is easy to compute by adding and
5204 # subtracting 1 from a saved value as types '{' and '}' are seen.
5205 # The running value of this variable is $level_in_tokenizer.
5207 # The total continuation is much more difficult to compute, and
5208 # requires several variables. These variables are:
5210 # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
5211 # each indentation level, if there are intervening open secondary
5212 # structures just prior to that level.
5213 # $continuation_string_in_tokenizer = a string of 1's and 0's
5214 # indicating if the last token at that level is "continued", meaning
5215 # that it is not the first token of an expression.
5216 # $nesting_block_string = a string of 1's and 0's indicating, for each
5217 # indentation level, if the level is of type BLOCK or not.
5218 # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
5219 # $nesting_list_string = a string of 1's and 0's indicating, for each
5220 # indentation level, if it is appropriate for list formatting.
5221 # If so, continuation indentation is used to indent long list items.
5222 # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
5223 # @{$rslevel_stack} = a stack of total nesting depths at each
5224 # structural indentation level, where "total nesting depth" means
5225 # the nesting depth that would occur if every nesting token
5226 # -- '{', '[', # and '(' -- , regardless of context, is used to
5227 # compute a nesting depth.
5229 # Notes on the Continuation Indentation
5231 # There is a sort of chicken-and-egg problem with continuation
5232 # indentation. The formatter can't make decisions on line breaks
5233 # without knowing what 'ci' will be at arbitrary locations.
5235 # But a problem with setting the continuation indentation (ci) here
5236 # in the tokenizer is that we do not know where line breaks will
5237 # actually be. As a result, we don't know if we should propagate
5238 # continuation indentation to higher levels of structure.
5240 # For nesting of only structural indentation, we never need to do
5241 # this. For example, in a long if statement, like this
5243 # if ( !$output_block_type[$i]
5244 # && ($in_statement_continuation) )
5249 # the second line has ci but we do normally give the lines within
5250 # the BLOCK any ci. This would be true if we had blocks nested
5251 # arbitrarily deeply.
5253 # But consider something like this, where we have created a break
5254 # after an opening paren on line 1, and the paren is not (currently)
5255 # a structural indentation token:
5257 # my $file = $menubar->Menubutton(
5258 # qw/-text File -underline 0 -menuitems/ => [
5260 # Cascade => '~View',
5264 # The second line has ci, so it would seem reasonable to propagate
5265 # it down, giving the third line 1 ci + 1 indentation. This
5266 # suggests the following rule, which is currently used to
5267 # propagating ci down: if there are any non-structural opening
5268 # parens (or brackets, or braces), before an opening structural
5269 # brace, then ci is propagated down, and otherwise
5270 # not. The variable $intervening_secondary_structure contains this
5271 # information for the current token, and the string
5272 # "$ci_string_in_tokenizer" is a stack of previous values of this
5275 my @token_type = (); # stack of output token types
5276 my @block_type = (); # stack of output code block types
5277 my @type_sequence = (); # stack of output type sequence numbers
5278 my @tokens = (); # output tokens
5279 my @levels = (); # structural brace levels of output tokens
5280 my @ci_string = (); # string needed to compute continuation indentation
5282 # Count the number of '1's in the string (previously sub ones_count)
5283 my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5285 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5287 my ( $ci_string_i, $level_i );
5293 foreach my $i ( @{$routput_token_list} ) {
5295 my $type_i = $routput_token_type->[$i];
5296 $level_i = $level_in_tokenizer;
5298 # Quick handling of indentation levels for blanks and comments
5299 if ( $type_i eq 'b' || $type_i eq '#' ) {
5300 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5306 # $tok_i is the PRE-token. It only equals the token for symbols
5307 my $tok_i = $rtokens->[$i];
5309 # Check for an invalid token type..
5310 # This can happen by running perltidy on non-scripts although
5311 # it could also be bug introduced by programming change. Perl
5312 # silently accepts a 032 (^Z) and takes it as the end
5313 if ( !$is_valid_token_type{$type_i} ) {
5314 my $val = ord($type_i);
5316 "unexpected character decimal $val ($type_i) in script\n"
5318 $self->[_in_error_] = 1;
5321 # $ternary_indentation_flag indicates that we need a change
5322 # in level at a nested ternary, as follows
5323 # 1 => at a nested ternary ?
5324 # -1 => at a nested ternary :
5326 my $ternary_indentation_flag = $routput_indent_flag->[$i];
5328 #-------------------------------------------
5329 # Section 1: handle a level-increasing token
5330 #-------------------------------------------
5331 # set primary indentation levels based on structural braces
5332 # Note: these are set so that the leading braces have a HIGHER
5333 # level than their CONTENTS, which is convenient for indentation
5334 # Also, define continuation indentation for each token.
5337 || $ternary_indentation_flag > 0 )
5340 # if the difference between total nesting levels is not 1,
5341 # there are intervening non-structural nesting types between
5342 # this '{' and the previous unclosed '{'
5343 my $intervening_secondary_structure = 0;
5344 if ( @{$rslevel_stack} ) {
5345 $intervening_secondary_structure =
5346 $slevel_in_tokenizer - $rslevel_stack->[-1];
5349 # save the current states
5350 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
5351 $level_in_tokenizer++;
5353 if ( $level_in_tokenizer > $self->[_maximum_level_] ) {
5354 $self->[_maximum_level_] = $level_in_tokenizer;
5357 if ($ternary_indentation_flag) {
5359 # break BEFORE '?' in a nested ternary
5360 if ( $type_i eq '?' ) {
5361 $level_i = $level_in_tokenizer;
5364 $nesting_block_string .= "$nesting_block_flag";
5365 } ## end if ($ternary_indentation_flag)
5368 if ( $routput_block_type->[$i] ) {
5369 $nesting_block_flag = 1;
5370 $nesting_block_string .= '1';
5373 $nesting_block_flag = 0;
5374 $nesting_block_string .= '0';
5378 # we will use continuation indentation within containers
5379 # which are not blocks and not logical expressions
5381 if ( !$routput_block_type->[$i] ) {
5383 # propagate flag down at nested open parens
5384 if ( $routput_container_type->[$i] eq '(' ) {
5385 $bit = 1 if $nesting_list_flag;
5388 # use list continuation if not a logical grouping
5389 # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
5393 $is_logical_container{ $routput_container_type
5397 $nesting_list_string .= $bit;
5398 $nesting_list_flag = $bit;
5400 $ci_string_in_tokenizer .=
5401 ( $intervening_secondary_structure != 0 ) ? '1' : '0';
5403 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5404 $continuation_string_in_tokenizer .=
5405 ( $in_statement_continuation > 0 ) ? '1' : '0';
5407 # Sometimes we want to give an opening brace
5408 # continuation indentation, and sometimes not. For code
5409 # blocks, we don't do it, so that the leading '{' gets
5410 # outdented, like this:
5412 # if ( !$output_block_type[$i]
5413 # && ($in_statement_continuation) )
5416 # For other types, we will give them continuation
5417 # indentation. For example, here is how a list looks
5418 # with the opening paren indented:
5421 # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
5422 # [ "homer", "marge", "bart" ], );
5424 # This looks best when 'ci' is one-half of the
5425 # indentation (i.e., 2 and 4)
5427 my $total_ci = $ci_string_sum;
5429 !$routput_block_type->[$i] # patch: skip for BLOCK
5430 && ($in_statement_continuation)
5431 && !( $ternary_indentation_flag && $type_i eq ':' )
5434 $total_ci += $in_statement_continuation
5436 substr( $ci_string_in_tokenizer, -1 ) eq '1' );
5439 $ci_string_i = $total_ci;
5440 $in_statement_continuation = 0;
5441 } ## end if ( $type_i eq '{' ||...})
5443 #-------------------------------------------
5444 # Section 2: handle a level-decreasing token
5445 #-------------------------------------------
5446 elsif ($type_i eq '}'
5448 || $ternary_indentation_flag < 0 )
5451 # only a nesting error in the script would prevent
5453 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
5455 $level_i = --$level_in_tokenizer;
5457 if ( $level_in_tokenizer < 0 ) {
5458 unless ( $self->[_saw_negative_indentation_] ) {
5459 $self->[_saw_negative_indentation_] = 1;
5460 warning("Starting negative indentation\n");
5464 # restore previous level values
5465 if ( length($nesting_block_string) > 1 )
5466 { # true for valid script
5467 chop $nesting_block_string;
5468 $nesting_block_flag =
5469 substr( $nesting_block_string, -1 ) eq '1';
5470 chop $nesting_list_string;
5471 $nesting_list_flag =
5472 substr( $nesting_list_string, -1 ) eq '1';
5474 chop $ci_string_in_tokenizer;
5476 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5478 $in_statement_continuation =
5479 chop $continuation_string_in_tokenizer;
5481 # zero continuation flag at terminal BLOCK '}' which
5483 my $block_type_i = $routput_block_type->[$i];
5484 if ($block_type_i) {
5486 # ...These include non-anonymous subs
5487 # note: could be sub ::abc { or sub 'abc
5488 if ( substr( $block_type_i, 0, 3 ) eq 'sub'
5489 && $block_type_i =~ m/^sub\s*/gc )
5492 # note: older versions of perl require the /gc
5493 # modifier here or else the \G does not work.
5494 $in_statement_continuation = 0
5495 if ( $block_type_i =~ /\G('|::|\w)/gc );
5498 # ...and include all block types except user subs
5499 # with block prototypes and these:
5500 # (sort|grep|map|do|eval)
5502 $is_zero_continuation_block_type{$block_type_i}
5505 $in_statement_continuation = 0;
5508 # ..but these are not terminal types:
5509 # /^(sort|grep|map|do|eval)$/ )
5510 elsif ($is_sort_map_grep_eval_do{$block_type_i}
5511 || $is_grep_alias{$block_type_i} )
5515 # ..and a block introduced by a label
5517 elsif ( $block_type_i =~ /:$/ ) {
5518 $in_statement_continuation = 0;
5521 # user function with block prototype
5523 $in_statement_continuation = 0;
5525 } ## end if ($block_type_i)
5527 # If we are in a list, then
5528 # we must set continuation indentation at the closing
5529 # paren of something like this (paren after $check):
5532 # ( not defined $check )
5534 # or $check eq "new"
5535 # or $check eq "old",
5537 elsif ( $tok_i eq ')' ) {
5538 $in_statement_continuation = 1
5541 $routput_container_type->[$i]
5544 ##if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
5546 } ## end if ( length($nesting_block_string...))
5548 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5549 } ## end elsif ( $type_i eq '}' ||...{)
5551 #-----------------------------------------
5552 # Section 3: handle a constant level token
5553 #-----------------------------------------
5556 # zero the continuation indentation at certain tokens so
5557 # that they will be at the same level as its container. For
5558 # commas, this simplifies the -lp indentation logic, which
5559 # counts commas. For ?: it makes them stand out.
5562 ## $type_i =~ /^[,\?\:]$/
5563 && $is_comma_question_colon{$type_i}
5566 $in_statement_continuation = 0;
5569 # Be sure binary operators get continuation indentation.
5570 # Note: the check on $nesting_block_flag is only needed
5571 # to add ci to binary operators following a 'try' block,
5572 # or similar extended syntax block operator (see c158).
5574 !$in_statement_continuation
5575 && ( $nesting_block_flag || $nesting_list_flag )
5576 && ( $type_i eq 'k' && $is_binary_keyword{$tok_i}
5577 || $is_binary_type{$type_i} )
5580 $in_statement_continuation = 1;
5583 # continuation indentation is sum of any open ci from
5584 # previous levels plus the current level
5585 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5587 # update continuation flag ...
5589 # if we are in a BLOCK
5590 if ($nesting_block_flag) {
5592 # the next token after a ';' and label starts a new stmt
5593 if ( $type_i eq ';' || $type_i eq 'J' ) {
5594 $in_statement_continuation = 0;
5597 # otherwise, we are continuing the current statement
5599 $in_statement_continuation = 1;
5603 # if we are not in a BLOCK..
5606 # do not use continuation indentation if not list
5607 # environment (could be within if/elsif clause)
5608 if ( !$nesting_list_flag ) {
5609 $in_statement_continuation = 0;
5612 # otherwise, the token after a ',' starts a new term
5614 # Patch FOR RT#99961; no continuation after a ';'
5615 # This is needed because perltidy currently marks
5616 # a block preceded by a type character like % or @
5617 # as a non block, to simplify formatting. But these
5618 # are actually blocks and can have semicolons.
5619 # See code_block_type() and is_non_structural_brace().
5620 elsif ( $type_i eq ',' || $type_i eq ';' ) {
5621 $in_statement_continuation = 0;
5624 # otherwise, we are continuing the current term
5626 $in_statement_continuation = 1;
5628 } ## end else [ if ($nesting_block_flag)]
5630 } ## end else [ if ( $type_i eq '{' ||...})]
5632 #-------------------------------------------
5633 # Section 4: operations common to all levels
5634 #-------------------------------------------
5636 # set secondary nesting levels based on all containment token
5637 # types Note: these are set so that the nesting depth is the
5638 # depth of the PREVIOUS TOKEN, which is convenient for setting
5639 # the strength of token bonds
5642 if ( $is_opening_type{$type_i} ) {
5643 $slevel_in_tokenizer++;
5644 $nesting_token_string .= $tok_i;
5645 $nesting_type_string .= $type_i;
5649 elsif ( $is_closing_type{$type_i} ) {
5650 $slevel_in_tokenizer--;
5651 my $char = chop $nesting_token_string;
5653 if ( $char ne $matching_start_token{$tok_i} ) {
5654 $nesting_token_string .= $char . $tok_i;
5655 $nesting_type_string .= $type_i;
5658 chop $nesting_type_string;
5662 # apply token type patch:
5663 # - output anonymous 'sub' as keyword (type 'k')
5664 # - output __END__, __DATA__, and format as type 'k' instead
5665 # of ';' to make html colors correct, etc.
5666 # The following hash tests are equivalent to these older tests:
5667 # if ( $type_i eq 't' && $is_sub{$tok_i} ) { $fix_type = 'k' }
5668 # if ( $type_i eq ';' && $tok_i =~ /\w/ ) { $fix_type = 'k' }
5669 if ( $is_END_DATA_format_sub{$tok_i}
5670 && $is_semicolon_or_t{$type_i} )
5674 } ## end else [ if ( $type_i eq 'b' ||...)]
5676 #--------------------------------
5677 # Store the values for this token
5678 #--------------------------------
5679 push( @ci_string, $ci_string_i );
5680 push( @levels, $level_i );
5681 push( @block_type, $routput_block_type->[$i] );
5682 push( @type_sequence, $routput_type_sequence->[$i] );
5683 push( @token_type, $type_i );
5685 # Form and store the PREVIOUS token
5686 if ( defined($rtoken_map_im) ) {
5688 $rtoken_map->[$i] - $rtoken_map_im; # how many characters
5692 substr( $input_line, $rtoken_map_im, $numc ) );
5696 # Should not happen unless @{$rtoken_map} is corrupted
5699 "number of characters is '$numc' but should be >0\n");
5703 # or grab some values for the leading token (needed for log output)
5705 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5708 $rtoken_map_im = $rtoken_map->[$i];
5709 } ## end foreach my $i ( @{$routput_token_list...})
5711 #------------------------
5712 # End loop to over tokens
5713 #------------------------
5715 # Form and store the final token of this line
5716 if ( defined($rtoken_map_im) ) {
5717 my $numc = length($input_line) - $rtoken_map_im;
5719 push( @tokens, substr( $input_line, $rtoken_map_im, $numc ) );
5723 # Should not happen unless @{$rtoken_map} is corrupted
5726 "Number of Characters is '$numc' but should be >0\n");
5730 #----------------------------------------------------------
5731 # Wrap up this line of tokens for shipping to the Formatter
5732 #----------------------------------------------------------
5733 $line_of_tokens->{_rtoken_type} = \@token_type;
5734 $line_of_tokens->{_rtokens} = \@tokens;
5735 $line_of_tokens->{_rblock_type} = \@block_type;
5736 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
5737 $line_of_tokens->{_rlevels} = \@levels;
5738 $line_of_tokens->{_rci_levels} = \@ci_string;
5741 } ## end sub tokenizer_wrapup_line
5742 } ## end tokenize_this_line
5744 #######################################################################
5745 # Tokenizer routines which assist in identifying token types
5746 #######################################################################
5748 # hash lookup table of operator expected values
5749 my %op_expected_table;
5751 # exceptions to perl's weird parsing rules after type 'Z'
5752 my %is_weird_parsing_rule_exception;
5754 my %is_paren_dollar;
5760 # Always expecting TERM following these types:
5761 # note: this is identical to '@value_requestor_type' defined later.
5763 ; ! + x & ? F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
5764 || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
5765 &= // >> ~. &. |. ^.
5766 ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
5769 push @q, '('; # for completeness, not currently a token type
5770 push @q, '->'; # was previously in UNKNOWN
5771 @{op_expected_table}{@q} = (TERM) x scalar(@q);
5773 # Always UNKNOWN following these types;
5774 # previously had '->' in this list for c030
5776 @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
5778 # Always expecting OPERATOR ...
5779 # 'n' and 'v' are currently excluded because they might be VERSION numbers
5780 # 'i' is currently excluded because it might be a package
5781 # 'q' is currently excluded because it might be a prototype
5782 # Fix for c030: removed '->' from this list:
5783 @q = qw( -- C h R ++ ] Q <> ); ## n v q i );
5785 @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
5787 # Fix for git #62: added '*' and '%'
5789 @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
5792 @{is_paren_dollar}{@q} = (1) x scalar(@q);
5795 @{is_n_v}{@q} = (1) x scalar(@q);
5799 use constant DEBUG_OPERATOR_EXPECTED => 0;
5801 sub operator_expected {
5803 # Returns a parameter indicating what types of tokens can occur next
5806 # $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
5808 # $prev_type is the type of the previous token (blank or not)
5809 # $tok is the current token
5810 # $next_type is the type of the next token (blank or not)
5812 # Many perl symbols have two or more meanings. For example, '<<'
5813 # can be a shift operator or a here-doc operator. The
5814 # interpretation of these symbols depends on the current state of
5815 # the tokenizer, which may either be expecting a term or an
5816 # operator. For this example, a << would be a shift if an OPERATOR
5817 # is expected, and a here-doc if a TERM is expected. This routine
5818 # is called to make this decision for any current token. It returns
5819 # one of three possible values:
5821 # OPERATOR - operator expected (or at least, not a term)
5822 # UNKNOWN - can't tell
5823 # TERM - a term is expected (or at least, not an operator)
5825 # The decision is based on what has been seen so far. This
5826 # information is stored in the "$last_nonblank_type" and
5827 # "$last_nonblank_token" variables. For example, if the
5828 # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
5829 # if $last_nonblank_type is 'n' (numeric), we are expecting an
5832 # If a UNKNOWN is returned, the calling routine must guess. A major
5833 # goal of this tokenizer is to minimize the possibility of returning
5834 # UNKNOWN, because a wrong guess can spoil the formatting of a
5837 # Adding NEW_TOKENS: it is critically important that this routine be
5838 # updated to allow it to determine if an operator or term is to be
5839 # expected after the new token. Doing this simply involves adding
5840 # the new token character to one of the regexes in this routine or
5841 # to one of the hash lists
5842 # that it uses, which are initialized in the BEGIN section.
5843 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
5846 # When possible, token types should be selected such that we can determine
5847 # the 'operator_expected' value by a simple hash lookup. If there are
5848 # exceptions, that is an indication that a new type is needed.
5856 # Many types are can be obtained by a table lookup given the previous type.
5857 # This typically handles half or more of the calls.
5858 my $op_expected = $op_expected_table{$last_nonblank_type};
5859 if ( defined($op_expected) ) {
5860 DEBUG_OPERATOR_EXPECTED
5862 "OPERATOR_EXPECTED: Table Lookup; returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5863 return $op_expected;
5866 #---------------------
5867 # Handle special cases
5868 #---------------------
5870 $op_expected = UNKNOWN;
5871 my ( $prev_type, $tok, $next_type ) = @{$rarg};
5873 # Types 'k', '}' and 'Z' depend on context
5874 # Types 'i', 'n', 'v', 'q' currently also temporarily depend on context.
5877 if ( $last_nonblank_type eq 'i' ) {
5878 $op_expected = OPERATOR;
5880 # TODO: it would be cleaner to make this a special type
5881 # expecting VERSION or {} after package NAMESPACE;
5882 # maybe mark these words as type 'Y'?
5883 if ( substr( $last_nonblank_token, 0, 7 ) eq 'package'
5884 && $statement_type =~ /^package\b/
5885 && $last_nonblank_token =~ /^package\b/ )
5887 $op_expected = TERM;
5892 elsif ( $last_nonblank_type eq 'k' ) {
5893 $op_expected = TERM;
5894 if ( $expecting_operator_token{$last_nonblank_token} ) {
5895 $op_expected = OPERATOR;
5897 elsif ( $expecting_term_token{$last_nonblank_token} ) {
5899 # Exceptions from TERM:
5901 # // may follow perl functions which may be unary operators
5902 # see test file dor.t (defined or);
5905 && $next_type eq '/'
5906 && $is_keyword_rejecting_slash_as_pattern_delimiter{
5907 $last_nonblank_token}
5910 $op_expected = OPERATOR;
5913 # Patch to allow a ? following 'split' to be a deprecated pattern
5914 # delimiter. This patch is coordinated with the omission of split
5916 # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
5917 # will force perltidy to guess.
5919 && $last_nonblank_token eq 'split' )
5921 $op_expected = UNKNOWN;
5926 # closing container token...
5928 # Note that the actual token for type '}' may also be a ')'.
5930 # Also note that $last_nonblank_token is not the token corresponding to
5931 # $last_nonblank_type when the type is a closing container. In that
5932 # case it is the token before the corresponding opening container token.
5933 # So for example, for this snippet
5934 # $a = do { BLOCK } / 2;
5935 # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
5937 elsif ( $last_nonblank_type eq '}' ) {
5938 $op_expected = UNKNOWN;
5940 # handle something after 'do' and 'eval'
5941 if ( $is_block_operator{$last_nonblank_token} ) {
5943 # something like $a = do { BLOCK } / 2;
5944 $op_expected = OPERATOR; # block mode following }
5947 # $last_nonblank_token =~ /^(\)|\$|\-\>)/
5948 elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
5949 || substr( $last_nonblank_token, 0, 2 ) eq '->' )
5951 $op_expected = OPERATOR;
5952 if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
5955 # Check for smartmatch operator before preceding brace or square
5956 # bracket. For example, at the ? after the ] in the following
5957 # expressions we are expecting an operator:
5959 # qr/3/ ~~ ['1234'] ? 1 : 0;
5960 # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
5961 elsif ( $last_nonblank_token eq '~~' ) {
5962 $op_expected = OPERATOR;
5965 # A right brace here indicates the end of a simple block. All
5966 # non-structural right braces have type 'R' all braces associated with
5967 # block operator keywords have been given those keywords as
5968 # "last_nonblank_token" and caught above. (This statement is order
5969 # dependent, and must come after checking $last_nonblank_token).
5972 # patch for dor.t (defined or).
5974 && $next_type eq '/'
5975 && $last_nonblank_token eq ']' )
5977 $op_expected = OPERATOR;
5980 # Patch for RT #116344: misparse a ternary operator after an
5981 # anonymous hash, like this:
5982 # return ref {} ? 1 : 0;
5983 # The right brace should really be marked type 'R' in this case,
5984 # and it is safest to return an UNKNOWN here. Expecting a TERM will
5985 # cause the '?' to always be interpreted as a pattern delimiter
5986 # rather than introducing a ternary operator.
5987 elsif ( $tok eq '?' ) {
5988 $op_expected = UNKNOWN;
5991 $op_expected = TERM;
5996 # number or v-string...
5997 # An exception is for VERSION numbers a 'use' statement. It has the format
5998 # use Module VERSION LIST
5999 # We could avoid this exception by writing a special sub to parse 'use'
6000 # statements and perhaps mark these numbers with a new type V (for VERSION)
6001 ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
6002 elsif ( $is_n_v{$last_nonblank_type} ) {
6003 $op_expected = OPERATOR;
6004 if ( $statement_type eq 'use' ) {
6005 $op_expected = UNKNOWN;
6010 # TODO: labeled prototype words would better be given type 'A' or maybe
6011 # 'J'; not 'q'; or maybe mark as type 'Y'?
6012 elsif ( $last_nonblank_type eq 'q' ) {
6013 $op_expected = OPERATOR;
6014 if ( $last_nonblank_token eq 'prototype' ) {
6015 $op_expected = TERM;
6018 # update for --use-feature=class (rt145706):
6019 # Look for class VERSION after possible attribute, as in
6020 # class Example::Subclass : isa(Example::Base) 1.345 { ... }
6021 elsif ( $statement_type =~ /^package\b/ ) {
6022 $op_expected = TERM;
6026 # file handle or similar
6027 elsif ( $last_nonblank_type eq 'Z' ) {
6029 $op_expected = UNKNOWN;
6032 if ( $last_nonblank_token =~ /^\w/ ) {
6033 $op_expected = UNKNOWN;
6036 # Exception to weird parsing rules for 'x(' ... see case b1205:
6037 # In something like 'print $vv x(...' the x is an operator;
6038 # Likewise in 'print $vv x$ww' the x is an operator (case b1207)
6039 # otherwise x follows the weird parsing rules.
6040 elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
6041 $op_expected = OPERATOR;
6044 # The 'weird parsing rules' of next section do not work for '<' and '?'
6045 # It is best to mark them as unknown. Test case:
6047 elsif ( $is_weird_parsing_rule_exception{$tok} ) {
6048 $op_expected = UNKNOWN;
6051 # For possible file handle like "$a", Perl uses weird parsing rules.
6053 # print $a/2,"/hi"; - division
6054 # print $a / 2,"/hi"; - division
6055 # print $a/ 2,"/hi"; - division
6056 # print $a /2,"/hi"; - pattern (and error)!
6057 # Some examples where this logic works okay, for '&','*','+':
6058 # print $fh &xsi_protos(@mods);
6059 # my $x = new $CompressClass *FH;
6060 # print $OUT +( $count % 15 ? ", " : "\n\t" );
6061 elsif ($prev_type eq 'b'
6062 && $next_type ne 'b' )
6064 $op_expected = TERM;
6067 # Note that '?' and '<' have been moved above
6068 # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
6069 elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
6071 # Do not complain in 'use' statements, which have special syntax.
6072 # For example, from RT#130344:
6073 # use lib $FindBin::Bin . '/lib';
6074 if ( $statement_type ne 'use' ) {
6076 "operator in possible indirect object location not recommended\n"
6079 $op_expected = OPERATOR;
6085 $op_expected = UNKNOWN;
6088 DEBUG_OPERATOR_EXPECTED
6090 "OPERATOR_EXPECTED: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
6092 return $op_expected;
6094 } ## end sub operator_expected
6096 sub new_statement_ok {
6098 # return true if the current token can start a new statement
6099 # USES GLOBAL VARIABLES: $last_nonblank_type
6101 return label_ok() # a label would be ok here
6103 || $last_nonblank_type eq 'J'; # or we follow a label
6105 } ## end sub new_statement_ok
6109 # Decide if a bare word followed by a colon here is a label
6110 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
6111 # $brace_depth, @brace_type
6113 # if it follows an opening or closing code block curly brace..
6114 if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
6115 && $last_nonblank_type eq $last_nonblank_token )
6118 # it is a label if and only if the curly encloses a code block
6119 return $brace_type[$brace_depth];
6122 # otherwise, it is a label if and only if it follows a ';' (real or fake)
6125 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
6127 } ## end sub label_ok
6129 sub code_block_type {
6131 # Decide if this is a block of code, and its type.
6132 # Must be called only when $type = $token = '{'
6133 # The problem is to distinguish between the start of a block of code
6134 # and the start of an anonymous hash reference
6135 # Returns "" if not code block, otherwise returns 'last_nonblank_token'
6136 # to indicate the type of code block. (For example, 'last_nonblank_token'
6137 # might be 'if' for an if block, 'else' for an else block, etc).
6138 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
6139 # $last_nonblank_block_type, $brace_depth, @brace_type
6141 # handle case of multiple '{'s
6143 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
6145 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6146 if ( $last_nonblank_token eq '{'
6147 && $last_nonblank_type eq $last_nonblank_token )
6150 # opening brace where a statement may appear is probably
6151 # a code block but might be and anonymous hash reference
6152 if ( $brace_type[$brace_depth] ) {
6153 return decide_if_code_block( $i, $rtokens, $rtoken_type,
6157 # cannot start a code block within an anonymous hash
6159 return EMPTY_STRING;
6163 elsif ( $last_nonblank_token eq ';' ) {
6165 # an opening brace where a statement may appear is probably
6166 # a code block but might be and anonymous hash reference
6167 return decide_if_code_block( $i, $rtokens, $rtoken_type,
6171 # handle case of '}{'
6172 elsif ($last_nonblank_token eq '}'
6173 && $last_nonblank_type eq $last_nonblank_token )
6176 # a } { situation ...
6177 # could be hash reference after code block..(blktype1.t)
6178 if ($last_nonblank_block_type) {
6179 return decide_if_code_block( $i, $rtokens, $rtoken_type,
6183 # must be a block if it follows a closing hash reference
6185 return $last_nonblank_token;
6189 #--------------------------------------------------------------
6190 # NOTE: braces after type characters start code blocks, but for
6191 # simplicity these are not identified as such. See also
6192 # sub is_non_structural_brace.
6193 #--------------------------------------------------------------
6195 ## elsif ( $last_nonblank_type eq 't' ) {
6196 ## return $last_nonblank_token;
6199 # brace after label:
6200 elsif ( $last_nonblank_type eq 'J' ) {
6201 return $last_nonblank_token;
6204 # otherwise, look at previous token. This must be a code block if
6205 # it follows any of these:
6206 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
6207 elsif ($is_code_block_token{$last_nonblank_token}
6208 || $is_grep_alias{$last_nonblank_token} )
6211 # Bug Patch: Note that the opening brace after the 'if' in the following
6212 # snippet is an anonymous hash ref and not a code block!
6213 # print 'hi' if { x => 1, }->{x};
6214 # We can identify this situation because the last nonblank type
6215 # will be a keyword (instead of a closing paren)
6217 $last_nonblank_type eq 'k'
6218 && ( $last_nonblank_token eq 'if'
6219 || $last_nonblank_token eq 'unless' )
6222 return EMPTY_STRING;
6225 return $last_nonblank_token;
6229 # or a sub or package BLOCK
6230 elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6231 && $last_nonblank_token =~ /^(sub|package)\b/ )
6233 return $last_nonblank_token;
6237 elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6238 && ( $is_sub{$last_nonblank_token} ) )
6243 elsif ( $statement_type =~ /^(sub|package)\b/ ) {
6244 return $statement_type;
6247 # user-defined subs with block parameters (like grep/map/eval)
6248 elsif ( $last_nonblank_type eq 'G' ) {
6249 return $last_nonblank_token;
6253 elsif ( $last_nonblank_type eq 'w' ) {
6255 # check for syntax 'use MODULE LIST'
6256 # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
6257 return EMPTY_STRING if ( $statement_type eq 'use' );
6259 return decide_if_code_block( $i, $rtokens, $rtoken_type,
6263 # Patch for bug # RT #94338 reported by Daniel Trizen
6264 # for-loop in a parenthesized block-map triggering an error message:
6265 # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
6266 # Check for a code block within a parenthesized function call
6267 elsif ( $last_nonblank_token eq '(' ) {
6268 my $paren_type = $paren_type[$paren_depth];
6270 # /^(map|grep|sort)$/
6271 if ( $paren_type && $is_sort_map_grep{$paren_type} ) {
6273 # We will mark this as a code block but use type 't' instead
6274 # of the name of the containing function. This will allow for
6275 # correct parsing but will usually produce better formatting.
6276 # Braces with block type 't' are not broken open automatically
6277 # in the formatter as are other code block types, and this usually
6279 return 't'; # (Not $paren_type)
6282 return EMPTY_STRING;
6286 # handle unknown syntax ') {'
6287 # we previously appended a '()' to mark this case
6288 elsif ( $last_nonblank_token =~ /\(\)$/ ) {
6289 return $last_nonblank_token;
6292 # anything else must be anonymous hash reference
6294 return EMPTY_STRING;
6296 } ## end sub code_block_type
6298 sub decide_if_code_block {
6300 # USES GLOBAL VARIABLES: $last_nonblank_token
6301 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6303 my ( $next_nonblank_token, $i_next ) =
6304 find_next_nonblank_token( $i, $rtokens, $max_token_index );
6306 # we are at a '{' where a statement may appear.
6307 # We must decide if this brace starts an anonymous hash or a code
6309 # return "" if anonymous hash, and $last_nonblank_token otherwise
6311 # initialize to be code BLOCK
6312 my $code_block_type = $last_nonblank_token;
6314 # Check for the common case of an empty anonymous hash reference:
6315 # Maybe something like sub { { } }
6316 if ( $next_nonblank_token eq '}' ) {
6317 $code_block_type = EMPTY_STRING;
6322 # To guess if this '{' is an anonymous hash reference, look ahead
6323 # and test as follows:
6325 # it is a hash reference if next come:
6326 # - a string or digit followed by a comma or =>
6327 # - bareword followed by =>
6328 # otherwise it is a code block
6330 # Examples of anonymous hash ref:
6334 # Examples of code blocks:
6335 # {1; print "hello\n", 1;}
6338 # We are only going to look ahead one more (nonblank/comment) line.
6339 # Strange formatting could cause a bad guess, but that's unlikely.
6343 # Ignore the rest of this line if it is a side comment
6344 if ( $next_nonblank_token ne '#' ) {
6345 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
6346 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
6348 my ( $rpre_tokens, $rpre_types ) =
6349 peek_ahead_for_n_nonblank_pre_tokens(20); # 20 is arbitrary but
6350 # generous, and prevents
6352 # time in mangled files
6353 if ( defined($rpre_types) && @{$rpre_types} ) {
6354 push @pre_types, @{$rpre_types};
6355 push @pre_tokens, @{$rpre_tokens};
6358 # put a sentinel token to simplify stopping the search
6359 push @pre_types, '}';
6360 push @pre_types, '}';
6363 $jbeg = 1 if $pre_types[0] eq 'b';
6365 # first look for one of these
6367 # - bareword with leading -
6371 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
6373 # find the closing quote; don't worry about escapes
6374 my $quote_mark = $pre_types[$j];
6375 foreach my $k ( $j + 1 .. @pre_types - 2 ) {
6376 if ( $pre_types[$k] eq $quote_mark ) {
6378 ##my $next = $pre_types[$j];
6383 elsif ( $pre_types[$j] eq 'd' ) {
6386 elsif ( $pre_types[$j] eq 'w' ) {
6389 elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
6394 $j++ if $pre_types[$j] eq 'b';
6396 # Patched for RT #95708
6399 # it is a comma which is not a pattern delimiter except for qw
6401 $pre_types[$j] eq ','
6402 ## !~ /^(s|m|y|tr|qr|q|qq|qx)$/
6403 && !$is_q_qq_qx_qr_s_y_tr_m{ $pre_tokens[$jbeg] }
6407 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
6410 $code_block_type = EMPTY_STRING;
6414 if ($code_block_type) {
6416 # Patch for cases b1085 b1128: It is uncertain if this is a block.
6417 # If this brace follows a bareword, then append a space as a signal
6418 # to the formatter that this may not be a block brace. To find the
6419 # corresponding code in Formatter.pm search for 'b1085'.
6420 $code_block_type .= SPACE if ( $code_block_type =~ /^\w/ );
6424 return $code_block_type;
6425 } ## end sub decide_if_code_block
6427 sub report_unexpected {
6429 # report unexpected token type and show where it is
6430 # USES GLOBAL VARIABLES: $tokenizer_self
6431 my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
6432 $rpretoken_type, $input_line )
6435 if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
6436 my $msg = "found $found where $expecting expected";
6437 my $pos = $rpretoken_map->[$i_tok];
6438 interrupt_logfile();
6439 my $input_line_number = $tokenizer_self->[_last_line_number_];
6440 my ( $offset, $numbered_line, $underline ) =
6441 make_numbered_line( $input_line_number, $input_line, $pos );
6442 $underline = write_on_underline( $underline, $pos - $offset, '^' );
6444 my $trailer = EMPTY_STRING;
6445 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
6446 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
6448 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
6449 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
6452 $num = $pos - $pos_prev;
6454 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
6457 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
6458 $trailer = " (previous token underlined)";
6460 $underline =~ s/\s+$//;
6461 warning( $numbered_line . "\n" );
6462 warning( $underline . "\n" );
6463 warning( $msg . $trailer . "\n" );
6467 } ## end sub report_unexpected
6469 my %is_sigil_or_paren;
6470 my %is_R_closing_sb;
6474 my @q = qw< $ & % * @ ) >;
6475 @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
6478 @{is_R_closing_sb}{@q} = (1) x scalar(@q);
6481 sub is_non_structural_brace {
6483 # Decide if a brace or bracket is structural or non-structural
6484 # by looking at the previous token and type
6485 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
6487 # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
6488 # Tentatively deactivated because it caused the wrong operator expectation
6490 # $user = @vars[1] / 100;
6491 # Must update sub operator_expected before re-implementing.
6492 # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
6496 #--------------------------------------------------------------
6497 # NOTE: braces after type characters start code blocks, but for
6498 # simplicity these are not identified as such. See also
6499 # sub code_block_type
6500 #--------------------------------------------------------------
6502 ##if ($last_nonblank_type eq 't') {return 0}
6504 # otherwise, it is non-structural if it is decorated
6505 # by type information.
6506 # For example, the '{' here is non-structural: ${xxx}
6507 # Removed '::' to fix c074
6508 ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
6510 ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
6511 $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
6512 || substr( $last_nonblank_token, 0, 2 ) eq '->'
6514 # or if we follow a hash or array closing curly brace or bracket
6515 # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
6516 # because the first '}' would have been given type 'R'
6517 ##|| $last_nonblank_type =~ /^([R\]])$/
6518 || $is_R_closing_sb{$last_nonblank_type}
6520 } ## end sub is_non_structural_brace
6522 #######################################################################
6523 # Tokenizer routines for tracking container nesting depths
6524 #######################################################################
6526 # The following routines keep track of nesting depths of the nesting
6527 # types, ( [ { and ?. This is necessary for determining the indentation
6528 # level, and also for debugging programs. Not only do they keep track of
6529 # nesting depths of the individual brace types, but they check that each
6530 # of the other brace types is balanced within matching pairs. For
6531 # example, if the program sees this sequence:
6535 # then it can determine that there is an extra left paren somewhere
6536 # between the { and the }. And so on with every other possible
6537 # combination of outer and inner brace types. For another
6542 # which has an extra ] within the parens.
6544 # The brace types have indexes 0 .. 3 which are indexes into
6547 # The pair ? : are treated as just another nesting type, with ? acting
6548 # as the opening brace and : acting as the closing brace.
6552 # $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
6554 # saves the nesting depth of brace type $b (where $b is either of the other
6555 # nesting types) when brace type $a enters a new depth. When this depth
6556 # decreases, a check is made that the current depth of brace types $b is
6557 # unchanged, or otherwise there must have been an error. This can
6558 # be very useful for localizing errors, particularly when perl runs to
6559 # the end of a large file (such as this one) and announces that there
6560 # is a problem somewhere.
6562 # A numerical sequence number is maintained for every nesting type,
6563 # so that each matching pair can be uniquely identified in a simple
6566 sub increase_nesting_depth {
6567 my ( $aa, $pos ) = @_;
6569 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6570 # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
6572 $current_depth[$aa]++;
6574 $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
6575 my $input_line_number = $tokenizer_self->[_last_line_number_];
6576 my $input_line = $tokenizer_self->[_line_of_text_];
6578 # Sequence numbers increment by number of items. This keeps
6579 # a unique set of numbers but still allows the relative location
6580 # of any type to be determined.
6582 # make a new unique sequence number
6583 my $seqno = $next_sequence_number++;
6585 $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
6587 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
6588 [ $input_line_number, $input_line, $pos ];
6590 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6591 next if ( $bb == $aa );
6592 $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
6595 # set a flag for indenting a nested ternary statement
6597 if ( $aa == QUESTION_COLON ) {
6598 $nested_ternary_flag[ $current_depth[$aa] ] = 0;
6599 if ( $current_depth[$aa] > 1 ) {
6600 if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
6601 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
6602 if ( $pdepth == $total_depth - 1 ) {
6604 $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
6610 # Fix part #1 for git82: save last token type for propagation of type 'Z'
6611 $nested_statement_type[$aa][ $current_depth[$aa] ] =
6612 [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
6613 $statement_type = EMPTY_STRING;
6614 return ( $seqno, $indent );
6615 } ## end sub increase_nesting_depth
6617 sub is_balanced_closing_container {
6619 # Return true if a closing container can go here without error
6620 # Return false if not
6623 # cannot close if there was no opening
6624 return unless ( $current_depth[$aa] > 0 );
6626 # check that any other brace types $bb contained within would be balanced
6627 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6628 next if ( $bb == $aa );
6630 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6631 $current_depth[$bb] );
6634 # OK, everything will be balanced
6636 } ## end sub is_balanced_closing_container
6638 sub decrease_nesting_depth {
6640 my ( $aa, $pos ) = @_;
6642 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6643 # @current_sequence_number, @depth_array, @starting_line_of_current_depth
6646 my $input_line_number = $tokenizer_self->[_last_line_number_];
6647 my $input_line = $tokenizer_self->[_line_of_text_];
6651 if ( $current_depth[$aa] > 0 ) {
6653 # set a flag for un-indenting after seeing a nested ternary statement
6654 $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
6655 if ( $aa == QUESTION_COLON ) {
6656 $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
6659 # Fix part #2 for git82: use saved type for propagation of type 'Z'
6660 # through type L-R braces. Perl seems to allow ${bareword}
6661 # as an indirect object, but nothing much more complex than that.
6662 ( $statement_type, my $saved_type, my $saved_token ) =
6663 @{ $nested_statement_type[$aa][ $current_depth[$aa] ] };
6665 && $saved_type eq 'Z'
6666 && $last_nonblank_type eq 'w'
6667 && $brace_structural_type[$brace_depth] eq 'L' )
6669 $last_nonblank_type = $saved_type;
6672 # check that any brace types $bb contained within are balanced
6673 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6674 next if ( $bb == $aa );
6676 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6677 $current_depth[$bb] )
6680 $current_depth[$bb] -
6681 $depth_array[$aa][$bb][ $current_depth[$aa] ];
6683 # don't whine too many times
6684 my $saw_brace_error = get_saw_brace_error();
6686 $saw_brace_error <= MAX_NAG_MESSAGES
6688 # if too many closing types have occurred, we probably
6689 # already caught this error
6690 && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
6693 interrupt_logfile();
6695 $starting_line_of_current_depth[$aa]
6696 [ $current_depth[$aa] ];
6698 my $rel = [ $input_line_number, $input_line, $pos ];
6702 if ( $diff == 1 || $diff == -1 ) {
6703 $ess = EMPTY_STRING;
6710 ? $opening_brace_names[$bb]
6711 : $closing_brace_names[$bb];
6712 write_error_indicator_pair( @{$rsl}, '^' );
6714 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
6719 $starting_line_of_current_depth[$bb]
6720 [ $current_depth[$bb] ];
6723 " The most recent un-matched $bname is on line $ml\n";
6724 write_error_indicator_pair( @{$rml}, '^' );
6726 write_error_indicator_pair( @{$rel}, '^' );
6730 increment_brace_error();
6733 $current_depth[$aa]--;
6737 my $saw_brace_error = get_saw_brace_error();
6738 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
6740 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
6742 indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
6744 increment_brace_error();
6746 # keep track of errors in braces alone (ignoring ternary nesting errors)
6747 $tokenizer_self->[_true_brace_error_count_]++
6748 if ( $closing_brace_names[$aa] ne "':'" );
6750 return ( $seqno, $outdent );
6751 } ## end sub decrease_nesting_depth
6753 sub check_final_nesting_depths {
6755 # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
6757 for my $aa ( 0 .. @closing_brace_names - 1 ) {
6759 if ( $current_depth[$aa] ) {
6761 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
6764 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
6765 The most recent un-matched $opening_brace_names[$aa] is on line $sl
6767 indicate_error( $msg, @{$rsl}, '^' );
6768 increment_brace_error();
6772 } ## end sub check_final_nesting_depths
6774 #######################################################################
6775 # Tokenizer routines for looking ahead in input stream
6776 #######################################################################
6778 sub peek_ahead_for_n_nonblank_pre_tokens {
6780 # returns next n pretokens if they exist
6781 # returns undef's if hits eof without seeing any pretokens
6782 # USES GLOBAL VARIABLES: $tokenizer_self
6783 my $max_pretokens = shift;
6786 my ( $rpre_tokens, $rmap, $rpre_types );
6789 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6791 $line =~ s/^\s*//; # trim leading blanks
6792 next if ( length($line) <= 0 ); # skip blank
6793 next if ( $line =~ /^#/ ); # skip comment
6794 ( $rpre_tokens, $rmap, $rpre_types ) =
6795 pre_tokenize( $line, $max_pretokens );
6798 return ( $rpre_tokens, $rpre_types );
6799 } ## end sub peek_ahead_for_n_nonblank_pre_tokens
6801 # look ahead for next non-blank, non-comment line of code
6802 sub peek_ahead_for_nonblank_token {
6804 # USES GLOBAL VARIABLES: $tokenizer_self
6805 my ( $rtokens, $max_token_index ) = @_;
6810 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6812 $line =~ s/^\s*//; # trim leading blanks
6813 next if ( length($line) <= 0 ); # skip blank
6814 next if ( $line =~ /^#/ ); # skip comment
6816 # Updated from 2 to 3 to get trigraphs, added for case b1175
6817 my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
6818 my $j = $max_token_index + 1;
6820 foreach my $tok ( @{$rtok} ) {
6821 last if ( $tok =~ "\n" );
6822 $rtokens->[ ++$j ] = $tok;
6827 } ## end sub peek_ahead_for_nonblank_token
6829 #######################################################################
6830 # Tokenizer guessing routines for ambiguous situations
6831 #######################################################################
6833 sub guess_if_pattern_or_conditional {
6835 # this routine is called when we have encountered a ? following an
6836 # unknown bareword, and we must decide if it starts a pattern or not
6838 # $i - token index of the ? starting possible pattern
6839 # output parameters:
6840 # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
6841 # msg = a warning or diagnostic message
6842 # USES GLOBAL VARIABLES: $last_nonblank_token
6844 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6846 my $msg = "guessing that ? after $last_nonblank_token starts a ";
6848 if ( $i >= $max_token_index ) {
6849 $msg .= "conditional (no end to pattern found on the line)\n";
6854 my $next_token = $rtokens->[$i]; # first token after ?
6856 # look for a possible ending ? on this line..
6858 my $quote_depth = 0;
6859 my $quote_character = EMPTY_STRING;
6871 ) = follow_quoted_string(
6885 # we didn't find an ending ? on this line,
6886 # so we bias towards conditional
6888 $msg .= "conditional (no ending ? on this line)\n";
6890 # we found an ending ?, so we bias towards a pattern
6894 # Watch out for an ending ? in quotes, like this
6895 # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
6899 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
6900 my $tok = $rtokens->[$ii];
6901 if ( $tok eq ":" ) { $colons++ }
6902 if ( $tok eq "'" ) { $s_quote++ }
6903 if ( $tok eq '"' ) { $d_quote++ }
6905 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
6907 $msg .= "found ending ? but unbalanced quote chars\n";
6909 elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
6911 $msg .= "pattern (found ending ? and pattern expected)\n";
6914 $msg .= "pattern (uncertain, but found ending ?)\n";
6918 return ( $is_pattern, $msg );
6919 } ## end sub guess_if_pattern_or_conditional
6921 my %is_known_constant;
6922 my %is_known_function;
6926 # Constants like 'pi' in Trig.pm are common
6927 my @q = qw(pi pi2 pi4 pip2 pip4);
6928 @{is_known_constant}{@q} = (1) x scalar(@q);
6930 # parenless calls of 'ok' are common
6932 @{is_known_function}{@q} = (1) x scalar(@q);
6935 sub guess_if_pattern_or_division {
6937 # this routine is called when we have encountered a / following an
6938 # unknown bareword, and we must decide if it starts a pattern or is a
6941 # $i - token index of the / starting possible pattern
6942 # output parameters:
6943 # $is_pattern = 0 if probably division, =1 if probably a pattern
6944 # msg = a warning or diagnostic message
6945 # USES GLOBAL VARIABLES: $last_nonblank_token
6946 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6948 my $msg = "guessing that / after $last_nonblank_token starts a ";
6950 if ( $i >= $max_token_index ) {
6951 $msg .= "division (no end to pattern found on the line)\n";
6955 my $divide_possible =
6956 is_possible_numerator( $i, $rtokens, $max_token_index );
6958 if ( $divide_possible < 0 ) {
6959 $msg = "pattern (division not possible here)\n";
6961 return ( $is_pattern, $msg );
6965 my $next_token = $rtokens->[$i]; # first token after slash
6967 # One of the things we can look at is the spacing around the slash.
6968 # There # are four possible spacings around the first slash:
6970 # return pi/two;#/; -/-
6971 # return pi/ two;#/; -/+
6972 # return pi / two;#/; +/+
6973 # return pi /two;#/; +/- <-- possible pattern
6975 # Spacing rule: a space before the slash but not after the slash
6976 # usually indicates a pattern. We can use this to break ties.
6978 my $is_pattern_by_spacing =
6979 ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
6981 # look for a possible ending / on this line..
6983 my $quote_depth = 0;
6984 my $quote_character = EMPTY_STRING;
6988 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6991 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6992 $quote_pos, $quote_depth, $max_token_index );
6996 # we didn't find an ending / on this line, so we bias towards
6998 if ( $divide_possible >= 0 ) {
7000 $msg .= "division (no ending / on this line)\n";
7004 # assuming a multi-line pattern ... this is risky, but division
7005 # does not seem possible. If this fails, it would either be due
7006 # to a syntax error in the code, or the division_expected logic
7007 # needs to be fixed.
7008 $msg = "multi-line pattern (division not possible)\n";
7013 # we found an ending /, so we bias slightly towards a pattern
7016 my $pattern_expected =
7017 pattern_expected( $i, $rtokens, $max_token_index );
7019 if ( $pattern_expected >= 0 ) {
7021 # pattern looks possible...
7022 if ( $divide_possible >= 0 ) {
7024 # Both pattern and divide can work here...
7026 # Increase weight of divide if a pure number follows
7027 $divide_possible += $next_token =~ /^\d+$/;
7029 # Check for known constants in the numerator, like 'pi'
7030 if ( $is_known_constant{$last_nonblank_token} ) {
7032 "division (pattern works too but saw known constant '$last_nonblank_token')\n";
7036 # A very common bare word in pattern expressions is 'ok'
7037 elsif ( $is_known_function{$last_nonblank_token} ) {
7039 "pattern (division works too but saw '$last_nonblank_token')\n";
7043 # If one rule is more definite, use it
7044 elsif ( $divide_possible > $pattern_expected ) {
7046 "division (more likely based on following tokens)\n";
7050 # otherwise, use the spacing rule
7051 elsif ($is_pattern_by_spacing) {
7053 "pattern (guess on spacing, but division possible too)\n";
7058 "division (guess on spacing, but pattern is possible too)\n";
7063 # divide_possible < 0 means divide can not work here
7066 $msg .= "pattern (division not possible)\n";
7070 # pattern does not look possible...
7073 if ( $divide_possible >= 0 ) {
7075 $msg .= "division (pattern not possible)\n";
7078 # Neither pattern nor divide look possible...go by spacing
7080 if ($is_pattern_by_spacing) {
7081 $msg .= "pattern (guess on spacing)\n";
7085 $msg .= "division (guess on spacing)\n";
7092 return ( $is_pattern, $msg );
7093 } ## end sub guess_if_pattern_or_division
7095 # try to resolve here-doc vs. shift by looking ahead for
7096 # non-code or the end token (currently only looks for end token)
7097 # returns 1 if it is probably a here doc, 0 if not
7098 sub guess_if_here_doc {
7100 # This is how many lines we will search for a target as part of the
7101 # guessing strategy. It is a constant because there is probably
7102 # little reason to change it.
7103 # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
7105 my $HERE_DOC_WINDOW = 40;
7107 my $next_token = shift;
7108 my $here_doc_expected = 0;
7111 my $msg = "checking <<";
7114 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) )
7118 if ( $line =~ /^$next_token$/ ) {
7119 $msg .= " -- found target $next_token ahead $k lines\n";
7120 $here_doc_expected = 1; # got it
7123 last if ( $k >= $HERE_DOC_WINDOW );
7126 unless ($here_doc_expected) {
7128 if ( !defined($line) ) {
7129 $here_doc_expected = -1; # hit eof without seeing target
7130 $msg .= " -- must be shift; target $next_token not in file\n";
7133 else { # still unsure..taking a wild guess
7135 if ( !$is_constant{$current_package}{$next_token} ) {
7136 $here_doc_expected = 1;
7138 " -- guessing it's a here-doc ($next_token not a constant)\n";
7142 " -- guessing it's a shift ($next_token is a constant)\n";
7146 write_logfile_entry($msg);
7147 return $here_doc_expected;
7148 } ## end sub guess_if_here_doc
7150 #######################################################################
7151 # Tokenizer Routines for scanning identifiers and related items
7152 #######################################################################
7154 sub scan_bare_identifier_do {
7156 # this routine is called to scan a token starting with an alphanumeric
7157 # variable or package separator, :: or '.
7158 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
7159 # $last_nonblank_type,@paren_type, $paren_depth
7161 my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
7165 my $package = undef;
7169 # we have to back up one pretoken at a :: since each : is one pretoken
7170 if ( $tok eq '::' ) { $i_beg-- }
7171 if ( $tok eq '->' ) { $i_beg-- }
7172 my $pos_beg = $rtoken_map->[$i_beg];
7173 pos($input_line) = $pos_beg;
7180 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
7182 my $pos = pos($input_line);
7183 my $numc = $pos - $pos_beg;
7184 $tok = substr( $input_line, $pos_beg, $numc );
7186 # type 'w' includes anything without leading type info
7187 # ($,%,@,*) including something like abc::def::ghi
7190 my $sub_name = EMPTY_STRING;
7191 if ( defined($2) ) { $sub_name = $2; }
7192 if ( defined($1) ) {
7195 # patch: don't allow isolated package name which just ends
7196 # in the old style package separator (single quote). Example:
7198 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
7202 $package =~ s/\'/::/g;
7203 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7204 $package =~ s/::$//;
7207 $package = $current_package;
7209 # patched for c043, part 1: keyword does not follow '->'
7210 if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
7215 # if it is a bareword.. patched for c043, part 2: not following '->'
7216 if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
7218 # check for v-string with leading 'v' type character
7219 # (This seems to have precedence over filehandle, type 'Y')
7220 if ( $tok =~ /^v\d[_\d]*$/ ) {
7222 # we only have the first part - something like 'v101' -
7224 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
7225 $pos = pos($input_line);
7226 $numc = $pos - $pos_beg;
7227 $tok = substr( $input_line, $pos_beg, $numc );
7231 # warn if this version can't handle v-strings
7232 report_v_string($tok);
7235 elsif ( $is_constant{$package}{$sub_name} ) {
7239 # bareword after sort has implied empty prototype; for example:
7240 # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
7241 # This has priority over whatever the user has specified.
7242 elsif ($last_nonblank_token eq 'sort'
7243 && $last_nonblank_type eq 'k' )
7248 # Note: strangely, perl does not seem to really let you create
7249 # functions which act like eval and do, in the sense that eval
7250 # and do may have operators following the final }, but any operators
7251 # that you create with prototype (&) apparently do not allow
7252 # trailing operators, only terms. This seems strange.
7253 # If this ever changes, here is the update
7254 # to make perltidy behave accordingly:
7256 # elsif ( $is_block_function{$package}{$tok} ) {
7257 # $tok='eval'; # patch to do braces like eval - doesn't work
7260 # TODO: This could become a separate type to allow for different
7262 elsif ( $is_block_function{$package}{$sub_name} ) {
7265 elsif ( $is_block_list_function{$package}{$sub_name} ) {
7268 elsif ( $is_user_function{$package}{$sub_name} ) {
7270 $prototype = $user_function_prototype{$package}{$sub_name};
7273 # check for indirect object
7276 # added 2001-03-27: must not be followed immediately by '('
7278 ( $input_line !~ m/\G\(/gc )
7283 # preceded by keyword like 'print', 'printf' and friends
7284 $is_indirect_object_taker{$last_nonblank_token}
7286 # or preceded by something like 'print(' or 'printf('
7288 ( $last_nonblank_token eq '(' )
7289 && $is_indirect_object_taker{ $paren_type[$paren_depth]
7297 # may not be indirect object unless followed by a space;
7298 # updated 2021-01-16 to consider newline to be a space.
7299 # updated for case b990 to look for either ';' or space
7300 if ( pos($input_line) == length($input_line)
7301 || $input_line =~ m/\G[;\s]/gc )
7306 # Perl's indirect object notation is a very bad
7307 # thing and can cause subtle bugs, especially for
7308 # beginning programmers. And I haven't even been
7309 # able to figure out a sane warning scheme which
7310 # doesn't get in the way of good scripts.
7312 # Complain if a filehandle has any lower case
7313 # letters. This is suggested good practice.
7314 # Use 'sub_name' because something like
7315 # main::MYHANDLE is ok for filehandle
7316 if ( $sub_name =~ /[a-z]/ ) {
7318 # could be bug caused by older perltidy if
7320 if ( $input_line =~ m/\G\s*\(/gc ) {
7322 "Caution: unknown word '$tok' in indirect object slot\n"
7328 # bareword not followed by a space -- may not be filehandle
7329 # (may be function call defined in a 'use' statement)
7336 # Now we must convert back from character position
7337 # to pre_token index.
7338 # I don't think an error flag can occur here ..but who knows
7341 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7343 warning("scan_bare_identifier: Possibly invalid tokenization\n");
7347 # no match but line not blank - could be syntax error
7348 # perl will take '::' alone without complaint
7352 # change this warning to log message if it becomes annoying
7353 warning("didn't find identifier after leading ::\n");
7355 return ( $i, $tok, $type, $prototype );
7356 } ## end sub scan_bare_identifier_do
7360 # This is the new scanner and will eventually replace scan_identifier.
7361 # Only type 'sub' and 'package' are implemented.
7362 # Token types $ * % @ & -> are not yet implemented.
7364 # Scan identifier following a type token.
7365 # The type of call depends on $id_scan_state: $id_scan_state = ''
7366 # for starting call, in which case $tok must be the token defining
7369 # If the type token is the last nonblank token on the line, a value
7370 # of $id_scan_state = $tok is returned, indicating that further
7371 # calls must be made to get the identifier. If the type token is
7372 # not the last nonblank token on the line, the identifier is
7373 # scanned and handled and a value of '' is returned.
7374 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
7375 # $statement_type, $tokenizer_self
7377 my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
7380 use constant DEBUG_NSCAN => 0;
7381 my $type = EMPTY_STRING;
7382 my ( $i_beg, $pos_beg );
7384 #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7385 #my ($a,$b,$c) = caller;
7386 #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
7388 # on re-entry, start scanning at first token on the line
7389 if ($id_scan_state) {
7391 $type = EMPTY_STRING;
7394 # on initial entry, start scanning just after type token
7397 $id_scan_state = $tok;
7401 # find $i_beg = index of next nonblank token,
7402 # and handle empty lines
7404 my $next_nonblank_token = $rtokens->[$i_beg];
7405 if ( $i_beg > $max_token_index ) {
7410 # only a '#' immediately after a '$' is not a comment
7411 if ( $next_nonblank_token eq '#' ) {
7412 unless ( $tok eq '$' ) {
7417 if ( $next_nonblank_token =~ /^\s/ ) {
7418 ( $next_nonblank_token, $i_beg ) =
7419 find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
7421 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
7427 # handle non-blank line; identifier, if any, must follow
7428 unless ($blank_line) {
7430 if ( $is_sub{$id_scan_state} ) {
7431 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
7433 input_line => $input_line,
7438 rtokens => $rtokens,
7439 rtoken_map => $rtoken_map,
7440 id_scan_state => $id_scan_state,
7441 max_token_index => $max_token_index,
7446 elsif ( $is_package{$id_scan_state} ) {
7447 ( $i, $tok, $type ) =
7448 do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
7449 $rtoken_map, $max_token_index );
7450 $id_scan_state = EMPTY_STRING;
7454 warning("invalid token in scan_id: $tok\n");
7455 $id_scan_state = EMPTY_STRING;
7459 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
7464 Program bug in scan_id: undefined type but scan_state=$id_scan_state
7468 "Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
7470 report_definite_bug();
7475 "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7477 return ( $i, $tok, $type, $id_scan_state );
7478 } ## end sub scan_id_do
7480 sub check_prototype {
7481 my ( $proto, $package, $subname ) = @_;
7482 return unless ( defined($package) && defined($subname) );
7483 if ( defined($proto) ) {
7484 $proto =~ s/^\s*\(\s*//;
7485 $proto =~ s/\s*\)$//;
7487 $is_user_function{$package}{$subname} = 1;
7488 $user_function_prototype{$package}{$subname} = "($proto)";
7490 # prototypes containing '&' must be treated specially..
7491 if ( $proto =~ /\&/ ) {
7493 # right curly braces of prototypes ending in
7494 # '&' may be followed by an operator
7495 if ( $proto =~ /\&$/ ) {
7496 $is_block_function{$package}{$subname} = 1;
7499 # right curly braces of prototypes NOT ending in
7500 # '&' may NOT be followed by an operator
7501 elsif ( $proto !~ /\&$/ ) {
7502 $is_block_list_function{$package}{$subname} = 1;
7507 $is_constant{$package}{$subname} = 1;
7511 $is_user_function{$package}{$subname} = 1;
7514 } ## end sub check_prototype
7516 sub do_scan_package {
7518 # do_scan_package parses a package name
7519 # it is called with $i_beg equal to the index of the first nonblank
7520 # token following a 'package' token.
7521 # USES GLOBAL VARIABLES: $current_package,
7524 # package NAMESPACE VERSION
7525 # package NAMESPACE BLOCK
7526 # package NAMESPACE VERSION BLOCK
7528 # If VERSION is provided, package sets the $VERSION variable in the given
7529 # namespace to a version object with the VERSION provided. VERSION must be
7530 # a "strict" style version number as defined by the version module: a
7531 # positive decimal number (integer or decimal-fraction) without
7532 # exponentiation or else a dotted-decimal v-string with a leading 'v'
7533 # character and at least three components.
7534 # reference http://perldoc.perl.org/functions/package.html
7536 my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
7539 my $package = undef;
7540 my $pos_beg = $rtoken_map->[$i_beg];
7541 pos($input_line) = $pos_beg;
7543 # handle non-blank line; package name, if any, must follow
7544 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
7546 $package = ( defined($1) && $1 ) ? $1 : 'main';
7547 $package =~ s/\'/::/g;
7548 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7549 $package =~ s/::$//;
7550 my $pos = pos($input_line);
7551 my $numc = $pos - $pos_beg;
7552 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
7555 # Now we must convert back from character position
7556 # to pre_token index.
7557 # I don't think an error flag can occur here ..but ?
7560 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7561 if ($error) { warning("Possibly invalid package\n") }
7562 $current_package = $package;
7564 # we should now have package NAMESPACE
7565 # now expecting VERSION, BLOCK, or ; to follow ...
7566 # package NAMESPACE VERSION
7567 # package NAMESPACE BLOCK
7568 # package NAMESPACE VERSION BLOCK
7569 my ( $next_nonblank_token, $i_next ) =
7570 find_next_nonblank_token( $i, $rtokens, $max_token_index );
7572 # check that something recognizable follows, but do not parse.
7573 # A VERSION number will be parsed later as a number or v-string in the
7574 # normal way. What is important is to set the statement type if
7575 # everything looks okay so that the operator_expected() routine
7576 # knows that the number is in a package statement.
7577 # Examples of valid primitive tokens that might follow are:
7579 # FIX: added a '#' since a side comment may also follow
7580 # Added ':' for class attributes (for --use-feature=class, rt145706)
7581 if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#\:])|v\d|\d+$/ ) {
7582 $statement_type = $tok;
7586 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
7591 # no match but line not blank --
7592 # could be a label with name package, like package: , for example.
7597 return ( $i, $tok, $type );
7598 } ## end sub do_scan_package
7600 my %is_special_variable_char;
7604 # These are the only characters which can (currently) form special
7605 # variables, like $^W: (issue c066).
7607 qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
7608 @{is_special_variable_char}{@q} = (1) x scalar(@q);
7611 { ## begin closure for sub scan_complex_identifier
7613 use constant DEBUG_SCAN_ID => 0;
7615 # These are the possible states for this scanner:
7616 my $scan_state_SIGIL = '$';
7617 my $scan_state_ALPHA = 'A';
7618 my $scan_state_COLON = ':';
7619 my $scan_state_LPAREN = '(';
7620 my $scan_state_RPAREN = ')';
7621 my $scan_state_AMPERSAND = '&';
7622 my $scan_state_SPLIT = '^';
7624 # Only these non-blank states may be returned to caller:
7625 my %is_returnable_scan_state = (
7626 $scan_state_SIGIL => 1,
7627 $scan_state_AMPERSAND => 1,
7630 # USES GLOBAL VARIABLES:
7631 # $context, $last_nonblank_token, $last_nonblank_type
7636 my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
7637 $expecting, $container_type );
7639 #-------------------------------------------
7640 # my variables, re-initialized on each call:
7641 #-------------------------------------------
7642 my $i_begin; # starting index $i
7643 my $type; # returned identifier type
7644 my $tok_begin; # starting token
7645 my $tok; # returned token
7646 my $id_scan_state_begin; # starting scan state
7647 my $identifier_begin; # starting identifier
7648 my $i_save; # a last good index, in case of error
7649 my $message; # hold error message for log file
7651 my $last_tok_is_blank;
7652 my $in_prototype_or_signature;
7657 sub initialize_my_scan_id_vars {
7659 # Initialize all 'my' vars on entry
7661 $type = EMPTY_STRING;
7662 $tok_begin = $rtokens->[$i_begin];
7664 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
7665 $id_scan_state_begin = $id_scan_state;
7666 $identifier_begin = $identifier;
7669 $message = EMPTY_STRING;
7670 $tok_is_blank = undef; # a flag to speed things up
7671 $last_tok_is_blank = undef;
7673 $in_prototype_or_signature =
7674 $container_type && $container_type =~ /^sub\b/;
7676 # these flags will be used to help figure out the type:
7680 # allow old package separator (') except in 'use' statement
7681 $allow_tick = ( $last_nonblank_token ne 'use' );
7683 } ## end sub initialize_my_scan_id_vars
7685 #----------------------------------
7686 # Routines for handling scan states
7687 #----------------------------------
7688 sub do_id_scan_state_dollar {
7690 # We saw a sigil, now looking to start a variable name
7691 if ( $tok eq '$' ) {
7693 $identifier .= $tok;
7695 # we've got a punctuation variable if end of line (punct.t)
7696 if ( $i == $max_token_index ) {
7698 $id_scan_state = EMPTY_STRING;
7701 elsif ( $tok =~ /^\w/ ) { # alphanumeric ..
7703 $id_scan_state = $scan_state_COLON; # now need ::
7704 $identifier .= $tok;
7706 elsif ( $tok eq '::' ) {
7707 $id_scan_state = $scan_state_ALPHA;
7708 $identifier .= $tok;
7711 # POSTDEFREF ->@ ->% ->& ->*
7712 elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
7713 $identifier .= $tok;
7715 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7717 $id_scan_state = $scan_state_COLON; # now need ::
7718 $identifier .= $tok;
7720 # Perl will accept leading digits in identifiers,
7721 # although they may not always produce useful results.
7722 # Something like $main::0 is ok. But this also works:
7724 # sub howdy::123::bubba{ print "bubba $54321!\n" }
7725 # howdy::123::bubba();
7728 elsif ( $tok eq '#' ) {
7730 my $is_punct_var = $identifier eq '$$';
7732 # side comment or identifier?
7735 # A '#' starts a comment if it follows a space. For example,
7736 # the following is equivalent to $ans=40.
7741 # a # inside a prototype or signature can only start a
7743 && !$in_prototype_or_signature
7745 # these are valid punctuation vars: *# %# @# $#
7746 # May also be '$#array' or POSTDEFREF ->$#
7747 && ( $identifier =~ /^[\%\@\$\*]$/
7748 || $identifier =~ /\$$/ )
7750 # but a '#' after '$$' is a side comment; see c147
7755 $identifier .= $tok; # keep same state, a $ could follow
7759 # otherwise it is a side comment
7760 if ( $identifier eq '->' ) { }
7761 elsif ($is_punct_var) { $type = 'i' }
7762 elsif ( $id_scan_state eq $scan_state_SIGIL ) { $type = 't' }
7763 else { $type = 'i' }
7765 $id_scan_state = EMPTY_STRING;
7769 elsif ( $tok eq '{' ) {
7771 # check for something like ${#} or ${?}, where ? is a special char
7775 || $identifier eq '@'
7776 || $identifier eq '$#'
7778 && $i + 2 <= $max_token_index
7779 && $rtokens->[ $i + 2 ] eq '}'
7780 && $rtokens->[ $i + 1 ] !~ /[\s\w]/
7783 my $next2 = $rtokens->[ $i + 2 ];
7784 my $next1 = $rtokens->[ $i + 1 ];
7785 $identifier .= $tok . $next1 . $next2;
7787 $id_scan_state = EMPTY_STRING;
7791 # skip something like ${xxx} or ->{
7792 $id_scan_state = EMPTY_STRING;
7794 # if this is the first token of a line, any tokens for this
7795 # identifier have already been accumulated
7796 if ( $identifier eq '$' || $i == 0 ) {
7797 $identifier = EMPTY_STRING;
7803 # space ok after leading $ % * & @
7804 elsif ( $tok =~ /^\s*$/ ) {
7808 # note: an id with a leading '&' does not actually come this way
7809 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
7811 if ( length($identifier) > 1 ) {
7812 $id_scan_state = EMPTY_STRING;
7814 $type = 'i'; # probably punctuation variable
7818 # fix c139: trim line-ending type 't'
7819 if ( $i == $max_token_index ) {
7824 # spaces after $'s are common, and space after @
7825 # is harmless, so only complain about space
7826 # after other type characters. Space after $ and
7827 # @ will be removed in formatting. Report space
7828 # after % and * because they might indicate a
7829 # parsing error. In other words '% ' might be a
7830 # modulo operator. Delete this warning if it
7832 elsif ( $identifier !~ /^[\@\$]$/ ) {
7834 "Space in identifier, following $identifier\n";
7837 ## ok: silently accept space after '$' and '@' sigils
7842 elsif ( $identifier eq '->' ) {
7844 # space after '->' is ok except at line end ..
7845 # so trim line-ending in type '->' (fixes c139)
7846 if ( $i == $max_token_index ) {
7852 # stop at space after something other than -> or sigil
7853 # Example of what can arrive here:
7854 # eval { $MyClass->$$ };
7856 $id_scan_state = EMPTY_STRING;
7861 elsif ( $tok eq '^' ) {
7863 # check for some special variables like $^ $^W
7864 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
7865 $identifier .= $tok;
7868 # There may be one more character, not a space, after the ^
7869 my $next1 = $rtokens->[ $i + 1 ];
7870 my $chr = substr( $next1, 0, 1 );
7871 if ( $is_special_variable_char{$chr} ) {
7873 # It is something like $^W
7874 # Test case (c066) : $^Oeq'linux'
7876 $identifier .= $next1;
7878 # If pretoken $next1 is more than one character long,
7879 # set a flag indicating that it needs to be split.
7881 ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
7886 # Simple test case (c065): '$aa=$^if($bb)';
7887 $id_scan_state = EMPTY_STRING;
7891 $id_scan_state = EMPTY_STRING;
7895 else { # something else
7897 if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
7899 # We might be in an extrusion of
7900 # sub foo2 ( $first, $, $third ) {
7901 # looking at a line starting with a comma, like
7904 # in this case the comma ends the signature variable
7905 # '$' which will have been previously marked type 't'
7907 if ( $i == $i_begin ) {
7908 $identifier = EMPTY_STRING;
7909 $type = EMPTY_STRING;
7912 # at a # we have to mark as type 't' because more may
7913 # follow, otherwise, in a signature we can let '$' be an
7914 # identifier here for better formatting.
7915 # See 'mangle4.in' for a test case.
7918 if ( $id_scan_state eq $scan_state_SIGIL && $tok eq '#' ) {
7923 $id_scan_state = EMPTY_STRING;
7926 # check for various punctuation variables
7927 elsif ( $identifier =~ /^[\$\*\@\%]$/ ) {
7928 $identifier .= $tok;
7931 # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
7933 && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
7935 $identifier .= $tok;
7938 elsif ( $identifier eq '$#' ) {
7940 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
7942 # perl seems to allow just these: $#: $#- $#+
7943 elsif ( $tok =~ /^[\:\-\+]$/ ) {
7945 $identifier .= $tok;
7949 write_logfile_entry( 'Use of $# is deprecated' . "\n" );
7952 elsif ( $identifier eq '$$' ) {
7954 # perl does not allow references to punctuation
7955 # variables without braces. For example, this
7959 # You would have to use
7962 # '$$' alone is punctuation variable for PID
7964 if ( $tok eq '{' ) { $type = 't' }
7965 else { $type = 'i' }
7967 elsif ( $identifier eq '->' ) {
7972 if ( length($identifier) == 1 ) {
7973 $identifier = EMPTY_STRING;
7976 $id_scan_state = EMPTY_STRING;
7979 } ## end sub do_id_scan_state_dollar
7981 sub do_id_scan_state_alpha {
7983 # looking for alphanumeric after ::
7984 $tok_is_blank = $tok =~ /^\s*$/;
7986 if ( $tok =~ /^\w/ ) { # found it
7987 $identifier .= $tok;
7988 $id_scan_state = $scan_state_COLON; # now need ::
7991 elsif ( $tok eq "'" && $allow_tick ) {
7992 $identifier .= $tok;
7993 $id_scan_state = $scan_state_COLON; # now need ::
7996 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7997 $id_scan_state = $scan_state_LPAREN;
7998 $identifier .= $tok;
8000 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
8001 $id_scan_state = $scan_state_RPAREN;
8002 $identifier .= $tok;
8005 $id_scan_state = EMPTY_STRING;
8009 } ## end sub do_id_scan_state_alpha
8011 sub do_id_scan_state_colon {
8013 # looking for possible :: after alphanumeric
8015 $tok_is_blank = $tok =~ /^\s*$/;
8017 if ( $tok eq '::' ) { # got it
8018 $identifier .= $tok;
8019 $id_scan_state = $scan_state_ALPHA; # now require alpha
8021 elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here
8022 $identifier .= $tok;
8023 $id_scan_state = $scan_state_COLON; # now need ::
8026 elsif ( $tok eq "'" && $allow_tick ) { # tick
8028 if ( $is_keyword{$identifier} ) {
8029 $id_scan_state = EMPTY_STRING; # that's all
8033 $identifier .= $tok;
8036 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
8037 $id_scan_state = $scan_state_LPAREN;
8038 $identifier .= $tok;
8040 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
8041 $id_scan_state = $scan_state_RPAREN;
8042 $identifier .= $tok;
8045 $id_scan_state = EMPTY_STRING; # that's all
8049 } ## end sub do_id_scan_state_colon
8051 sub do_id_scan_state_left_paren {
8053 # looking for possible '(' of a prototype
8055 if ( $tok eq '(' ) { # got it
8056 $identifier .= $tok;
8057 $id_scan_state = $scan_state_RPAREN; # now find the end of it
8059 elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
8060 $identifier .= $tok;
8064 $id_scan_state = EMPTY_STRING; # that's all - no prototype
8068 } ## end sub do_id_scan_state_left_paren
8070 sub do_id_scan_state_right_paren {
8072 # looking for a ')' of prototype to close a '('
8074 $tok_is_blank = $tok =~ /^\s*$/;
8076 if ( $tok eq ')' ) { # got it
8077 $identifier .= $tok;
8078 $id_scan_state = EMPTY_STRING; # all done
8080 elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
8081 $identifier .= $tok;
8083 else { # probable error in script, but keep going
8084 warning("Unexpected '$tok' while seeking end of prototype\n");
8085 $identifier .= $tok;
8088 } ## end sub do_id_scan_state_right_paren
8090 sub do_id_scan_state_ampersand {
8092 # Starting sub call after seeing an '&'
8094 if ( $tok =~ /^[\$\w]/ ) { # alphanumeric ..
8095 $id_scan_state = $scan_state_COLON; # now need ::
8097 $identifier .= $tok;
8099 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
8100 $id_scan_state = $scan_state_COLON; # now need ::
8102 $identifier .= $tok;
8104 elsif ( $tok =~ /^\s*$/ ) { # allow space
8107 # fix c139: trim line-ending type 't'
8108 if ( length($identifier) == 1 && $i == $max_token_index ) {
8113 elsif ( $tok eq '::' ) { # leading ::
8114 $id_scan_state = $scan_state_ALPHA; # accept alpha next
8115 $identifier .= $tok;
8117 elsif ( $tok eq '{' ) {
8118 if ( $identifier eq '&' || $i == 0 ) {
8119 $identifier = EMPTY_STRING;
8122 $id_scan_state = EMPTY_STRING;
8124 elsif ( $tok eq '^' ) {
8125 if ( $identifier eq '&' ) {
8127 # Special variable (c066)
8128 $identifier .= $tok;
8131 # There may be one more character, not a space, after the ^
8132 my $next1 = $rtokens->[ $i + 1 ];
8133 my $chr = substr( $next1, 0, 1 );
8134 if ( $is_special_variable_char{$chr} ) {
8136 # It is something like &^O
8138 $identifier .= $next1;
8140 # If pretoken $next1 is more than one character long,
8141 # set a flag indicating that it needs to be split.
8143 ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
8148 $id_scan_state = EMPTY_STRING;
8152 $identifier = EMPTY_STRING;
8158 # punctuation variable?
8159 # testfile: cunningham4.pl
8161 # We have to be careful here. If we are in an unknown state,
8162 # we will reject the punctuation variable. In the following
8163 # example the '&' is a binary operator but we are in an unknown
8164 # state because there is no sigil on 'Prima', so we don't
8165 # know what it is. But it is a bad guess that
8166 # '&~' is a function variable.
8167 # $self->{text}->{colorMap}->[
8168 # Prima::PodView::COLOR_CODE_FOREGROUND
8169 # & ~tb::COLOR_INDEX ] =
8172 # Fix for case c033: a '#' here starts a side comment
8173 if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
8174 $identifier .= $tok;
8177 $identifier = EMPTY_STRING;
8181 $id_scan_state = EMPTY_STRING;
8184 } ## end sub do_id_scan_state_ampersand
8186 #-------------------
8187 # hash of scanner subs
8188 #-------------------
8189 my $scan_identifier_code = {
8190 $scan_state_SIGIL => \&do_id_scan_state_dollar,
8191 $scan_state_ALPHA => \&do_id_scan_state_alpha,
8192 $scan_state_COLON => \&do_id_scan_state_colon,
8193 $scan_state_LPAREN => \&do_id_scan_state_left_paren,
8194 $scan_state_RPAREN => \&do_id_scan_state_right_paren,
8195 $scan_state_AMPERSAND => \&do_id_scan_state_ampersand,
8198 sub scan_complex_identifier {
8200 # This routine assembles tokens into identifiers. It maintains a
8201 # scan state, id_scan_state. It updates id_scan_state based upon
8202 # current id_scan_state and token, and returns an updated
8203 # id_scan_state and the next index after the identifier.
8205 # This routine now serves a a backup for sub scan_simple_identifier
8206 # which handles most identifiers.
8209 $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
8210 $expecting, $container_type
8213 # return flag telling caller to split the pretoken
8214 my $split_pretoken_flag;
8216 #-------------------
8217 # Initialize my vars
8218 #-------------------
8220 initialize_my_scan_id_vars();
8222 #--------------------------------------------------------
8223 # get started by defining a type and a state if necessary
8224 #--------------------------------------------------------
8226 if ( !$id_scan_state ) {
8227 $context = UNKNOWN_CONTEXT;
8230 if ( $tok eq '>' ) {
8236 if ( $last_nonblank_token eq '->' ) {
8237 $identifier = '->' . $identifier;
8238 $id_scan_state = $scan_state_SIGIL;
8240 elsif ( $tok eq '$' || $tok eq '*' ) {
8241 $id_scan_state = $scan_state_SIGIL;
8242 $context = SCALAR_CONTEXT;
8244 elsif ( $tok eq '%' || $tok eq '@' ) {
8245 $id_scan_state = $scan_state_SIGIL;
8246 $context = LIST_CONTEXT;
8248 elsif ( $tok eq '&' ) {
8249 $id_scan_state = $scan_state_AMPERSAND;
8251 elsif ( $tok eq 'sub' or $tok eq 'package' ) {
8252 $saw_alpha = 0; # 'sub' is considered type info here
8253 $id_scan_state = $scan_state_SIGIL;
8255 SPACE; # need a space to separate sub from sub name
8257 elsif ( $tok eq '::' ) {
8258 $id_scan_state = $scan_state_ALPHA;
8260 elsif ( $tok =~ /^\w/ ) {
8261 $id_scan_state = $scan_state_COLON;
8264 elsif ( $tok eq '->' ) {
8265 $id_scan_state = $scan_state_SIGIL;
8269 # shouldn't happen: bad call parameter
8271 "Program bug detected: scan_identifier received bad starting token = '$tok'\n";
8272 if (DEVEL_MODE) { Fault($msg) }
8273 if ( !$tokenizer_self->[_in_error_] ) {
8275 $tokenizer_self->[_in_error_] = 1;
8277 $id_scan_state = EMPTY_STRING;
8282 $saw_type = !$saw_alpha;
8286 $saw_alpha = ( $tok =~ /^\w/ );
8287 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
8289 # check for a valid starting state
8290 if ( DEVEL_MODE && !$is_returnable_scan_state{$id_scan_state} ) {
8292 Unexpected starting scan state in sub scan_complex_identifier: '$id_scan_state'
8297 #------------------------------
8298 # loop to gather the identifier
8299 #------------------------------
8303 while ( $i < $max_token_index && $id_scan_state ) {
8305 # Be sure we have code to handle this state before we proceed
8306 my $code = $scan_identifier_code->{$id_scan_state};
8309 if ( $id_scan_state eq $scan_state_SPLIT ) {
8310 ## OK: this is the signal to exit and split the pretoken
8313 # unknown state - should not happen
8317 Unknown scan state in sub scan_complex_identifier: '$id_scan_state'
8318 Scan state at sub entry was '$id_scan_state_begin'
8321 $id_scan_state = EMPTY_STRING;
8327 # Remember the starting index for progress check below
8328 my $i_start_loop = $i;
8330 $last_tok_is_blank = $tok_is_blank;
8331 if ($tok_is_blank) { $tok_is_blank = undef }
8332 else { $i_save = $i }
8334 $tok = $rtokens->[ ++$i ];
8336 # patch to make digraph :: if necessary
8337 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
8344 # check for forward progress: a decrease in the index $i
8345 # implies that scanning has finished
8346 last if ( $i <= $i_start_loop );
8348 } ## end of main loop
8354 # Be sure a valid state is returned
8355 if ($id_scan_state) {
8357 if ( !$is_returnable_scan_state{$id_scan_state} ) {
8359 if ( $id_scan_state eq $scan_state_SPLIT ) {
8360 $split_pretoken_flag = 1;
8363 if ( $id_scan_state eq $scan_state_RPAREN ) {
8365 "Hit end of line while seeking ) to end prototype\n");
8368 $id_scan_state = EMPTY_STRING;
8371 # Patch: the deprecated variable $# does not combine with anything
8373 if ( $identifier eq '$#' ) { $id_scan_state = EMPTY_STRING }
8376 # Be sure the token index is valid
8377 if ( $i < 0 ) { $i = 0 }
8379 # Be sure a token type is defined
8386 # The type without the -> should be the same as with the -> so
8387 # that if they get separated we get the same bond strengths,
8389 if ( $identifier =~ /^->/
8390 && $last_nonblank_type eq 'w'
8391 && substr( $identifier, 2, 1 ) =~ /^\w/ )
8395 else { $type = 'i' }
8397 elsif ( $identifier eq '->' ) {
8401 ( length($identifier) > 1 )
8403 # In something like '@$=' we have an identifier '@$'
8404 # In something like '$${' we have type '$$' (and only
8405 # part of an identifier)
8406 && !( $identifier =~ /\$$/ && $tok eq '{' )
8408 ## && ( $identifier !~ /^(sub |package )$/ )
8409 && $identifier ne 'sub '
8410 && $identifier ne 'package '
8415 else { $type = 't' }
8417 elsif ($saw_alpha) {
8419 # type 'w' includes anything without leading type info
8420 # ($,%,@,*) including something like abc::def::ghi
8423 # Fix for b1337, if restarting scan after line break between
8424 # '->' or sigil and identifier name, use type 'i'
8425 if ( $id_scan_state_begin
8426 && $identifier =~ /^([\$\%\@\*\&]|->)/ )
8432 $type = EMPTY_STRING;
8433 } # this can happen on a restart
8436 # See if we formed an identifier...
8439 if ($message) { write_logfile_entry($message) }
8442 # did not find an identifier, back up
8450 DEBUG_SCAN_ID && do {
8451 my ( $a, $b, $c ) = caller;
8453 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
8455 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
8457 return ( $i, $tok, $type, $id_scan_state, $identifier,
8458 $split_pretoken_flag );
8459 } ## end sub scan_complex_identifier
8460 } ## end closure for sub scan_complex_identifier
8462 { ## closure for sub do_scan_sub
8464 my %warn_if_lexical;
8468 # lexical subs with these names can cause parsing errors in this version
8469 my @q = qw( m q qq qr qw qx s tr y );
8470 @{warn_if_lexical}{@q} = (1) x scalar(@q);
8473 # saved package and subnames in case prototype is on separate line
8474 my ( $package_saved, $subname_saved );
8476 # initialize subname each time a new 'sub' keyword is encountered
8477 sub initialize_subname {
8478 $package_saved = EMPTY_STRING;
8479 $subname_saved = EMPTY_STRING;
8486 PROTOTYPE_CALL => 3,
8491 # do_scan_sub parses a sub name and prototype.
8493 # At present there are three basic CALL TYPES which are
8494 # distinguished by the starting value of '$tok':
8495 # 1. $tok='sub', id_scan_state='sub'
8496 # it is called with $i_beg equal to the index of the first nonblank
8497 # token following a 'sub' token.
8498 # 2. $tok='(', id_scan_state='sub',
8499 # it is called with $i_beg equal to the index of a '(' which may
8500 # start a prototype.
8501 # 3. $tok='prototype', id_scan_state='prototype'
8502 # it is called with $i_beg equal to the index of a '(' which is
8503 # preceded by ': prototype' and has $id_scan_state eq 'prototype'
8507 # A single type 1 call will get both the sub and prototype
8508 # sub foo1 ( $$ ) { }
8511 # The subname will be obtained with a 'sub' call
8512 # The prototype on line 2 will be obtained with a '(' call
8518 # The subname will be obtained with a 'sub' call
8519 # The prototype will be obtained with a 'prototype' call
8520 # sub foo1 ( $x, $y ) : prototype ( $$ ) { }
8521 # ^ <---type 1 ^ <---type 3
8523 # TODO: add future error checks to be sure we have a valid
8524 # sub name. For example, 'sub &doit' is wrong. Also, be sure
8525 # a name is given if and only if a non-anonymous sub is
8527 # USES GLOBAL VARS: $current_package, $last_nonblank_token,
8528 # $in_attribute_list, %saw_function_definition,
8531 my ($rinput_hash) = @_;
8533 my $input_line = $rinput_hash->{input_line};
8534 my $i = $rinput_hash->{i};
8535 my $i_beg = $rinput_hash->{i_beg};
8536 my $tok = $rinput_hash->{tok};
8537 my $type = $rinput_hash->{type};
8538 my $rtokens = $rinput_hash->{rtokens};
8539 my $rtoken_map = $rinput_hash->{rtoken_map};
8540 my $id_scan_state = $rinput_hash->{id_scan_state};
8541 my $max_token_index = $rinput_hash->{max_token_index};
8545 # Determine the CALL TYPE
8550 $tok eq 'prototype' ? PROTOTYPE_CALL
8551 : $tok eq '(' ? PAREN_CALL
8554 $id_scan_state = EMPTY_STRING; # normally we get everything in one call
8555 my $subname = $subname_saved;
8556 my $package = $package_saved;
8561 my $pos_beg = $rtoken_map->[$i_beg];
8562 pos($input_line) = $pos_beg;
8564 # Look for the sub NAME if this is a SUB call
8566 $call_type == SUB_CALL
8567 && $input_line =~ m/\G\s*
8568 ((?:\w*(?:'|::))*) # package - something that ends in :: or '
8569 (\w+) # NAME - required
8576 my $is_lexical_sub =
8577 $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
8578 if ( $is_lexical_sub && $1 ) {
8579 warning("'my' sub $subname cannot be in package '$1'\n");
8580 $is_lexical_sub = 0;
8583 if ($is_lexical_sub) {
8585 # lexical subs use the block sequence number as a package name
8587 $current_sequence_number[BRACE][ $current_depth[BRACE] ];
8588 $seqno = 1 unless ( defined($seqno) );
8590 if ( $warn_if_lexical{$subname} ) {
8592 "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
8597 $package = ( defined($1) && $1 ) ? $1 : $current_package;
8598 $package =~ s/\'/::/g;
8599 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
8600 $package =~ s/::$//;
8603 my $pos = pos($input_line);
8604 my $numc = $pos - $pos_beg;
8605 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
8608 # remember the sub name in case another call is needed to
8610 $package_saved = $package;
8611 $subname_saved = $subname;
8614 # Now look for PROTO ATTRS for all call types
8615 # Look for prototype/attributes which are usually on the same
8616 # line as the sub name but which might be on a separate line.
8617 # For example, we might have an anonymous sub with attributes,
8618 # or a prototype on a separate line from its sub name
8620 # NOTE: We only want to parse PROTOTYPES here. If we see anything that
8621 # does not look like a prototype, we assume it is a SIGNATURE and we
8622 # will stop and let the the standard tokenizer handle it. In
8623 # particular, we stop if we see any nested parens, braces, or commas.
8624 # Also note, a valid prototype cannot contain any alphabetic character
8625 # -- see https://perldoc.perl.org/perlsub
8626 # But it appears that an underscore is valid in a prototype, so the
8627 # regex below uses [A-Za-z] rather than \w
8628 # This is the old regex which has been replaced:
8629 # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO
8630 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
8632 $input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO
8633 (\s*:)? # ATTRS leading ':'
8641 # Append the prototype to the starting token if it is 'sub' or
8642 # 'prototype'. This is not necessary but for compatibility with
8643 # previous versions when the -csc flag is used:
8644 if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
8648 # If we just entered the sub at an opening paren on this call, not
8649 # a following :prototype, label it with the previous token. This is
8650 # necessary to propagate the sub name to its opening block.
8651 elsif ( $call_type == PAREN_CALL ) {
8652 $tok = $last_nonblank_token;
8657 # Patch part #1 to fixes cases b994 and b1053:
8658 # Mark an anonymous sub keyword without prototype as type 'k', i.e.
8659 # 'sub : lvalue { ...'
8661 if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
8666 # ATTRS: if there are attributes, back up and let the ':' be
8667 # found later by the scanner.
8668 my $pos = pos($input_line);
8670 $pos -= length($attrs);
8673 my $next_nonblank_token = $tok;
8675 # catch case of line with leading ATTR ':' after anonymous sub
8676 if ( $pos == $pos_beg && $tok eq ':' ) {
8678 $in_attribute_list = 1;
8681 # Otherwise, if we found a match we must convert back from
8682 # string position to the pre_token index for continued parsing.
8685 # I don't think an error flag can occur here ..but ?
8687 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
8689 if ($error) { warning("Possibly invalid sub\n") }
8691 # Patch part #2 to fixes cases b994 and b1053:
8692 # Do not let spaces be part of the token of an anonymous sub
8693 # keyword which we marked as type 'k' above...i.e. for
8695 # 'sub : lvalue { ...'
8696 # Back up and let it be parsed as a blank
8700 && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
8705 # check for multiple definitions of a sub
8706 ( $next_nonblank_token, my $i_next ) =
8707 find_next_nonblank_token_on_this_line( $i, $rtokens,
8711 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
8712 { # skip blank or side comment
8713 my ( $rpre_tokens, $rpre_types ) =
8714 peek_ahead_for_n_nonblank_pre_tokens(1);
8715 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
8716 $next_nonblank_token = $rpre_tokens->[0];
8719 $next_nonblank_token = '}';
8723 # See what's next...
8724 if ( $next_nonblank_token eq '{' ) {
8727 # Check for multiple definitions of a sub, but
8728 # it is ok to have multiple sub BEGIN, etc,
8729 # so we do not complain if name is all caps
8730 if ( $saw_function_definition{$subname}{$package}
8731 && $subname !~ /^[A-Z]+$/ )
8733 my $lno = $saw_function_definition{$subname}{$package};
8734 if ( $package =~ /^\d/ ) {
8736 "already saw definition of lexical 'sub $subname' at line $lno\n"
8742 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
8743 ) unless (DEVEL_MODE);
8746 $saw_function_definition{$subname}{$package} =
8747 $tokenizer_self->[_last_line_number_];
8750 elsif ( $next_nonblank_token eq ';' ) {
8752 elsif ( $next_nonblank_token eq '}' ) {
8755 # ATTRS - if an attribute list follows, remember the name
8756 # of the sub so the next opening brace can be labeled.
8757 # Setting 'statement_type' causes any ':'s to introduce
8759 elsif ( $next_nonblank_token eq ':' ) {
8760 if ( $call_type == SUB_CALL ) {
8762 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8766 # if we stopped before an open paren ...
8767 elsif ( $next_nonblank_token eq '(' ) {
8769 # If we DID NOT see this paren above then it must be on the
8770 # next line so we will set a flag to come back here and see if
8773 # Otherwise, we assume it is a SIGNATURE rather than a
8774 # PROTOTYPE and let the normal tokenizer handle it as a list
8775 if ( !$saw_opening_paren ) {
8776 $id_scan_state = 'sub'; # we must come back to get proto
8778 if ( $call_type == SUB_CALL ) {
8780 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8783 elsif ($next_nonblank_token) { # EOF technically ok
8785 if ( $rinput_hash->{tok} eq 'method' && $call_type == SUB_CALL )
8787 # For a method call, silently ignore this error (rt145706)
8788 # to avoid needless warnings. Example which can produce it:
8789 # test(method Pack (), "method");
8791 # TODO: scan for use feature 'class' and:
8792 # - if we saw 'use feature 'class' then issue the warning.
8793 # - if we did not see use feature 'class' then issue the
8794 # warning and suggest turning off --use-feature=class
8797 $subname = EMPTY_STRING unless defined($subname);
8799 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
8803 check_prototype( $proto, $package, $subname );
8806 # no match to either sub name or prototype, but line not blank
8810 return ( $i, $tok, $type, $id_scan_state );
8811 } ## end sub do_scan_sub
8814 #########################################################################
8815 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
8816 #########################################################################
8818 sub find_next_nonblank_token {
8819 my ( $i, $rtokens, $max_token_index ) = @_;
8821 # Returns the next nonblank token after the token at index $i
8822 # To skip past a side comment, and any subsequent block comments
8823 # and blank lines, call with i=$max_token_index
8825 if ( $i >= $max_token_index ) {
8826 if ( !peeked_ahead() ) {
8828 peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
8832 my $next_nonblank_token = $rtokens->[ ++$i ];
8833 return ( SPACE, $i )
8834 unless ( defined($next_nonblank_token) && length($next_nonblank_token) );
8836 # Quick test for nonblank ascii char. Note that we just have to
8837 # examine the first character here.
8838 my $ord = ord( substr( $next_nonblank_token, 0, 1 ) );
8839 if ( $ord >= ORD_PRINTABLE_MIN
8840 && $ord <= ORD_PRINTABLE_MAX )
8842 return ( $next_nonblank_token, $i );
8845 # Quick test to skip over an ascii space or tab
8846 elsif ( $ord == ORD_SPACE || $ord == ORD_TAB ) {
8847 $next_nonblank_token = $rtokens->[ ++$i ];
8848 return ( SPACE, $i ) unless defined($next_nonblank_token);
8851 # Slow test to skip over something else identified as whitespace
8852 elsif ( $next_nonblank_token =~ /^\s*$/ ) {
8853 $next_nonblank_token = $rtokens->[ ++$i ];
8854 return ( SPACE, $i ) unless defined($next_nonblank_token);
8857 # We should be at a nonblank now
8858 return ( $next_nonblank_token, $i );
8859 } ## end sub find_next_nonblank_token
8861 sub find_next_noncomment_type {
8862 my ( $i, $rtokens, $max_token_index ) = @_;
8864 # Given the current character position, look ahead past any comments
8865 # and blank lines and return the next token, including digraphs and
8868 my ( $next_nonblank_token, $i_next ) =
8869 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8871 # skip past any side comment
8872 if ( $next_nonblank_token eq '#' ) {
8873 ( $next_nonblank_token, $i_next ) =
8874 find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
8877 # check for a digraph
8878 if ( $next_nonblank_token
8879 && $next_nonblank_token ne SPACE
8880 && defined( $rtokens->[ $i_next + 1 ] ) )
8882 my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8883 if ( $is_digraph{$test2} ) {
8884 $next_nonblank_token = $test2;
8885 $i_next = $i_next + 1;
8887 # check for a trigraph
8888 if ( defined( $rtokens->[ $i_next + 1 ] ) ) {
8889 my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8890 if ( $is_trigraph{$test3} ) {
8891 $next_nonblank_token = $test3;
8892 $i_next = $i_next + 1;
8898 return ( $next_nonblank_token, $i_next );
8899 } ## end sub find_next_noncomment_type
8901 sub is_possible_numerator {
8903 # Look at the next non-comment character and decide if it could be a
8909 my ( $i, $rtokens, $max_token_index ) = @_;
8910 my $is_possible_numerator = 0;
8912 my $next_token = $rtokens->[ $i + 1 ];
8913 if ( $next_token eq '=' ) { $i++; } # handle /=
8914 my ( $next_nonblank_token, $i_next ) =
8915 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8917 if ( $next_nonblank_token eq '#' ) {
8918 ( $next_nonblank_token, $i_next ) =
8919 find_next_nonblank_token( $max_token_index, $rtokens,
8923 if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
8924 $is_possible_numerator = 1;
8926 elsif ( $next_nonblank_token =~ /^\s*$/ ) {
8927 $is_possible_numerator = 0;
8930 $is_possible_numerator = -1;
8933 return $is_possible_numerator;
8934 } ## end sub is_possible_numerator
8936 { ## closure for sub pattern_expected
8941 # List of tokens which may follow a pattern. Note that we will not
8942 # have formed digraphs at this point, so we will see '&' instead of
8943 # '&&' and '|' instead of '||'
8945 # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
8946 my @q = qw( & && | || ? : + - * and or while if unless);
8947 push @q, ')', '}', ']', '>', ',', ';';
8948 @{pattern_test}{@q} = (1) x scalar(@q);
8951 sub pattern_expected {
8953 # This a filter for a possible pattern.
8954 # It looks at the token after a possible pattern and tries to
8955 # determine if that token could end a pattern.
8960 my ( $i, $rtokens, $max_token_index ) = @_;
8963 my $next_token = $rtokens->[ $i + 1 ];
8964 if ( $next_token =~ /^[msixpodualgc]/ ) {
8966 } # skip possible modifier
8967 my ( $next_nonblank_token, $i_next ) =
8968 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8970 if ( $pattern_test{$next_nonblank_token} ) {
8975 # Added '#' to fix issue c044
8976 if ( $next_nonblank_token =~ /^\s*$/
8977 || $next_nonblank_token eq '#' )
8986 } ## end sub pattern_expected
8989 sub find_next_nonblank_token_on_this_line {
8990 my ( $i, $rtokens, $max_token_index ) = @_;
8991 my $next_nonblank_token;
8993 if ( $i < $max_token_index ) {
8994 $next_nonblank_token = $rtokens->[ ++$i ];
8996 if ( $next_nonblank_token =~ /^\s*$/ ) {
8998 if ( $i < $max_token_index ) {
8999 $next_nonblank_token = $rtokens->[ ++$i ];
9004 $next_nonblank_token = EMPTY_STRING;
9006 return ( $next_nonblank_token, $i );
9007 } ## end sub find_next_nonblank_token_on_this_line
9009 sub find_angle_operator_termination {
9011 # We are looking at a '<' and want to know if it is an angle operator.
9013 # $i = pretoken index of ending '>' if found, current $i otherwise
9014 # $type = 'Q' if found, '>' otherwise
9015 my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
9018 pos($input_line) = 1 + $rtoken_map->[$i];
9022 # we just have to find the next '>' if a term is expected
9023 if ( $expecting == TERM ) { $filter = '[\>]' }
9025 # we have to guess if we don't know what is expected
9026 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
9028 # shouldn't happen - we shouldn't be here if operator is expected
9032 Bad call to find_angle_operator_termination
9035 return ( $i, $type );
9038 # To illustrate what we might be looking at, in case we are
9039 # guessing, here are some examples of valid angle operators
9046 # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
9047 # <${PREFIX}*img*.$IMAGE_TYPE>
9048 # <img*.$IMAGE_TYPE>
9049 # <Timg*.$IMAGE_TYPE>
9050 # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
9052 # Here are some examples of lines which do not have angle operators:
9053 # return unless $self->[2]++ < $#{$self->[1]};
9056 # the following line from dlister.pl caused trouble:
9057 # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
9059 # If the '<' starts an angle operator, it must end on this line and
9060 # it must not have certain characters like ';' and '=' in it. I use
9061 # this to limit the testing. This filter should be improved if
9064 if ( $input_line =~ /($filter)/g ) {
9068 # We MAY have found an angle operator termination if we get
9069 # here, but we need to do more to be sure we haven't been
9071 my $pos = pos($input_line);
9073 my $pos_beg = $rtoken_map->[$i];
9074 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
9076 # Test for '<' after possible filehandle, issue c103
9077 # print $fh <>; # syntax error
9078 # print $fh <DATA>; # ok
9079 # print $fh < DATA>; # syntax error at '>'
9080 # print STDERR < DATA>; # ok, prints word 'DATA'
9081 # print BLABLA <DATA>; # ok; does nothing unless BLABLA is defined
9082 if ( $last_nonblank_type eq 'Z' ) {
9084 # $str includes brackets; something like '<DATA>'
9085 if ( substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
9086 && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
9088 return ( $i, $type );
9092 # Reject if the closing '>' follows a '-' as in:
9093 # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
9094 if ( $expecting eq UNKNOWN ) {
9095 my $check = substr( $input_line, $pos - 2, 1 );
9096 if ( $check eq '-' ) {
9097 return ( $i, $type );
9101 ######################################debug#####
9102 #write_diagnostics( "ANGLE? :$str\n");
9103 #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
9104 ######################################debug#####
9108 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9110 # It may be possible that a quote ends midway in a pretoken.
9111 # If this happens, it may be necessary to split the pretoken.
9115 unexpected error condition returned by inverse_pretoken_map
9119 "Possible tokinization error..please check this line\n");
9122 # count blanks on inside of brackets
9123 my $blank_count = 0;
9124 $blank_count++ if ( $str =~ /<\s+/ );
9125 $blank_count++ if ( $str =~ /\s+>/ );
9127 # Now let's see where we stand....
9128 # OK if math op not possible
9129 if ( $expecting == TERM ) {
9132 # OK if there are no more than 2 non-blank pre-tokens inside
9133 # (not possible to write 2 token math between < and >)
9134 # This catches most common cases
9135 elsif ( $i <= $i_beg + 3 + $blank_count ) {
9137 # No longer any need to document this common case
9138 ## write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
9141 # OK if there is some kind of identifier inside
9142 # print $fh <tvg::INPUT>;
9143 elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
9144 write_diagnostics("ANGLE (contains identifier): $str\n");
9150 # Let's try a Brace Test: any braces inside must balance
9152 while ( $str =~ /\{/g ) { $br++ }
9153 while ( $str =~ /\}/g ) { $br-- }
9155 while ( $str =~ /\[/g ) { $sb++ }
9156 while ( $str =~ /\]/g ) { $sb-- }
9158 while ( $str =~ /\(/g ) { $pr++ }
9159 while ( $str =~ /\)/g ) { $pr-- }
9161 # if braces do not balance - not angle operator
9162 if ( $br || $sb || $pr ) {
9166 "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
9169 # we should keep doing more checks here...to be continued
9170 # Tentatively accepting this as a valid angle operator.
9171 # There are lots more things that can be checked.
9174 "ANGLE-Guessing yes: $str expecting=$expecting\n");
9175 write_logfile_entry("Guessing angle operator here: $str\n");
9180 # didn't find ending >
9182 if ( $expecting == TERM ) {
9183 warning("No ending > for angle operator\n");
9187 return ( $i, $type );
9188 } ## end sub find_angle_operator_termination
9190 sub scan_number_do {
9192 # scan a number in any of the formats that Perl accepts
9193 # Underbars (_) are allowed in decimal numbers.
9194 # input parameters -
9195 # $input_line - the string to scan
9196 # $i - pre_token index to start scanning
9197 # $rtoken_map - reference to the pre_token map giving starting
9198 # character position in $input_line of token $i
9199 # output parameters -
9200 # $i - last pre_token index of the number just scanned
9201 # number - the number (characters); or undef if not a number
9203 my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
9204 my $pos_beg = $rtoken_map->[$i];
9208 my $type = $input_type;
9210 my $first_char = substr( $input_line, $pos_beg, 1 );
9212 # Look for bad starting characters; Shouldn't happen..
9213 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
9216 Program bug - scan_number given bad first character = '$first_char'
9219 return ( $i, $type, $number );
9222 # handle v-string without leading 'v' character ('Two Dot' rule)
9224 # Here is the format prior to including underscores:
9225 ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
9226 pos($input_line) = $pos_beg;
9227 if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
9228 $pos = pos($input_line);
9229 my $numc = $pos - $pos_beg;
9230 $number = substr( $input_line, $pos_beg, $numc );
9232 report_v_string($number);
9235 # handle octal, hex, binary
9236 if ( !defined($number) ) {
9237 pos($input_line) = $pos_beg;
9239 # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
9240 # For reference, the format prior to hex floating point is:
9241 # /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
9242 # (hex) (octal) (binary)
9246 /\G[+-]?0( # leading [signed] 0
9248 # a hex float, i.e. '0x0.b17217f7d1cf78p0'
9249 ([xX][0-9a-fA-F_]* # X and optional leading digits
9250 (\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction
9251 [Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
9252 [0-9a-fA-F_]*) # optional Additional exponent digits
9255 |([xX][0-9a-fA-F_]+)
9258 |([oO]?[0-7_]+ # string of octal digits
9259 (\.([0-7][0-7_]*)?)? # optional decimal and fraction
9260 [Pp][+-]?[0-7] # REQUIRED exponent, no underscore
9261 [0-7_]*) # Additional exponent digits with underscores
9264 |([oO]?[0-7_]+) # string of octal digits
9267 |([bB][01_]* # 'b' with string of binary digits
9268 (\.([01][01_]*)?)? # optional decimal and fraction
9269 [Pp][+-]?[01] # Required exponent indicator, no underscore
9270 [01_]*) # additional exponent bits
9273 |([bB][01_]+) # 'b' with string of binary digits
9278 $pos = pos($input_line);
9279 my $numc = $pos - $pos_beg;
9280 $number = substr( $input_line, $pos_beg, $numc );
9286 if ( !defined($number) ) {
9287 pos($input_line) = $pos_beg;
9289 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
9290 $pos = pos($input_line);
9292 # watch out for things like 0..40 which would give 0. by this;
9293 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
9294 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
9298 my $numc = $pos - $pos_beg;
9299 $number = substr( $input_line, $pos_beg, $numc );
9304 # filter out non-numbers like e + - . e2 .e3 +e6
9305 # the rule: at least one digit, and any 'e' must be preceded by a digit
9307 $number !~ /\d/ # no digits
9308 || ( $number =~ /^(.*)[eE]/
9309 && $1 !~ /\d/ ) # or no digits before the 'e'
9313 $type = $input_type;
9314 return ( $i, $type, $number );
9317 # Found a number; now we must convert back from character position
9318 # to pre_token index. An error here implies user syntax error.
9319 # An example would be an invalid octal number like '009'.
9322 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9323 if ($error) { warning("Possibly invalid number\n") }
9325 return ( $i, $type, $number );
9326 } ## end sub scan_number_do
9328 sub inverse_pretoken_map {
9330 # Starting with the current pre_token index $i, scan forward until
9331 # finding the index of the next pre_token whose position is $pos.
9332 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
9335 while ( ++$i <= $max_token_index ) {
9337 if ( $pos <= $rtoken_map->[$i] ) {
9339 # Let the calling routine handle errors in which we do not
9340 # land on a pre-token boundary. It can happen by running
9341 # perltidy on some non-perl scripts, for example.
9342 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
9347 return ( $i, $error );
9348 } ## end sub inverse_pretoken_map
9352 # find the target of a here document, if any
9354 # $i - token index of the second < of <<
9355 # ($i must be less than the last token index if this is called)
9356 # output parameters:
9357 # $found_target = 0 didn't find target; =1 found target
9358 # HERE_TARGET - the target string (may be empty string)
9359 # $i - unchanged if not here doc,
9360 # or index of the last token of the here target
9361 # $saw_error - flag noting unbalanced quote on here target
9362 my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
9364 my $found_target = 0;
9365 my $here_doc_target = EMPTY_STRING;
9366 my $here_quote_character = EMPTY_STRING;
9368 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
9369 $next_token = $rtokens->[ $i + 1 ];
9371 # perl allows a backslash before the target string (heredoc.t)
9373 if ( $next_token eq '\\' ) {
9375 $next_token = $rtokens->[ $i + 2 ];
9378 ( $next_nonblank_token, $i_next_nonblank ) =
9379 find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
9381 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
9384 my $quote_depth = 0;
9389 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
9392 = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
9393 $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
9395 if ($in_quote) { # didn't find end of quote, so no target found
9397 if ( $expecting == TERM ) {
9399 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
9404 else { # found ending quote
9408 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
9409 $tokj = $rtokens->[$j];
9411 # we have to remove any backslash before the quote character
9412 # so that the here-doc-target exactly matches this string
9416 && $rtokens->[ $j + 1 ] eq $here_quote_character );
9417 $here_doc_target .= $tokj;
9422 elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
9424 write_logfile_entry(
9425 "found blank here-target after <<; suggest using \"\"\n");
9428 elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
9430 my $here_doc_expected;
9431 if ( $expecting == UNKNOWN ) {
9432 $here_doc_expected = guess_if_here_doc($next_token);
9435 $here_doc_expected = 1;
9438 if ($here_doc_expected) {
9440 $here_doc_target = $next_token;
9447 if ( $expecting == TERM ) {
9449 write_logfile_entry("Note: bare here-doc operator <<\n");
9456 # patch to neglect any prepended backslash
9457 if ( $found_target && $backslash ) { $i++ }
9459 return ( $found_target, $here_doc_target, $here_quote_character, $i,
9461 } ## end sub find_here_doc
9465 # follow (or continue following) quoted string(s)
9466 # $in_quote return code:
9468 # 1 - still must find end of quote whose target is $quote_character
9469 # 2 - still looking for end of first of two quotes
9471 # Returns updated strings:
9472 # $quoted_string_1 = quoted string seen while in_quote=1
9473 # $quoted_string_2 = quoted string seen while in_quote=2
9490 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
9493 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9496 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
9497 $quote_pos, $quote_depth, $max_token_index );
9498 $quoted_string_2 .= $quoted_string;
9499 if ( $in_quote == 1 ) {
9500 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
9501 $quote_character = EMPTY_STRING;
9504 $quoted_string_2 .= "\n";
9508 if ( $in_quote == 1 ) { # one (more) quote to follow
9511 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9514 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
9515 $quote_pos, $quote_depth, $max_token_index );
9516 $quoted_string_1 .= $quoted_string;
9517 if ( $in_quote == 1 ) {
9518 $quoted_string_1 .= "\n";
9532 } ## end sub do_quote
9534 sub follow_quoted_string {
9536 # scan for a specific token, skipping escaped characters
9537 # if the quote character is blank, use the first non-blank character
9539 # $rtokens = reference to the array of tokens
9540 # $i = the token index of the first character to search
9541 # $in_quote = number of quoted strings being followed
9542 # $beginning_tok = the starting quote character
9543 # $quote_pos = index to check next for alphanumeric delimiter
9544 # output parameters:
9545 # $i = the token index of the ending quote character
9546 # $in_quote = decremented if found end, unchanged if not
9547 # $beginning_tok = the starting quote character
9548 # $quote_pos = index to check next for alphanumeric delimiter
9549 # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
9550 # $quoted_string = the text of the quote (without quotation tokens)
9563 my ( $tok, $end_tok );
9565 my $quoted_string = EMPTY_STRING;
9569 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
9572 # get the corresponding end token
9573 if ( $beginning_tok !~ /^\s*$/ ) {
9574 $end_tok = matching_end_token($beginning_tok);
9577 # a blank token means we must find and use the first non-blank one
9579 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
9581 while ( $i < $max_token_index ) {
9582 $tok = $rtokens->[ ++$i ];
9584 if ( $tok !~ /^\s*$/ ) {
9586 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
9587 $i = $max_token_index;
9591 if ( length($tok) > 1 ) {
9592 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
9593 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
9596 $beginning_tok = $tok;
9599 $end_tok = matching_end_token($beginning_tok);
9605 $allow_quote_comments = 1;
9610 # There are two different loops which search for the ending quote
9611 # character. In the rare case of an alphanumeric quote delimiter, we
9612 # have to look through alphanumeric tokens character-by-character, since
9613 # the pre-tokenization process combines multiple alphanumeric
9614 # characters, whereas for a non-alphanumeric delimiter, only tokens of
9615 # length 1 can match.
9617 #----------------------------------------------------------------
9618 # Case 1 (rare): loop for case of alphanumeric quote delimiter..
9619 # "quote_pos" is the position the current word to begin searching
9620 #----------------------------------------------------------------
9621 if ( $beginning_tok =~ /\w/ ) {
9623 # Note this because it is not recommended practice except
9624 # for obfuscated perl contests
9625 if ( $in_quote == 1 ) {
9626 write_logfile_entry(
9627 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
9630 # Note: changed < to <= here to fix c109. Relying on extra end blanks.
9631 while ( $i <= $max_token_index ) {
9633 if ( $quote_pos == 0 || ( $i < 0 ) ) {
9634 $tok = $rtokens->[ ++$i ];
9636 if ( $tok eq '\\' ) {
9638 # retain backslash unless it hides the end token
9639 $quoted_string .= $tok
9640 unless $rtokens->[ $i + 1 ] eq $end_tok;
9642 last if ( $i >= $max_token_index );
9643 $tok = $rtokens->[ ++$i ];
9646 my $old_pos = $quote_pos;
9648 unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
9652 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
9654 if ( $quote_pos > 0 ) {
9657 substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
9659 # NOTE: any quote modifiers will be at the end of '$tok'. If we
9660 # wanted to check them, this is the place to get them. But
9661 # this quote form is rarely used in practice, so it isn't
9666 if ( $quote_depth == 0 ) {
9672 if ( $old_pos <= length($tok) ) {
9673 $quoted_string .= substr( $tok, $old_pos );
9679 #-----------------------------------------------------------------------
9680 # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
9681 #-----------------------------------------------------------------------
9684 while ( $i < $max_token_index ) {
9685 $tok = $rtokens->[ ++$i ];
9687 if ( $tok eq $end_tok ) {
9690 if ( $quote_depth == 0 ) {
9695 elsif ( $tok eq $beginning_tok ) {
9698 elsif ( $tok eq '\\' ) {
9700 # retain backslash unless it hides the beginning or end token
9701 $tok = $rtokens->[ ++$i ];
9702 $quoted_string .= '\\'
9703 unless ( $tok eq $end_tok || $tok eq $beginning_tok );
9705 $quoted_string .= $tok;
9708 if ( $i > $max_token_index ) { $i = $max_token_index }
9719 } ## end sub follow_quoted_string
9721 sub indicate_error {
9722 my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
9723 interrupt_logfile();
9725 write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
9728 } ## end sub indicate_error
9730 sub write_error_indicator_pair {
9731 my ( $line_number, $input_line, $pos, $carrat ) = @_;
9732 my ( $offset, $numbered_line, $underline ) =
9733 make_numbered_line( $line_number, $input_line, $pos );
9734 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
9735 warning( $numbered_line . "\n" );
9736 $underline =~ s/\s*$//;
9737 warning( $underline . "\n" );
9739 } ## end sub write_error_indicator_pair
9741 sub make_numbered_line {
9743 # Given an input line, its line number, and a character position of
9744 # interest, create a string not longer than 80 characters of the form
9745 # $lineno: sub_string
9746 # such that the sub_string of $str contains the position of interest
9748 # Here is an example of what we want, in this case we add trailing
9749 # '...' because the line is long.
9751 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9753 # Here is another example, this time in which we used leading '...'
9754 # because of excessive length:
9756 # 2: ... er of the World Wide Web Consortium's
9758 # input parameters are:
9759 # $lineno = line number
9760 # $str = the text of the line
9761 # $pos = position of interest (the error) : 0 = first character
9764 # - $offset = an offset which corrects the position in case we only
9765 # display part of a line, such that $pos-$offset is the effective
9766 # position from the start of the displayed line.
9767 # - $numbered_line = the numbered line as above,
9768 # - $underline = a blank 'underline' which is all spaces with the same
9769 # number of characters as the numbered line.
9771 my ( $lineno, $str, $pos ) = @_;
9772 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
9773 my $excess = length($str) - $offset - 68;
9774 my $numc = ( $excess > 0 ) ? 68 : undef;
9776 if ( defined($numc) ) {
9777 if ( $offset == 0 ) {
9778 $str = substr( $str, $offset, $numc - 4 ) . " ...";
9781 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
9786 if ( $offset == 0 ) {
9789 $str = "... " . substr( $str, $offset + 4 );
9793 my $numbered_line = sprintf( "%d: ", $lineno );
9794 $offset -= length($numbered_line);
9795 $numbered_line .= $str;
9796 my $underline = SPACE x length($numbered_line);
9797 return ( $offset, $numbered_line, $underline );
9798 } ## end sub make_numbered_line
9800 sub write_on_underline {
9802 # The "underline" is a string that shows where an error is; it starts
9803 # out as a string of blanks with the same length as the numbered line of
9804 # code above it, and we have to add marking to show where an error is.
9805 # In the example below, we want to write the string '--^' just below
9806 # the line of bad code:
9808 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9810 # We are given the current underline string, plus a position and a
9811 # string to write on it.
9813 # In the above example, there will be 2 calls to do this:
9814 # First call: $pos=19, pos_chr=^
9815 # Second call: $pos=16, pos_chr=---
9817 # This is a trivial thing to do with substr, but there is some
9820 my ( $underline, $pos, $pos_chr ) = @_;
9822 # check for error..shouldn't happen
9823 unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
9826 my $excess = length($pos_chr) + $pos - length($underline);
9827 if ( $excess > 0 ) {
9828 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
9830 substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
9831 return ($underline);
9832 } ## end sub write_on_underline
9836 my ( $str, $max_tokens_wanted ) = @_;
9839 # $max_tokens_wanted > 0 to stop on reaching this many tokens.
9840 # = 0 means get all tokens
9842 # Break a string, $str, into a sequence of preliminary tokens. We
9843 # are interested in these types of tokens:
9844 # words (type='w'), example: 'max_tokens_wanted'
9845 # digits (type = 'd'), example: '0755'
9846 # whitespace (type = 'b'), example: ' '
9847 # any other single character (i.e. punct; type = the character itself).
9848 # We cannot do better than this yet because we might be in a quoted
9849 # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
9852 # An advantage of doing this pre-tokenization step is that it keeps almost
9853 # all of the regex work highly localized. A disadvantage is that in some
9854 # very rare instances we will have to go back and split a pre-token.
9856 # Return parameters:
9857 my @tokens = (); # array of the tokens themselves
9858 my @token_map = (0); # string position of start of each token
9859 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
9864 if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
9867 # note that this must come before words!
9868 elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
9871 elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
9873 # single-character punctuation
9874 elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
9878 return ( \@tokens, \@token_map, \@type );
9882 push @token_map, pos($str);
9884 } while ( --$max_tokens_wanted != 0 );
9886 return ( \@tokens, \@token_map, \@type );
9887 } ## end sub pre_tokenize
9891 # this is an old debug routine
9892 # not called, but saved for reference
9893 my ( $rtokens, $rtoken_map ) = @_;
9894 my $num = scalar( @{$rtokens} );
9896 foreach my $i ( 0 .. $num - 1 ) {
9897 my $len = length( $rtokens->[$i] );
9898 print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
9901 } ## end sub show_tokens
9903 { ## closure for sub matching end token
9904 my %matching_end_token;
9907 %matching_end_token = (
9915 sub matching_end_token {
9917 # return closing character for a pattern
9918 my $beginning_token = shift;
9919 if ( $matching_end_token{$beginning_token} ) {
9920 return $matching_end_token{$beginning_token};
9922 return ($beginning_token);
9923 } ## end sub matching_end_token
9926 sub dump_token_types {
9927 my ( $class, $fh ) = @_;
9929 # This should be the latest list of token types in use
9930 # adding NEW_TOKENS: add a comment here
9931 $fh->print(<<'END_OF_LIST');
9933 Here is a list of the token types currently used for lines of type 'CODE'.
9934 For the following tokens, the "type" of a token is just the token itself.
9936 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
9937 ( ) <= >= == =~ !~ != ++ -- /= x=
9938 ... **= <<= >>= &&= ||= //= <=>
9939 , + - / * | % ! x ~ = \ ? : . < > ^ &
9941 The following additional token types are defined:
9944 b blank (white space)
9945 { indent: opening structural curly brace or square bracket or paren
9946 (code block, anonymous hash reference, or anonymous array reference)
9947 } outdent: right structural curly brace or square bracket or paren
9948 [ left non-structural square bracket (enclosing an array index)
9949 ] right non-structural square bracket
9950 ( left non-structural paren (all but a list right of an =)
9951 ) right non-structural paren
9952 L left non-structural curly brace (enclosing a key)
9953 R right non-structural curly brace
9954 ; terminal semicolon
9955 f indicates a semicolon in a "for" statement
9956 h here_doc operator <<
9958 Q indicates a quote or pattern
9959 q indicates a qw quote block
9961 C user-defined constant or constant function (with void prototype = ())
9962 U user-defined function taking parameters
9963 G user-defined function taking block parameter (like grep/map/eval)
9964 M (unused, but reserved for subroutine definition name)
9965 P (unused, but -html uses it to label pod text)
9966 t type indicater such as %,$,@,*,&,sub
9967 w bare word (perhaps a subroutine call)
9968 i identifier of some type (with leading %, $, @, *, &, sub, -> )
9971 F a file test operator (like -e)
9973 Z identifier in indirect object slot: may be file handle, object
9974 J LABEL: code block label
9975 j LABEL after next, last, redo, goto
9978 pp pre-increment operator ++
9979 mm pre-decrement operator --
9980 A : used as attribute separator
9982 Here are the '_line_type' codes used internally:
9983 SYSTEM - system-specific code before hash-bang line
9984 CODE - line of perl code (including comments)
9985 POD_START - line starting pod, such as '=head'
9986 POD - pod documentation text
9987 POD_END - last line of pod section, '=cut'
9988 HERE - text of here-document
9989 HERE_END - last line of here-doc (target word)
9990 FORMAT - format section
9991 FORMAT_END - last line of format section, '.'
9992 SKIP - code skipping section
9993 SKIP_END - last line of code skipping section, '#>>V'
9994 DATA_START - __DATA__ line
9995 DATA - unidentified text following __DATA__
9996 END_START - __END__ line
9997 END - unidentified text following __END__
9998 ERROR - we are in big trouble, probably not a perl script
10002 } ## end sub dump_token_types
10006 # These names are used in error messages
10007 @opening_brace_names = qw# '{' '[' '(' '?' #;
10008 @closing_brace_names = qw# '}' ']' ')' ':' #;
10013 .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
10014 <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
10016 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
10019 . : < > * & | / - = + - % ^ ! x ~
10021 @can_start_digraph{@q} = (1) x scalar(@q);
10023 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
10024 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
10026 my @tetragraphs = qw( <<>> );
10027 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
10029 # make a hash of all valid token types for self-checking the tokenizer
10030 # (adding NEW_TOKENS : select a new character and add to this list)
10031 my @valid_token_types = qw#
10032 A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
10033 { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
10035 push( @valid_token_types, @digraphs );
10036 push( @valid_token_types, @trigraphs );
10037 push( @valid_token_types, @tetragraphs );
10038 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
10039 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
10041 # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
10042 my @file_test_operators =
10043 qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
10044 @is_file_test_operator{@file_test_operators} =
10045 (1) x scalar(@file_test_operators);
10047 # these functions have prototypes of the form (&), so when they are
10048 # followed by a block, that block MAY BE followed by an operator.
10049 # Smartmatch operator ~~ may be followed by anonymous hash or array ref
10050 @q = qw( do eval );
10051 @is_block_operator{@q} = (1) x scalar(@q);
10053 # these functions allow an identifier in the indirect object slot
10054 @q = qw( print printf sort exec system say);
10055 @is_indirect_object_taker{@q} = (1) x scalar(@q);
10057 # Note: 'field' will be added by sub check_options if --use-feature=class
10058 @q = qw(my our state);
10059 @is_my_our_state{@q} = (1) x scalar(@q);
10061 # These tokens may precede a code block
10062 # patched for SWITCH/CASE/CATCH. Actually these could be removed
10063 # now and we could let the extended-syntax coding handle them.
10064 # Added 'default' for Switch::Plain.
10065 # Note: 'ADJUST' will be added by sub check_options if --use-feature=class
10067 qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
10068 unless do while until eval for foreach map grep sort
10069 switch case given when default catch try finally);
10070 @is_code_block_token{@q} = (1) x scalar(@q);
10072 # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
10073 # to contrast it with the block types in '%is_zero_continuation_block_type'
10074 @q = qw( sort map grep eval do );
10075 @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
10077 @q = qw( sort map grep );
10078 @is_sort_map_grep{@q} = (1) x scalar(@q);
10080 %is_grep_alias = ();
10082 # I'll build the list of keywords incrementally
10085 # keywords and tokens after which a value or pattern is expected,
10086 # but not an operator. In other words, these should consume terms
10087 # to their right, or at least they are not expected to be followed
10088 # immediately by operators.
10089 my @value_requestor = qw(
10318 # Note: 'ADJUST', 'field' are added by sub check_options
10319 # if --use-feature=class
10321 # patched above for SWITCH/CASE given/when err say
10322 # 'err' is a fairly safe addition.
10323 # Added 'default' for Switch::Plain. Note that we could also have
10324 # a separate set of keywords to include if we see 'use Switch::Plain'
10325 push( @Keywords, @value_requestor );
10327 # These are treated the same but are not keywords:
10332 push( @value_requestor, @extra_vr );
10334 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
10336 # this list contains keywords which do not look for arguments,
10337 # so that they might be followed by an operator, or at least
10339 my @operator_requestor = qw(
10363 push( @Keywords, @operator_requestor );
10365 # These are treated the same but are not considered keywords:
10372 push( @operator_requestor, @extra_or );
10374 @expecting_operator_token{@operator_requestor} =
10375 (1) x scalar(@operator_requestor);
10377 # these token TYPES expect trailing operator but not a term
10378 # note: ++ and -- are post-increment and decrement, 'C' = constant
10379 my @operator_requestor_types = qw( ++ -- C <> q );
10380 @expecting_operator_types{@operator_requestor_types} =
10381 (1) x scalar(@operator_requestor_types);
10383 # these token TYPES consume values (terms)
10384 # note: pp and mm are pre-increment and decrement
10385 # f=semicolon in for, F=file test operator
10386 my @value_requestor_type = qw#
10387 L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
10388 **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
10389 <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
10390 f F pp mm Y p m U J G j >> << ^ t
10391 ~. ^. |. &. ^.= |.= &.=
10393 push( @value_requestor_type, ',' )
10394 ; # (perl doesn't like a ',' in a qw block)
10395 @expecting_term_types{@value_requestor_type} =
10396 (1) x scalar(@value_requestor_type);
10398 # Note: the following valid token types are not assigned here to
10399 # hashes requesting to be followed by values or terms, but are
10400 # instead currently hard-coded into sub operator_expected:
10401 # ) -> :: Q R Z ] b h i k n v w } #
10403 # For simple syntax checking, it is nice to have a list of operators which
10404 # will really be unhappy if not followed by a term. This includes most
10406 %really_want_term = %expecting_term_types;
10408 # with these exceptions...
10409 delete $really_want_term{'U'}; # user sub, depends on prototype
10410 delete $really_want_term{'F'}; # file test works on $_ if no following term
10411 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
10413 @q = qw(q qq qx qr s y tr m);
10414 @is_q_qq_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10416 # Note added 'qw' here
10417 @q = qw(q qq qw qx qr s y tr m);
10418 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10420 # Note: 'class' will be added by sub check_options if -use-feature=class
10422 @is_package{@q} = (1) x scalar(@q);
10426 @is_comma_question_colon{@q} = (1) x scalar(@q);
10428 @q = qw( if elsif unless );
10429 @is_if_elsif_unless{@q} = (1) x scalar(@q);
10432 @is_semicolon_or_t{@q} = (1) x scalar(@q);
10434 @q = qw( if elsif unless case when );
10435 @is_if_elsif_unless_case_when{@q} = (1) x scalar(@q);
10437 # Hash of other possible line endings which may occur.
10438 # Keep these coordinated with the regex where this is used.
10439 # Note: chr(13) = chr(015)="\r".
10440 @q = ( chr(13), chr(29), chr(26) );
10441 @other_line_endings{@q} = (1) x scalar(@q);
10443 # These keywords are handled specially in the tokenizer code:
10444 my @special_keywords = qw(
10460 push( @Keywords, @special_keywords );
10462 # Keywords after which list formatting may be used
10463 # WARNING: do not include |map|grep|eval or perl may die on
10464 # syntax errors (map1.t).
10465 my @keyword_taking_list = qw(
10540 @is_keyword_taking_list{@keyword_taking_list} =
10541 (1) x scalar(@keyword_taking_list);
10543 # perl functions which may be unary operators.
10545 # This list is used to decide if a pattern delimited by slashes, /pattern/,
10546 # can follow one of these keywords.
10548 chomp eof eval fc lc pop shift uc undef
10551 @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
10554 # These are keywords for which an arg may optionally be omitted. They are
10555 # currently only used to disambiguate a ? used as a ternary from one used
10556 # as a (deprecated) pattern delimiter. In the future, they might be used
10557 # to give a warning about ambiguous syntax before a /.
10558 # Note: split has been omitted (see not below).
10559 my @keywords_taking_optional_arg = qw(
10628 @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
10629 (1) x scalar(@keywords_taking_optional_arg);
10631 # This list is used to decide if a pattern delimited by question marks,
10632 # ?pattern?, can follow one of these keywords. Note that from perl 5.22
10633 # on, a ?pattern? is not recognized, so we can be much more strict than
10634 # with a /pattern/. Note that 'split' is not in this list. In current
10635 # versions of perl a question following split must be a ternary, but
10636 # in older versions it could be a pattern. The guessing algorithm will
10637 # decide. We are combining two lists here to simplify the test.
10638 @q = ( @keywords_taking_optional_arg, @operator_requestor );
10639 @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
10642 # These are not used in any way yet
10643 # my @unused_keywords = qw(
10649 # The list of keywords was originally extracted from function 'keyword' in
10650 # perl file toke.c version 5.005.03, using this utility, plus a
10651 # little editing: (file getkwd.pl):
10652 # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
10653 # Add 'get' prefix where necessary, then split into the above lists.
10654 # This list should be updated as necessary.
10655 # The list should not contain these special variables:
10656 # ARGV DATA ENV SIG STDERR STDIN STDOUT
10659 @is_keyword{@Keywords} = (1) x scalar(@Keywords);