1 #####################################################################
3 # The Perl::Tidy::Tokenizer package is essentially a filter which
4 # reads lines of perl source code from a source object and provides
5 # corresponding tokenized lines through its get_line() method. Lines
6 # flow from the source_object to the caller like this:
8 # source_object --> LineBuffer_object --> Tokenizer --> calling routine
9 # get_line() get_line() get_line() line_of_tokens
11 # The source object can be any object with a get_line() method which
12 # supplies one line (a character string) perl call.
13 # The LineBuffer object is created by the Tokenizer.
14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
15 # containing one tokenized line for each call to its get_line() method.
17 # WARNING: This is not a real class. Only one tokenizer my be used.
19 ########################################################################
21 package Perl::Tidy::Tokenizer;
24 our $VERSION = '20220217';
26 # this can be turned on for extra checking during development
27 use constant DEVEL_MODE => 0;
29 use Perl::Tidy::LineBuffer;
32 # PACKAGE VARIABLES for processing an entire FILE.
33 # These must be package variables because most may get localized during
34 # processing. Most are initialized in sub prepare_for_a_new_file.
40 $last_nonblank_block_type
48 %user_function_prototype
50 %is_block_list_function
51 %saw_function_definition
62 @nesting_sequence_number
63 @current_sequence_number
65 @paren_semicolon_count
66 @paren_structural_type
68 @brace_structural_type
72 @square_bracket_structural_type
75 @nested_statement_type
76 @starting_line_of_current_depth
79 # GLOBAL CONSTANTS for routines in this package,
80 # Initialized in a BEGIN block.
82 %is_indirect_object_taker
84 %expecting_operator_token
85 %expecting_operator_types
89 %is_file_test_operator
95 %is_sort_map_grep_eval_do
100 %is_keyword_taking_list
101 %is_keyword_taking_optional_arg
102 %is_keyword_rejecting_slash_as_pattern_delimiter
103 %is_keyword_rejecting_question_as_pattern_delimiter
104 %is_q_qq_qw_qx_qr_s_y_tr_m
107 %is_comma_question_colon
109 $code_skipping_pattern_begin
110 $code_skipping_pattern_end
113 # GLOBAL VARIABLES which are constant after being configured by user-supplied
114 # parameters. They remain constant as a file is being processed.
117 $rOpts_code_skipping,
118 $code_skipping_pattern_begin,
119 $code_skipping_pattern_end,
122 # possible values of operator_expected()
123 use constant TERM => -1;
124 use constant UNKNOWN => 0;
125 use constant OPERATOR => 1;
127 # possible values of context
128 use constant SCALAR_CONTEXT => -1;
129 use constant UNKNOWN_CONTEXT => 0;
130 use constant LIST_CONTEXT => 1;
132 # Maximum number of little messages; probably need not be changed.
133 use constant MAX_NAG_MESSAGES => 6;
137 # Array index names for $self.
138 # Do not combine with other BEGIN blocks (c101).
141 _rhere_target_list_ => $i++,
142 _in_here_doc_ => $i++,
143 _here_doc_target_ => $i++,
144 _here_quote_character_ => $i++,
150 _in_skipped_ => $i++,
151 _in_attribute_list_ => $i++,
153 _quote_target_ => $i++,
154 _line_start_quote_ => $i++,
155 _starting_level_ => $i++,
156 _know_starting_level_ => $i++,
158 _indent_columns_ => $i++,
159 _look_for_hash_bang_ => $i++,
161 _continuation_indentation_ => $i++,
162 _outdent_labels_ => $i++,
163 _last_line_number_ => $i++,
164 _saw_perl_dash_P_ => $i++,
165 _saw_perl_dash_w_ => $i++,
166 _saw_use_strict_ => $i++,
167 _saw_v_string_ => $i++,
169 _look_for_autoloader_ => $i++,
170 _look_for_selfloader_ => $i++,
171 _saw_autoloader_ => $i++,
172 _saw_selfloader_ => $i++,
173 _saw_hash_bang_ => $i++,
176 _saw_negative_indentation_ => $i++,
177 _started_tokenizing_ => $i++,
178 _line_buffer_object_ => $i++,
179 _debugger_object_ => $i++,
180 _diagnostics_object_ => $i++,
181 _logger_object_ => $i++,
182 _unexpected_error_count_ => $i++,
183 _started_looking_for_here_target_at_ => $i++,
184 _nearly_matched_here_target_at_ => $i++,
185 _line_of_text_ => $i++,
186 _rlower_case_labels_at_ => $i++,
187 _extended_syntax_ => $i++,
188 _maximum_level_ => $i++,
189 _true_brace_error_count_ => $i++,
190 _rOpts_maximum_level_errors_ => $i++,
191 _rOpts_maximum_unexpected_errors_ => $i++,
192 _rOpts_logfile_ => $i++,
197 { ## closure for subs to count instances
199 # methods to count instances
201 sub get_count { return $_count; }
202 sub _increment_count { return ++$_count }
203 sub _decrement_count { return --$_count }
208 $self->_decrement_count();
214 # Catch any undefined sub calls so that we are sure to get
215 # some diagnostic information. This sub should never be called
216 # except for a programming error.
218 return if ( $AUTOLOAD =~ /\bDESTROY$/ );
219 my ( $pkg, $fname, $lno ) = caller();
220 my $my_package = __PACKAGE__;
222 ======================================================================
223 Error detected in package '$my_package', version $VERSION
224 Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
225 Called from package: '$pkg'
226 Called from File '$fname' at line '$lno'
227 This error is probably due to a recent programming change
228 ======================================================================
235 Perl::Tidy::Die($msg);
236 croak "unexpected return from Perl::Tidy::Die";
242 # This routine is called for errors that really should not occur
243 # except if there has been a bug introduced by a recent program change.
244 # Please add comments at calls to Fault to explain why the call
245 # should not occur, and where to look to fix it.
246 my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
247 my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
248 my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
249 my $input_stream_name = get_input_stream_name();
252 ==============================================================================
253 While operating on input stream with name: '$input_stream_name'
254 A fault was detected at line $line0 of sub '$subroutine1'
256 which was called from line $line1 of sub '$subroutine2'
258 This is probably an error introduced by a recent programming change.
259 Perl::Tidy::Tokenizer.pm reports VERSION='$VERSION'.
260 ==============================================================================
263 # We shouldn't get here, but this return is to keep Perl-Critic from
270 # See if a pattern will compile. We have to use a string eval here,
271 # but it should be safe because the pattern has been constructed
274 eval "'##'=~/$pattern/";
278 sub make_code_skipping_pattern {
279 my ( $rOpts, $opt_name, $default ) = @_;
280 my $param = $rOpts->{$opt_name};
281 unless ($param) { $param = $default }
282 $param =~ s/^\s*//; # allow leading spaces to be like format-skipping
283 if ( $param !~ /^#/ ) {
284 Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
286 my $pattern = '^\s*' . $param . '\b';
287 if ( bad_pattern($pattern) ) {
289 "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
297 # Check Tokenizer parameters
303 # Install any aliases to 'sub'
304 if ( $rOpts->{'sub-alias-list'} ) {
306 # Note that any 'sub-alias-list' has been preprocessed to
307 # be a trimmed, space-separated list which includes 'sub'
308 # for example, it might be 'sub method fun'
309 my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
310 foreach my $word (@sub_alias_list) {
316 if ( $rOpts->{'grep-alias-list'} ) {
318 # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
319 # space-separated list
320 my @q = split /\s+/, $rOpts->{'grep-alias-list'};
321 @{is_grep_alias}{@q} = (1) x scalar(@q);
324 $rOpts_code_skipping = $rOpts->{'code-skipping'};
325 $code_skipping_pattern_begin =
326 make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
327 $code_skipping_pattern_end =
328 make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
334 my ( $class, @args ) = @_;
336 # Note: 'tabs' and 'indent_columns' are temporary and should be
339 source_object => undef,
340 debugger_object => undef,
341 diagnostics_object => undef,
342 logger_object => undef,
343 starting_level => undef,
346 look_for_hash_bang => 0,
348 look_for_autoloader => 1,
349 look_for_selfloader => 1,
350 starting_line_number => 1,
351 extended_syntax => 0,
354 my %args = ( %defaults, @args );
356 # we are given an object with a get_line() method to supply source lines
357 my $source_object = $args{source_object};
358 my $rOpts = $args{rOpts};
360 # we create another object with a get_line() and peek_ahead() method
361 my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
363 # Tokenizer state data is as follows:
364 # _rhere_target_list_ reference to list of here-doc targets
365 # _here_doc_target_ the target string for a here document
366 # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
367 # to determine if interpolation is done
368 # _quote_target_ character we seek if chasing a quote
369 # _line_start_quote_ line where we started looking for a long quote
370 # _in_here_doc_ flag indicating if we are in a here-doc
371 # _in_pod_ flag set if we are in pod documentation
372 # _in_skipped_ flag set if we are in a skipped section
373 # _in_error_ flag set if we saw severe error (binary in script)
374 # _in_data_ flag set if we are in __DATA__ section
375 # _in_end_ flag set if we are in __END__ section
376 # _in_format_ flag set if we are in a format description
377 # _in_attribute_list_ flag telling if we are looking for attributes
378 # _in_quote_ flag telling if we are chasing a quote
379 # _starting_level_ indentation level of first line
380 # _line_buffer_object_ object with get_line() method to supply source code
381 # _diagnostics_object_ place to write debugging information
382 # _unexpected_error_count_ error count used to limit output
383 # _lower_case_labels_at_ line numbers where lower case labels seen
384 # _hit_bug_ program bug detected
387 $self->[_rhere_target_list_] = [];
388 $self->[_in_here_doc_] = 0;
389 $self->[_here_doc_target_] = "";
390 $self->[_here_quote_character_] = "";
391 $self->[_in_data_] = 0;
392 $self->[_in_end_] = 0;
393 $self->[_in_format_] = 0;
394 $self->[_in_error_] = 0;
395 $self->[_in_pod_] = 0;
396 $self->[_in_skipped_] = 0;
397 $self->[_in_attribute_list_] = 0;
398 $self->[_in_quote_] = 0;
399 $self->[_quote_target_] = "";
400 $self->[_line_start_quote_] = -1;
401 $self->[_starting_level_] = $args{starting_level};
402 $self->[_know_starting_level_] = defined( $args{starting_level} );
403 $self->[_tabsize_] = $args{tabsize};
404 $self->[_indent_columns_] = $args{indent_columns};
405 $self->[_look_for_hash_bang_] = $args{look_for_hash_bang};
406 $self->[_trim_qw_] = $args{trim_qw};
407 $self->[_continuation_indentation_] = $args{continuation_indentation};
408 $self->[_outdent_labels_] = $args{outdent_labels};
409 $self->[_last_line_number_] = $args{starting_line_number} - 1;
410 $self->[_saw_perl_dash_P_] = 0;
411 $self->[_saw_perl_dash_w_] = 0;
412 $self->[_saw_use_strict_] = 0;
413 $self->[_saw_v_string_] = 0;
414 $self->[_hit_bug_] = 0;
415 $self->[_look_for_autoloader_] = $args{look_for_autoloader};
416 $self->[_look_for_selfloader_] = $args{look_for_selfloader};
417 $self->[_saw_autoloader_] = 0;
418 $self->[_saw_selfloader_] = 0;
419 $self->[_saw_hash_bang_] = 0;
420 $self->[_saw_end_] = 0;
421 $self->[_saw_data_] = 0;
422 $self->[_saw_negative_indentation_] = 0;
423 $self->[_started_tokenizing_] = 0;
424 $self->[_line_buffer_object_] = $line_buffer_object;
425 $self->[_debugger_object_] = $args{debugger_object};
426 $self->[_diagnostics_object_] = $args{diagnostics_object};
427 $self->[_logger_object_] = $args{logger_object};
428 $self->[_unexpected_error_count_] = 0;
429 $self->[_started_looking_for_here_target_at_] = 0;
430 $self->[_nearly_matched_here_target_at_] = undef;
431 $self->[_line_of_text_] = "";
432 $self->[_rlower_case_labels_at_] = undef;
433 $self->[_extended_syntax_] = $args{extended_syntax};
434 $self->[_maximum_level_] = 0;
435 $self->[_true_brace_error_count_] = 0;
436 $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
437 $self->[_rOpts_maximum_unexpected_errors_] =
438 $rOpts->{'maximum-unexpected-errors'};
439 $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
440 $self->[_rOpts_] = $rOpts;
443 $tokenizer_self = $self;
445 prepare_for_a_new_file();
446 find_starting_indentation_level();
448 # This is not a full class yet, so die if an attempt is made to
449 # create more than one object.
451 if ( _increment_count() > 1 ) {
453 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
460 # interface to Perl::Tidy::Logger routines
463 my $logger_object = $tokenizer_self->[_logger_object_];
464 if ($logger_object) {
465 $logger_object->warning($msg);
470 sub get_input_stream_name {
471 my $input_stream_name = "";
472 my $logger_object = $tokenizer_self->[_logger_object_];
473 if ($logger_object) {
474 $input_stream_name = $logger_object->get_input_stream_name();
476 return $input_stream_name;
481 my $logger_object = $tokenizer_self->[_logger_object_];
482 if ($logger_object) {
483 $logger_object->complain($msg);
488 sub write_logfile_entry {
490 my $logger_object = $tokenizer_self->[_logger_object_];
491 if ($logger_object) {
492 $logger_object->write_logfile_entry($msg);
497 sub interrupt_logfile {
498 my $logger_object = $tokenizer_self->[_logger_object_];
499 if ($logger_object) {
500 $logger_object->interrupt_logfile();
506 my $logger_object = $tokenizer_self->[_logger_object_];
507 if ($logger_object) {
508 $logger_object->resume_logfile();
513 sub increment_brace_error {
514 my $logger_object = $tokenizer_self->[_logger_object_];
515 if ($logger_object) {
516 $logger_object->increment_brace_error();
521 sub report_definite_bug {
522 $tokenizer_self->[_hit_bug_] = 1;
523 my $logger_object = $tokenizer_self->[_logger_object_];
524 if ($logger_object) {
525 $logger_object->report_definite_bug();
532 my $logger_object = $tokenizer_self->[_logger_object_];
533 if ($logger_object) {
534 $logger_object->brace_warning($msg);
539 sub get_saw_brace_error {
540 my $logger_object = $tokenizer_self->[_logger_object_];
541 if ($logger_object) {
542 return $logger_object->get_saw_brace_error();
549 sub get_unexpected_error_count {
551 return $self->[_unexpected_error_count_];
554 # interface to Perl::Tidy::Diagnostics routines
555 sub write_diagnostics {
557 if ( $tokenizer_self->[_diagnostics_object_] ) {
558 $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
563 sub get_maximum_level {
564 return $tokenizer_self->[_maximum_level_];
567 sub report_tokenization_errors {
571 # Report any tokenization errors and return a flag '$severe_error'.
572 # Set $severe_error = 1 if the tokenizations errors are so severe that
573 # the formatter should not attempt to format the file. Instead, it will
574 # just output the file verbatim.
576 # set severe error flag if tokenizer has encountered file reading problems
577 # (i.e. unexpected binary characters)
578 my $severe_error = $self->[_in_error_];
580 my $maxle = $self->[_rOpts_maximum_level_errors_];
581 my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
582 $maxle = 1 unless defined($maxle);
583 $maxue = 0 unless defined($maxue);
585 my $level = get_indentation_level();
586 if ( $level != $tokenizer_self->[_starting_level_] ) {
587 warning("final indentation level: $level\n");
588 my $level_diff = $tokenizer_self->[_starting_level_] - $level;
589 if ( $level_diff < 0 ) { $level_diff = -$level_diff }
591 # Set severe error flag if the level error is greater than 1.
592 # The formatter can function for any level error but it is probably
593 # best not to attempt formatting for a high level error.
594 if ( $maxle >= 0 && $level_diff > $maxle ) {
597 Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
602 check_final_nesting_depths();
604 # Likewise, large numbers of brace errors usually indicate non-perl
605 # scirpts, so set the severe error flag at a low number. This is similar
606 # to the level check, but different because braces may balance but be
607 # incorrectly interlaced.
608 if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
612 if ( $tokenizer_self->[_look_for_hash_bang_]
613 && !$tokenizer_self->[_saw_hash_bang_] )
616 "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
619 if ( $tokenizer_self->[_in_format_] ) {
620 warning("hit EOF while in format description\n");
623 if ( $tokenizer_self->[_in_skipped_] ) {
625 "hit EOF while in lines skipped with --code-skipping\n");
628 if ( $tokenizer_self->[_in_pod_] ) {
630 # Just write log entry if this is after __END__ or __DATA__
631 # because this happens to often, and it is not likely to be
633 if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
635 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
641 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
647 if ( $tokenizer_self->[_in_here_doc_] ) {
649 my $here_doc_target = $tokenizer_self->[_here_doc_target_];
650 my $started_looking_for_here_target_at =
651 $tokenizer_self->[_started_looking_for_here_target_at_];
652 if ($here_doc_target) {
654 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
659 Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
660 (Perl will match to the end of file but this may not be intended).
663 my $nearly_matched_here_target_at =
664 $tokenizer_self->[_nearly_matched_here_target_at_];
665 if ($nearly_matched_here_target_at) {
667 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
672 # Something is seriously wrong if we ended inside a quote
673 if ( $tokenizer_self->[_in_quote_] ) {
675 my $line_start_quote = $tokenizer_self->[_line_start_quote_];
676 my $quote_target = $tokenizer_self->[_quote_target_];
678 ( $tokenizer_self->[_in_attribute_list_] )
682 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
686 if ( $tokenizer_self->[_hit_bug_] ) {
690 # Multiple "unexpected" type tokenization errors usually indicate parsing
691 # non-perl scripts, or that something is seriously wrong, so we should
692 # avoid formatting them. This can happen for example if we run perltidy on
693 # a shell script or an html file. But unfortunately this check can
694 # interfere with some extended syntaxes, such as RPerl, so it has to be off
696 my $ue_count = $tokenizer_self->[_unexpected_error_count_];
697 if ( $maxue > 0 && $ue_count > $maxue ) {
699 Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
704 unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
706 write_logfile_entry("Suggest including '-w parameter'\n");
709 write_logfile_entry("Suggest including 'use warnings;'\n");
713 if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
714 write_logfile_entry("Use of -P parameter for defines is discouraged\n");
717 unless ( $tokenizer_self->[_saw_use_strict_] ) {
718 write_logfile_entry("Suggest including 'use strict;'\n");
721 # it is suggested that labels have at least one upper case character
722 # for legibility and to avoid code breakage as new keywords are introduced
723 if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
724 my @lower_case_labels_at =
725 @{ $tokenizer_self->[_rlower_case_labels_at_] };
727 "Suggest using upper case characters in label(s)\n");
729 write_logfile_entry(" defined at line(s): (@lower_case_labels_at)\n");
731 return $severe_error;
734 sub report_v_string {
736 # warn if this version can't handle v-strings
738 unless ( $tokenizer_self->[_saw_v_string_] ) {
739 $tokenizer_self->[_saw_v_string_] =
740 $tokenizer_self->[_last_line_number_];
744 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
750 sub is_valid_token_type {
752 return $is_valid_token_type{$type};
755 sub get_input_line_number {
756 return $tokenizer_self->[_last_line_number_];
759 # returns the next tokenized line
764 # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
765 # $square_bracket_depth, $paren_depth
767 my $input_line = $tokenizer_self->[_line_buffer_object_]->get_line();
768 $tokenizer_self->[_line_of_text_] = $input_line;
770 return unless ($input_line);
772 my $input_line_number = ++$tokenizer_self->[_last_line_number_];
774 my $write_logfile_entry = sub {
776 write_logfile_entry("Line $input_line_number: $msg");
780 # Find and remove what characters terminate this line, including any
782 my $input_line_separator = "";
783 if ( chomp($input_line) ) { $input_line_separator = $/ }
785 # The first test here very significantly speeds things up, but be sure to
786 # keep the regex and hash %other_line_endings the same.
787 if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
788 if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
789 $input_line_separator = $2 . $input_line_separator;
793 # for backwards compatibility we keep the line text terminated with
794 # a newline character
796 $tokenizer_self->[_line_of_text_] = $input_line; # update
798 # create a data structure describing this line which will be
799 # returned to the caller.
801 # _line_type codes are:
802 # SYSTEM - system-specific code before hash-bang line
803 # CODE - line of perl code (including comments)
804 # POD_START - line starting pod, such as '=head'
805 # POD - pod documentation text
806 # POD_END - last line of pod section, '=cut'
807 # HERE - text of here-document
808 # HERE_END - last line of here-doc (target word)
809 # FORMAT - format section
810 # FORMAT_END - last line of format section, '.'
811 # SKIP - code skipping section
812 # SKIP_END - last line of code skipping section, '#>>V'
813 # DATA_START - __DATA__ line
814 # DATA - unidentified text following __DATA__
815 # END_START - __END__ line
816 # END - unidentified text following __END__
817 # ERROR - we are in big trouble, probably not a perl script
820 # _curly_brace_depth - depth of curly braces at start of line
821 # _square_bracket_depth - depth of square brackets at start of line
822 # _paren_depth - depth of parens at start of line
823 # _starting_in_quote - this line continues a multi-line quote
824 # (so don't trim leading blanks!)
825 # _ending_in_quote - this line ends in a multi-line quote
826 # (so don't trim trailing blanks!)
827 my $line_of_tokens = {
829 _line_text => $input_line,
830 _line_number => $input_line_number,
831 _guessed_indentation_level => 0,
832 _curly_brace_depth => $brace_depth,
833 _square_bracket_depth => $square_bracket_depth,
834 _paren_depth => $paren_depth,
835 _quote_character => '',
836 ## _rtoken_type => undef,
837 ## _rtokens => undef,
838 ## _rlevels => undef,
839 ## _rslevels => undef,
840 ## _rblock_type => undef,
841 ## _rcontainer_type => undef,
842 ## _rcontainer_environment => undef,
843 ## _rtype_sequence => undef,
844 ## _rnesting_tokens => undef,
845 ## _rci_levels => undef,
846 ## _rnesting_blocks => undef,
847 ## _starting_in_quote => 0,
848 ## _ending_in_quote => 0,
851 # must print line unchanged if we are in a here document
852 if ( $tokenizer_self->[_in_here_doc_] ) {
854 $line_of_tokens->{_line_type} = 'HERE';
855 my $here_doc_target = $tokenizer_self->[_here_doc_target_];
856 my $here_quote_character = $tokenizer_self->[_here_quote_character_];
857 my $candidate_target = $input_line;
858 chomp $candidate_target;
860 # Handle <<~ targets, which are indicated here by a leading space on
861 # the here quote character
862 if ( $here_quote_character =~ /^\s/ ) {
863 $candidate_target =~ s/^\s*//;
865 if ( $candidate_target eq $here_doc_target ) {
866 $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
867 $line_of_tokens->{_line_type} = 'HERE_END';
868 $write_logfile_entry->("Exiting HERE document $here_doc_target\n");
870 my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
871 if ( @{$rhere_target_list} ) { # there can be multiple here targets
872 ( $here_doc_target, $here_quote_character ) =
873 @{ shift @{$rhere_target_list} };
874 $tokenizer_self->[_here_doc_target_] = $here_doc_target;
875 $tokenizer_self->[_here_quote_character_] =
876 $here_quote_character;
877 $write_logfile_entry->(
878 "Entering HERE document $here_doc_target\n");
879 $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
880 $tokenizer_self->[_started_looking_for_here_target_at_] =
884 $tokenizer_self->[_in_here_doc_] = 0;
885 $tokenizer_self->[_here_doc_target_] = "";
886 $tokenizer_self->[_here_quote_character_] = "";
890 # check for error of extra whitespace
891 # note for PERL6: leading whitespace is allowed
893 $candidate_target =~ s/\s*$//;
894 $candidate_target =~ s/^\s*//;
895 if ( $candidate_target eq $here_doc_target ) {
896 $tokenizer_self->[_nearly_matched_here_target_at_] =
900 return $line_of_tokens;
903 # Print line unchanged if we are in a format section
904 elsif ( $tokenizer_self->[_in_format_] ) {
906 if ( $input_line =~ /^\.[\s#]*$/ ) {
908 # Decrement format depth count at a '.' after a 'format'
909 $tokenizer_self->[_in_format_]--;
911 # This is the end when count reaches 0
912 if ( !$tokenizer_self->[_in_format_] ) {
913 $write_logfile_entry->("Exiting format section\n");
914 $line_of_tokens->{_line_type} = 'FORMAT_END';
918 $line_of_tokens->{_line_type} = 'FORMAT';
919 if ( $input_line =~ /^\s*format\s+\w+/ ) {
921 # Increment format depth count at a 'format' within a 'format'
922 # This is a simple way to handle nested formats (issue c019).
923 $tokenizer_self->[_in_format_]++;
926 return $line_of_tokens;
929 # must print line unchanged if we are in pod documentation
930 elsif ( $tokenizer_self->[_in_pod_] ) {
932 $line_of_tokens->{_line_type} = 'POD';
933 if ( $input_line =~ /^=cut/ ) {
934 $line_of_tokens->{_line_type} = 'POD_END';
935 $write_logfile_entry->("Exiting POD section\n");
936 $tokenizer_self->[_in_pod_] = 0;
938 if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) {
940 "Hash-bang in pod can cause older versions of perl to fail! \n"
944 return $line_of_tokens;
947 # print line unchanged if in skipped section
948 elsif ( $tokenizer_self->[_in_skipped_] ) {
950 $line_of_tokens->{_line_type} = 'SKIP';
951 if ( $input_line =~ /$code_skipping_pattern_end/ ) {
952 $line_of_tokens->{_line_type} = 'SKIP_END';
953 $write_logfile_entry->("Exiting code-skipping section\n");
954 $tokenizer_self->[_in_skipped_] = 0;
956 return $line_of_tokens;
959 # must print line unchanged if we have seen a severe error (i.e., we
960 # are seeing illegal tokens and cannot continue. Syntax errors do
961 # not pass this route). Calling routine can decide what to do, but
962 # the default can be to just pass all lines as if they were after __END__
963 elsif ( $tokenizer_self->[_in_error_] ) {
964 $line_of_tokens->{_line_type} = 'ERROR';
965 return $line_of_tokens;
968 # print line unchanged if we are __DATA__ section
969 elsif ( $tokenizer_self->[_in_data_] ) {
971 # ...but look for POD
972 # Note that the _in_data and _in_end flags remain set
973 # so that we return to that state after seeing the
974 # end of a pod section
975 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
976 $line_of_tokens->{_line_type} = 'POD_START';
977 $write_logfile_entry->("Entering POD section\n");
978 $tokenizer_self->[_in_pod_] = 1;
979 return $line_of_tokens;
982 $line_of_tokens->{_line_type} = 'DATA';
983 return $line_of_tokens;
987 # print line unchanged if we are in __END__ section
988 elsif ( $tokenizer_self->[_in_end_] ) {
990 # ...but look for POD
991 # Note that the _in_data and _in_end flags remain set
992 # so that we return to that state after seeing the
993 # end of a pod section
994 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
995 $line_of_tokens->{_line_type} = 'POD_START';
996 $write_logfile_entry->("Entering POD section\n");
997 $tokenizer_self->[_in_pod_] = 1;
998 return $line_of_tokens;
1001 $line_of_tokens->{_line_type} = 'END';
1002 return $line_of_tokens;
1006 # check for a hash-bang line if we haven't seen one
1007 if ( !$tokenizer_self->[_saw_hash_bang_] ) {
1008 if ( $input_line =~ /^\#\!.*perl\b/ ) {
1009 $tokenizer_self->[_saw_hash_bang_] = $input_line_number;
1011 # check for -w and -P flags
1012 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
1013 $tokenizer_self->[_saw_perl_dash_P_] = 1;
1016 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
1017 $tokenizer_self->[_saw_perl_dash_w_] = 1;
1021 $input_line_number > 1
1023 # leave any hash bang in a BEGIN block alone
1024 # i.e. see 'debugger-duck_type.t'
1026 $last_nonblank_block_type
1027 && $last_nonblank_block_type eq 'BEGIN'
1029 && !$tokenizer_self->[_look_for_hash_bang_]
1031 # Try to avoid giving a false alarm at a simple comment.
1032 # These look like valid hash-bang lines:
1036 #!c:\perl\bin\perl.exe
1038 # These are comments:
1040 #! sunos does not yet provide a /usr/bin/perl
1042 # Comments typically have multiple spaces, which suggests
1044 && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
1048 # this is helpful for VMS systems; we may have accidentally
1049 # tokenized some DCL commands
1050 if ( $tokenizer_self->[_started_tokenizing_] ) {
1052 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
1056 complain("Useless hash-bang after line 1\n");
1060 # Report the leading hash-bang as a system line
1061 # This will prevent -dac from deleting it
1063 $line_of_tokens->{_line_type} = 'SYSTEM';
1064 return $line_of_tokens;
1069 # wait for a hash-bang before parsing if the user invoked us with -x
1070 if ( $tokenizer_self->[_look_for_hash_bang_]
1071 && !$tokenizer_self->[_saw_hash_bang_] )
1073 $line_of_tokens->{_line_type} = 'SYSTEM';
1074 return $line_of_tokens;
1077 # a first line of the form ': #' will be marked as SYSTEM
1078 # since lines of this form may be used by tcsh
1079 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1080 $line_of_tokens->{_line_type} = 'SYSTEM';
1081 return $line_of_tokens;
1084 # now we know that it is ok to tokenize the line...
1085 # the line tokenizer will modify any of these private variables:
1086 # _rhere_target_list_
1094 my $ending_in_quote_last = $tokenizer_self->[_in_quote_];
1095 tokenize_this_line($line_of_tokens);
1097 # Now finish defining the return structure and return it
1098 $line_of_tokens->{_ending_in_quote} = $tokenizer_self->[_in_quote_];
1100 # handle severe error (binary data in script)
1101 if ( $tokenizer_self->[_in_error_] ) {
1102 $tokenizer_self->[_in_quote_] = 0; # to avoid any more messages
1103 warning("Giving up after error\n");
1104 $line_of_tokens->{_line_type} = 'ERROR';
1105 reset_indentation_level(0); # avoid error messages
1106 return $line_of_tokens;
1109 # handle start of pod documentation
1110 if ( $tokenizer_self->[_in_pod_] ) {
1112 # This gets tricky..above a __DATA__ or __END__ section, perl
1113 # accepts '=cut' as the start of pod section. But afterwards,
1114 # only pod utilities see it and they may ignore an =cut without
1115 # leading =head. In any case, this isn't good.
1116 if ( $input_line =~ /^=cut\b/ ) {
1117 if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] )
1119 complain("=cut while not in pod ignored\n");
1120 $tokenizer_self->[_in_pod_] = 0;
1121 $line_of_tokens->{_line_type} = 'POD_END';
1124 $line_of_tokens->{_line_type} = 'POD_START';
1126 "=cut starts a pod section .. this can fool pod utilities.\n"
1127 ) unless (DEVEL_MODE);
1128 $write_logfile_entry->("Entering POD section\n");
1133 $line_of_tokens->{_line_type} = 'POD_START';
1134 $write_logfile_entry->("Entering POD section\n");
1137 return $line_of_tokens;
1140 # handle start of skipped section
1141 if ( $tokenizer_self->[_in_skipped_] ) {
1143 $line_of_tokens->{_line_type} = 'SKIP';
1144 $write_logfile_entry->("Entering code-skipping section\n");
1145 return $line_of_tokens;
1148 # Update indentation levels for log messages.
1149 # Skip blank lines and also block comments, unless a logfile is requested.
1150 # Note that _line_of_text_ is the input line but trimmed from left to right.
1151 my $lot = $tokenizer_self->[_line_of_text_];
1152 if ( $lot && ( $self->[_rOpts_logfile_] || substr( $lot, 0, 1 ) ne '#' ) ) {
1153 my $rlevels = $line_of_tokens->{_rlevels};
1154 $line_of_tokens->{_guessed_indentation_level} =
1155 guess_old_indentation_level($input_line);
1158 # see if this line contains here doc targets
1159 my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
1160 if ( @{$rhere_target_list} ) {
1162 my ( $here_doc_target, $here_quote_character ) =
1163 @{ shift @{$rhere_target_list} };
1164 $tokenizer_self->[_in_here_doc_] = 1;
1165 $tokenizer_self->[_here_doc_target_] = $here_doc_target;
1166 $tokenizer_self->[_here_quote_character_] = $here_quote_character;
1167 $write_logfile_entry->("Entering HERE document $here_doc_target\n");
1168 $tokenizer_self->[_started_looking_for_here_target_at_] =
1172 # NOTE: __END__ and __DATA__ statements are written unformatted
1173 # because they can theoretically contain additional characters
1174 # which are not tokenized (and cannot be read with <DATA> either!).
1175 if ( $tokenizer_self->[_in_data_] ) {
1176 $line_of_tokens->{_line_type} = 'DATA_START';
1177 $write_logfile_entry->("Starting __DATA__ section\n");
1178 $tokenizer_self->[_saw_data_] = 1;
1180 # keep parsing after __DATA__ if use SelfLoader was seen
1181 if ( $tokenizer_self->[_saw_selfloader_] ) {
1182 $tokenizer_self->[_in_data_] = 0;
1183 $write_logfile_entry->(
1184 "SelfLoader seen, continuing; -nlsl deactivates\n");
1187 return $line_of_tokens;
1190 elsif ( $tokenizer_self->[_in_end_] ) {
1191 $line_of_tokens->{_line_type} = 'END_START';
1192 $write_logfile_entry->("Starting __END__ section\n");
1193 $tokenizer_self->[_saw_end_] = 1;
1195 # keep parsing after __END__ if use AutoLoader was seen
1196 if ( $tokenizer_self->[_saw_autoloader_] ) {
1197 $tokenizer_self->[_in_end_] = 0;
1198 $write_logfile_entry->(
1199 "AutoLoader seen, continuing; -nlal deactivates\n");
1201 return $line_of_tokens;
1204 # now, finally, we know that this line is type 'CODE'
1205 $line_of_tokens->{_line_type} = 'CODE';
1207 # remember if we have seen any real code
1208 if ( !$tokenizer_self->[_started_tokenizing_]
1209 && $input_line !~ /^\s*$/
1210 && $input_line !~ /^\s*#/ )
1212 $tokenizer_self->[_started_tokenizing_] = 1;
1215 if ( $tokenizer_self->[_debugger_object_] ) {
1216 $tokenizer_self->[_debugger_object_]
1217 ->write_debug_entry($line_of_tokens);
1220 # Note: if keyword 'format' occurs in this line code, it is still CODE
1221 # (keyword 'format' need not start a line)
1222 if ( $tokenizer_self->[_in_format_] ) {
1223 $write_logfile_entry->("Entering format section\n");
1226 if ( $tokenizer_self->[_in_quote_]
1227 and ( $tokenizer_self->[_line_start_quote_] < 0 ) )
1230 #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1231 if ( ( my $quote_target = $tokenizer_self->[_quote_target_] ) !~
1234 $tokenizer_self->[_line_start_quote_] = $input_line_number;
1235 $write_logfile_entry->(
1236 "Start multi-line quote or pattern ending in $quote_target\n");
1239 elsif ( ( $tokenizer_self->[_line_start_quote_] >= 0 )
1240 && !$tokenizer_self->[_in_quote_] )
1242 $tokenizer_self->[_line_start_quote_] = -1;
1243 $write_logfile_entry->("End of multi-line quote or pattern\n");
1246 # we are returning a line of CODE
1247 return $line_of_tokens;
1250 sub find_starting_indentation_level {
1252 # We need to find the indentation level of the first line of the
1253 # script being formatted. Often it will be zero for an entire file,
1254 # but if we are formatting a local block of code (within an editor for
1255 # example) it may not be zero. The user may specify this with the
1256 # -sil=n parameter but normally doesn't so we have to guess.
1258 # USES GLOBAL VARIABLES: $tokenizer_self
1259 my $starting_level = 0;
1261 # use value if given as parameter
1262 if ( $tokenizer_self->[_know_starting_level_] ) {
1263 $starting_level = $tokenizer_self->[_starting_level_];
1266 # if we know there is a hash_bang line, the level must be zero
1267 elsif ( $tokenizer_self->[_look_for_hash_bang_] ) {
1268 $tokenizer_self->[_know_starting_level_] = 1;
1271 # otherwise figure it out from the input file
1276 # keep looking at lines until we find a hash bang or piece of code
1279 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
1282 # if first line is #! then assume starting level is zero
1283 if ( $i == 1 && $line =~ /^\#\!/ ) {
1284 $starting_level = 0;
1287 next if ( $line =~ /^\s*#/ ); # skip past comments
1288 next if ( $line =~ /^\s*$/ ); # skip past blank lines
1289 $starting_level = guess_old_indentation_level($line);
1292 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1293 write_logfile_entry("$msg");
1295 $tokenizer_self->[_starting_level_] = $starting_level;
1296 reset_indentation_level($starting_level);
1300 sub guess_old_indentation_level {
1303 # Guess the indentation level of an input line.
1305 # For the first line of code this result will define the starting
1306 # indentation level. It will mainly be non-zero when perltidy is applied
1307 # within an editor to a local block of code.
1309 # This is an impossible task in general because we can't know what tabs
1310 # meant for the old script and how many spaces were used for one
1311 # indentation level in the given input script. For example it may have
1312 # been previously formatted with -i=7 -et=3. But we can at least try to
1313 # make sure that perltidy guesses correctly if it is applied repeatedly to
1314 # a block of code within an editor, so that the block stays at the same
1315 # level when perltidy is applied repeatedly.
1317 # USES GLOBAL VARIABLES: $tokenizer_self
1320 # find leading tabs, spaces, and any statement label
1322 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1324 # If there are leading tabs, we use the tab scheme for this run, if
1325 # any, so that the code will remain stable when editing.
1326 if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }
1328 if ($2) { $spaces += length($2) }
1330 # correct for outdented labels
1331 if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
1332 $spaces += $tokenizer_self->[_continuation_indentation_];
1336 # compute indentation using the value of -i for this run.
1337 # If -i=0 is used for this run (which is possible) it doesn't matter
1338 # what we do here but we'll guess that the old run used 4 spaces per level.
1339 my $indent_columns = $tokenizer_self->[_indent_columns_];
1340 $indent_columns = 4 if ( !$indent_columns );
1341 $level = int( $spaces / $indent_columns );
1345 # This is a currently unused debug routine
1346 sub dump_functions {
1349 foreach my $pkg ( keys %is_user_function ) {
1350 $fh->print("\nnon-constant subs in package $pkg\n");
1352 foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1354 if ( $is_block_list_function{$pkg}{$sub} ) {
1355 $msg = 'block_list';
1358 if ( $is_block_function{$pkg}{$sub} ) {
1361 $fh->print("$sub $msg\n");
1365 foreach my $pkg ( keys %is_constant ) {
1366 $fh->print("\nconstants and constant subs in package $pkg\n");
1368 foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1369 $fh->print("$sub\n");
1375 sub prepare_for_a_new_file {
1377 # previous tokens needed to determine what to expect next
1378 $last_nonblank_token = ';'; # the only possible starting state which
1379 $last_nonblank_type = ';'; # will make a leading brace a code block
1380 $last_nonblank_block_type = '';
1382 # scalars for remembering statement types across multiple lines
1383 $statement_type = ''; # '' or 'use' or 'sub..' or 'case..'
1384 $in_attribute_list = 0;
1386 # scalars for remembering where we are in the file
1387 $current_package = "main";
1388 $context = UNKNOWN_CONTEXT;
1390 # hashes used to remember function information
1391 %is_constant = (); # user-defined constants
1392 %is_user_function = (); # user-defined functions
1393 %user_function_prototype = (); # their prototypes
1394 %is_block_function = ();
1395 %is_block_list_function = ();
1396 %saw_function_definition = ();
1397 %saw_use_module = ();
1399 # variables used to track depths of various containers
1400 # and report nesting errors
1403 $square_bracket_depth = 0;
1404 @current_depth = (0) x scalar @closing_brace_names;
1407 @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
1408 @current_sequence_number = ();
1409 $next_sequence_number = 2; # The value 1 is reserved for SEQ_ROOT
1412 @paren_semicolon_count = ();
1413 @paren_structural_type = ();
1415 @brace_structural_type = ();
1416 @brace_context = ();
1417 @brace_package = ();
1418 @square_bracket_type = ();
1419 @square_bracket_structural_type = ();
1421 @nested_ternary_flag = ();
1422 @nested_statement_type = ();
1423 @starting_line_of_current_depth = ();
1425 $paren_type[$paren_depth] = '';
1426 $paren_semicolon_count[$paren_depth] = 0;
1427 $paren_structural_type[$brace_depth] = '';
1428 $brace_type[$brace_depth] = ';'; # identify opening brace as code block
1429 $brace_structural_type[$brace_depth] = '';
1430 $brace_context[$brace_depth] = UNKNOWN_CONTEXT;
1431 $brace_package[$paren_depth] = $current_package;
1432 $square_bracket_type[$square_bracket_depth] = '';
1433 $square_bracket_structural_type[$square_bracket_depth] = '';
1435 initialize_tokenizer_state();
1439 { ## closure for sub tokenize_this_line
1441 use constant BRACE => 0;
1442 use constant SQUARE_BRACKET => 1;
1443 use constant PAREN => 2;
1444 use constant QUESTION_COLON => 3;
1446 # TV1: scalars for processing one LINE.
1447 # Re-initialized on each entry to sub tokenize_this_line.
1449 $block_type, $container_type, $expecting,
1450 $i, $i_tok, $input_line,
1451 $input_line_number, $last_nonblank_i, $max_token_index,
1452 $next_tok, $next_type, $peeked_ahead,
1453 $prototype, $rhere_target_list, $rtoken_map,
1454 $rtoken_type, $rtokens, $tok,
1455 $type, $type_sequence, $indent_flag,
1458 # TV2: refs to ARRAYS for processing one LINE
1459 # Re-initialized on each call.
1460 my $routput_token_list = []; # stack of output token indexes
1461 my $routput_token_type = []; # token types
1462 my $routput_block_type = []; # types of code block
1463 my $routput_container_type = []; # paren types, such as if, elsif, ..
1464 my $routput_type_sequence = []; # nesting sequential number
1465 my $routput_indent_flag = []; #
1467 # TV3: SCALARS for quote variables. These are initialized with a
1468 # subroutine call and continually updated as lines are processed.
1469 my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1470 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1472 # TV4: SCALARS for multi-line identifiers and
1473 # statements. These are initialized with a subroutine call
1474 # and continually updated as lines are processed.
1475 my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
1477 # TV5: SCALARS for tracking indentation level.
1478 # Initialized once and continually updated as lines are
1481 $nesting_token_string, $nesting_type_string,
1482 $nesting_block_string, $nesting_block_flag,
1483 $nesting_list_string, $nesting_list_flag,
1484 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1485 $in_statement_continuation, $level_in_tokenizer,
1486 $slevel_in_tokenizer, $rslevel_stack,
1489 # TV6: SCALARS for remembering several previous
1490 # tokens. Initialized once and continually updated as
1491 # lines are processed.
1493 $last_nonblank_container_type, $last_nonblank_type_sequence,
1494 $last_last_nonblank_token, $last_last_nonblank_type,
1495 $last_last_nonblank_block_type, $last_last_nonblank_container_type,
1496 $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1499 # ----------------------------------------------------------------
1500 # beginning of tokenizer variable access and manipulation routines
1501 # ----------------------------------------------------------------
1503 sub initialize_tokenizer_state {
1505 # TV1: initialized on each call
1506 # TV2: initialized on each call
1510 $quote_character = "";
1513 $quoted_string_1 = "";
1514 $quoted_string_2 = "";
1515 $allowed_quote_modifiers = "";
1518 $id_scan_state = '';
1521 $indented_if_level = 0;
1524 $nesting_token_string = "";
1525 $nesting_type_string = "";
1526 $nesting_block_string = '1'; # initially in a block
1527 $nesting_block_flag = 1;
1528 $nesting_list_string = '0'; # initially not in a list
1529 $nesting_list_flag = 0; # initially not in a list
1530 $ci_string_in_tokenizer = "";
1531 $continuation_string_in_tokenizer = "0";
1532 $in_statement_continuation = 0;
1533 $level_in_tokenizer = 0;
1534 $slevel_in_tokenizer = 0;
1535 $rslevel_stack = [];
1538 $last_nonblank_container_type = '';
1539 $last_nonblank_type_sequence = '';
1540 $last_last_nonblank_token = ';';
1541 $last_last_nonblank_type = ';';
1542 $last_last_nonblank_block_type = '';
1543 $last_last_nonblank_container_type = '';
1544 $last_last_nonblank_type_sequence = '';
1545 $last_nonblank_prototype = "";
1549 sub save_tokenizer_state {
1552 $block_type, $container_type, $expecting,
1553 $i, $i_tok, $input_line,
1554 $input_line_number, $last_nonblank_i, $max_token_index,
1555 $next_tok, $next_type, $peeked_ahead,
1556 $prototype, $rhere_target_list, $rtoken_map,
1557 $rtoken_type, $rtokens, $tok,
1558 $type, $type_sequence, $indent_flag,
1562 $routput_token_list, $routput_token_type,
1563 $routput_block_type, $routput_container_type,
1564 $routput_type_sequence, $routput_indent_flag,
1568 $in_quote, $quote_type,
1569 $quote_character, $quote_pos,
1570 $quote_depth, $quoted_string_1,
1571 $quoted_string_2, $allowed_quote_modifiers,
1575 [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
1578 $nesting_token_string, $nesting_type_string,
1579 $nesting_block_string, $nesting_block_flag,
1580 $nesting_list_string, $nesting_list_flag,
1581 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1582 $in_statement_continuation, $level_in_tokenizer,
1583 $slevel_in_tokenizer, $rslevel_stack,
1587 $last_nonblank_container_type,
1588 $last_nonblank_type_sequence,
1589 $last_last_nonblank_token,
1590 $last_last_nonblank_type,
1591 $last_last_nonblank_block_type,
1592 $last_last_nonblank_container_type,
1593 $last_last_nonblank_type_sequence,
1594 $last_nonblank_prototype,
1596 return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1599 sub restore_tokenizer_state {
1601 my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1603 $block_type, $container_type, $expecting,
1604 $i, $i_tok, $input_line,
1605 $input_line_number, $last_nonblank_i, $max_token_index,
1606 $next_tok, $next_type, $peeked_ahead,
1607 $prototype, $rhere_target_list, $rtoken_map,
1608 $rtoken_type, $rtokens, $tok,
1609 $type, $type_sequence, $indent_flag,
1613 $routput_token_list, $routput_token_type,
1614 $routput_block_type, $routput_container_type,
1615 $routput_type_sequence, $routput_indent_flag,
1619 $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1620 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1623 ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
1627 $nesting_token_string, $nesting_type_string,
1628 $nesting_block_string, $nesting_block_flag,
1629 $nesting_list_string, $nesting_list_flag,
1630 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1631 $in_statement_continuation, $level_in_tokenizer,
1632 $slevel_in_tokenizer, $rslevel_stack,
1636 $last_nonblank_container_type,
1637 $last_nonblank_type_sequence,
1638 $last_last_nonblank_token,
1639 $last_last_nonblank_type,
1640 $last_last_nonblank_block_type,
1641 $last_last_nonblank_container_type,
1642 $last_last_nonblank_type_sequence,
1643 $last_nonblank_prototype,
1648 sub split_pretoken {
1652 # Split the leading $numc characters from the current token (at index=$i)
1653 # which is pre-type 'w' and insert the remainder back into the pretoken
1654 # stream with appropriate settings. Since we are splitting a pre-type 'w',
1655 # there are three cases, depending on if the remainder starts with a digit:
1656 # Case 1: remainder is type 'd', all digits
1657 # Case 2: remainder is type 'd' and type 'w': digits and other characters
1658 # Case 3: remainder is type 'w'
1660 # Examples, for $numc=1:
1661 # $tok => $tok_0 $tok_1 $tok_2
1662 # 'x10' => 'x' '10' # case 1
1663 # 'x10if' => 'x' '10' 'if' # case 2
1664 # '0ne => 'O' 'ne' # case 3
1667 # $tok_1 is a possible string of digits (pre-type 'd')
1668 # $tok_2 is a possible word (pre-type 'w')
1670 # return 1 if successful
1671 # return undef if error (shouldn't happen)
1673 # Calling routine should update '$type' and '$tok' if successful.
1675 my $pretoken = $rtokens->[$i];
1677 && length($pretoken) > $numc
1678 && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
1681 # Split $tok into up to 3 tokens:
1682 my $tok_0 = substr( $pretoken, 0, $numc );
1683 my $tok_1 = defined($1) ? $1 : "";
1684 my $tok_2 = defined($2) ? $2 : "";
1686 my $len_0 = length($tok_0);
1687 my $len_1 = length($tok_1);
1688 my $len_2 = length($tok_2);
1690 my $pre_type_0 = 'w';
1691 my $pre_type_1 = 'd';
1692 my $pre_type_2 = 'w';
1694 my $pos_0 = $rtoken_map->[$i];
1695 my $pos_1 = $pos_0 + $len_0;
1696 my $pos_2 = $pos_1 + $len_1;
1698 my $isplice = $i + 1;
1700 # Splice in any digits
1702 splice @{$rtoken_map}, $isplice, 0, $pos_1;
1703 splice @{$rtokens}, $isplice, 0, $tok_1;
1704 splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
1709 # Splice in any trailing word
1711 splice @{$rtoken_map}, $isplice, 0, $pos_2;
1712 splice @{$rtokens}, $isplice, 0, $tok_2;
1713 splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
1717 $rtokens->[$i] = $tok_0;
1722 # Shouldn't get here
1725 While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
1732 sub get_indentation_level {
1734 # patch to avoid reporting error if indented if is not terminated
1735 if ($indented_if_level) { return $level_in_tokenizer - 1 }
1736 return $level_in_tokenizer;
1739 sub reset_indentation_level {
1740 $level_in_tokenizer = $slevel_in_tokenizer = shift;
1741 push @{$rslevel_stack}, $slevel_in_tokenizer;
1747 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1748 return $peeked_ahead;
1751 # ------------------------------------------------------------
1752 # end of tokenizer variable access and manipulation routines
1753 # ------------------------------------------------------------
1755 # ------------------------------------------------------------
1756 # beginning of various scanner interface routines
1757 # ------------------------------------------------------------
1758 sub scan_replacement_text {
1760 # check for here-docs in replacement text invoked by
1761 # a substitution operator with executable modifier 'e'.
1766 # $rht = reference to any here-doc targets
1767 my ($replacement_text) = @_;
1770 return unless ( $replacement_text =~ /<</ );
1772 write_logfile_entry("scanning replacement text for here-doc targets\n");
1774 # save the logger object for error messages
1775 my $logger_object = $tokenizer_self->[_logger_object_];
1777 # localize all package variables
1779 $tokenizer_self, $last_nonblank_token,
1780 $last_nonblank_type, $last_nonblank_block_type,
1781 $statement_type, $in_attribute_list,
1782 $current_package, $context,
1783 %is_constant, %is_user_function,
1784 %user_function_prototype, %is_block_function,
1785 %is_block_list_function, %saw_function_definition,
1786 $brace_depth, $paren_depth,
1787 $square_bracket_depth, @current_depth,
1788 @total_depth, $total_depth,
1789 @nesting_sequence_number, @current_sequence_number,
1790 @paren_type, @paren_semicolon_count,
1791 @paren_structural_type, @brace_type,
1792 @brace_structural_type, @brace_context,
1793 @brace_package, @square_bracket_type,
1794 @square_bracket_structural_type, @depth_array,
1795 @starting_line_of_current_depth, @nested_ternary_flag,
1796 @nested_statement_type, $next_sequence_number,
1799 # save all lexical variables
1800 my $rstate = save_tokenizer_state();
1801 _decrement_count(); # avoid error check for multiple tokenizers
1803 # make a new tokenizer
1805 my $rpending_logfile_message;
1806 my $source_object = Perl::Tidy::LineSource->new(
1807 input_file => \$replacement_text,
1809 rpending_logfile_message => $rpending_logfile_message,
1811 my $tokenizer = Perl::Tidy::Tokenizer->new(
1812 source_object => $source_object,
1813 logger_object => $logger_object,
1814 starting_line_number => $input_line_number,
1817 # scan the replacement text
1818 1 while ( $tokenizer->get_line() );
1820 # remove any here doc targets
1822 if ( $tokenizer_self->[_in_here_doc_] ) {
1826 $tokenizer_self->[_here_doc_target_],
1827 $tokenizer_self->[_here_quote_character_]
1829 if ( $tokenizer_self->[_rhere_target_list_] ) {
1830 push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
1831 $tokenizer_self->[_rhere_target_list_] = undef;
1833 $tokenizer_self->[_in_here_doc_] = undef;
1836 # now its safe to report errors
1837 my $severe_error = $tokenizer->report_tokenization_errors();
1839 # TODO: Could propagate a severe error up
1841 # restore all tokenizer lexical variables
1842 restore_tokenizer_state($rstate);
1844 # return the here doc targets
1848 sub scan_bare_identifier {
1849 ( $i, $tok, $type, $prototype ) =
1850 scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
1851 $rtoken_map, $max_token_index );
1855 sub scan_identifier {
1856 ( $i, $tok, $type, $id_scan_state, $identifier ) =
1857 scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
1858 $max_token_index, $expecting, $paren_type[$paren_depth] );
1860 # Check for signal to fix a special variable adjacent to a keyword,
1861 # such as '$^One$0'.
1862 if ( $id_scan_state eq '^' ) {
1864 # Try to fix it by splitting the pretoken
1866 && $rtokens->[ $i - 1 ] eq '^'
1867 && split_pretoken(1) )
1869 $identifier = substr( $identifier, 0, 3 );
1874 # This shouldn't happen ...
1875 my $var = substr( $tok, 0, 3 );
1876 my $excess = substr( $tok, 3 );
1877 interrupt_logfile();
1879 $input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
1880 A space may be needed after '$var'.
1884 $id_scan_state = "";
1889 use constant VERIFY_FASTSCAN => 0;
1890 my %fast_scan_context;
1893 %fast_scan_context = (
1894 '$' => SCALAR_CONTEXT,
1895 '*' => SCALAR_CONTEXT,
1896 '@' => LIST_CONTEXT,
1897 '%' => LIST_CONTEXT,
1898 '&' => UNKNOWN_CONTEXT,
1902 sub scan_identifier_fast {
1904 # This is a wrapper for sub scan_identifier. It does a fast preliminary
1905 # scan for certain common identifiers:
1906 # '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
1907 # If it does not find one of these, or this is a restart, it calls the
1908 # original scanner directly.
1910 # This gives the same results as the full scanner in about 1/4 the
1911 # total runtime for a typical input stream.
1914 my $tok_begin = $tok;
1917 ###############################
1918 # quick scan with leading sigil
1919 ###############################
1920 if ( !$id_scan_state
1921 && $i + 1 <= $max_token_index
1922 && $fast_scan_context{$tok} )
1924 $context = $fast_scan_context{$tok};
1926 # look for $var, @var, ...
1927 if ( $rtoken_type->[ $i + 1 ] eq 'w' ) {
1928 my $pretype_next = "";
1929 my $i_next = $i + 2;
1930 if ( $i_next <= $max_token_index ) {
1931 if ( $rtoken_type->[$i_next] eq 'b'
1932 && $i_next < $max_token_index )
1936 $pretype_next = $rtoken_type->[$i_next];
1938 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
1940 # Found type 'i' like '$var', '@var', or '%var'
1941 $identifier = $tok . $rtokens->[ $i + 1 ];
1945 $fast_scan_type = $type;
1949 # Look for @{ or %{ .
1950 # But we must let the full scanner handle things ${ because it may
1951 # keep going to get a complete identifier like '${#}' .
1953 $rtoken_type->[ $i + 1 ] eq '{'
1954 && ( $tok_begin eq '@'
1955 || $tok_begin eq '%' )
1961 $fast_scan_type = $type;
1965 ############################
1966 # Quick scan with leading ->
1967 # Look for ->[ and ->{
1968 ############################
1971 && $i < $max_token_index
1972 && ( $rtokens->[ $i + 1 ] eq '{'
1973 || $rtokens->[ $i + 1 ] eq '[' )
1977 $fast_scan_type = $type;
1979 $context = UNKNOWN_CONTEXT;
1982 #######################################
1983 # Verify correctness during development
1984 #######################################
1985 if ( VERIFY_FASTSCAN && $fast_scan_type ) {
1987 # We will call the full method
1988 my $identifier_simple = $identifier;
1989 my $tok_simple = $tok;
1990 my $fast_scan_type = $type;
1992 my $context_simple = $context;
1998 if ( $tok ne $tok_simple
1999 || $type ne $fast_scan_type
2001 || $identifier ne $identifier_simple
2003 || $context ne $context_simple )
2006 scan_identifier_fast differs from scan_identifier:
2007 simple: i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
2008 full: i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
2013 ###################################################
2014 # call full scanner if fast method did not succeed
2015 ###################################################
2016 if ( !$fast_scan_type ) {
2023 ( $i, $tok, $type, $id_scan_state ) =
2024 scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
2025 $id_scan_state, $max_token_index );
2031 ( $i, $type, $number ) =
2032 scan_number_do( $input_line, $i, $rtoken_map, $type,
2037 use constant VERIFY_FASTNUM => 0;
2039 sub scan_number_fast {
2041 # This is a wrapper for sub scan_number. It does a fast preliminary
2042 # scan for a simple integer. It calls the original scan_number if it
2043 # does not find one.
2046 my $tok_begin = $tok;
2049 ##################################
2050 # Quick check for (signed) integer
2051 ##################################
2053 # This will be the string of digits:
2056 my $typ_d = $rtoken_type->[$i_d];
2058 # check for signed integer
2061 && ( $typ_d eq '+' || $typ_d eq '-' )
2062 && $i_d < $max_token_index )
2066 $tok_d = $rtokens->[$i_d];
2067 $typ_d = $rtoken_type->[$i_d];
2074 $i_d == $max_token_index
2075 || ( $i_d < $max_token_index
2076 && $rtoken_type->[ $i_d + 1 ] ne '.'
2077 && $rtoken_type->[ $i_d + 1 ] ne 'w' )
2081 # Let let full scanner handle multi-digit integers beginning with
2082 # '0' because there could be error messages. For example, '009' is
2083 # not a valid number.
2085 if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
2086 $number = $sign . $tok_d;
2092 #######################################
2093 # Verify correctness during development
2094 #######################################
2095 if ( VERIFY_FASTNUM && defined($number) ) {
2097 # We will call the full method
2098 my $type_simple = $type;
2100 my $number_simple = $number;
2104 $number = scan_number();
2106 if ( $type ne $type_simple
2107 || ( $i != $i_simple && $i <= $max_token_index )
2108 || $number ne $number_simple )
2111 scan_number_fast differs from scan_number:
2112 simple: i=$i_simple, type=$type_simple, number=$number_simple
2113 full: i=$i, type=$type, number=$number
2118 #########################################
2119 # call full scanner if may not be integer
2120 #########################################
2121 if ( !defined($number) ) {
2122 $number = scan_number();
2127 # a sub to warn if token found where term expected
2128 sub error_if_expecting_TERM {
2129 if ( $expecting == TERM ) {
2130 if ( $really_want_term{$last_nonblank_type} ) {
2131 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
2132 $rtoken_map, $rtoken_type, $input_line );
2139 # a sub to warn if token found where operator expected
2140 sub error_if_expecting_OPERATOR {
2142 if ( $expecting == OPERATOR ) {
2143 if ( !defined($thing) ) { $thing = $tok }
2144 report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
2145 $rtoken_map, $rtoken_type, $input_line );
2146 if ( $i_tok == 0 ) {
2147 interrupt_logfile();
2148 warning("Missing ';' or ',' above?\n");
2156 # ------------------------------------------------------------
2157 # end scanner interfaces
2158 # ------------------------------------------------------------
2161 @_ = qw(for foreach);
2162 @is_for_foreach{@_} = (1) x scalar(@_);
2164 my %is_my_our_state;
2165 @_ = qw(my our state);
2166 @is_my_our_state{@_} = (1) x scalar(@_);
2168 # These keywords may introduce blocks after parenthesized expressions,
2170 # keyword ( .... ) { BLOCK }
2171 # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
2172 my %is_blocktype_with_paren;
2174 qw(if elsif unless while until for foreach switch case given when catch);
2175 @is_blocktype_with_paren{@_} = (1) x scalar(@_);
2177 my %is_case_default;
2178 @_ = qw(case default);
2179 @is_case_default{@_} = (1) x scalar(@_);
2181 # ------------------------------------------------------------
2182 # begin hash of code for handling most token types
2183 # ------------------------------------------------------------
2184 my $tokenization_code = {
2186 # no special code for these types yet, but syntax checks
2221 error_if_expecting_TERM()
2222 if ( $expecting == TERM );
2225 error_if_expecting_TERM()
2226 if ( $expecting == TERM );
2230 # start looking for a scalar
2231 error_if_expecting_OPERATOR("Scalar")
2232 if ( $expecting == OPERATOR );
2233 scan_identifier_fast();
2235 if ( $identifier eq '$^W' ) {
2236 $tokenizer_self->[_saw_perl_dash_w_] = 1;
2239 # Check for identifier in indirect object slot
2240 # (vorboard.pl, sort.t). Something like:
2241 # /^(print|printf|sort|exec|system)$/
2243 $is_indirect_object_taker{$last_nonblank_token}
2244 || ( ( $last_nonblank_token eq '(' )
2245 && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
2246 || ( $last_nonblank_type eq 'w'
2247 || $last_nonblank_type eq 'U' ) # possible object
2251 # An identifier followed by '->' is not indirect object;
2252 # fixes b1175, b1176
2253 my ( $next_nonblank_type, $i_next ) =
2254 find_next_noncomment_type( $i, $rtokens, $max_token_index );
2255 $type = 'Z' if ( $next_nonblank_type ne '->' );
2261 $paren_semicolon_count[$paren_depth] = 0;
2263 $container_type = $want_paren;
2266 elsif ( $statement_type =~ /^sub\b/ ) {
2267 $container_type = $statement_type;
2270 $container_type = $last_nonblank_token;
2272 # We can check for a syntax error here of unexpected '(',
2273 # but this is going to get messy...
2275 $expecting == OPERATOR
2277 # Be sure this is not a method call of the form
2278 # &method(...), $method->(..), &{method}(...),
2279 # $ref[2](list) is ok & short for $ref[2]->(list)
2280 # NOTE: at present, braces in something like &{ xxx }
2281 # are not marked as a block, we might have a method call.
2282 # Added ')' to fix case c017, something like ()()()
2283 && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/
2288 # ref: camel 3 p 703.
2289 if ( $last_last_nonblank_token eq 'do' ) {
2291 "do SUBROUTINE is deprecated; consider & or -> notation\n"
2296 # if this is an empty list, (), then it is not an
2297 # error; for example, we might have a constant pi and
2298 # invoke it with pi() or just pi;
2299 my ( $next_nonblank_token, $i_next ) =
2300 find_next_nonblank_token( $i, $rtokens,
2303 # Patch for c029: give up error check if
2304 # a side comment follows
2305 if ( $next_nonblank_token ne ')'
2306 && $next_nonblank_token ne '#' )
2310 error_if_expecting_OPERATOR('(');
2312 if ( $last_nonblank_type eq 'C' ) {
2314 "$last_nonblank_token has a void prototype\n";
2316 elsif ( $last_nonblank_type eq 'i' ) {
2318 && $last_nonblank_token =~ /^\$/ )
2321 "Do you mean '$last_nonblank_token->(' ?\n";
2325 interrupt_logfile();
2329 } ## end if ( $next_nonblank_token...
2330 } ## end else [ if ( $last_last_nonblank_token...
2331 } ## end if ( $expecting == OPERATOR...
2333 $paren_type[$paren_depth] = $container_type;
2334 ( $type_sequence, $indent_flag ) =
2335 increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2337 # propagate types down through nested parens
2338 # for example: the second paren in 'if ((' would be structural
2339 # since the first is.
2341 if ( $last_nonblank_token eq '(' ) {
2342 $type = $last_nonblank_type;
2345 # We exclude parens as structural after a ',' because it
2346 # causes subtle problems with continuation indentation for
2347 # something like this, where the first 'or' will not get
2352 # ( not defined $check )
2354 # or $check eq "new"
2355 # or $check eq "old",
2358 # Likewise, we exclude parens where a statement can start
2359 # because of problems with continuation indentation, like
2362 # ($firstline =~ /^#\!.*perl/)
2363 # and (print $File::Find::name, "\n")
2366 # (ref($usage_fref) =~ /CODE/)
2368 # : (&blast_usage, &blast_params, &blast_general_params);
2374 if ( $last_nonblank_type eq ')' ) {
2376 "Syntax error? found token '$last_nonblank_type' then '('\n"
2379 $paren_structural_type[$paren_depth] = $type;
2383 ( $type_sequence, $indent_flag ) =
2384 decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2386 if ( $paren_structural_type[$paren_depth] eq '{' ) {
2390 $container_type = $paren_type[$paren_depth];
2392 # restore statement type as 'sub' at closing paren of a signature
2393 # so that a subsequent ':' is identified as an attribute
2394 if ( $container_type =~ /^sub\b/ ) {
2395 $statement_type = $container_type;
2399 if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
2400 my $num_sc = $paren_semicolon_count[$paren_depth];
2401 if ( $num_sc > 0 && $num_sc != 2 ) {
2402 warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2406 if ( $paren_depth > 0 ) { $paren_depth-- }
2409 if ( $last_nonblank_type eq ',' ) {
2410 complain("Repeated ','s \n");
2413 # Note that we have to check both token and type here because a
2414 # comma following a qw list can have last token='(' but type = 'q'
2415 elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' )
2417 warning("Unexpected leading ',' after a '('\n");
2420 # patch for operator_expected: note if we are in the list (use.t)
2421 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2425 $context = UNKNOWN_CONTEXT;
2426 $statement_type = '';
2430 if ( $is_for_foreach{ $paren_type[$paren_depth] } )
2431 { # mark ; in for loop
2433 # Be careful: we do not want a semicolon such as the
2434 # following to be included:
2436 # for (sort {strcoll($a,$b);} keys %investments) {
2438 if ( $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
2439 && $square_bracket_depth ==
2440 $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
2444 $paren_semicolon_count[$paren_depth]++;
2450 error_if_expecting_OPERATOR("String")
2451 if ( $expecting == OPERATOR );
2454 $allowed_quote_modifiers = "";
2457 error_if_expecting_OPERATOR("String")
2458 if ( $expecting == OPERATOR );
2461 $allowed_quote_modifiers = "";
2464 error_if_expecting_OPERATOR("String")
2465 if ( $expecting == OPERATOR );
2468 $allowed_quote_modifiers = "";
2473 # a pattern cannot follow certain keywords which take optional
2474 # arguments, like 'shift' and 'pop'. See also '?'.
2476 $last_nonblank_type eq 'k'
2477 && $is_keyword_rejecting_slash_as_pattern_delimiter{
2478 $last_nonblank_token}
2483 elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
2485 ( $is_pattern, $msg ) =
2486 guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
2490 write_diagnostics("DIVIDE:$msg\n");
2491 write_logfile_entry($msg);
2494 else { $is_pattern = ( $expecting == TERM ) }
2499 $allowed_quote_modifiers = '[msixpodualngc]';
2501 else { # not a pattern; check for a /= token
2503 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
2509 #DEBUG - collecting info on what tokens follow a divide
2510 # for development of guessing algorithm
2511 #if ( is_possible_numerator( $i, $rtokens, $max_token_index ) < 0 ) {
2512 # #write_diagnostics( "DIVIDE? $input_line\n" );
2518 # if we just saw a ')', we will label this block with
2519 # its type. We need to do this to allow sub
2520 # code_block_type to determine if this brace starts a
2521 # code block or anonymous hash. (The type of a paren
2522 # pair is the preceding token, such as 'if', 'else',
2524 $container_type = "";
2526 # ATTRS: for a '{' following an attribute list, reset
2527 # things to look like we just saw the sub name
2528 if ( $statement_type =~ /^sub\b/ ) {
2529 $last_nonblank_token = $statement_type;
2530 $last_nonblank_type = 'i';
2531 $statement_type = "";
2534 # patch for SWITCH/CASE: hide these keywords from an immediately
2535 # following opening brace
2536 elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
2537 && $statement_type eq $last_nonblank_token )
2539 $last_nonblank_token = ";";
2542 elsif ( $last_nonblank_token eq ')' ) {
2543 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
2545 # defensive move in case of a nesting error (pbug.t)
2546 # in which this ')' had no previous '('
2547 # this nesting error will have been caught
2548 if ( !defined($last_nonblank_token) ) {
2549 $last_nonblank_token = 'if';
2552 # check for syntax error here;
2553 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
2554 if ( $tokenizer_self->[_extended_syntax_] ) {
2556 # we append a trailing () to mark this as an unknown
2557 # block type. This allows perltidy to format some
2558 # common extensions of perl syntax.
2559 # This is used by sub code_block_type
2560 $last_nonblank_token .= '()';
2564 join( ' ', sort keys %is_blocktype_with_paren );
2566 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
2572 # patch for paren-less for/foreach glitch, part 2.
2573 # see note below under 'qw'
2574 elsif ($last_nonblank_token eq 'qw'
2575 && $is_for_foreach{$want_paren} )
2577 $last_nonblank_token = $want_paren;
2578 if ( $last_last_nonblank_token eq $want_paren ) {
2580 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
2587 # now identify which of the three possible types of
2588 # curly braces we have: hash index container, anonymous
2589 # hash reference, or code block.
2591 # non-structural (hash index) curly brace pair
2592 # get marked 'L' and 'R'
2593 if ( is_non_structural_brace() ) {
2596 # patch for SWITCH/CASE:
2597 # allow paren-less identifier after 'when'
2598 # if the brace is preceded by a space
2599 if ( $statement_type eq 'when'
2600 && $last_nonblank_type eq 'i'
2601 && $last_last_nonblank_type eq 'k'
2602 && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
2605 $block_type = $statement_type;
2609 # code and anonymous hash have the same type, '{', but are
2610 # distinguished by 'block_type',
2611 # which will be blank for an anonymous hash
2614 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
2617 # patch to promote bareword type to function taking block
2619 && $last_nonblank_type eq 'w'
2620 && $last_nonblank_i >= 0 )
2622 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
2623 $routput_token_type->[$last_nonblank_i] =
2624 $is_grep_alias{$block_type} ? 'k' : 'G';
2628 # patch for SWITCH/CASE: if we find a stray opening block brace
2629 # where we might accept a 'case' or 'when' block, then take it
2630 if ( $statement_type eq 'case'
2631 || $statement_type eq 'when' )
2633 if ( !$block_type || $block_type eq '}' ) {
2634 $block_type = $statement_type;
2639 $brace_type[ ++$brace_depth ] = $block_type;
2640 $brace_package[$brace_depth] = $current_package;
2641 $brace_structural_type[$brace_depth] = $type;
2642 $brace_context[$brace_depth] = $context;
2643 ( $type_sequence, $indent_flag ) =
2644 increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2647 $block_type = $brace_type[$brace_depth];
2648 if ($block_type) { $statement_type = '' }
2649 if ( defined( $brace_package[$brace_depth] ) ) {
2650 $current_package = $brace_package[$brace_depth];
2653 # can happen on brace error (caught elsewhere)
2656 ( $type_sequence, $indent_flag ) =
2657 decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2659 if ( $brace_structural_type[$brace_depth] eq 'L' ) {
2663 # propagate type information for 'do' and 'eval' blocks, and also
2664 # for smartmatch operator. This is necessary to enable us to know
2665 # if an operator or term is expected next.
2666 if ( $is_block_operator{$block_type} ) {
2670 $context = $brace_context[$brace_depth];
2671 if ( $brace_depth > 0 ) { $brace_depth--; }
2673 '&' => sub { # maybe sub call? start looking
2675 # We have to check for sub call unless we are sure we
2676 # are expecting an operator. This example from s2p
2677 # got mistaken as a q operator in an early version:
2678 # print BODY &q(<<'EOT');
2679 if ( $expecting != OPERATOR ) {
2681 # But only look for a sub call if we are expecting a term or
2682 # if there is no existing space after the &.
2683 # For example we probably don't want & as sub call here:
2684 # Fcntl::S_IRUSR & $mode;
2685 if ( $expecting == TERM || $next_type ne 'b' ) {
2686 scan_identifier_fast();
2692 '<' => sub { # angle operator or less than?
2694 if ( $expecting != OPERATOR ) {
2696 find_angle_operator_termination( $input_line, $i, $rtoken_map,
2697 $expecting, $max_token_index );
2699 ## This message is not very helpful and quite confusing if the above
2700 ## routine decided not to write a message with the line number.
2701 ## if ( $type eq '<' && $expecting == TERM ) {
2702 ## error_if_expecting_TERM();
2703 ## interrupt_logfile();
2704 ## warning("Unterminated <> operator?\n");
2705 ## resume_logfile();
2712 '?' => sub { # ?: conditional or starting pattern?
2716 # Patch for rt #126965
2717 # a pattern cannot follow certain keywords which take optional
2718 # arguments, like 'shift' and 'pop'. See also '/'.
2720 $last_nonblank_type eq 'k'
2721 && $is_keyword_rejecting_question_as_pattern_delimiter{
2722 $last_nonblank_token}
2728 # patch for RT#131288, user constant function without prototype
2729 # last type is 'U' followed by ?.
2730 elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
2733 elsif ( $expecting == UNKNOWN ) {
2735 # In older versions of Perl, a bare ? can be a pattern
2736 # delimiter. In perl version 5.22 this was
2737 # dropped, but we have to support it in order to format
2738 # older programs. See:
2739 ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
2740 # For example, the following line worked
2742 # ?(.*)? && (print $1,"\n");
2743 # In current versions it would have to be written with slashes:
2744 # /(.*)/ && (print $1,"\n");
2746 ( $is_pattern, $msg ) =
2747 guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
2750 if ($msg) { write_logfile_entry($msg) }
2752 else { $is_pattern = ( $expecting == TERM ) }
2757 $allowed_quote_modifiers = '[msixpodualngc]';
2760 ( $type_sequence, $indent_flag ) =
2761 increase_nesting_depth( QUESTION_COLON,
2762 $rtoken_map->[$i_tok] );
2765 '*' => sub { # typeglob, or multiply?
2767 if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
2768 if ( $next_type ne 'b'
2769 && $next_type ne '('
2770 && $next_type ne '#' ) # Fix c036
2775 if ( $expecting == TERM ) {
2776 scan_identifier_fast();
2780 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2785 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
2789 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2797 '.' => sub { # what kind of . ?
2799 if ( $expecting != OPERATOR ) {
2801 if ( $type eq '.' ) {
2802 error_if_expecting_TERM()
2803 if ( $expecting == TERM );
2811 # if this is the first nonblank character, call it a label
2812 # since perl seems to just swallow it
2813 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
2817 # ATTRS: check for a ':' which introduces an attribute list
2818 # either after a 'sub' keyword or within a paren list
2819 elsif ( $statement_type =~ /^sub\b/ ) {
2821 $in_attribute_list = 1;
2824 # Within a signature, unless we are in a ternary. For example,
2825 # from 't/filter_example.t':
2826 # method foo4 ( $class: $bar ) { $class->bar($bar) }
2827 elsif ( $paren_type[$paren_depth] =~ /^sub\b/
2828 && !is_balanced_closing_container(QUESTION_COLON) )
2831 $in_attribute_list = 1;
2834 # check for scalar attribute, such as
2835 # my $foo : shared = 1;
2836 elsif ($is_my_our_state{$statement_type}
2837 && $current_depth[QUESTION_COLON] == 0 )
2840 $in_attribute_list = 1;
2843 # Look for Switch::Plain syntax if an error would otherwise occur
2844 # here. Note that we do not need to check if the extended syntax
2845 # flag is set because otherwise an error would occur, and we would
2846 # then have to output a message telling the user to set the
2847 # extended syntax flag to avoid the error.
2851 # Note that the line 'default:' will be parsed as a label elsewhere.
2852 elsif ( $is_case_default{$statement_type}
2853 && !is_balanced_closing_container(QUESTION_COLON) )
2855 # mark it as a perltidy label type
2859 # otherwise, it should be part of a ?/: operator
2861 ( $type_sequence, $indent_flag ) =
2862 decrease_nesting_depth( QUESTION_COLON,
2863 $rtoken_map->[$i_tok] );
2864 if ( $last_nonblank_token eq '?' ) {
2865 warning("Syntax error near ? :\n");
2869 '+' => sub { # what kind of plus?
2871 if ( $expecting == TERM ) {
2872 my $number = scan_number_fast();
2874 # unary plus is safest assumption if not a number
2875 if ( !defined($number) ) { $type = 'p'; }
2877 elsif ( $expecting == OPERATOR ) {
2880 if ( $next_type eq 'w' ) { $type = 'p' }
2885 error_if_expecting_OPERATOR("Array")
2886 if ( $expecting == OPERATOR );
2887 scan_identifier_fast();
2889 '%' => sub { # hash or modulo?
2891 # first guess is hash if no following blank or paren
2892 if ( $expecting == UNKNOWN ) {
2893 if ( $next_type ne 'b' && $next_type ne '(' ) {
2897 if ( $expecting == TERM ) {
2898 scan_identifier_fast();
2902 $square_bracket_type[ ++$square_bracket_depth ] =
2903 $last_nonblank_token;
2904 ( $type_sequence, $indent_flag ) =
2905 increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2907 # It may seem odd, but structural square brackets have
2908 # type '{' and '}'. This simplifies the indentation logic.
2909 if ( !is_non_structural_brace() ) {
2912 $square_bracket_structural_type[$square_bracket_depth] = $type;
2915 ( $type_sequence, $indent_flag ) =
2916 decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2918 if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
2923 # propagate type information for smartmatch operator. This is
2924 # necessary to enable us to know if an operator or term is expected
2926 if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
2927 $tok = $square_bracket_type[$square_bracket_depth];
2930 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
2932 '-' => sub { # what kind of minus?
2934 if ( ( $expecting != OPERATOR )
2935 && $is_file_test_operator{$next_tok} )
2937 my ( $next_nonblank_token, $i_next ) =
2938 find_next_nonblank_token( $i + 1, $rtokens,
2941 # check for a quoted word like "-w=>xx";
2942 # it is sufficient to just check for a following '='
2943 if ( $next_nonblank_token eq '=' ) {
2952 elsif ( $expecting == TERM ) {
2953 my $number = scan_number_fast();
2955 # maybe part of bareword token? unary is safest
2956 if ( !defined($number) ) { $type = 'm'; }
2959 elsif ( $expecting == OPERATOR ) {
2963 if ( $next_type eq 'w' ) {
2971 # check for special variables like ${^WARNING_BITS}
2972 if ( $expecting == TERM ) {
2974 if ( $last_nonblank_token eq '{'
2975 && ( $next_tok !~ /^\d/ )
2976 && ( $next_tok =~ /^\w/ ) )
2979 if ( $next_tok eq 'W' ) {
2980 $tokenizer_self->[_saw_perl_dash_w_] = 1;
2982 $tok = $tok . $next_tok;
2986 # Optional coding to try to catch syntax errors. This can
2987 # be removed if it ever causes incorrect warning messages.
2988 # The '{^' should be preceded by either by a type or '$#'
2991 # *${^LAST_FH}{NAME} ok
2993 # $hash{^HOWDY} error
2995 # Note that a type sigil '$' may be tokenized as 'Z'
2996 # after something like 'print', so allow type 'Z'
2997 if ( $last_last_nonblank_type ne 't'
2998 && $last_last_nonblank_type ne 'Z'
2999 && $last_last_nonblank_token ne '$#' )
3001 warning("Possible syntax error near '{^'\n");
3006 unless ( error_if_expecting_TERM() ) {
3008 # Something like this is valid but strange:
3010 complain("The '^' seems unusual here\n");
3016 '::' => sub { # probably a sub call
3017 scan_bare_identifier();
3019 '<<' => sub { # maybe a here-doc?
3021 ## This check removed because it could be a deprecated here-doc with
3022 ## no specified target. See example in log 16 Sep 2020.
3024 ## unless ( $i < $max_token_index )
3025 ## ; # here-doc not possible if end of line
3027 if ( $expecting != OPERATOR ) {
3028 my ( $found_target, $here_doc_target, $here_quote_character,
3031 $found_target, $here_doc_target, $here_quote_character, $i,
3034 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3037 if ($found_target) {
3038 push @{$rhere_target_list},
3039 [ $here_doc_target, $here_quote_character ];
3041 if ( length($here_doc_target) > 80 ) {
3042 my $truncated = substr( $here_doc_target, 0, 80 );
3043 complain("Long here-target: '$truncated' ...\n");
3045 elsif ( !$here_doc_target ) {
3047 'Use of bare << to mean <<"" is deprecated' . "\n" )
3048 unless ($here_quote_character);
3050 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3052 "Unconventional here-target: '$here_doc_target'\n");
3055 elsif ( $expecting == TERM ) {
3056 unless ($saw_error) {
3058 # shouldn't happen..arriving here implies an error in
3059 # the logic in sub 'find_here_doc'
3062 Program bug; didn't find here doc target
3066 "Possible program error: didn't find here doc target\n"
3068 report_definite_bug();
3075 '<<~' => sub { # a here-doc, new type added in v26
3077 unless ( $i < $max_token_index )
3078 ; # here-doc not possible if end of line
3079 if ( $expecting != OPERATOR ) {
3080 my ( $found_target, $here_doc_target, $here_quote_character,
3083 $found_target, $here_doc_target, $here_quote_character, $i,
3086 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3089 if ($found_target) {
3091 if ( length($here_doc_target) > 80 ) {
3092 my $truncated = substr( $here_doc_target, 0, 80 );
3093 complain("Long here-target: '$truncated' ...\n");
3095 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3097 "Unconventional here-target: '$here_doc_target'\n");
3100 # Note that we put a leading space on the here quote
3101 # character indicate that it may be preceded by spaces
3102 $here_quote_character = " " . $here_quote_character;
3103 push @{$rhere_target_list},
3104 [ $here_doc_target, $here_quote_character ];
3107 elsif ( $expecting == TERM ) {
3108 unless ($saw_error) {
3110 # shouldn't happen..arriving here implies an error in
3111 # the logic in sub 'find_here_doc'
3114 Program bug; didn't find here doc target
3118 "Possible program error: didn't find here doc target\n"
3120 report_definite_bug();
3125 error_if_expecting_OPERATOR();
3130 # if -> points to a bare word, we must scan for an identifier,
3131 # otherwise something like ->y would look like the y operator
3133 # NOTE: this will currently allow things like
3134 # '->@array' '->*VAR' '->%hash'
3135 # to get parsed as identifiers, even though these are not currently
3136 # allowed syntax. To catch syntax errors like this we could first
3137 # check that the next character and skip this call if it is one of
3138 # ' @ % * '. A disadvantage with doing this is that this would
3139 # have to be fixed if the perltidy syntax is ever extended to make
3140 # any of these valid. So for now this check is not done.
3141 scan_identifier_fast();
3144 # type = 'pp' for pre-increment, '++' for post-increment
3146 if ( $expecting == TERM ) { $type = 'pp' }
3147 elsif ( $expecting == UNKNOWN ) {
3149 my ( $next_nonblank_token, $i_next ) =
3150 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3152 # Fix for c042: look past a side comment
3153 if ( $next_nonblank_token eq '#' ) {
3154 ( $next_nonblank_token, $i_next ) =
3155 find_next_nonblank_token( $max_token_index,
3156 $rtokens, $max_token_index );
3159 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
3164 if ( $last_nonblank_type eq $tok ) {
3165 complain("Repeated '=>'s \n");
3168 # patch for operator_expected: note if we are in the list (use.t)
3169 # TODO: make version numbers a new token type
3170 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
3173 # type = 'mm' for pre-decrement, '--' for post-decrement
3176 if ( $expecting == TERM ) { $type = 'mm' }
3177 elsif ( $expecting == UNKNOWN ) {
3178 my ( $next_nonblank_token, $i_next ) =
3179 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3181 # Fix for c042: look past a side comment
3182 if ( $next_nonblank_token eq '#' ) {
3183 ( $next_nonblank_token, $i_next ) =
3184 find_next_nonblank_token( $max_token_index,
3185 $rtokens, $max_token_index );
3188 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
3193 error_if_expecting_TERM()
3194 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3198 error_if_expecting_TERM()
3199 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3203 error_if_expecting_TERM()
3204 if ( $expecting == TERM );
3208 # ------------------------------------------------------------
3209 # end hash of code for handling individual token types
3210 # ------------------------------------------------------------
3212 my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
3214 # These block types terminate statements and do not need a trailing
3216 # patched for SWITCH/CASE/
3217 my %is_zero_continuation_block_type;
3218 @_ = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
3219 if elsif else unless while until for foreach switch case given when);
3220 @is_zero_continuation_block_type{@_} = (1) x scalar(@_);
3222 my %is_logical_container;
3223 @_ = qw(if elsif unless while and or err not && ! || for foreach);
3224 @is_logical_container{@_} = (1) x scalar(@_);
3228 @is_binary_type{@_} = (1) x scalar(@_);
3230 my %is_binary_keyword;
3231 @_ = qw(and or err eq ne cmp);
3232 @is_binary_keyword{@_} = (1) x scalar(@_);
3234 # 'L' is token for opening { at hash key
3235 my %is_opening_type;
3237 @is_opening_type{@_} = (1) x scalar(@_);
3239 # 'R' is token for closing } at hash key
3240 my %is_closing_type;
3242 @is_closing_type{@_} = (1) x scalar(@_);
3244 my %is_redo_last_next_goto;
3245 @_ = qw(redo last next goto);
3246 @is_redo_last_next_goto{@_} = (1) x scalar(@_);
3249 @_ = qw(use require);
3250 @is_use_require{@_} = (1) x scalar(@_);
3252 # This hash holds the array index in $tokenizer_self for these keywords:
3253 # Fix for issue c035: removed 'format' from this hash
3255 '__END__' => _in_end_,
3256 '__DATA__' => _in_data_,
3259 # original ref: camel 3 p 147,
3260 # but perl may accept undocumented flags
3261 # perl 5.10 adds 'p' (preserve)
3262 # Perl version 5.22 added 'n'
3263 # From http://perldoc.perl.org/perlop.html we have
3264 # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
3265 # s/PATTERN/REPLACEMENT/msixpodualngcer
3266 # y/SEARCHLIST/REPLACEMENTLIST/cdsr
3267 # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
3268 # qr/STRING/msixpodualn
3269 my %quote_modifiers = (
3270 's' => '[msixpodualngcer]',
3273 'm' => '[msixpodualngc]',
3274 'qr' => '[msixpodualn]',
3281 # table showing how many quoted things to look for after quote operator..
3282 # s, y, tr have 2 (pattern and replacement)
3283 # others have 1 (pattern only)
3296 use constant DEBUG_TOKENIZE => 0;
3298 sub tokenize_this_line {
3300 # This routine breaks a line of perl code into tokens which are of use in
3301 # indentation and reformatting. One of my goals has been to define tokens
3302 # such that a newline may be inserted between any pair of tokens without
3303 # changing or invalidating the program. This version comes close to this,
3304 # although there are necessarily a few exceptions which must be caught by
3305 # the formatter. Many of these involve the treatment of bare words.
3307 # The tokens and their types are returned in arrays. See previous
3308 # routine for their names.
3310 # See also the array "valid_token_types" in the BEGIN section for an
3313 # To simplify things, token types are either a single character, or they
3314 # are identical to the tokens themselves.
3316 # As a debugging aid, the -D flag creates a file containing a side-by-side
3317 # comparison of the input string and its tokenization for each line of a file.
3318 # This is an invaluable debugging aid.
3320 # In addition to tokens, and some associated quantities, the tokenizer
3321 # also returns flags indication any special line types. These include
3322 # quotes, here_docs, formats.
3324 # -----------------------------------------------------------------------
3326 # How to add NEW_TOKENS:
3328 # New token types will undoubtedly be needed in the future both to keep up
3329 # with changes in perl and to help adapt the tokenizer to other applications.
3331 # Here are some notes on the minimal steps. I wrote these notes while
3332 # adding the 'v' token type for v-strings, which are things like version
3333 # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
3334 # can use your editor to search for the string "NEW_TOKENS" to find the
3335 # appropriate sections to change):
3337 # *. Try to talk somebody else into doing it! If not, ..
3339 # *. Make a backup of your current version in case things don't work out!
3341 # *. Think of a new, unused character for the token type, and add to
3342 # the array @valid_token_types in the BEGIN section of this package.
3343 # For example, I used 'v' for v-strings.
3345 # *. Implement coding to recognize the $type of the token in this routine.
3346 # This is the hardest part, and is best done by imitating or modifying
3347 # some of the existing coding. For example, to recognize v-strings, I
3348 # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
3349 # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
3351 # *. Update sub operator_expected. This update is critically important but
3352 # the coding is trivial. Look at the comments in that routine for help.
3353 # For v-strings, which should behave like numbers, I just added 'v' to the
3354 # regex used to handle numbers and strings (types 'n' and 'Q').
3356 # *. Implement a 'bond strength' rule in sub set_bond_strengths in
3357 # Perl::Tidy::Formatter for breaking lines around this token type. You can
3358 # skip this step and take the default at first, then adjust later to get
3359 # desired results. For adding type 'v', I looked at sub bond_strength and
3360 # saw that number type 'n' was using default strengths, so I didn't do
3361 # anything. I may tune it up someday if I don't like the way line
3362 # breaks with v-strings look.
3364 # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
3365 # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
3366 # and saw that type 'n' used spaces on both sides, so I just added 'v'
3367 # to the array @spaces_both_sides.
3369 # *. Update HtmlWriter package so that users can colorize the token as
3370 # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
3371 # that package. For v-strings, I initially chose to use a default color
3372 # equal to the default for numbers, but it might be nice to change that
3375 # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
3377 # *. Run lots and lots of debug tests. Start with special files designed
3378 # to test the new token type. Run with the -D flag to create a .DEBUG
3379 # file which shows the tokenization. When these work ok, test as many old
3380 # scripts as possible. Start with all of the '.t' files in the 'test'
3381 # directory of the distribution file. Compare .tdy output with previous
3382 # version and updated version to see the differences. Then include as
3383 # many more files as possible. My own technique has been to collect a huge
3384 # number of perl scripts (thousands!) into one directory and run perltidy
3385 # *, then run diff between the output of the previous version and the
3388 # *. For another example, search for the smartmatch operator '~~'
3389 # with your editor to see where updates were made for it.
3391 # -----------------------------------------------------------------------
3393 my $line_of_tokens = shift;
3394 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
3396 # patch while coding change is underway
3397 # make callers private data to allow access
3398 # $tokenizer_self = $caller_tokenizer_self;
3400 # extract line number for use in error messages
3401 $input_line_number = $line_of_tokens->{_line_number};
3403 # reinitialize for multi-line quote
3404 $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
3406 # check for pod documentation
3407 if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
3408 && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
3411 # must not be in multi-line quote
3412 # and must not be in an equation
3414 && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
3416 $tokenizer_self->[_in_pod_] = 1;
3421 $input_line = $untrimmed_input_line;
3425 # Set a flag to indicate if we might be at an __END__ or __DATA__ line
3426 # This will be used below to avoid quoting a bare word followed by
3430 # trim start of this line unless we are continuing a quoted line
3431 # do not trim end because we might end in a quote (test: deken4.pl)
3432 # Perl::Tidy::Formatter will delete needless trailing blanks
3433 unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
3434 $input_line =~ s/^\s+//; # trim left end
3436 $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
3437 && $input_line =~ /^\s*__(END|DATA)__\s*$/;
3440 # update the copy of the line for use in error messages
3441 # This must be exactly what we give the pre_tokenizer
3442 $tokenizer_self->[_line_of_text_] = $input_line;
3444 # re-initialize for the main loop
3445 $routput_token_list = []; # stack of output token indexes
3446 $routput_token_type = []; # token types
3447 $routput_block_type = []; # types of code block
3448 $routput_container_type = []; # paren types, such as if, elsif, ..
3449 $routput_type_sequence = []; # nesting sequential number
3451 $rhere_target_list = [];
3453 $tok = $last_nonblank_token;
3454 $type = $last_nonblank_type;
3455 $prototype = $last_nonblank_prototype;
3456 $last_nonblank_i = -1;
3457 $block_type = $last_nonblank_block_type;
3458 $container_type = $last_nonblank_container_type;
3459 $type_sequence = $last_nonblank_type_sequence;
3463 # tokenization is done in two stages..
3464 # stage 1 is a very simple pre-tokenization
3465 my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
3467 # optimize for a full-line comment
3468 if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) {
3469 $max_tokens_wanted = 1; # no use tokenizing a comment
3471 # and check for skipped section
3472 if ( $rOpts_code_skipping
3473 && $input_line =~ /$code_skipping_pattern_begin/ )
3475 $tokenizer_self->[_in_skipped_] = 1;
3480 # start by breaking the line into pre-tokens
3481 ( $rtokens, $rtoken_map, $rtoken_type ) =
3482 pre_tokenize( $input_line, $max_tokens_wanted );
3484 $max_token_index = scalar( @{$rtokens} ) - 1;
3485 push( @{$rtokens}, ' ', ' ', ' ' ); # extra whitespace simplifies logic
3486 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
3487 push( @{$rtoken_type}, 'b', 'b', 'b' );
3489 # initialize for main loop
3490 foreach my $ii ( 0 .. $max_token_index + 3 ) {
3491 $routput_token_type->[$ii] = "";
3492 $routput_block_type->[$ii] = "";
3493 $routput_container_type->[$ii] = "";
3494 $routput_type_sequence->[$ii] = "";
3495 $routput_indent_flag->[$ii] = 0;
3500 # ------------------------------------------------------------
3501 # begin main tokenization loop
3502 # ------------------------------------------------------------
3504 # we are looking at each pre-token of one line and combining them
3506 while ( ++$i <= $max_token_index ) {
3508 if ($in_quote) { # continue looking for end of a quote
3509 $type = $quote_type;
3511 unless ( @{$routput_token_list} )
3512 { # initialize if continuation line
3513 push( @{$routput_token_list}, $i );
3514 $routput_token_type->[$i] = $type;
3518 # Removed to fix b1280. This is not needed and was causing the
3519 # starting type 'qw' to be lost, leading to mis-tokenization of
3520 # a trailing block brace in a parenless for stmt 'for .. qw.. {'
3521 ##$tok = $quote_character if ($quote_character);
3523 # scan for the end of the quote or pattern
3525 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
3526 $quoted_string_1, $quoted_string_2
3529 $i, $in_quote, $quote_character,
3530 $quote_pos, $quote_depth, $quoted_string_1,
3531 $quoted_string_2, $rtokens, $rtoken_map,
3535 # all done if we didn't find it
3536 last if ($in_quote);
3538 # save pattern and replacement text for rescanning
3539 my $qs1 = $quoted_string_1;
3540 my $qs2 = $quoted_string_2;
3542 # re-initialize for next search
3543 $quote_character = '';
3546 $quoted_string_1 = "";
3547 $quoted_string_2 = "";
3548 last if ( ++$i > $max_token_index );
3550 # look for any modifiers
3551 if ($allowed_quote_modifiers) {
3553 # check for exact quote modifiers
3554 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
3555 my $str = $rtokens->[$i];
3557 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
3558 my $pos = pos($str);
3559 my $char = substr( $str, $pos - 1, 1 );
3560 $saw_modifier_e ||= ( $char eq 'e' );
3563 # For an 'e' quote modifier we must scan the replacement
3564 # text for here-doc targets...
3565 # but if the modifier starts a new line we can skip
3566 # this because either the here doc will be fully
3567 # contained in the replacement text (so we can
3568 # ignore it) or Perl will not find it.
3569 # See test 'here2.in'.
3570 if ( $saw_modifier_e && $i_tok >= 0 ) {
3572 my $rht = scan_replacement_text($qs1);
3574 # Change type from 'Q' to 'h' for quotes with
3575 # here-doc targets so that the formatter (see sub
3576 # process_line_of_CODE) will not make any line
3577 # breaks after this point.
3579 push @{$rhere_target_list}, @{$rht};
3582 my $ilast = $routput_token_list->[-1];
3583 $routput_token_type->[$ilast] = $type;
3588 if ( defined( pos($str) ) ) {
3591 if ( pos($str) == length($str) ) {
3592 last if ( ++$i > $max_token_index );
3595 # Looks like a joined quote modifier
3596 # and keyword, maybe something like
3597 # s/xxx/yyy/gefor @k=...
3598 # Example is "galgen.pl". Would have to split
3599 # the word and insert a new token in the
3600 # pre-token list. This is so rare that I haven't
3601 # done it. Will just issue a warning citation.
3603 # This error might also be triggered if my quote
3604 # modifier characters are incomplete
3608 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
3609 Please put a space between quote modifiers and trailing keywords.
3612 # print "token $rtokens->[$i]\n";
3613 # my $num = length($str) - pos($str);
3614 # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
3615 # print "continuing with new token $rtokens->[$i]\n";
3617 # skipping past this token does least damage
3618 last if ( ++$i > $max_token_index );
3623 # example file: rokicki4.pl
3624 # This error might also be triggered if my quote
3625 # modifier characters are incomplete
3626 write_logfile_entry(
3627 "Note: found word $str at quote modifier location\n"
3633 $allowed_quote_modifiers = "";
3637 unless ( $type eq 'b' || $tok eq 'CORE::' ) {
3639 # try to catch some common errors
3640 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
3642 if ( $last_nonblank_token eq 'eq' ) {
3643 complain("Should 'eq' be '==' here ?\n");
3645 elsif ( $last_nonblank_token eq 'ne' ) {
3646 complain("Should 'ne' be '!=' here ?\n");
3650 # fix c090, only rotate vars if a new token will be stored
3651 if ( $i_tok >= 0 ) {
3652 $last_last_nonblank_token = $last_nonblank_token;
3653 $last_last_nonblank_type = $last_nonblank_type;
3654 $last_last_nonblank_block_type = $last_nonblank_block_type;
3655 $last_last_nonblank_container_type =
3656 $last_nonblank_container_type;
3657 $last_last_nonblank_type_sequence =
3658 $last_nonblank_type_sequence;
3660 # Fix part #3 for git82: propagate type 'Z' though L-R pair
3661 unless ( $type eq 'R' && $last_nonblank_type eq 'Z' ) {
3662 $last_nonblank_token = $tok;
3663 $last_nonblank_type = $type;
3665 $last_nonblank_prototype = $prototype;
3666 $last_nonblank_block_type = $block_type;
3667 $last_nonblank_container_type = $container_type;
3668 $last_nonblank_type_sequence = $type_sequence;
3669 $last_nonblank_i = $i_tok;
3672 # Patch for c030: Fix things in case a '->' got separated from
3673 # the subsequent identifier by a side comment. We need the
3674 # last_nonblank_token to have a leading -> to avoid triggering
3675 # an operator expected error message at the next '('. See also
3677 if ( $last_last_nonblank_token eq '->' ) {
3678 if ( $last_nonblank_type eq 'w'
3679 || $last_nonblank_type eq 'i'
3680 && substr( $last_nonblank_token, 0, 1 ) eq '$' )
3682 $last_nonblank_token = '->' . $last_nonblank_token;
3683 $last_nonblank_type = 'i';
3688 # store previous token type
3689 if ( $i_tok >= 0 ) {
3690 $routput_token_type->[$i_tok] = $type;
3691 $routput_block_type->[$i_tok] = $block_type;
3692 $routput_container_type->[$i_tok] = $container_type;
3693 $routput_type_sequence->[$i_tok] = $type_sequence;
3694 $routput_indent_flag->[$i_tok] = $indent_flag;
3696 my $pre_tok = $rtokens->[$i]; # get the next pre-token
3697 my $pre_type = $rtoken_type->[$i]; # and type
3699 $type = $pre_type; # to be modified as necessary
3700 $block_type = ""; # blank for all tokens except code block braces
3701 $container_type = ""; # blank for all tokens except some parens
3702 $type_sequence = ""; # blank for all tokens except ?/:
3704 $prototype = ""; # blank for all tokens except user defined subs
3707 # this pre-token will start an output token
3708 push( @{$routput_token_list}, $i_tok );
3710 # continue gathering identifier if necessary
3711 # but do not start on blanks and comments
3712 if ( $id_scan_state && $pre_type ne 'b' && $pre_type ne '#' ) {
3714 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
3721 if ($id_scan_state) {
3723 # Still scanning ...
3724 # Check for side comment between sub and prototype (c061)
3726 # done if nothing left to scan on this line
3727 last if ( $i > $max_token_index );
3729 my ( $next_nonblank_token, $i_next ) =
3730 find_next_nonblank_token_on_this_line( $i, $rtokens,
3733 # done if it was just some trailing space
3734 last if ( $i_next > $max_token_index );
3736 # something remains on the line ... must be a side comment
3740 next if ( ( $i > 0 ) || $type );
3742 # didn't find any token; start over
3747 # handle whitespace tokens..
3748 next if ( $type eq 'b' );
3749 my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : ' ';
3750 my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
3752 # Build larger tokens where possible, since we are not in a quote.
3754 # First try to assemble digraphs. The following tokens are
3755 # excluded and handled specially:
3756 # '/=' is excluded because the / might start a pattern.
3757 # 'x=' is excluded since it might be $x=, with $ on previous line
3758 # '**' and *= might be typeglobs of punctuation variables
3759 # I have allowed tokens starting with <, such as <=,
3760 # because I don't think these could be valid angle operators.
3761 # test file: storrs4.pl
3762 my $test_tok = $tok . $rtokens->[ $i + 1 ];
3763 my $combine_ok = $is_digraph{$test_tok};
3765 # check for special cases which cannot be combined
3768 # '//' must be defined_or operator if an operator is expected.
3769 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
3770 # could be migrated here for clarity
3772 # Patch for RT#102371, misparsing a // in the following snippet:
3773 # state $b //= ccc();
3774 # The solution is to always accept the digraph (or trigraph) after
3775 # token type 'Z' (possible file handle). The reason is that
3776 # sub operator_expected gives TERM expected here, which is
3777 # wrong in this case.
3778 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
3779 my $next_type = $rtokens->[ $i + 1 ];
3781 operator_expected( [ $prev_type, $tok, $next_type ] );
3783 # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
3784 $combine_ok = 0 if ( $expecting == TERM );
3787 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
3788 # Accept the digraphs '**' only after type 'Z'
3789 # Otherwise postpone the decision.
3790 if ( $test_tok eq '**' ) {
3791 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
3798 && ( $test_tok ne '/=' ) # might be pattern
3799 && ( $test_tok ne 'x=' ) # might be $x
3800 && ( $test_tok ne '*=' ) # typeglob?
3802 # Moved above as part of fix for
3803 # RT #114359: Missparsing of "print $x ** 0.5;
3804 # && ( $test_tok ne '**' ) # typeglob?
3810 # Now try to assemble trigraphs. Note that all possible
3811 # perl trigraphs can be constructed by appending a character
3813 $test_tok = $tok . $rtokens->[ $i + 1 ];
3815 if ( $is_trigraph{$test_tok} ) {
3820 # The only current tetragraph is the double diamond operator
3821 # and its first three characters are not a trigraph, so
3822 # we do can do a special test for it
3823 elsif ( $test_tok eq '<<>' ) {
3824 $test_tok .= $rtokens->[ $i + 2 ];
3825 if ( $is_tetragraph{$test_tok} ) {
3833 $next_tok = $rtokens->[ $i + 1 ];
3834 $next_type = $rtoken_type->[ $i + 1 ];
3836 DEBUG_TOKENIZE && do {
3839 $last_nonblank_token, $tok,
3840 $next_tok, $brace_depth,
3841 $brace_type[$brace_depth], $paren_depth,
3842 $paren_type[$paren_depth]
3844 print STDOUT "TOKENIZE:(@debug_list)\n";
3847 # Turn off attribute list on first non-blank, non-bareword.
3848 # Added '#' to fix c038.
3849 if ( $pre_type ne 'w' && $pre_type ne '#' ) {
3850 $in_attribute_list = 0;
3853 ###############################################################
3854 # We have the next token, $tok.
3855 # Now we have to examine this token and decide what it is
3856 # and define its $type
3858 # section 1: bare words
3859 ###############################################################
3861 if ( $pre_type eq 'w' ) {
3863 operator_expected( [ $prev_type, $tok, $next_type ] );
3865 # Patch for c043, part 3: A bareword after '->' expects a TERM
3866 # FIXME: It would be cleaner to give method calls a new type 'M'
3867 # and update sub operator_expected to handle this.
3868 if ( $last_nonblank_type eq '->' ) {
3872 my ( $next_nonblank_token, $i_next ) =
3873 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3875 # ATTRS: handle sub and variable attributes
3876 if ($in_attribute_list) {
3878 # treat bare word followed by open paren like qw(
3879 if ( $next_nonblank_token eq '(' ) {
3881 # For something like:
3883 # we should let do_scan_sub see it so that it can see
3884 # the prototype. All other attributes get parsed as a
3886 if ( $tok eq 'prototype' ) {
3887 $id_scan_state = 'prototype';
3889 # start just after the word 'prototype'
3891 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
3893 input_line => $input_line,
3898 rtokens => $rtokens,
3899 rtoken_map => $rtoken_map,
3900 id_scan_state => $id_scan_state,
3901 max_token_index => $max_token_index
3905 # If successful, mark as type 'q' to be consistent with other
3906 # attributes. Note that type 'w' would also work.
3907 if ( $i > $i_beg ) {
3912 # If not successful, continue and parse as a quote.
3915 # All other attribute lists must be parsed as quotes
3916 # (see 'signatures.t' for good examples)
3917 $in_quote = $quote_items{'q'};
3918 $allowed_quote_modifiers = $quote_modifiers{'q'};
3924 # handle bareword not followed by open paren
3931 # quote a word followed by => operator
3932 # unless the word __END__ or __DATA__ and the only word on
3934 if ( !$is_END_or_DATA && $next_nonblank_token eq '=' ) {
3936 if ( $rtokens->[ $i_next + 1 ] eq '>' ) {
3937 if ( $is_constant{$current_package}{$tok} ) {
3940 elsif ( $is_user_function{$current_package}{$tok} ) {
3943 $user_function_prototype{$current_package}{$tok};
3945 elsif ( $tok =~ /^v\d+$/ ) {
3947 report_v_string($tok);
3951 # Bareword followed by a fat comma ... see 'git18.in'
3952 # If tok is something like 'x17' then it could
3953 # actually be operator x followed by number 17.
3954 # For example, here:
3955 # 123x17 => [ 792, 1224 ],
3956 # (a key of 123 repeated 17 times, perhaps not
3957 # what was intended). We will mark x17 as type
3958 # 'n' and it will be split. If the previous token
3959 # was also a bareword then it is not very clear is
3960 # going on. In this case we will not be sure that
3961 # an operator is expected, so we just mark it as a
3962 # bareword. Perl is a little murky in what it does
3963 # with stuff like this, and its behavior can change
3964 # over time. Something like
3965 # a x18 => [792, 1224], will compile as
3966 # a key with 18 a's. But something like
3967 # push @array, a x18;
3968 # is a syntax error.
3970 $expecting == OPERATOR
3971 && substr( $tok, 0, 1 ) eq 'x'
3972 && ( length($tok) == 1
3973 || substr( $tok, 1, 1 ) =~ /^\d/ )
3977 if ( split_pretoken(1) ) {
3986 error_if_expecting_OPERATOR();
3994 # quote a bare word within braces..like xxx->{s}; note that we
3995 # must be sure this is not a structural brace, to avoid
3996 # mistaking {s} in the following for a quoted bare word:
3997 # for(@[){s}bla}BLA}
3998 # Also treat q in something like var{-q} as a bare word, not qoute operator
4000 $next_nonblank_token eq '}'
4002 $last_nonblank_type eq 'L'
4003 || ( $last_nonblank_type eq 'm'
4004 && $last_last_nonblank_type eq 'L' )
4012 # Scan a bare word following a -> as an identifir; it could
4013 # have a long package name. Fixes c037, c041.
4014 if ( $last_nonblank_token eq '->' ) {
4015 scan_bare_identifier();
4017 # Patch for c043, part 4; use type 'w' after a '->'.
4018 # This is just a safety check on sub scan_bare_identifier,
4019 # which should get this case correct.
4024 # a bare word immediately followed by :: is not a keyword;
4025 # use $tok_kw when testing for keywords to avoid a mistake
4027 if ( $rtokens->[ $i + 1 ] eq ':'
4028 && $rtokens->[ $i + 2 ] eq ':' )
4033 # Decide if 'sub :' can be the start of a sub attribute list.
4034 # We will decide based on if the colon is followed by a
4035 # bareword which is not a keyword.
4036 # Changed inext+1 to inext to fixed case b1190.
4037 my $sub_attribute_ok_here;
4038 if ( $is_sub{$tok_kw}
4039 && $expecting != OPERATOR
4040 && $next_nonblank_token eq ':' )
4042 my ( $nn_nonblank_token, $i_nn ) =
4043 find_next_nonblank_token( $i_next,
4044 $rtokens, $max_token_index );
4045 $sub_attribute_ok_here =
4046 $nn_nonblank_token =~ /^\w/
4047 && $nn_nonblank_token !~ /^\d/
4048 && !$is_keyword{$nn_nonblank_token};
4051 # handle operator x (now we know it isn't $x=)
4053 $expecting == OPERATOR
4054 && substr( $tok, 0, 1 ) eq 'x'
4055 && ( length($tok) == 1
4056 || substr( $tok, 1, 1 ) =~ /^\d/ )
4060 if ( $tok eq 'x' ) {
4061 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
4072 # Split a pretoken like 'x10' into 'x' and '10'.
4073 # Note: In previous versions of perltidy it was marked
4074 # as a number, $type = 'n', and fixed downstream by the
4077 if ( split_pretoken(1) ) {
4083 elsif ( $tok_kw eq 'CORE::' ) {
4084 $type = $tok = $tok_kw;
4087 elsif ( ( $tok eq 'strict' )
4088 and ( $last_nonblank_token eq 'use' ) )
4090 $tokenizer_self->[_saw_use_strict_] = 1;
4091 scan_bare_identifier();
4094 elsif ( ( $tok eq 'warnings' )
4095 and ( $last_nonblank_token eq 'use' ) )
4097 $tokenizer_self->[_saw_perl_dash_w_] = 1;
4099 # scan as identifier, so that we pick up something like:
4100 # use warnings::register
4101 scan_bare_identifier();
4105 $tok eq 'AutoLoader'
4106 && $tokenizer_self->[_look_for_autoloader_]
4108 $last_nonblank_token eq 'use'
4110 # these regexes are from AutoSplit.pm, which we want
4112 || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
4113 || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
4117 write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
4118 $tokenizer_self->[_saw_autoloader_] = 1;
4119 $tokenizer_self->[_look_for_autoloader_] = 0;
4120 scan_bare_identifier();
4124 $tok eq 'SelfLoader'
4125 && $tokenizer_self->[_look_for_selfloader_]
4126 && ( $last_nonblank_token eq 'use'
4127 || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
4128 || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
4131 write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
4132 $tokenizer_self->[_saw_selfloader_] = 1;
4133 $tokenizer_self->[_look_for_selfloader_] = 0;
4134 scan_bare_identifier();
4137 elsif ( ( $tok eq 'constant' )
4138 and ( $last_nonblank_token eq 'use' ) )
4140 scan_bare_identifier();
4141 my ( $next_nonblank_token, $i_next ) =
4142 find_next_nonblank_token( $i, $rtokens,
4145 if ($next_nonblank_token) {
4147 if ( $is_keyword{$next_nonblank_token} ) {
4149 # Assume qw is used as a quote and okay, as in:
4150 # use constant qw{ DEBUG 0 };
4151 # Not worth trying to parse for just a warning
4153 # NOTE: This warning is deactivated because recent
4154 # versions of perl do not complain here, but
4155 # the coding is retained for reference.
4156 if ( 0 && $next_nonblank_token ne 'qw' ) {
4158 "Attempting to define constant '$next_nonblank_token' which is a perl keyword\n"
4164 $is_constant{$current_package}{$next_nonblank_token}
4170 # various quote operators
4171 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
4173 if ( $expecting == OPERATOR ) {
4175 # Be careful not to call an error for a qw quote
4176 # where a parenthesized list is allowed. For example,
4177 # it could also be a for/foreach construct such as
4179 # foreach my $key qw\Uno Due Tres Quadro\ {
4180 # print "Set $key\n";
4184 # Or it could be a function call.
4185 # NOTE: Braces in something like &{ xxx } are not
4186 # marked as a block, we might have a method call.
4187 # &method(...), $method->(..), &{method}(...),
4188 # $ref[2](list) is ok & short for $ref[2]->(list)
4190 # See notes in 'sub code_block_type' and
4191 # 'sub is_non_structural_brace'
4195 && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
4196 || $is_for_foreach{$want_paren} )
4199 error_if_expecting_OPERATOR();
4202 $in_quote = $quote_items{$tok};
4203 $allowed_quote_modifiers = $quote_modifiers{$tok};
4205 # All quote types are 'Q' except possibly qw quotes.
4206 # qw quotes are special in that they may generally be trimmed
4207 # of leading and trailing whitespace. So they are given a
4208 # separate type, 'q', unless requested otherwise.
4210 ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
4213 $quote_type = $type;
4216 # check for a statement label
4218 ( $next_nonblank_token eq ':' )
4219 && ( $rtokens->[ $i_next + 1 ] ne ':' )
4220 && ( $i_next <= $max_token_index ) # colon on same line
4221 && !$sub_attribute_ok_here # like 'sub : lvalue' ?
4225 if ( $tok !~ /[A-Z]/ ) {
4226 push @{ $tokenizer_self->[_rlower_case_labels_at_] },
4236 elsif ( $is_sub{$tok_kw} ) {
4237 error_if_expecting_OPERATOR()
4238 if ( $expecting == OPERATOR );
4239 initialize_subname();
4244 elsif ( $is_package{$tok_kw} ) {
4245 error_if_expecting_OPERATOR()
4246 if ( $expecting == OPERATOR );
4250 # Fix for c035: split 'format' from 'is_format_END_DATA' to be
4251 # more restrictive. Require a new statement to be ok here.
4252 elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
4253 $type = ';'; # make tokenizer look for TERM next
4254 $tokenizer_self->[_in_format_] = 1;
4258 # Note on token types for format, __DATA__, __END__:
4259 # It simplifies things to give these type ';', so that when we
4260 # start rescanning we will be expecting a token of type TERM.
4261 # We will switch to type 'k' before outputting the tokens.
4262 elsif ( $is_END_DATA{$tok_kw} ) {
4263 $type = ';'; # make tokenizer look for TERM next
4265 # Remember that we are in one of these three sections
4266 $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1;
4270 elsif ( $is_keyword{$tok_kw} ) {
4273 # Since for and foreach may not be followed immediately
4274 # by an opening paren, we have to remember which keyword
4275 # is associated with the next '('
4276 if ( $is_for_foreach{$tok} ) {
4277 if ( new_statement_ok() ) {
4282 # recognize 'use' statements, which are special
4283 elsif ( $is_use_require{$tok} ) {
4284 $statement_type = $tok;
4285 error_if_expecting_OPERATOR()
4286 if ( $expecting == OPERATOR );
4289 # remember my and our to check for trailing ": shared"
4290 elsif ( $is_my_our_state{$tok} ) {
4291 $statement_type = $tok;
4294 # Check for misplaced 'elsif' and 'else', but allow isolated
4295 # else or elsif blocks to be formatted. This is indicated
4296 # by a last noblank token of ';'
4297 elsif ( $tok eq 'elsif' ) {
4298 if ( $last_nonblank_token ne ';'
4299 && $last_nonblank_block_type !~
4300 /^(if|elsif|unless)$/ )
4303 "expecting '$tok' to follow one of 'if|elsif|unless'\n"
4307 elsif ( $tok eq 'else' ) {
4309 # patched for SWITCH/CASE
4311 $last_nonblank_token ne ';'
4312 && $last_nonblank_block_type !~
4313 /^(if|elsif|unless|case|when)$/
4315 # patch to avoid an unwanted error message for
4316 # the case of a parenless 'case' (RT 105484):
4317 # switch ( 1 ) { case x { 2 } else { } }
4318 && $statement_type !~
4319 /^(if|elsif|unless|case|when)$/
4323 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
4327 elsif ( $tok eq 'continue' ) {
4328 if ( $last_nonblank_token ne ';'
4329 && $last_nonblank_block_type !~
4330 /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
4333 # note: ';' '{' and '}' in list above
4334 # because continues can follow bare blocks;
4335 # ':' is labeled block
4337 ############################################
4338 # NOTE: This check has been deactivated because
4339 # continue has an alternative usage for given/when
4340 # blocks in perl 5.10
4341 ## warning("'$tok' should follow a block\n");
4342 ############################################
4346 # patch for SWITCH/CASE if 'case' and 'when are
4347 # treated as keywords. Also 'default' for Switch::Plain
4348 elsif ($tok eq 'when'
4350 || $tok eq 'default' )
4352 $statement_type = $tok; # next '{' is block
4356 # indent trailing if/unless/while/until
4357 # outdenting will be handled by later indentation loop
4358 ## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
4370 ## if ( $tok =~ /^(if|unless|while|until)$/
4371 ## && $next_nonblank_token ne '(' )
4373 ## $indent_flag = 1;
4377 # check for inline label following
4378 # /^(redo|last|next|goto)$/
4379 elsif (( $last_nonblank_type eq 'k' )
4380 && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4389 scan_bare_identifier();
4391 if ( $statement_type eq 'use'
4392 && $last_nonblank_token eq 'use' )
4394 $saw_use_module{$current_package}->{$tok} = 1;
4397 if ( $type eq 'w' ) {
4399 if ( $expecting == OPERATOR ) {
4401 # Patch to avoid error message for RPerl overloaded
4402 # operator functions: use overload
4407 # FIXME: this should eventually be generalized
4408 if ( $saw_use_module{$current_package}->{'RPerl'}
4409 && $tok =~ /^sse_(mul|div|add|sub)$/ )
4414 # Fix part 1 for git #63 in which a comment falls
4415 # between an -> and the following word. An
4416 # alternate fix would be to change operator_expected
4417 # to return an UNKNOWN for this type.
4418 elsif ( $last_nonblank_type eq '->' ) {
4422 # don't complain about possible indirect object
4426 # sub new($) { ... }
4427 # $b = new A::; # calls A::new
4428 # $c = new A; # same thing but suspicious
4429 # This will call A::new but we have a 'new' in
4430 # main:: which looks like a constant.
4432 elsif ( $last_nonblank_type eq 'C' ) {
4433 if ( $tok !~ /::$/ ) {
4435 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
4436 Maybe indirectet object notation?
4441 error_if_expecting_OPERATOR("bareword");
4445 # mark bare words immediately followed by a paren as
4447 $next_tok = $rtokens->[ $i + 1 ];
4448 if ( $next_tok eq '(' ) {
4450 # Fix part 2 for git #63. Leave type as 'w' to keep
4451 # the type the same as if the -> were not separated
4452 $type = 'U' unless ( $last_nonblank_type eq '->' );
4455 # underscore after file test operator is file handle
4456 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
4460 # patch for SWITCH/CASE if 'case' and 'when are
4461 # not treated as keywords:
4465 && $brace_type[$brace_depth] eq 'switch'
4468 && $brace_type[$brace_depth] eq 'given' )
4471 $statement_type = $tok; # next '{' is block
4472 $type = 'k'; # for keyword syntax coloring
4475 # patch for SWITCH/CASE if switch and given not keywords
4476 # Switch is not a perl 5 keyword, but we will gamble
4477 # and mark switch followed by paren as a keyword. This
4478 # is only necessary to get html syntax coloring nice,
4479 # and does not commit this as being a switch/case.
4480 if ( $next_nonblank_token eq '('
4481 && ( $tok eq 'switch' || $tok eq 'given' ) )
4483 $type = 'k'; # for keyword syntax coloring
4489 ###############################################################
4490 # section 2: strings of digits
4491 ###############################################################
4492 elsif ( $pre_type eq 'd' ) {
4494 operator_expected( [ $prev_type, $tok, $next_type ] );
4495 error_if_expecting_OPERATOR("Number")
4496 if ( $expecting == OPERATOR );
4498 my $number = scan_number_fast();
4499 if ( !defined($number) ) {
4501 # shouldn't happen - we should always get a number
4504 non-number beginning with digit--program bug
4508 "Unexpected error condition: non-number beginning with digit\n"
4510 report_definite_bug();
4514 ###############################################################
4515 # section 3: all other tokens
4516 ###############################################################
4519 last if ( $tok eq '#' );
4520 my $code = $tokenization_code->{$tok};
4523 operator_expected( [ $prev_type, $tok, $next_type ] );
4530 # -----------------------------
4531 # end of main tokenization loop
4532 # -----------------------------
4534 if ( $i_tok >= 0 ) {
4535 $routput_token_type->[$i_tok] = $type;
4536 $routput_block_type->[$i_tok] = $block_type;
4537 $routput_container_type->[$i_tok] = $container_type;
4538 $routput_type_sequence->[$i_tok] = $type_sequence;
4539 $routput_indent_flag->[$i_tok] = $indent_flag;
4542 unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
4543 $last_last_nonblank_token = $last_nonblank_token;
4544 $last_last_nonblank_type = $last_nonblank_type;
4545 $last_last_nonblank_block_type = $last_nonblank_block_type;
4546 $last_last_nonblank_container_type = $last_nonblank_container_type;
4547 $last_last_nonblank_type_sequence = $last_nonblank_type_sequence;
4548 $last_nonblank_token = $tok;
4549 $last_nonblank_type = $type;
4550 $last_nonblank_block_type = $block_type;
4551 $last_nonblank_container_type = $container_type;
4552 $last_nonblank_type_sequence = $type_sequence;
4553 $last_nonblank_prototype = $prototype;
4556 # reset indentation level if necessary at a sub or package
4557 # in an attempt to recover from a nesting error
4558 if ( $level_in_tokenizer < 0 ) {
4559 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
4560 reset_indentation_level(0);
4561 brace_warning("resetting level to 0 at $1 $2\n");
4565 # all done tokenizing this line ...
4566 # now prepare the final list of tokens and types
4568 my @token_type = (); # stack of output token types
4569 my @block_type = (); # stack of output code block types
4570 my @container_type = (); # stack of output code container types
4571 my @type_sequence = (); # stack of output type sequence numbers
4572 my @tokens = (); # output tokens
4573 my @levels = (); # structural brace levels of output tokens
4574 my @slevels = (); # secondary nesting levels of output tokens
4575 my @nesting_tokens = (); # string of tokens leading to this depth
4576 my @nesting_types = (); # string of token types leading to this depth
4577 my @nesting_blocks = (); # string of block types leading to this depth
4578 my @nesting_lists = (); # string of list types leading to this depth
4579 my @ci_string = (); # string needed to compute continuation indentation
4580 my @container_environment = (); # BLOCK or LIST
4581 my $container_environment = '';
4582 my $im = -1; # previous $i value
4585 # Count the number of '1's in the string (previously sub ones_count)
4586 my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4588 # Computing Token Indentation
4590 # The final section of the tokenizer forms tokens and also computes
4591 # parameters needed to find indentation. It is much easier to do it
4592 # in the tokenizer than elsewhere. Here is a brief description of how
4593 # indentation is computed. Perl::Tidy computes indentation as the sum
4596 # (1) structural indentation, such as if/else/elsif blocks
4597 # (2) continuation indentation, such as long parameter call lists.
4599 # These are occasionally called primary and secondary indentation.
4601 # Structural indentation is introduced by tokens of type '{', although
4602 # the actual tokens might be '{', '(', or '['. Structural indentation
4603 # is of two types: BLOCK and non-BLOCK. Default structural indentation
4604 # is 4 characters if the standard indentation scheme is used.
4606 # Continuation indentation is introduced whenever a line at BLOCK level
4607 # is broken before its termination. Default continuation indentation
4608 # is 2 characters in the standard indentation scheme.
4610 # Both types of indentation may be nested arbitrarily deep and
4611 # interlaced. The distinction between the two is somewhat arbitrary.
4613 # For each token, we will define two variables which would apply if
4614 # the current statement were broken just before that token, so that
4615 # that token started a new line:
4617 # $level = the structural indentation level,
4618 # $ci_level = the continuation indentation level
4620 # The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
4621 # assuming defaults. However, in some special cases it is customary
4622 # to modify $ci_level from this strict value.
4624 # The total structural indentation is easy to compute by adding and
4625 # subtracting 1 from a saved value as types '{' and '}' are seen. The
4626 # running value of this variable is $level_in_tokenizer.
4628 # The total continuation is much more difficult to compute, and requires
4629 # several variables. These variables are:
4631 # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
4632 # each indentation level, if there are intervening open secondary
4633 # structures just prior to that level.
4634 # $continuation_string_in_tokenizer = a string of 1's and 0's indicating
4635 # if the last token at that level is "continued", meaning that it
4636 # is not the first token of an expression.
4637 # $nesting_block_string = a string of 1's and 0's indicating, for each
4638 # indentation level, if the level is of type BLOCK or not.
4639 # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
4640 # $nesting_list_string = a string of 1's and 0's indicating, for each
4641 # indentation level, if it is appropriate for list formatting.
4642 # If so, continuation indentation is used to indent long list items.
4643 # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
4644 # @{$rslevel_stack} = a stack of total nesting depths at each
4645 # structural indentation level, where "total nesting depth" means
4646 # the nesting depth that would occur if every nesting token -- '{', '[',
4647 # and '(' -- , regardless of context, is used to compute a nesting
4650 #my $nesting_block_flag = ($nesting_block_string =~ /1$/);
4651 #my $nesting_list_flag = ($nesting_list_string =~ /1$/);
4653 my ( $ci_string_i, $level_i, $nesting_block_string_i,
4654 $nesting_list_string_i, $nesting_token_string_i,
4655 $nesting_type_string_i, );
4657 foreach my $i ( @{$routput_token_list} )
4658 { # scan the list of pre-tokens indexes
4660 # self-checking for valid token types
4661 my $type = $routput_token_type->[$i];
4662 my $forced_indentation_flag = $routput_indent_flag->[$i];
4664 # See if we should undo the $forced_indentation_flag.
4665 # Forced indentation after 'if', 'unless', 'while' and 'until'
4666 # expressions without trailing parens is optional and doesn't
4667 # always look good. It is usually okay for a trailing logical
4668 # expression, but if the expression is a function call, code block,
4669 # or some kind of list it puts in an unwanted extra indentation
4670 # level which is hard to remove.
4672 # Example where extra indentation looks ok:
4674 # if $det_a < 0 and $det_b > 0
4675 # or $det_a > 0 and $det_b < 0;
4677 # Example where extra indentation is not needed because
4678 # the eval brace also provides indentation:
4679 # print "not " if defined eval {
4680 # reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
4683 # The following rule works fairly well:
4684 # Undo the flag if the end of this line, or start of the next
4685 # line, is an opening container token or a comma.
4686 # This almost always works, but if not after another pass it will
4688 if ( $forced_indentation_flag && $type eq 'k' ) {
4690 my $ilast = $routput_token_list->[$ixlast];
4691 my $toklast = $routput_token_type->[$ilast];
4692 if ( $toklast eq '#' ) {
4694 $ilast = $routput_token_list->[$ixlast];
4695 $toklast = $routput_token_type->[$ilast];
4697 if ( $toklast eq 'b' ) {
4699 $ilast = $routput_token_list->[$ixlast];
4700 $toklast = $routput_token_type->[$ilast];
4702 if ( $toklast =~ /^[\{,]$/ ) {
4703 $forced_indentation_flag = 0;
4706 ( $toklast, my $i_next ) =
4707 find_next_nonblank_token( $max_token_index, $rtokens,
4709 if ( $toklast =~ /^[\{,]$/ ) {
4710 $forced_indentation_flag = 0;
4715 # if we are already in an indented if, see if we should outdent
4716 if ($indented_if_level) {
4718 # don't try to nest trailing if's - shouldn't happen
4719 if ( $type eq 'k' ) {
4720 $forced_indentation_flag = 0;
4723 # check for the normal case - outdenting at next ';'
4724 elsif ( $type eq ';' ) {
4725 if ( $level_in_tokenizer == $indented_if_level ) {
4726 $forced_indentation_flag = -1;
4727 $indented_if_level = 0;
4731 # handle case of missing semicolon
4732 elsif ( $type eq '}' ) {
4733 if ( $level_in_tokenizer == $indented_if_level ) {
4734 $indented_if_level = 0;
4736 # TBD: This could be a subroutine call
4737 $level_in_tokenizer--;
4738 if ( @{$rslevel_stack} > 1 ) {
4739 pop( @{$rslevel_stack} );
4741 if ( length($nesting_block_string) > 1 )
4742 { # true for valid script
4743 chop $nesting_block_string;
4744 chop $nesting_list_string;
4751 my $tok = $rtokens->[$i]; # the token, but ONLY if same as pretoken
4752 $level_i = $level_in_tokenizer;
4754 # This can happen by running perltidy on non-scripts
4755 # although it could also be bug introduced by programming change.
4756 # Perl silently accepts a 032 (^Z) and takes it as the end
4757 if ( !$is_valid_token_type{$type} ) {
4758 my $val = ord($type);
4760 "unexpected character decimal $val ($type) in script\n");
4761 $tokenizer_self->[_in_error_] = 1;
4764 # ----------------------------------------------------------------
4765 # TOKEN TYPE PATCHES
4766 # output __END__, __DATA__, and format as type 'k' instead of ';'
4767 # to make html colors correct, etc.
4768 my $fix_type = $type;
4769 if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }
4771 # output anonymous 'sub' as keyword
4772 if ( $type eq 't' && $is_sub{$tok} ) { $fix_type = 'k' }
4774 # -----------------------------------------------------------------
4776 $nesting_token_string_i = $nesting_token_string;
4777 $nesting_type_string_i = $nesting_type_string;
4778 $nesting_block_string_i = $nesting_block_string;
4779 $nesting_list_string_i = $nesting_list_string;
4781 # set primary indentation levels based on structural braces
4782 # Note: these are set so that the leading braces have a HIGHER
4783 # level than their CONTENTS, which is convenient for indentation
4784 # Also, define continuation indentation for each token.
4785 if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
4788 # use environment before updating
4789 $container_environment =
4790 $nesting_block_flag ? 'BLOCK'
4791 : $nesting_list_flag ? 'LIST'
4794 # if the difference between total nesting levels is not 1,
4795 # there are intervening non-structural nesting types between
4796 # this '{' and the previous unclosed '{'
4797 my $intervening_secondary_structure = 0;
4798 if ( @{$rslevel_stack} ) {
4799 $intervening_secondary_structure =
4800 $slevel_in_tokenizer - $rslevel_stack->[-1];
4803 # Continuation Indentation
4805 # Having tried setting continuation indentation both in the formatter and
4806 # in the tokenizer, I can say that setting it in the tokenizer is much,
4807 # much easier. The formatter already has too much to do, and can't
4808 # make decisions on line breaks without knowing what 'ci' will be at
4809 # arbitrary locations.
4811 # But a problem with setting the continuation indentation (ci) here
4812 # in the tokenizer is that we do not know where line breaks will actually
4813 # be. As a result, we don't know if we should propagate continuation
4814 # indentation to higher levels of structure.
4816 # For nesting of only structural indentation, we never need to do this.
4817 # For example, in a long if statement, like this
4819 # if ( !$output_block_type[$i]
4820 # && ($in_statement_continuation) )
4825 # the second line has ci but we do normally give the lines within the BLOCK
4826 # any ci. This would be true if we had blocks nested arbitrarily deeply.
4828 # But consider something like this, where we have created a break after
4829 # an opening paren on line 1, and the paren is not (currently) a
4830 # structural indentation token:
4832 # my $file = $menubar->Menubutton(
4833 # qw/-text File -underline 0 -menuitems/ => [
4835 # Cascade => '~View',
4839 # The second line has ci, so it would seem reasonable to propagate it
4840 # down, giving the third line 1 ci + 1 indentation. This suggests the
4841 # following rule, which is currently used to propagating ci down: if there
4842 # are any non-structural opening parens (or brackets, or braces), before
4843 # an opening structural brace, then ci is propagated down, and otherwise
4844 # not. The variable $intervening_secondary_structure contains this
4845 # information for the current token, and the string
4846 # "$ci_string_in_tokenizer" is a stack of previous values of this
4849 # save the current states
4850 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
4851 $level_in_tokenizer++;
4853 if ( $level_in_tokenizer > $tokenizer_self->[_maximum_level_] )
4855 $tokenizer_self->[_maximum_level_] = $level_in_tokenizer;
4858 if ($forced_indentation_flag) {
4860 # break BEFORE '?' when there is forced indentation
4861 if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
4862 if ( $type eq 'k' ) {
4863 $indented_if_level = $level_in_tokenizer;
4866 # do not change container environment here if we are not
4867 # at a real list. Adding this check prevents "blinkers"
4868 # often near 'unless" clauses, such as in the following
4873 ## File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
4876 $nesting_block_string .= "$nesting_block_flag";
4880 if ( $routput_block_type->[$i] ) {
4881 $nesting_block_flag = 1;
4882 $nesting_block_string .= '1';
4885 $nesting_block_flag = 0;
4886 $nesting_block_string .= '0';
4890 # we will use continuation indentation within containers
4891 # which are not blocks and not logical expressions
4893 if ( !$routput_block_type->[$i] ) {
4895 # propagate flag down at nested open parens
4896 if ( $routput_container_type->[$i] eq '(' ) {
4897 $bit = 1 if $nesting_list_flag;
4900 # use list continuation if not a logical grouping
4901 # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
4905 $is_logical_container{ $routput_container_type->[$i]
4909 $nesting_list_string .= $bit;
4910 $nesting_list_flag = $bit;
4912 $ci_string_in_tokenizer .=
4913 ( $intervening_secondary_structure != 0 ) ? '1' : '0';
4915 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4916 $continuation_string_in_tokenizer .=
4917 ( $in_statement_continuation > 0 ) ? '1' : '0';
4919 # Sometimes we want to give an opening brace continuation indentation,
4920 # and sometimes not. For code blocks, we don't do it, so that the leading
4921 # '{' gets outdented, like this:
4923 # if ( !$output_block_type[$i]
4924 # && ($in_statement_continuation) )
4927 # For other types, we will give them continuation indentation. For example,
4928 # here is how a list looks with the opening paren indented:
4931 # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
4932 # [ "homer", "marge", "bart" ], );
4934 # This looks best when 'ci' is one-half of the indentation (i.e., 2 and 4)
4936 my $total_ci = $ci_string_sum;
4938 !$routput_block_type->[$i] # patch: skip for BLOCK
4939 && ($in_statement_continuation)
4940 && !( $forced_indentation_flag && $type eq ':' )
4943 $total_ci += $in_statement_continuation
4944 unless ( substr( $ci_string_in_tokenizer, -1 ) eq '1' );
4947 $ci_string_i = $total_ci;
4948 $in_statement_continuation = 0;
4953 || $forced_indentation_flag < 0 )
4956 # only a nesting error in the script would prevent popping here
4957 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
4959 $level_i = --$level_in_tokenizer;
4961 # restore previous level values
4962 if ( length($nesting_block_string) > 1 )
4963 { # true for valid script
4964 chop $nesting_block_string;
4965 $nesting_block_flag =
4966 substr( $nesting_block_string, -1 ) eq '1';
4967 chop $nesting_list_string;
4968 $nesting_list_flag =
4969 substr( $nesting_list_string, -1 ) eq '1';
4971 chop $ci_string_in_tokenizer;
4973 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4975 $in_statement_continuation =
4976 chop $continuation_string_in_tokenizer;
4978 # zero continuation flag at terminal BLOCK '}' which
4980 my $block_type_i = $routput_block_type->[$i];
4981 if ($block_type_i) {
4983 # ...These include non-anonymous subs
4984 # note: could be sub ::abc { or sub 'abc
4985 if ( $block_type_i =~ m/^sub\s*/gc ) {
4987 # note: older versions of perl require the /gc modifier
4988 # here or else the \G does not work.
4989 if ( $block_type_i =~ /\G('|::|\w)/gc ) {
4990 $in_statement_continuation = 0;
4994 # ...and include all block types except user subs with
4995 # block prototypes and these: (sort|grep|map|do|eval)
4996 # /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
4998 $is_zero_continuation_block_type{$block_type_i} )
5000 $in_statement_continuation = 0;
5003 # ..but these are not terminal types:
5004 # /^(sort|grep|map|do|eval)$/ )
5005 elsif ($is_sort_map_grep_eval_do{$block_type_i}
5006 || $is_grep_alias{$block_type_i} )
5010 # ..and a block introduced by a label
5012 elsif ( $block_type_i =~ /:$/ ) {
5013 $in_statement_continuation = 0;
5016 # user function with block prototype
5018 $in_statement_continuation = 0;
5022 # If we are in a list, then
5023 # we must set continuation indentation at the closing
5024 # paren of something like this (paren after $check):
5027 # ( not defined $check )
5029 # or $check eq "new"
5030 # or $check eq "old",
5032 elsif ( $tok eq ')' ) {
5033 $in_statement_continuation = 1
5034 if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
5037 elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
5040 # use environment after updating
5041 $container_environment =
5042 $nesting_block_flag ? 'BLOCK'
5043 : $nesting_list_flag ? 'LIST'
5045 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5046 $nesting_block_string_i = $nesting_block_string;
5047 $nesting_list_string_i = $nesting_list_string;
5050 # not a structural indentation type..
5053 $container_environment =
5054 $nesting_block_flag ? 'BLOCK'
5055 : $nesting_list_flag ? 'LIST'
5058 # zero the continuation indentation at certain tokens so
5059 # that they will be at the same level as its container. For
5060 # commas, this simplifies the -lp indentation logic, which
5061 # counts commas. For ?: it makes them stand out.
5062 if ($nesting_list_flag) {
5063 ## $type =~ /^[,\?\:]$/
5064 if ( $is_comma_question_colon{$type} ) {
5065 $in_statement_continuation = 0;
5069 # be sure binary operators get continuation indentation
5071 $container_environment
5072 && ( $type eq 'k' && $is_binary_keyword{$tok}
5073 || $is_binary_type{$type} )
5076 $in_statement_continuation = 1;
5079 # continuation indentation is sum of any open ci from previous
5080 # levels plus the current level
5081 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5083 # update continuation flag ...
5084 # if this isn't a blank or comment..
5085 if ( $type ne 'b' && $type ne '#' ) {
5087 # and we are in a BLOCK
5088 if ($nesting_block_flag) {
5090 # the next token after a ';' and label starts a new stmt
5091 if ( $type eq ';' || $type eq 'J' ) {
5092 $in_statement_continuation = 0;
5095 # otherwise, we are continuing the current statement
5097 $in_statement_continuation = 1;
5101 # if we are not in a BLOCK..
5104 # do not use continuation indentation if not list
5105 # environment (could be within if/elsif clause)
5106 if ( !$nesting_list_flag ) {
5107 $in_statement_continuation = 0;
5110 # otherwise, the token after a ',' starts a new term
5112 # Patch FOR RT#99961; no continuation after a ';'
5113 # This is needed because perltidy currently marks
5114 # a block preceded by a type character like % or @
5115 # as a non block, to simplify formatting. But these
5116 # are actually blocks and can have semicolons.
5117 # See code_block_type() and is_non_structural_brace().
5118 elsif ( $type eq ',' || $type eq ';' ) {
5119 $in_statement_continuation = 0;
5122 # otherwise, we are continuing the current term
5124 $in_statement_continuation = 1;
5130 if ( $level_in_tokenizer < 0 ) {
5131 unless ( $tokenizer_self->[_saw_negative_indentation_] ) {
5132 $tokenizer_self->[_saw_negative_indentation_] = 1;
5133 warning("Starting negative indentation\n");
5137 # set secondary nesting levels based on all containment token types
5138 # Note: these are set so that the nesting depth is the depth
5139 # of the PREVIOUS TOKEN, which is convenient for setting
5140 # the strength of token bonds
5141 my $slevel_i = $slevel_in_tokenizer;
5144 if ( $is_opening_type{$type} ) {
5145 $slevel_in_tokenizer++;
5146 $nesting_token_string .= $tok;
5147 $nesting_type_string .= $type;
5151 elsif ( $is_closing_type{$type} ) {
5152 $slevel_in_tokenizer--;
5153 my $char = chop $nesting_token_string;
5155 if ( $char ne $matching_start_token{$tok} ) {
5156 $nesting_token_string .= $char . $tok;
5157 $nesting_type_string .= $type;
5160 chop $nesting_type_string;
5164 push( @block_type, $routput_block_type->[$i] );
5165 push( @ci_string, $ci_string_i );
5166 push( @container_environment, $container_environment );
5167 push( @container_type, $routput_container_type->[$i] );
5168 push( @levels, $level_i );
5169 push( @nesting_tokens, $nesting_token_string_i );
5170 push( @nesting_types, $nesting_type_string_i );
5171 push( @slevels, $slevel_i );
5172 push( @token_type, $fix_type );
5173 push( @type_sequence, $routput_type_sequence->[$i] );
5174 push( @nesting_blocks, $nesting_block_string );
5175 push( @nesting_lists, $nesting_list_string );
5177 # now form the previous token
5180 $rtoken_map->[$i] - $rtoken_map->[$im]; # how many characters
5184 substr( $input_line, $rtoken_map->[$im], $num ) );
5190 $num = length($input_line) - $rtoken_map->[$im]; # make the last token
5192 push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
5195 $tokenizer_self->[_in_attribute_list_] = $in_attribute_list;
5196 $tokenizer_self->[_in_quote_] = $in_quote;
5197 $tokenizer_self->[_quote_target_] =
5198 $in_quote ? matching_end_token($quote_character) : "";
5199 $tokenizer_self->[_rhere_target_list_] = $rhere_target_list;
5201 $line_of_tokens->{_rtoken_type} = \@token_type;
5202 $line_of_tokens->{_rtokens} = \@tokens;
5203 $line_of_tokens->{_rblock_type} = \@block_type;
5204 $line_of_tokens->{_rcontainer_type} = \@container_type;
5205 $line_of_tokens->{_rcontainer_environment} = \@container_environment;
5206 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
5207 $line_of_tokens->{_rlevels} = \@levels;
5208 $line_of_tokens->{_rslevels} = \@slevels;
5209 $line_of_tokens->{_rnesting_tokens} = \@nesting_tokens;
5210 $line_of_tokens->{_rci_levels} = \@ci_string;
5211 $line_of_tokens->{_rnesting_blocks} = \@nesting_blocks;
5215 } ## end tokenize_this_line
5217 #########i#############################################################
5218 # Tokenizer routines which assist in identifying token types
5219 #######################################################################
5221 # hash lookup table of operator expected values
5222 my %op_expected_table;
5224 # exceptions to perl's weird parsing rules after type 'Z'
5225 my %is_weird_parsing_rule_exception;
5227 my %is_paren_dollar;
5233 # Always expecting TERM following these types:
5234 # note: this is identical to '@value_requestor_type' defined later.
5236 ; ! + x & ? F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
5237 || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
5238 &= // >> ~. &. |. ^.
5239 ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
5242 push @q, '('; # for completeness, not currently a token type
5243 @{op_expected_table}{@q} = (TERM) x scalar(@q);
5245 # Always UNKNOWN following these types:
5246 # Fix for c030: added '->' to this list
5248 @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
5250 # Always expecting OPERATOR ...
5251 # 'n' and 'v' are currently excluded because they might be VERSION numbers
5252 # 'i' is currently excluded because it might be a package
5253 # 'q' is currently excluded because it might be a prototype
5254 # Fix for c030: removed '->' from this list:
5255 @q = qw( -- C h R ++ ] Q <> ); ## n v q i );
5257 @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
5259 # Fix for git #62: added '*' and '%'
5261 @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
5264 @{is_paren_dollar}{@q} = (1) x scalar(@q);
5267 @{is_n_v}{@q} = (1) x scalar(@q);
5271 use constant DEBUG_OPERATOR_EXPECTED => 0;
5273 sub operator_expected {
5275 # Returns a parameter indicating what types of tokens can occur next
5278 # $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
5280 # $prev_type is the type of the previous token (blank or not)
5281 # $tok is the current token
5282 # $next_type is the type of the next token (blank or not)
5284 # Many perl symbols have two or more meanings. For example, '<<'
5285 # can be a shift operator or a here-doc operator. The
5286 # interpretation of these symbols depends on the current state of
5287 # the tokenizer, which may either be expecting a term or an
5288 # operator. For this example, a << would be a shift if an OPERATOR
5289 # is expected, and a here-doc if a TERM is expected. This routine
5290 # is called to make this decision for any current token. It returns
5291 # one of three possible values:
5293 # OPERATOR - operator expected (or at least, not a term)
5294 # UNKNOWN - can't tell
5295 # TERM - a term is expected (or at least, not an operator)
5297 # The decision is based on what has been seen so far. This
5298 # information is stored in the "$last_nonblank_type" and
5299 # "$last_nonblank_token" variables. For example, if the
5300 # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
5301 # if $last_nonblank_type is 'n' (numeric), we are expecting an
5304 # If a UNKNOWN is returned, the calling routine must guess. A major
5305 # goal of this tokenizer is to minimize the possibility of returning
5306 # UNKNOWN, because a wrong guess can spoil the formatting of a
5309 # Adding NEW_TOKENS: it is critically important that this routine be
5310 # updated to allow it to determine if an operator or term is to be
5311 # expected after the new token. Doing this simply involves adding
5312 # the new token character to one of the regexes in this routine or
5313 # to one of the hash lists
5314 # that it uses, which are initialized in the BEGIN section.
5315 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
5318 # When possible, token types should be selected such that we can determine
5319 # the 'operator_expected' value by a simple hash lookup. If there are
5320 # exceptions, that is an indication that a new type is needed.
5330 # Many types are can be obtained by a table lookup given the previous type.
5331 # This typically handles half or more of the calls.
5332 my $op_expected = $op_expected_table{$last_nonblank_type};
5333 if ( defined($op_expected) ) {
5334 $msg = "Table lookup";
5338 ######################
5339 # Handle special cases
5340 ######################
5342 $op_expected = UNKNOWN;
5343 my ( $prev_type, $tok, $next_type ) = @{$rarg};
5345 # Types 'k', '}' and 'Z' depend on context
5346 # FIXME: Types 'i', 'n', 'v', 'q' currently also temporarily depend on
5347 # context but that dependence could eventually be eliminated with better
5348 # token type definition
5351 if ( $last_nonblank_type eq 'i' ) {
5352 $op_expected = OPERATOR;
5354 # FIXME: it would be cleaner to make this a special type
5355 # expecting VERSION or {} after package NAMESPACE
5356 # TODO: maybe mark these words as type 'Y'?
5357 if ( substr( $last_nonblank_token, 0, 7 ) eq 'package'
5358 && $statement_type =~ /^package\b/
5359 && $last_nonblank_token =~ /^package\b/ )
5361 $op_expected = TERM;
5366 elsif ( $last_nonblank_type eq 'k' ) {
5367 $op_expected = TERM;
5368 if ( $expecting_operator_token{$last_nonblank_token} ) {
5369 $op_expected = OPERATOR;
5371 elsif ( $expecting_term_token{$last_nonblank_token} ) {
5373 # Exceptions from TERM:
5375 # // may follow perl functions which may be unary operators
5376 # see test file dor.t (defined or);
5379 && $next_type eq '/'
5380 && $is_keyword_rejecting_slash_as_pattern_delimiter{
5381 $last_nonblank_token}
5384 $op_expected = OPERATOR;
5387 # Patch to allow a ? following 'split' to be a depricated pattern
5388 # delimiter. This patch is coordinated with the omission of split
5390 # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
5391 # will force perltidy to guess.
5393 && $last_nonblank_token eq 'split' )
5395 $op_expected = UNKNOWN;
5400 # closing container token...
5402 # Note that the actual token for type '}' may also be a ')'.
5404 # Also note that $last_nonblank_token is not the token corresponding to
5405 # $last_nonblank_type when the type is a closing container. In that
5406 # case it is the token before the corresponding opening container token.
5407 # So for example, for this snippet
5408 # $a = do { BLOCK } / 2;
5409 # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
5411 elsif ( $last_nonblank_type eq '}' ) {
5412 $op_expected = UNKNOWN;
5414 # handle something after 'do' and 'eval'
5415 if ( $is_block_operator{$last_nonblank_token} ) {
5417 # something like $a = do { BLOCK } / 2;
5418 $op_expected = OPERATOR; # block mode following }
5421 ##elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
5422 elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
5423 || substr( $last_nonblank_token, 0, 2 ) eq '->' )
5425 $op_expected = OPERATOR;
5426 if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
5429 # Check for smartmatch operator before preceding brace or square
5430 # bracket. For example, at the ? after the ] in the following
5431 # expressions we are expecting an operator:
5433 # qr/3/ ~~ ['1234'] ? 1 : 0;
5434 # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
5435 elsif ( $last_nonblank_token eq '~~' ) {
5436 $op_expected = OPERATOR;
5439 # A right brace here indicates the end of a simple block. All
5440 # non-structural right braces have type 'R' all braces associated with
5441 # block operator keywords have been given those keywords as
5442 # "last_nonblank_token" and caught above. (This statement is order
5443 # dependent, and must come after checking $last_nonblank_token).
5446 # patch for dor.t (defined or).
5448 && $next_type eq '/'
5449 && $last_nonblank_token eq ']' )
5451 $op_expected = OPERATOR;
5454 # Patch for RT #116344: misparse a ternary operator after an
5455 # anonymous hash, like this:
5456 # return ref {} ? 1 : 0;
5457 # The right brace should really be marked type 'R' in this case,
5458 # and it is safest to return an UNKNOWN here. Expecting a TERM will
5459 # cause the '?' to always be interpreted as a pattern delimiter
5460 # rather than introducing a ternary operator.
5461 elsif ( $tok eq '?' ) {
5462 $op_expected = UNKNOWN;
5465 $op_expected = TERM;
5470 # number or v-string...
5471 # An exception is for VERSION numbers a 'use' statement. It has the format
5472 # use Module VERSION LIST
5473 # We could avoid this exception by writing a special sub to parse 'use'
5474 # statements and perhaps mark these numbers with a new type V (for VERSION)
5475 ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
5476 elsif ( $is_n_v{$last_nonblank_type} ) {
5477 $op_expected = OPERATOR;
5478 if ( $statement_type eq 'use' ) {
5479 $op_expected = UNKNOWN;
5484 # FIXME: labeled prototype words should probably be given type 'A' or maybe
5485 # 'J'; not 'q'; or maybe mark as type 'Y'
5486 elsif ( $last_nonblank_type eq 'q' ) {
5487 $op_expected = OPERATOR;
5488 if ( $last_nonblank_token eq 'prototype' )
5489 ##|| $last_nonblank_token eq 'switch' )
5491 $op_expected = TERM;
5495 # file handle or similar
5496 elsif ( $last_nonblank_type eq 'Z' ) {
5498 $op_expected = UNKNOWN;
5501 if ( $last_nonblank_token =~ /^\w/ ) {
5502 $op_expected = UNKNOWN;
5505 # Exception to weird parsing rules for 'x(' ... see case b1205:
5506 # In something like 'print $vv x(...' the x is an operator;
5507 # Likewise in 'print $vv x$ww' the x is an operatory (case b1207)
5508 # otherwise x follows the weird parsing rules.
5509 elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
5510 $op_expected = OPERATOR;
5513 # The 'weird parsing rules' of next section do not work for '<' and '?'
5514 # It is best to mark them as unknown. Test case:
5516 elsif ( $is_weird_parsing_rule_exception{$tok} ) {
5517 $op_expected = UNKNOWN;
5520 # For possible file handle like "$a", Perl uses weird parsing rules.
5522 # print $a/2,"/hi"; - division
5523 # print $a / 2,"/hi"; - division
5524 # print $a/ 2,"/hi"; - division
5525 # print $a /2,"/hi"; - pattern (and error)!
5526 # Some examples where this logic works okay, for '&','*','+':
5527 # print $fh &xsi_protos(@mods);
5528 # my $x = new $CompressClass *FH;
5529 # print $OUT +( $count % 15 ? ", " : "\n\t" );
5530 elsif ($prev_type eq 'b'
5531 && $next_type ne 'b' )
5533 $op_expected = TERM;
5536 # Note that '?' and '<' have been moved above
5537 # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
5538 elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
5540 # Do not complain in 'use' statements, which have special syntax.
5541 # For example, from RT#130344:
5542 # use lib $FindBin::Bin . '/lib';
5543 if ( $statement_type ne 'use' ) {
5545 "operator in possible indirect object location not recommended\n"
5548 $op_expected = OPERATOR;
5554 $op_expected = UNKNOWN;
5559 DEBUG_OPERATOR_EXPECTED && do {
5561 "OPERATOR_EXPECTED: $msg: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5564 return $op_expected;
5566 } ## end of sub operator_expected
5568 sub new_statement_ok {
5570 # return true if the current token can start a new statement
5571 # USES GLOBAL VARIABLES: $last_nonblank_type
5573 return label_ok() # a label would be ok here
5575 || $last_nonblank_type eq 'J'; # or we follow a label
5581 # Decide if a bare word followed by a colon here is a label
5582 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5583 # $brace_depth, @brace_type
5585 # if it follows an opening or closing code block curly brace..
5586 if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
5587 && $last_nonblank_type eq $last_nonblank_token )
5590 # it is a label if and only if the curly encloses a code block
5591 return $brace_type[$brace_depth];
5594 # otherwise, it is a label if and only if it follows a ';' (real or fake)
5597 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
5601 sub code_block_type {
5603 # Decide if this is a block of code, and its type.
5604 # Must be called only when $type = $token = '{'
5605 # The problem is to distinguish between the start of a block of code
5606 # and the start of an anonymous hash reference
5607 # Returns "" if not code block, otherwise returns 'last_nonblank_token'
5608 # to indicate the type of code block. (For example, 'last_nonblank_token'
5609 # might be 'if' for an if block, 'else' for an else block, etc).
5610 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5611 # $last_nonblank_block_type, $brace_depth, @brace_type
5613 # handle case of multiple '{'s
5615 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
5617 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
5618 if ( $last_nonblank_token eq '{'
5619 && $last_nonblank_type eq $last_nonblank_token )
5622 # opening brace where a statement may appear is probably
5623 # a code block but might be and anonymous hash reference
5624 if ( $brace_type[$brace_depth] ) {
5625 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5629 # cannot start a code block within an anonymous hash
5635 elsif ( $last_nonblank_token eq ';' ) {
5637 # an opening brace where a statement may appear is probably
5638 # a code block but might be and anonymous hash reference
5639 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5643 # handle case of '}{'
5644 elsif ($last_nonblank_token eq '}'
5645 && $last_nonblank_type eq $last_nonblank_token )
5648 # a } { situation ...
5649 # could be hash reference after code block..(blktype1.t)
5650 if ($last_nonblank_block_type) {
5651 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5655 # must be a block if it follows a closing hash reference
5657 return $last_nonblank_token;
5661 ################################################################
5662 # NOTE: braces after type characters start code blocks, but for
5663 # simplicity these are not identified as such. See also
5664 # sub is_non_structural_brace.
5665 ################################################################
5667 ## elsif ( $last_nonblank_type eq 't' ) {
5668 ## return $last_nonblank_token;
5671 # brace after label:
5672 elsif ( $last_nonblank_type eq 'J' ) {
5673 return $last_nonblank_token;
5676 # otherwise, look at previous token. This must be a code block if
5677 # it follows any of these:
5678 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
5679 elsif ($is_code_block_token{$last_nonblank_token}
5680 || $is_grep_alias{$last_nonblank_token} )
5683 # Bug Patch: Note that the opening brace after the 'if' in the following
5684 # snippet is an anonymous hash ref and not a code block!
5685 # print 'hi' if { x => 1, }->{x};
5686 # We can identify this situation because the last nonblank type
5687 # will be a keyword (instead of a closing peren)
5688 if ( $last_nonblank_token =~ /^(if|unless)$/
5689 && $last_nonblank_type eq 'k' )
5694 return $last_nonblank_token;
5698 # or a sub or package BLOCK
5699 elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
5700 && $last_nonblank_token =~ /^(sub|package)\b/ )
5702 return $last_nonblank_token;
5706 elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
5707 && ( $is_sub{$last_nonblank_token} ) )
5712 elsif ( $statement_type =~ /^(sub|package)\b/ ) {
5713 return $statement_type;
5716 # user-defined subs with block parameters (like grep/map/eval)
5717 elsif ( $last_nonblank_type eq 'G' ) {
5718 return $last_nonblank_token;
5722 elsif ( $last_nonblank_type eq 'w' ) {
5724 # check for syntax 'use MODULE LIST'
5725 # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
5726 return "" if ( $statement_type eq 'use' );
5728 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5732 # Patch for bug # RT #94338 reported by Daniel Trizen
5733 # for-loop in a parenthesized block-map triggering an error message:
5734 # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
5735 # Check for a code block within a parenthesized function call
5736 elsif ( $last_nonblank_token eq '(' ) {
5737 my $paren_type = $paren_type[$paren_depth];
5738 if ( $paren_type && $paren_type =~ /^(map|grep|sort)$/ ) {
5740 # We will mark this as a code block but use type 't' instead
5741 # of the name of the contining function. This will allow for
5742 # correct parsing but will usually produce better formatting.
5743 # Braces with block type 't' are not broken open automatically
5744 # in the formatter as are other code block types, and this usually
5746 return 't'; # (Not $paren_type)
5753 # handle unknown syntax ') {'
5754 # we previously appended a '()' to mark this case
5755 elsif ( $last_nonblank_token =~ /\(\)$/ ) {
5756 return $last_nonblank_token;
5759 # anything else must be anonymous hash reference
5765 sub decide_if_code_block {
5767 # USES GLOBAL VARIABLES: $last_nonblank_token
5768 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
5770 my ( $next_nonblank_token, $i_next ) =
5771 find_next_nonblank_token( $i, $rtokens, $max_token_index );
5773 # we are at a '{' where a statement may appear.
5774 # We must decide if this brace starts an anonymous hash or a code
5776 # return "" if anonymous hash, and $last_nonblank_token otherwise
5778 # initialize to be code BLOCK
5779 my $code_block_type = $last_nonblank_token;
5781 # Check for the common case of an empty anonymous hash reference:
5782 # Maybe something like sub { { } }
5783 if ( $next_nonblank_token eq '}' ) {
5784 $code_block_type = "";
5789 # To guess if this '{' is an anonymous hash reference, look ahead
5790 # and test as follows:
5792 # it is a hash reference if next come:
5793 # - a string or digit followed by a comma or =>
5794 # - bareword followed by =>
5795 # otherwise it is a code block
5797 # Examples of anonymous hash ref:
5801 # Examples of code blocks:
5802 # {1; print "hello\n", 1;}
5805 # We are only going to look ahead one more (nonblank/comment) line.
5806 # Strange formatting could cause a bad guess, but that's unlikely.
5810 # Ignore the rest of this line if it is a side comment
5811 if ( $next_nonblank_token ne '#' ) {
5812 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
5813 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
5815 my ( $rpre_tokens, $rpre_types ) =
5816 peek_ahead_for_n_nonblank_pre_tokens(20); # 20 is arbitrary but
5817 # generous, and prevents
5819 # time in mangled files
5820 if ( defined($rpre_types) && @{$rpre_types} ) {
5821 push @pre_types, @{$rpre_types};
5822 push @pre_tokens, @{$rpre_tokens};
5825 # put a sentinel token to simplify stopping the search
5826 push @pre_types, '}';
5827 push @pre_types, '}';
5830 $jbeg = 1 if $pre_types[0] eq 'b';
5832 # first look for one of these
5834 # - bareword with leading -
5838 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
5840 # find the closing quote; don't worry about escapes
5841 my $quote_mark = $pre_types[$j];
5842 foreach my $k ( $j + 1 .. @pre_types - 2 ) {
5843 if ( $pre_types[$k] eq $quote_mark ) {
5845 my $next = $pre_types[$j];
5850 elsif ( $pre_types[$j] eq 'd' ) {
5853 elsif ( $pre_types[$j] eq 'w' ) {
5856 elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
5861 $j++ if $pre_types[$j] eq 'b';
5863 # Patched for RT #95708
5866 # it is a comma which is not a pattern delimeter except for qw
5868 $pre_types[$j] eq ','
5869 && $pre_tokens[$jbeg] !~ /^(s|m|y|tr|qr|q|qq|qx)$/
5873 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
5876 $code_block_type = "";
5880 if ($code_block_type) {
5882 # Patch for cases b1085 b1128: It is uncertain if this is a block.
5883 # If this brace follows a bareword, then append a space as a signal
5884 # to the formatter that this may not be a block brace. To find the
5885 # corresponding code in Formatter.pm search for 'b1085'.
5886 $code_block_type .= " " if ( $code_block_type =~ /^\w/ );
5890 return $code_block_type;
5893 sub report_unexpected {
5895 # report unexpected token type and show where it is
5896 # USES GLOBAL VARIABLES: $tokenizer_self
5897 my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
5898 $rpretoken_type, $input_line )
5901 if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
5902 my $msg = "found $found where $expecting expected";
5903 my $pos = $rpretoken_map->[$i_tok];
5904 interrupt_logfile();
5905 my $input_line_number = $tokenizer_self->[_last_line_number_];
5906 my ( $offset, $numbered_line, $underline ) =
5907 make_numbered_line( $input_line_number, $input_line, $pos );
5908 $underline = write_on_underline( $underline, $pos - $offset, '^' );
5911 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
5912 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
5914 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
5915 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
5918 $num = $pos - $pos_prev;
5920 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
5923 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
5924 $trailer = " (previous token underlined)";
5926 $underline =~ s/\s+$//;
5927 warning( $numbered_line . "\n" );
5928 warning( $underline . "\n" );
5929 warning( $msg . $trailer . "\n" );
5935 my %is_sigil_or_paren;
5936 my %is_R_closing_sb;
5940 my @q = qw< $ & % * @ ) >;
5941 @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
5944 @{is_R_closing_sb}{@q} = (1) x scalar(@q);
5947 sub is_non_structural_brace {
5949 # Decide if a brace or bracket is structural or non-structural
5950 # by looking at the previous token and type
5951 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
5953 # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
5954 # Tentatively deactivated because it caused the wrong operator expectation
5956 # $user = @vars[1] / 100;
5957 # Must update sub operator_expected before re-implementing.
5958 # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
5962 ################################################################
5963 # NOTE: braces after type characters start code blocks, but for
5964 # simplicity these are not identified as such. See also
5965 # sub code_block_type
5966 ################################################################
5968 ##if ($last_nonblank_type eq 't') {return 0}
5970 # otherwise, it is non-structural if it is decorated
5971 # by type information.
5972 # For example, the '{' here is non-structural: ${xxx}
5973 # Removed '::' to fix c074
5974 ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
5976 ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
5977 $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
5978 || substr( $last_nonblank_token, 0, 2 ) eq '->'
5980 # or if we follow a hash or array closing curly brace or bracket
5981 # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
5982 # because the first '}' would have been given type 'R'
5983 ##|| $last_nonblank_type =~ /^([R\]])$/
5984 || $is_R_closing_sb{$last_nonblank_type}
5988 #########i#############################################################
5989 # Tokenizer routines for tracking container nesting depths
5990 #######################################################################
5992 # The following routines keep track of nesting depths of the nesting
5993 # types, ( [ { and ?. This is necessary for determining the indentation
5994 # level, and also for debugging programs. Not only do they keep track of
5995 # nesting depths of the individual brace types, but they check that each
5996 # of the other brace types is balanced within matching pairs. For
5997 # example, if the program sees this sequence:
6001 # then it can determine that there is an extra left paren somewhere
6002 # between the { and the }. And so on with every other possible
6003 # combination of outer and inner brace types. For another
6008 # which has an extra ] within the parens.
6010 # The brace types have indexes 0 .. 3 which are indexes into
6013 # The pair ? : are treated as just another nesting type, with ? acting
6014 # as the opening brace and : acting as the closing brace.
6018 # $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
6020 # saves the nesting depth of brace type $b (where $b is either of the other
6021 # nesting types) when brace type $a enters a new depth. When this depth
6022 # decreases, a check is made that the current depth of brace types $b is
6023 # unchanged, or otherwise there must have been an error. This can
6024 # be very useful for localizing errors, particularly when perl runs to
6025 # the end of a large file (such as this one) and announces that there
6026 # is a problem somewhere.
6028 # A numerical sequence number is maintained for every nesting type,
6029 # so that each matching pair can be uniquely identified in a simple
6032 sub increase_nesting_depth {
6033 my ( $aa, $pos ) = @_;
6035 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6036 # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
6038 $current_depth[$aa]++;
6040 $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
6041 my $input_line_number = $tokenizer_self->[_last_line_number_];
6042 my $input_line = $tokenizer_self->[_line_of_text_];
6044 # Sequence numbers increment by number of items. This keeps
6045 # a unique set of numbers but still allows the relative location
6046 # of any type to be determined.
6048 ########################################################################
6049 # OLD SEQNO METHOD for incrementing sequence numbers.
6050 # Keep this coding awhile for possible testing.
6051 ## $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
6052 ## my $seqno = $nesting_sequence_number[$aa];
6054 # NEW SEQNO METHOD, continuous sequence numbers. This allows sequence
6055 # numbers to be used as array indexes, and allows them to be compared.
6056 my $seqno = $next_sequence_number++;
6057 ########################################################################
6059 $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
6061 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
6062 [ $input_line_number, $input_line, $pos ];
6064 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6065 next if ( $bb == $aa );
6066 $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
6069 # set a flag for indenting a nested ternary statement
6071 if ( $aa == QUESTION_COLON ) {
6072 $nested_ternary_flag[ $current_depth[$aa] ] = 0;
6073 if ( $current_depth[$aa] > 1 ) {
6074 if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
6075 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
6076 if ( $pdepth == $total_depth - 1 ) {
6078 $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
6084 # Fix part #1 for git82: save last token type for propagation of type 'Z'
6085 $nested_statement_type[$aa][ $current_depth[$aa] ] =
6086 [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
6087 $statement_type = "";
6088 return ( $seqno, $indent );
6091 sub is_balanced_closing_container {
6093 # Return true if a closing container can go here without error
6094 # Return false if not
6097 # cannot close if there was no opening
6098 return unless ( $current_depth[$aa] > 0 );
6100 # check that any other brace types $bb contained within would be balanced
6101 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6102 next if ( $bb == $aa );
6104 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6105 $current_depth[$bb] );
6108 # OK, everything will be balanced
6112 sub decrease_nesting_depth {
6114 my ( $aa, $pos ) = @_;
6116 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6117 # @current_sequence_number, @depth_array, @starting_line_of_current_depth
6120 my $input_line_number = $tokenizer_self->[_last_line_number_];
6121 my $input_line = $tokenizer_self->[_line_of_text_];
6125 if ( $current_depth[$aa] > 0 ) {
6127 # set a flag for un-indenting after seeing a nested ternary statement
6128 $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
6129 if ( $aa == QUESTION_COLON ) {
6130 $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
6133 # Fix part #2 for git82: use saved type for propagation of type 'Z'
6134 # through type L-R braces. Perl seems to allow ${bareword}
6135 # as an indirect object, but nothing much more complex than that.
6136 ( $statement_type, my $saved_type, my $saved_token ) =
6137 @{ $nested_statement_type[$aa][ $current_depth[$aa] ] };
6139 && $saved_type eq 'Z'
6140 && $last_nonblank_type eq 'w'
6141 && $brace_structural_type[$brace_depth] eq 'L' )
6143 $last_nonblank_type = $saved_type;
6146 # check that any brace types $bb contained within are balanced
6147 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6148 next if ( $bb == $aa );
6150 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6151 $current_depth[$bb] )
6154 $current_depth[$bb] -
6155 $depth_array[$aa][$bb][ $current_depth[$aa] ];
6157 # don't whine too many times
6158 my $saw_brace_error = get_saw_brace_error();
6160 $saw_brace_error <= MAX_NAG_MESSAGES
6162 # if too many closing types have occurred, we probably
6163 # already caught this error
6164 && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
6167 interrupt_logfile();
6169 $starting_line_of_current_depth[$aa]
6170 [ $current_depth[$aa] ];
6172 my $rel = [ $input_line_number, $input_line, $pos ];
6176 if ( $diff == 1 || $diff == -1 ) {
6184 ? $opening_brace_names[$bb]
6185 : $closing_brace_names[$bb];
6186 write_error_indicator_pair( @{$rsl}, '^' );
6188 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
6193 $starting_line_of_current_depth[$bb]
6194 [ $current_depth[$bb] ];
6197 " The most recent un-matched $bname is on line $ml\n";
6198 write_error_indicator_pair( @{$rml}, '^' );
6200 write_error_indicator_pair( @{$rel}, '^' );
6204 increment_brace_error();
6207 $current_depth[$aa]--;
6211 my $saw_brace_error = get_saw_brace_error();
6212 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
6214 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
6216 indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
6218 increment_brace_error();
6220 # keep track of errors in braces alone (ignoring ternary nesting errors)
6221 $tokenizer_self->[_true_brace_error_count_]++
6222 if ( $closing_brace_names[$aa] ne "':'" );
6224 return ( $seqno, $outdent );
6227 sub check_final_nesting_depths {
6229 # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
6231 for my $aa ( 0 .. @closing_brace_names - 1 ) {
6233 if ( $current_depth[$aa] ) {
6235 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
6238 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
6239 The most recent un-matched $opening_brace_names[$aa] is on line $sl
6241 indicate_error( $msg, @{$rsl}, '^' );
6242 increment_brace_error();
6248 #########i#############################################################
6249 # Tokenizer routines for looking ahead in input stream
6250 #######################################################################
6252 sub peek_ahead_for_n_nonblank_pre_tokens {
6254 # returns next n pretokens if they exist
6255 # returns undef's if hits eof without seeing any pretokens
6256 # USES GLOBAL VARIABLES: $tokenizer_self
6257 my $max_pretokens = shift;
6260 my ( $rpre_tokens, $rmap, $rpre_types );
6263 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6265 $line =~ s/^\s*//; # trim leading blanks
6266 next if ( length($line) <= 0 ); # skip blank
6267 next if ( $line =~ /^#/ ); # skip comment
6268 ( $rpre_tokens, $rmap, $rpre_types ) =
6269 pre_tokenize( $line, $max_pretokens );
6272 return ( $rpre_tokens, $rpre_types );
6275 # look ahead for next non-blank, non-comment line of code
6276 sub peek_ahead_for_nonblank_token {
6278 # USES GLOBAL VARIABLES: $tokenizer_self
6279 my ( $rtokens, $max_token_index ) = @_;
6284 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6286 $line =~ s/^\s*//; # trim leading blanks
6287 next if ( length($line) <= 0 ); # skip blank
6288 next if ( $line =~ /^#/ ); # skip comment
6290 # Updated from 2 to 3 to get trigraphs, added for case b1175
6291 my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
6292 my $j = $max_token_index + 1;
6294 foreach my $tok ( @{$rtok} ) {
6295 last if ( $tok =~ "\n" );
6296 $rtokens->[ ++$j ] = $tok;
6303 #########i#############################################################
6304 # Tokenizer guessing routines for ambiguous situations
6305 #######################################################################
6307 sub guess_if_pattern_or_conditional {
6309 # this routine is called when we have encountered a ? following an
6310 # unknown bareword, and we must decide if it starts a pattern or not
6312 # $i - token index of the ? starting possible pattern
6313 # output parameters:
6314 # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
6315 # msg = a warning or diagnostic message
6316 # USES GLOBAL VARIABLES: $last_nonblank_token
6318 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6320 my $msg = "guessing that ? after $last_nonblank_token starts a ";
6322 if ( $i >= $max_token_index ) {
6323 $msg .= "conditional (no end to pattern found on the line)\n";
6328 my $next_token = $rtokens->[$i]; # first token after ?
6330 # look for a possible ending ? on this line..
6332 my $quote_depth = 0;
6333 my $quote_character = '';
6337 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6340 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6341 $quote_pos, $quote_depth, $max_token_index );
6345 # we didn't find an ending ? on this line,
6346 # so we bias towards conditional
6348 $msg .= "conditional (no ending ? on this line)\n";
6350 # we found an ending ?, so we bias towards a pattern
6354 # Watch out for an ending ? in quotes, like this
6355 # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
6359 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
6360 my $tok = $rtokens->[$ii];
6361 if ( $tok eq ":" ) { $colons++ }
6362 if ( $tok eq "'" ) { $s_quote++ }
6363 if ( $tok eq '"' ) { $d_quote++ }
6365 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
6367 $msg .= "found ending ? but unbalanced quote chars\n";
6369 elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
6371 $msg .= "pattern (found ending ? and pattern expected)\n";
6374 $msg .= "pattern (uncertain, but found ending ?)\n";
6378 return ( $is_pattern, $msg );
6381 my %is_known_constant;
6382 my %is_known_function;
6386 # Constants like 'pi' in Trig.pm are common
6387 my @q = qw(pi pi2 pi4 pip2 pip4);
6388 @{is_known_constant}{@q} = (1) x scalar(@q);
6390 # parenless calls of 'ok' are common
6392 @{is_known_function}{@q} = (1) x scalar(@q);
6395 sub guess_if_pattern_or_division {
6397 # this routine is called when we have encountered a / following an
6398 # unknown bareword, and we must decide if it starts a pattern or is a
6401 # $i - token index of the / starting possible pattern
6402 # output parameters:
6403 # $is_pattern = 0 if probably division, =1 if probably a pattern
6404 # msg = a warning or diagnostic message
6405 # USES GLOBAL VARIABLES: $last_nonblank_token
6406 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6408 my $msg = "guessing that / after $last_nonblank_token starts a ";
6410 if ( $i >= $max_token_index ) {
6411 $msg .= "division (no end to pattern found on the line)\n";
6415 my $divide_possible =
6416 is_possible_numerator( $i, $rtokens, $max_token_index );
6418 if ( $divide_possible < 0 ) {
6419 $msg = "pattern (division not possible here)\n";
6425 my $next_token = $rtokens->[$i]; # first token after slash
6427 # One of the things we can look at is the spacing around the slash.
6428 # There # are four possible spacings around the first slash:
6430 # return pi/two;#/; -/-
6431 # return pi/ two;#/; -/+
6432 # return pi / two;#/; +/+
6433 # return pi /two;#/; +/- <-- possible pattern
6435 # Spacing rule: a space before the slash but not after the slash
6436 # usually indicates a pattern. We can use this to break ties.
6438 my $is_pattern_by_spacing =
6439 ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
6441 # look for a possible ending / on this line..
6443 my $quote_depth = 0;
6444 my $quote_character = '';
6448 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6451 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6452 $quote_pos, $quote_depth, $max_token_index );
6456 # we didn't find an ending / on this line, so we bias towards
6458 if ( $divide_possible >= 0 ) {
6460 $msg .= "division (no ending / on this line)\n";
6464 # assuming a multi-line pattern ... this is risky, but division
6465 # does not seem possible. If this fails, it would either be due
6466 # to a syntax error in the code, or the division_expected logic
6467 # needs to be fixed.
6468 $msg = "multi-line pattern (division not possible)\n";
6473 # we found an ending /, so we bias slightly towards a pattern
6476 my $pattern_expected =
6477 pattern_expected( $i, $rtokens, $max_token_index );
6479 if ( $pattern_expected >= 0 ) {
6481 # pattern looks possible...
6482 if ( $divide_possible >= 0 ) {
6484 # Both pattern and divide can work here...
6486 # Increase weight of divide if a pure number follows
6487 $divide_possible += $next_token =~ /^\d+$/;
6489 # Check for known constants in the numerator, like 'pi'
6490 if ( $is_known_constant{$last_nonblank_token} ) {
6492 "division (pattern works too but saw known constant '$last_nonblank_token')\n";
6496 # A very common bare word in pattern expressions is 'ok'
6497 elsif ( $is_known_function{$last_nonblank_token} ) {
6499 "pattern (division works too but saw '$last_nonblank_token')\n";
6503 # If one rule is more definite, use it
6504 elsif ( $divide_possible > $pattern_expected ) {
6506 "division (more likely based on following tokens)\n";
6510 # otherwise, use the spacing rule
6511 elsif ($is_pattern_by_spacing) {
6513 "pattern (guess on spacing, but division possible too)\n";
6518 "division (guess on spacing, but pattern is possible too)\n";
6523 # divide_possible < 0 means divide can not work here
6526 $msg .= "pattern (division not possible)\n";
6530 # pattern does not look possible...
6533 if ( $divide_possible >= 0 ) {
6535 $msg .= "division (pattern not possible)\n";
6538 # Neither pattern nor divide look possible...go by spacing
6540 if ($is_pattern_by_spacing) {
6541 $msg .= "pattern (guess on spacing)\n";
6545 $msg .= "division (guess on spacing)\n";
6554 return ( $is_pattern, $msg );
6557 # try to resolve here-doc vs. shift by looking ahead for
6558 # non-code or the end token (currently only looks for end token)
6559 # returns 1 if it is probably a here doc, 0 if not
6560 sub guess_if_here_doc {
6562 # This is how many lines we will search for a target as part of the
6563 # guessing strategy. It is a constant because there is probably
6564 # little reason to change it.
6565 # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
6567 my $HERE_DOC_WINDOW = 40;
6569 my $next_token = shift;
6570 my $here_doc_expected = 0;
6573 my $msg = "checking <<";
6576 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) )
6580 if ( $line =~ /^$next_token$/ ) {
6581 $msg .= " -- found target $next_token ahead $k lines\n";
6582 $here_doc_expected = 1; # got it
6585 last if ( $k >= $HERE_DOC_WINDOW );
6588 unless ($here_doc_expected) {
6590 if ( !defined($line) ) {
6591 $here_doc_expected = -1; # hit eof without seeing target
6592 $msg .= " -- must be shift; target $next_token not in file\n";
6595 else { # still unsure..taking a wild guess
6597 if ( !$is_constant{$current_package}{$next_token} ) {
6598 $here_doc_expected = 1;
6600 " -- guessing it's a here-doc ($next_token not a constant)\n";
6604 " -- guessing it's a shift ($next_token is a constant)\n";
6608 write_logfile_entry($msg);
6609 return $here_doc_expected;
6612 #########i#############################################################
6613 # Tokenizer Routines for scanning identifiers and related items
6614 #######################################################################
6616 sub scan_bare_identifier_do {
6618 # this routine is called to scan a token starting with an alphanumeric
6619 # variable or package separator, :: or '.
6620 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
6621 # $last_nonblank_type,@paren_type, $paren_depth
6623 my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
6627 my $package = undef;
6631 # we have to back up one pretoken at a :: since each : is one pretoken
6632 if ( $tok eq '::' ) { $i_beg-- }
6633 if ( $tok eq '->' ) { $i_beg-- }
6634 my $pos_beg = $rtoken_map->[$i_beg];
6635 pos($input_line) = $pos_beg;
6642 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
6644 my $pos = pos($input_line);
6645 my $numc = $pos - $pos_beg;
6646 $tok = substr( $input_line, $pos_beg, $numc );
6648 # type 'w' includes anything without leading type info
6649 # ($,%,@,*) including something like abc::def::ghi
6653 if ( defined($2) ) { $sub_name = $2; }
6654 if ( defined($1) ) {
6657 # patch: don't allow isolated package name which just ends
6658 # in the old style package separator (single quote). Example:
6660 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
6664 $package =~ s/\'/::/g;
6665 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
6666 $package =~ s/::$//;
6669 $package = $current_package;
6671 # patched for c043, part 1: keyword does not follow '->'
6672 if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
6677 # if it is a bareword.. patched for c043, part 2: not following '->'
6678 if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
6680 # check for v-string with leading 'v' type character
6681 # (This seems to have precedence over filehandle, type 'Y')
6682 if ( $tok =~ /^v\d[_\d]*$/ ) {
6684 # we only have the first part - something like 'v101' -
6686 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
6687 $pos = pos($input_line);
6688 $numc = $pos - $pos_beg;
6689 $tok = substr( $input_line, $pos_beg, $numc );
6693 # warn if this version can't handle v-strings
6694 report_v_string($tok);
6697 elsif ( $is_constant{$package}{$sub_name} ) {
6701 # bareword after sort has implied empty prototype; for example:
6702 # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
6703 # This has priority over whatever the user has specified.
6704 elsif ($last_nonblank_token eq 'sort'
6705 && $last_nonblank_type eq 'k' )
6710 # Note: strangely, perl does not seem to really let you create
6711 # functions which act like eval and do, in the sense that eval
6712 # and do may have operators following the final }, but any operators
6713 # that you create with prototype (&) apparently do not allow
6714 # trailing operators, only terms. This seems strange.
6715 # If this ever changes, here is the update
6716 # to make perltidy behave accordingly:
6718 # elsif ( $is_block_function{$package}{$tok} ) {
6719 # $tok='eval'; # patch to do braces like eval - doesn't work
6722 # FIXME: This could become a separate type to allow for different
6724 elsif ( $is_block_function{$package}{$sub_name} ) {
6727 elsif ( $is_block_list_function{$package}{$sub_name} ) {
6730 elsif ( $is_user_function{$package}{$sub_name} ) {
6732 $prototype = $user_function_prototype{$package}{$sub_name};
6735 # check for indirect object
6738 # added 2001-03-27: must not be followed immediately by '('
6740 ( $input_line !~ m/\G\(/gc )
6745 # preceded by keyword like 'print', 'printf' and friends
6746 $is_indirect_object_taker{$last_nonblank_token}
6748 # or preceded by something like 'print(' or 'printf('
6750 ( $last_nonblank_token eq '(' )
6751 && $is_indirect_object_taker{ $paren_type[$paren_depth]
6759 # may not be indirect object unless followed by a space;
6760 # updated 2021-01-16 to consider newline to be a space.
6761 # updated for case b990 to look for either ';' or space
6762 if ( pos($input_line) == length($input_line)
6763 || $input_line =~ m/\G[;\s]/gc )
6768 # Perl's indirect object notation is a very bad
6769 # thing and can cause subtle bugs, especially for
6770 # beginning programmers. And I haven't even been
6771 # able to figure out a sane warning scheme which
6772 # doesn't get in the way of good scripts.
6774 # Complain if a filehandle has any lower case
6775 # letters. This is suggested good practice.
6776 # Use 'sub_name' because something like
6777 # main::MYHANDLE is ok for filehandle
6778 if ( $sub_name =~ /[a-z]/ ) {
6780 # could be bug caused by older perltidy if
6782 if ( $input_line =~ m/\G\s*\(/gc ) {
6784 "Caution: unknown word '$tok' in indirect object slot\n"
6790 # bareword not followed by a space -- may not be filehandle
6791 # (may be function call defined in a 'use' statement)
6798 # Now we must convert back from character position
6799 # to pre_token index.
6800 # I don't think an error flag can occur here ..but who knows
6803 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
6805 warning("scan_bare_identifier: Possibly invalid tokenization\n");
6809 # no match but line not blank - could be syntax error
6810 # perl will take '::' alone without complaint
6814 # change this warning to log message if it becomes annoying
6815 warning("didn't find identifier after leading ::\n");
6817 return ( $i, $tok, $type, $prototype );
6822 # This is the new scanner and will eventually replace scan_identifier.
6823 # Only type 'sub' and 'package' are implemented.
6824 # Token types $ * % @ & -> are not yet implemented.
6826 # Scan identifier following a type token.
6827 # The type of call depends on $id_scan_state: $id_scan_state = ''
6828 # for starting call, in which case $tok must be the token defining
6831 # If the type token is the last nonblank token on the line, a value
6832 # of $id_scan_state = $tok is returned, indicating that further
6833 # calls must be made to get the identifier. If the type token is
6834 # not the last nonblank token on the line, the identifier is
6835 # scanned and handled and a value of '' is returned.
6836 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
6837 # $statement_type, $tokenizer_self
6839 my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
6842 use constant DEBUG_NSCAN => 0;
6844 my ( $i_beg, $pos_beg );
6846 #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
6847 #my ($a,$b,$c) = caller;
6848 #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
6850 # on re-entry, start scanning at first token on the line
6851 if ($id_scan_state) {
6856 # on initial entry, start scanning just after type token
6859 $id_scan_state = $tok;
6863 # find $i_beg = index of next nonblank token,
6864 # and handle empty lines
6866 my $next_nonblank_token = $rtokens->[$i_beg];
6867 if ( $i_beg > $max_token_index ) {
6872 # only a '#' immediately after a '$' is not a comment
6873 if ( $next_nonblank_token eq '#' ) {
6874 unless ( $tok eq '$' ) {
6879 if ( $next_nonblank_token =~ /^\s/ ) {
6880 ( $next_nonblank_token, $i_beg ) =
6881 find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
6883 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
6889 # handle non-blank line; identifier, if any, must follow
6890 unless ($blank_line) {
6892 if ( $is_sub{$id_scan_state} ) {
6893 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
6895 input_line => $input_line,
6900 rtokens => $rtokens,
6901 rtoken_map => $rtoken_map,
6902 id_scan_state => $id_scan_state,
6903 max_token_index => $max_token_index
6908 elsif ( $is_package{$id_scan_state} ) {
6909 ( $i, $tok, $type ) =
6910 do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
6911 $rtoken_map, $max_token_index );
6912 $id_scan_state = '';
6916 warning("invalid token in scan_id: $tok\n");
6917 $id_scan_state = '';
6921 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
6926 Program bug in scan_id: undefined type but scan_state=$id_scan_state
6930 "Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
6932 report_definite_bug();
6937 "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
6939 return ( $i, $tok, $type, $id_scan_state );
6942 sub check_prototype {
6943 my ( $proto, $package, $subname ) = @_;
6944 return unless ( defined($package) && defined($subname) );
6945 if ( defined($proto) ) {
6946 $proto =~ s/^\s*\(\s*//;
6947 $proto =~ s/\s*\)$//;
6949 $is_user_function{$package}{$subname} = 1;
6950 $user_function_prototype{$package}{$subname} = "($proto)";
6952 # prototypes containing '&' must be treated specially..
6953 if ( $proto =~ /\&/ ) {
6955 # right curly braces of prototypes ending in
6956 # '&' may be followed by an operator
6957 if ( $proto =~ /\&$/ ) {
6958 $is_block_function{$package}{$subname} = 1;
6961 # right curly braces of prototypes NOT ending in
6962 # '&' may NOT be followed by an operator
6963 elsif ( $proto !~ /\&$/ ) {
6964 $is_block_list_function{$package}{$subname} = 1;
6969 $is_constant{$package}{$subname} = 1;
6973 $is_user_function{$package}{$subname} = 1;
6978 sub do_scan_package {
6980 # do_scan_package parses a package name
6981 # it is called with $i_beg equal to the index of the first nonblank
6982 # token following a 'package' token.
6983 # USES GLOBAL VARIABLES: $current_package,
6986 # package NAMESPACE VERSION
6987 # package NAMESPACE BLOCK
6988 # package NAMESPACE VERSION BLOCK
6990 # If VERSION is provided, package sets the $VERSION variable in the given
6991 # namespace to a version object with the VERSION provided. VERSION must be
6992 # a "strict" style version number as defined by the version module: a
6993 # positive decimal number (integer or decimal-fraction) without
6994 # exponentiation or else a dotted-decimal v-string with a leading 'v'
6995 # character and at least three components.
6996 # reference http://perldoc.perl.org/functions/package.html
6998 my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
7001 my $package = undef;
7002 my $pos_beg = $rtoken_map->[$i_beg];
7003 pos($input_line) = $pos_beg;
7005 # handle non-blank line; package name, if any, must follow
7006 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
7008 $package = ( defined($1) && $1 ) ? $1 : 'main';
7009 $package =~ s/\'/::/g;
7010 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7011 $package =~ s/::$//;
7012 my $pos = pos($input_line);
7013 my $numc = $pos - $pos_beg;
7014 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
7017 # Now we must convert back from character position
7018 # to pre_token index.
7019 # I don't think an error flag can occur here ..but ?
7022 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7023 if ($error) { warning("Possibly invalid package\n") }
7024 $current_package = $package;
7026 # we should now have package NAMESPACE
7027 # now expecting VERSION, BLOCK, or ; to follow ...
7028 # package NAMESPACE VERSION
7029 # package NAMESPACE BLOCK
7030 # package NAMESPACE VERSION BLOCK
7031 my ( $next_nonblank_token, $i_next ) =
7032 find_next_nonblank_token( $i, $rtokens, $max_token_index );
7034 # check that something recognizable follows, but do not parse.
7035 # A VERSION number will be parsed later as a number or v-string in the
7036 # normal way. What is important is to set the statement type if
7037 # everything looks okay so that the operator_expected() routine
7038 # knows that the number is in a package statement.
7039 # Examples of valid primitive tokens that might follow are:
7041 # FIX: added a '#' since a side comment may also follow
7042 if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#])|v\d|\d+$/ ) {
7043 $statement_type = $tok;
7047 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
7052 # no match but line not blank --
7053 # could be a label with name package, like package: , for example.
7058 return ( $i, $tok, $type );
7061 my %is_special_variable_char;
7065 # These are the only characters which can (currently) form special
7066 # variables, like $^W: (issue c066).
7068 qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
7069 @{is_special_variable_char}{@q} = (1) x scalar(@q);
7072 sub scan_identifier_do {
7074 # This routine assembles tokens into identifiers. It maintains a
7075 # scan state, id_scan_state. It updates id_scan_state based upon
7076 # current id_scan_state and token, and returns an updated
7077 # id_scan_state and the next index after the identifier.
7079 # USES GLOBAL VARIABLES: $context, $last_nonblank_token,
7080 # $last_nonblank_type
7082 my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
7083 $expecting, $container_type )
7085 use constant DEBUG_SCAN_ID => 0;
7088 my $tok_begin = $rtokens->[$i_begin];
7089 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
7090 my $id_scan_state_begin = $id_scan_state;
7091 my $identifier_begin = $identifier;
7092 my $tok = $tok_begin;
7094 my $tok_is_blank; # a flag to speed things up
7096 my $in_prototype_or_signature =
7097 $container_type && $container_type =~ /^sub\b/;
7099 # these flags will be used to help figure out the type:
7103 # allow old package separator (') except in 'use' statement
7104 my $allow_tick = ( $last_nonblank_token ne 'use' );
7106 #########################################################
7107 # get started by defining a type and a state if necessary
7108 #########################################################
7110 if ( !$id_scan_state ) {
7111 $context = UNKNOWN_CONTEXT;
7114 if ( $tok eq '>' ) {
7120 if ( $tok eq '$' || $tok eq '*' ) {
7121 $id_scan_state = '$';
7122 $context = SCALAR_CONTEXT;
7124 elsif ( $tok eq '%' || $tok eq '@' ) {
7125 $id_scan_state = '$';
7126 $context = LIST_CONTEXT;
7128 elsif ( $tok eq '&' ) {
7129 $id_scan_state = '&';
7131 elsif ( $tok eq 'sub' or $tok eq 'package' ) {
7132 $saw_alpha = 0; # 'sub' is considered type info here
7133 $id_scan_state = '$';
7134 $identifier .= ' '; # need a space to separate sub from sub name
7136 elsif ( $tok eq '::' ) {
7137 $id_scan_state = 'A';
7139 elsif ( $tok =~ /^\w/ ) {
7140 $id_scan_state = ':';
7143 elsif ( $tok eq '->' ) {
7144 $id_scan_state = '$';
7148 # shouldn't happen: bad call parameter
7150 "Program bug detected: scan_identifier received bad starting token = '$tok'\n";
7151 if (DEVEL_MODE) { Fault($msg) }
7152 if ( !$tokenizer_self->[_in_error_] ) {
7154 $tokenizer_self->[_in_error_] = 1;
7156 $id_scan_state = '';
7159 $saw_type = !$saw_alpha;
7163 $saw_alpha = ( $tok =~ /^\w/ );
7164 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
7167 ###############################
7168 # loop to gather the identifier
7169 ###############################
7173 while ( $i < $max_token_index ) {
7174 my $last_tok_is_blank = $tok_is_blank;
7175 if ($tok_is_blank) { $tok_is_blank = undef }
7176 else { $i_save = $i }
7178 $tok = $rtokens->[ ++$i ];
7180 # patch to make digraph :: if necessary
7181 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
7186 ########################
7187 # Starting variable name
7188 ########################
7190 if ( $id_scan_state eq '$' ) {
7192 if ( $tok eq '$' ) {
7194 $identifier .= $tok;
7196 # we've got a punctuation variable if end of line (punct.t)
7197 if ( $i == $max_token_index ) {
7199 $id_scan_state = '';
7203 elsif ( $tok =~ /^\w/ ) { # alphanumeric ..
7205 $id_scan_state = ':'; # now need ::
7206 $identifier .= $tok;
7208 elsif ( $tok eq '::' ) {
7209 $id_scan_state = 'A';
7210 $identifier .= $tok;
7213 # POSTDEFREF ->@ ->% ->& ->*
7214 elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
7215 $identifier .= $tok;
7217 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7219 $id_scan_state = ':'; # now need ::
7220 $identifier .= $tok;
7222 # Perl will accept leading digits in identifiers,
7223 # although they may not always produce useful results.
7224 # Something like $main::0 is ok. But this also works:
7226 # sub howdy::123::bubba{ print "bubba $54321!\n" }
7227 # howdy::123::bubba();
7230 elsif ( $tok eq '#' ) {
7232 # side comment or identifier?
7235 # A '#' starts a comment if it follows a space. For example,
7236 # the following is equivalent to $ans=40.
7241 # a # inside a prototype or signature can only start a
7243 && !$in_prototype_or_signature
7245 # these are valid punctuation vars: *# %# @# $#
7246 # May also be '$#array' or POSTDEFREF ->$#
7247 && ( $identifier =~ /^[\%\@\$\*]$/ || $identifier =~ /\$$/ )
7251 $identifier .= $tok; # keep same state, a $ could follow
7255 # otherwise it is a side comment
7256 if ( $identifier eq '->' ) { }
7257 elsif ( $id_scan_state eq '$' ) { $type = 't' }
7258 else { $type = 'i' }
7260 $id_scan_state = '';
7265 elsif ( $tok eq '{' ) {
7267 # check for something like ${#} or ${©}
7271 || $identifier eq '@'
7272 || $identifier eq '$#'
7274 && $i + 2 <= $max_token_index
7275 && $rtokens->[ $i + 2 ] eq '}'
7276 && $rtokens->[ $i + 1 ] !~ /[\s\w]/
7279 my $next2 = $rtokens->[ $i + 2 ];
7280 my $next1 = $rtokens->[ $i + 1 ];
7281 $identifier .= $tok . $next1 . $next2;
7283 $id_scan_state = '';
7287 # skip something like ${xxx} or ->{
7288 $id_scan_state = '';
7290 # if this is the first token of a line, any tokens for this
7291 # identifier have already been accumulated
7292 if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; }
7297 # space ok after leading $ % * & @
7298 elsif ( $tok =~ /^\s*$/ ) {
7302 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
7304 if ( length($identifier) > 1 ) {
7305 $id_scan_state = '';
7307 $type = 'i'; # probably punctuation variable
7312 # spaces after $'s are common, and space after @
7313 # is harmless, so only complain about space
7314 # after other type characters. Space after $ and
7315 # @ will be removed in formatting. Report space
7316 # after % and * because they might indicate a
7317 # parsing error. In other words '% ' might be a
7318 # modulo operator. Delete this warning if it
7320 if ( $identifier !~ /^[\@\$]$/ ) {
7322 "Space in identifier, following $identifier\n";
7328 # space after '->' is ok
7330 elsif ( $tok eq '^' ) {
7332 # check for some special variables like $^ $^W
7333 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
7334 $identifier .= $tok;
7337 # There may be one more character, not a space, after the ^
7338 my $next1 = $rtokens->[ $i + 1 ];
7339 my $chr = substr( $next1, 0, 1 );
7340 if ( $is_special_variable_char{$chr} ) {
7342 # It is something like $^W
7343 # Test case (c066) : $^Oeq'linux'
7345 $identifier .= $next1;
7347 # If pretoken $next1 is more than one character long,
7348 # set a flag indicating that it needs to be split.
7349 $id_scan_state = ( length($next1) > 1 ) ? '^' : "";
7355 # Simple test case (c065): '$aa=$^if($bb)';
7356 $id_scan_state = "";
7361 $id_scan_state = '';
7366 else { # something else
7368 if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
7370 # We might be in an extrusion of
7371 # sub foo2 ( $first, $, $third ) {
7372 # looking at a line starting with a comma, like
7375 # in this case the comma ends the signature variable
7376 # '$' which will have been previously marked type 't'
7378 if ( $i == $i_begin ) {
7383 # at a # we have to mark as type 't' because more may
7384 # follow, otherwise, in a signature we can let '$' be an
7385 # identifier here for better formatting.
7386 # See 'mangle4.in' for a test case.
7389 if ( $id_scan_state eq '$' && $tok eq '#' ) {
7394 $id_scan_state = '';
7398 # check for various punctuation variables
7399 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
7400 $identifier .= $tok;
7403 # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
7405 && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
7407 $identifier .= $tok;
7410 elsif ( $identifier eq '$#' ) {
7412 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
7414 # perl seems to allow just these: $#: $#- $#+
7415 elsif ( $tok =~ /^[\:\-\+]$/ ) {
7417 $identifier .= $tok;
7421 write_logfile_entry( 'Use of $# is deprecated' . "\n" );
7424 elsif ( $identifier eq '$$' ) {
7426 # perl does not allow references to punctuation
7427 # variables without braces. For example, this
7431 # You would have to use
7434 # '$$' alone is punctuation variable for PID
7436 if ( $tok eq '{' ) { $type = 't' }
7437 else { $type = 'i' }
7439 elsif ( $identifier eq '->' ) {
7444 if ( length($identifier) == 1 ) { $identifier = ''; }
7446 $id_scan_state = '';
7451 ###################################
7452 # looking for alphanumeric after ::
7453 ###################################
7455 elsif ( $id_scan_state eq 'A' ) {
7457 $tok_is_blank = $tok =~ /^\s*$/;
7459 if ( $tok =~ /^\w/ ) { # found it
7460 $identifier .= $tok;
7461 $id_scan_state = ':'; # now need ::
7464 elsif ( $tok eq "'" && $allow_tick ) {
7465 $identifier .= $tok;
7466 $id_scan_state = ':'; # now need ::
7469 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7470 $id_scan_state = '(';
7471 $identifier .= $tok;
7473 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7474 $id_scan_state = ')';
7475 $identifier .= $tok;
7478 $id_scan_state = '';
7484 ###################################
7485 # looking for :: after alphanumeric
7486 ###################################
7488 elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha
7490 $tok_is_blank = $tok =~ /^\s*$/;
7492 if ( $tok eq '::' ) { # got it
7493 $identifier .= $tok;
7494 $id_scan_state = 'A'; # now require alpha
7496 elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here
7497 $identifier .= $tok;
7498 $id_scan_state = ':'; # now need ::
7501 elsif ( $tok eq "'" && $allow_tick ) { # tick
7503 if ( $is_keyword{$identifier} ) {
7504 $id_scan_state = ''; # that's all
7508 $identifier .= $tok;
7511 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7512 $id_scan_state = '(';
7513 $identifier .= $tok;
7515 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7516 $id_scan_state = ')';
7517 $identifier .= $tok;
7520 $id_scan_state = ''; # that's all
7526 ##############################
7527 # looking for '(' of prototype
7528 ##############################
7530 elsif ( $id_scan_state eq '(' ) {
7532 if ( $tok eq '(' ) { # got it
7533 $identifier .= $tok;
7534 $id_scan_state = ')'; # now find the end of it
7536 elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
7537 $identifier .= $tok;
7541 $id_scan_state = ''; # that's all - no prototype
7547 ##############################
7548 # looking for ')' of prototype
7549 ##############################
7551 elsif ( $id_scan_state eq ')' ) {
7553 $tok_is_blank = $tok =~ /^\s*$/;
7555 if ( $tok eq ')' ) { # got it
7556 $identifier .= $tok;
7557 $id_scan_state = ''; # all done
7560 elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
7561 $identifier .= $tok;
7563 else { # probable error in script, but keep going
7564 warning("Unexpected '$tok' while seeking end of prototype\n");
7565 $identifier .= $tok;
7573 elsif ( $id_scan_state eq '&' ) {
7575 if ( $tok =~ /^[\$\w]/ ) { # alphanumeric ..
7576 $id_scan_state = ':'; # now need ::
7578 $identifier .= $tok;
7580 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7581 $id_scan_state = ':'; # now need ::
7583 $identifier .= $tok;
7585 elsif ( $tok =~ /^\s*$/ ) { # allow space
7588 elsif ( $tok eq '::' ) { # leading ::
7589 $id_scan_state = 'A'; # accept alpha next
7590 $identifier .= $tok;
7592 elsif ( $tok eq '{' ) {
7593 if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; }
7595 $id_scan_state = '';
7598 elsif ( $tok eq '^' ) {
7599 if ( $identifier eq '&' ) {
7601 # Special variable (c066)
7602 $identifier .= $tok;
7605 # There may be one more character, not a space, after the ^
7606 my $next1 = $rtokens->[ $i + 1 ];
7607 my $chr = substr( $next1, 0, 1 );
7608 if ( $is_special_variable_char{$chr} ) {
7610 # It is something like &^O
7612 $identifier .= $next1;
7614 # If pretoken $next1 is more than one character long,
7615 # set a flag indicating that it needs to be split.
7616 $id_scan_state = ( length($next1) > 1 ) ? '^' : "";
7621 $id_scan_state = "";
7633 # punctuation variable?
7634 # testfile: cunningham4.pl
7636 # We have to be careful here. If we are in an unknown state,
7637 # we will reject the punctuation variable. In the following
7638 # example the '&' is a binary operator but we are in an unknown
7639 # state because there is no sigil on 'Prima', so we don't
7640 # know what it is. But it is a bad guess that
7641 # '&~' is a function variable.
7642 # $self->{text}->{colorMap}->[
7643 # Prima::PodView::COLOR_CODE_FOREGROUND
7644 # & ~tb::COLOR_INDEX ] =
7647 # Fix for case c033: a '#' here starts a side comment
7648 if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
7649 $identifier .= $tok;
7656 $id_scan_state = '';
7661 ######################
7662 # unknown state - quit
7663 ######################
7665 else { # can get here due to error in initialization
7666 $id_scan_state = '';
7670 } ## end of main loop
7672 if ( $id_scan_state eq ')' ) {
7673 warning("Hit end of line while seeking ) to end prototype\n");
7676 # once we enter the actual identifier, it may not extend beyond
7677 # the end of the current line
7678 if ( $id_scan_state =~ /^[A\:\(\)]/ ) {
7679 $id_scan_state = '';
7682 # Patch: the deprecated variable $# does not combine with anything on the
7684 if ( $identifier eq '$#' ) { $id_scan_state = '' }
7686 if ( $i < 0 ) { $i = 0 }
7688 # Be sure a token type is defined
7695 # The type without the -> should be the same as with the -> so
7696 # that if they get separated we get the same bond strengths,
7698 if ( $identifier =~ /^->/
7699 && $last_nonblank_type eq 'w'
7700 && substr( $identifier, 2, 1 ) =~ /^\w/ )
7704 else { $type = 'i' }
7706 elsif ( $identifier eq '->' ) {
7710 ( length($identifier) > 1 )
7712 # In something like '@$=' we have an identifier '@$'
7713 # In something like '$${' we have type '$$' (and only
7714 # part of an identifier)
7715 && !( $identifier =~ /\$$/ && $tok eq '{' )
7716 && ( $identifier !~ /^(sub |package )$/ )
7721 else { $type = 't' }
7723 elsif ($saw_alpha) {
7725 # type 'w' includes anything without leading type info
7726 # ($,%,@,*) including something like abc::def::ghi
7731 } # this can happen on a restart
7734 # See if we formed an identifier...
7737 if ($message) { write_logfile_entry($message) }
7740 # did not find an identifier, back up
7748 DEBUG_SCAN_ID && do {
7749 my ( $a, $b, $c ) = caller;
7751 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
7753 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
7755 return ( $i, $tok, $type, $id_scan_state, $identifier );
7758 { ## closure for sub do_scan_sub
7760 my %warn_if_lexical;
7764 # lexical subs with these names can cause parsing errors in this version
7765 my @q = qw( m q qq qr qw qx s tr y );
7766 @{warn_if_lexical}{@q} = (1) x scalar(@q);
7769 # saved package and subnames in case prototype is on separate line
7770 my ( $package_saved, $subname_saved );
7772 # initialize subname each time a new 'sub' keyword is encountered
7773 sub initialize_subname {
7774 $package_saved = "";
7775 $subname_saved = "";
7782 PROTOTYPE_CALL => 3,
7787 # do_scan_sub parses a sub name and prototype.
7789 # At present there are three basic CALL TYPES which are
7790 # distinguished by the starting value of '$tok':
7791 # 1. $tok='sub', id_scan_state='sub'
7792 # it is called with $i_beg equal to the index of the first nonblank
7793 # token following a 'sub' token.
7794 # 2. $tok='(', id_scan_state='sub',
7795 # it is called with $i_beg equal to the index of a '(' which may
7796 # start a prototype.
7797 # 3. $tok='prototype', id_scan_state='prototype'
7798 # it is called with $i_beg equal to the index of a '(' which is
7799 # preceded by ': prototype' and has $id_scan_state eq 'prototype'
7803 # A single type 1 call will get both the sub and prototype
7804 # sub foo1 ( $$ ) { }
7807 # The subname will be obtained with a 'sub' call
7808 # The prototype on line 2 will be obtained with a '(' call
7814 # The subname will be obtained with a 'sub' call
7815 # The prototype will be obtained with a 'prototype' call
7816 # sub foo1 ( $x, $y ) : prototype ( $$ ) { }
7817 # ^ <---type 1 ^ <---type 3
7819 # TODO: add future error checks to be sure we have a valid
7820 # sub name. For example, 'sub &doit' is wrong. Also, be sure
7821 # a name is given if and only if a non-anonymous sub is
7823 # USES GLOBAL VARS: $current_package, $last_nonblank_token,
7824 # $in_attribute_list, %saw_function_definition,
7827 my ($rinput_hash) = @_;
7829 my $input_line = $rinput_hash->{input_line};
7830 my $i = $rinput_hash->{i};
7831 my $i_beg = $rinput_hash->{i_beg};
7832 my $tok = $rinput_hash->{tok};
7833 my $type = $rinput_hash->{type};
7834 my $rtokens = $rinput_hash->{rtokens};
7835 my $rtoken_map = $rinput_hash->{rtoken_map};
7836 my $id_scan_state = $rinput_hash->{id_scan_state};
7837 my $max_token_index = $rinput_hash->{max_token_index};
7841 # Determine the CALL TYPE
7846 $tok eq 'prototype' ? PROTOTYPE_CALL
7847 : $tok eq '(' ? PAREN_CALL
7850 $id_scan_state = ""; # normally we get everything in one call
7851 my $subname = $subname_saved;
7852 my $package = $package_saved;
7857 my $pos_beg = $rtoken_map->[$i_beg];
7858 pos($input_line) = $pos_beg;
7860 # Look for the sub NAME if this is a SUB call
7862 $call_type == SUB_CALL
7863 && $input_line =~ m/\G\s*
7864 ((?:\w*(?:'|::))*) # package - something that ends in :: or '
7865 (\w+) # NAME - required
7872 my $is_lexical_sub =
7873 $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
7874 if ( $is_lexical_sub && $1 ) {
7875 warning("'my' sub $subname cannot be in package '$1'\n");
7876 $is_lexical_sub = 0;
7879 if ($is_lexical_sub) {
7881 # lexical subs use the block sequence number as a package name
7883 $current_sequence_number[BRACE][ $current_depth[BRACE] ];
7884 $seqno = 1 unless ( defined($seqno) );
7886 if ( $warn_if_lexical{$subname} ) {
7888 "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
7893 $package = ( defined($1) && $1 ) ? $1 : $current_package;
7894 $package =~ s/\'/::/g;
7895 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7896 $package =~ s/::$//;
7899 my $pos = pos($input_line);
7900 my $numc = $pos - $pos_beg;
7901 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
7904 # remember the sub name in case another call is needed to
7906 $package_saved = $package;
7907 $subname_saved = $subname;
7910 # Now look for PROTO ATTRS for all call types
7911 # Look for prototype/attributes which are usually on the same
7912 # line as the sub name but which might be on a separate line.
7913 # For example, we might have an anonymous sub with attributes,
7914 # or a prototype on a separate line from its sub name
7916 # NOTE: We only want to parse PROTOTYPES here. If we see anything that
7917 # does not look like a prototype, we assume it is a SIGNATURE and we
7918 # will stop and let the the standard tokenizer handle it. In
7919 # particular, we stop if we see any nested parens, braces, or commas.
7920 # Also note, a valid prototype cannot contain any alphabetic character
7921 # -- see https://perldoc.perl.org/perlsub
7922 # But it appears that an underscore is valid in a prototype, so the
7923 # regex below uses [A-Za-z] rather than \w
7924 # This is the old regex which has been replaced:
7925 # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO
7926 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
7928 $input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO
7929 (\s*:)? # ATTRS leading ':'
7937 # Append the prototype to the starting token if it is 'sub' or
7938 # 'prototype'. This is not necessary but for compatibility with
7939 # previous versions when the -csc flag is used:
7940 if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
7944 # If we just entered the sub at an opening paren on this call, not
7945 # a following :prototype, label it with the previous token. This is
7946 # necessary to propagate the sub name to its opening block.
7947 elsif ( $call_type == PAREN_CALL ) {
7948 $tok = $last_nonblank_token;
7953 # Patch part #1 to fixes cases b994 and b1053:
7954 # Mark an anonymous sub keyword without prototype as type 'k', i.e.
7955 # 'sub : lvalue { ...'
7957 if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
7962 # ATTRS: if there are attributes, back up and let the ':' be
7963 # found later by the scanner.
7964 my $pos = pos($input_line);
7966 $pos -= length($attrs);
7969 my $next_nonblank_token = $tok;
7971 # catch case of line with leading ATTR ':' after anonymous sub
7972 if ( $pos == $pos_beg && $tok eq ':' ) {
7974 $in_attribute_list = 1;
7977 # Otherwise, if we found a match we must convert back from
7978 # string position to the pre_token index for continued parsing.
7981 # I don't think an error flag can occur here ..but ?
7983 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
7985 if ($error) { warning("Possibly invalid sub\n") }
7987 # Patch part #2 to fixes cases b994 and b1053:
7988 # Do not let spaces be part of the token of an anonymous sub
7989 # keyword which we marked as type 'k' above...i.e. for
7991 # 'sub : lvalue { ...'
7992 # Back up and let it be parsed as a blank
7996 && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
8001 # check for multiple definitions of a sub
8002 ( $next_nonblank_token, my $i_next ) =
8003 find_next_nonblank_token_on_this_line( $i, $rtokens,
8007 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
8008 { # skip blank or side comment
8009 my ( $rpre_tokens, $rpre_types ) =
8010 peek_ahead_for_n_nonblank_pre_tokens(1);
8011 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
8012 $next_nonblank_token = $rpre_tokens->[0];
8015 $next_nonblank_token = '}';
8019 # See what's next...
8020 if ( $next_nonblank_token eq '{' ) {
8023 # Check for multiple definitions of a sub, but
8024 # it is ok to have multiple sub BEGIN, etc,
8025 # so we do not complain if name is all caps
8026 if ( $saw_function_definition{$subname}{$package}
8027 && $subname !~ /^[A-Z]+$/ )
8029 my $lno = $saw_function_definition{$subname}{$package};
8030 if ( $package =~ /^\d/ ) {
8032 "already saw definition of lexical 'sub $subname' at line $lno\n"
8038 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
8039 ) unless (DEVEL_MODE);
8042 $saw_function_definition{$subname}{$package} =
8043 $tokenizer_self->[_last_line_number_];
8046 elsif ( $next_nonblank_token eq ';' ) {
8048 elsif ( $next_nonblank_token eq '}' ) {
8051 # ATTRS - if an attribute list follows, remember the name
8052 # of the sub so the next opening brace can be labeled.
8053 # Setting 'statement_type' causes any ':'s to introduce
8055 elsif ( $next_nonblank_token eq ':' ) {
8056 if ( $call_type == SUB_CALL ) {
8058 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8062 # if we stopped before an open paren ...
8063 elsif ( $next_nonblank_token eq '(' ) {
8065 # If we DID NOT see this paren above then it must be on the
8066 # next line so we will set a flag to come back here and see if
8069 # Otherwise, we assume it is a SIGNATURE rather than a
8070 # PROTOTYPE and let the normal tokenizer handle it as a list
8071 if ( !$saw_opening_paren ) {
8072 $id_scan_state = 'sub'; # we must come back to get proto
8074 if ( $call_type == SUB_CALL ) {
8076 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8079 elsif ($next_nonblank_token) { # EOF technically ok
8080 $subname = "" unless defined($subname);
8082 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
8085 check_prototype( $proto, $package, $subname );
8088 # no match to either sub name or prototype, but line not blank
8092 return ( $i, $tok, $type, $id_scan_state );
8096 #########i###############################################################
8097 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
8098 #########################################################################
8100 sub find_next_nonblank_token {
8101 my ( $i, $rtokens, $max_token_index ) = @_;
8103 # Returns the next nonblank token after the token at index $i
8104 # To skip past a side comment, and any subsequent block comments
8105 # and blank lines, call with i=$max_token_index
8107 if ( $i >= $max_token_index ) {
8108 if ( !peeked_ahead() ) {
8110 peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
8114 my $next_nonblank_token = $rtokens->[ ++$i ];
8115 return ( " ", $i ) unless defined($next_nonblank_token);
8117 if ( $next_nonblank_token =~ /^\s*$/ ) {
8118 $next_nonblank_token = $rtokens->[ ++$i ];
8119 return ( " ", $i ) unless defined($next_nonblank_token);
8121 return ( $next_nonblank_token, $i );
8124 sub find_next_noncomment_type {
8125 my ( $i, $rtokens, $max_token_index ) = @_;
8127 # Given the current character position, look ahead past any comments
8128 # and blank lines and return the next token, including digraphs and
8131 my ( $next_nonblank_token, $i_next ) =
8132 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8134 # skip past any side comment
8135 if ( $next_nonblank_token eq '#' ) {
8136 ( $next_nonblank_token, $i_next ) =
8137 find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
8140 goto RETURN if ( !$next_nonblank_token || $next_nonblank_token eq " " );
8142 # check for possible a digraph
8143 goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
8144 my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8145 goto RETURN if ( !$is_digraph{$test2} );
8146 $next_nonblank_token = $test2;
8147 $i_next = $i_next + 1;
8149 # check for possible a trigraph
8150 goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
8151 my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8152 goto RETURN if ( !$is_trigraph{$test3} );
8153 $next_nonblank_token = $test3;
8154 $i_next = $i_next + 1;
8157 return ( $next_nonblank_token, $i_next );
8160 sub is_possible_numerator {
8162 # Look at the next non-comment character and decide if it could be a
8168 my ( $i, $rtokens, $max_token_index ) = @_;
8169 my $is_possible_numerator = 0;
8171 my $next_token = $rtokens->[ $i + 1 ];
8172 if ( $next_token eq '=' ) { $i++; } # handle /=
8173 my ( $next_nonblank_token, $i_next ) =
8174 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8176 if ( $next_nonblank_token eq '#' ) {
8177 ( $next_nonblank_token, $i_next ) =
8178 find_next_nonblank_token( $max_token_index, $rtokens,
8182 if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
8183 $is_possible_numerator = 1;
8185 elsif ( $next_nonblank_token =~ /^\s*$/ ) {
8186 $is_possible_numerator = 0;
8189 $is_possible_numerator = -1;
8192 return $is_possible_numerator;
8195 { ## closure for sub pattern_expected
8200 # List of tokens which may follow a pattern. Note that we will not
8201 # have formed digraphs at this point, so we will see '&' instead of
8202 # '&&' and '|' instead of '||'
8204 # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
8205 my @q = qw( & && | || ? : + - * and or while if unless);
8206 push @q, ')', '}', ']', '>', ',', ';';
8207 @{pattern_test}{@q} = (1) x scalar(@q);
8210 sub pattern_expected {
8212 # This a filter for a possible pattern.
8213 # It looks at the token after a possible pattern and tries to
8214 # determine if that token could end a pattern.
8219 my ( $i, $rtokens, $max_token_index ) = @_;
8222 my $next_token = $rtokens->[ $i + 1 ];
8223 if ( $next_token =~ /^[msixpodualgc]/ ) {
8225 } # skip possible modifier
8226 my ( $next_nonblank_token, $i_next ) =
8227 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8229 if ( $pattern_test{$next_nonblank_token} ) {
8234 # Added '#' to fix issue c044
8235 if ( $next_nonblank_token =~ /^\s*$/
8236 || $next_nonblank_token eq '#' )
8248 sub find_next_nonblank_token_on_this_line {
8249 my ( $i, $rtokens, $max_token_index ) = @_;
8250 my $next_nonblank_token;
8252 if ( $i < $max_token_index ) {
8253 $next_nonblank_token = $rtokens->[ ++$i ];
8255 if ( $next_nonblank_token =~ /^\s*$/ ) {
8257 if ( $i < $max_token_index ) {
8258 $next_nonblank_token = $rtokens->[ ++$i ];
8263 $next_nonblank_token = "";
8265 return ( $next_nonblank_token, $i );
8268 sub find_angle_operator_termination {
8270 # We are looking at a '<' and want to know if it is an angle operator.
8272 # $i = pretoken index of ending '>' if found, current $i otherwise
8273 # $type = 'Q' if found, '>' otherwise
8274 my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
8277 pos($input_line) = 1 + $rtoken_map->[$i];
8281 # we just have to find the next '>' if a term is expected
8282 if ( $expecting == TERM ) { $filter = '[\>]' }
8284 # we have to guess if we don't know what is expected
8285 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
8287 # shouldn't happen - we shouldn't be here if operator is expected
8291 Bad call to find_angle_operator_termination
8294 return ( $i, $type );
8297 # To illustrate what we might be looking at, in case we are
8298 # guessing, here are some examples of valid angle operators
8305 # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
8306 # <${PREFIX}*img*.$IMAGE_TYPE>
8307 # <img*.$IMAGE_TYPE>
8308 # <Timg*.$IMAGE_TYPE>
8309 # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
8311 # Here are some examples of lines which do not have angle operators:
8312 # return unless $self->[2]++ < $#{$self->[1]};
8315 # the following line from dlister.pl caused trouble:
8316 # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
8318 # If the '<' starts an angle operator, it must end on this line and
8319 # it must not have certain characters like ';' and '=' in it. I use
8320 # this to limit the testing. This filter should be improved if
8323 if ( $input_line =~ /($filter)/g ) {
8327 # We MAY have found an angle operator termination if we get
8328 # here, but we need to do more to be sure we haven't been
8330 my $pos = pos($input_line);
8332 my $pos_beg = $rtoken_map->[$i];
8333 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
8335 # Test for '<' after possible filehandle, issue c103
8336 # print $fh <>; # syntax error
8337 # print $fh <DATA>; # ok
8338 # print $fh < DATA>; # syntax error at '>'
8339 # print STDERR < DATA>; # ok, prints word 'DATA'
8340 # print BLABLA <DATA>; # ok; does nothing unless BLABLA is defined
8341 if ( $last_nonblank_type eq 'Z' ) {
8343 # $str includes brackets; something like '<DATA>'
8344 if ( substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
8345 && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
8347 return ( $i, $type );
8351 # Reject if the closing '>' follows a '-' as in:
8352 # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
8353 if ( $expecting eq UNKNOWN ) {
8354 my $check = substr( $input_line, $pos - 2, 1 );
8355 if ( $check eq '-' ) {
8356 return ( $i, $type );
8360 ######################################debug#####
8361 #write_diagnostics( "ANGLE? :$str\n");
8362 #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
8363 ######################################debug#####
8367 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
8369 # It may be possible that a quote ends midway in a pretoken.
8370 # If this happens, it may be necessary to split the pretoken.
8374 unexpected error condition returned by inverse_pretoken_map
8378 "Possible tokinization error..please check this line\n");
8381 # count blanks on inside of brackets
8382 my $blank_count = 0;
8383 $blank_count++ if ( $str =~ /<\s+/ );
8384 $blank_count++ if ( $str =~ /\s+>/ );
8386 # Now let's see where we stand....
8387 # OK if math op not possible
8388 if ( $expecting == TERM ) {
8391 # OK if there are no more than 2 non-blank pre-tokens inside
8392 # (not possible to write 2 token math between < and >)
8393 # This catches most common cases
8394 elsif ( $i <= $i_beg + 3 + $blank_count ) {
8396 # No longer any need to document this common case
8397 ## write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
8400 # OK if there is some kind of identifier inside
8401 # print $fh <tvg::INPUT>;
8402 elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
8403 write_diagnostics("ANGLE (contains identifier): $str\n");
8409 # Let's try a Brace Test: any braces inside must balance
8411 while ( $str =~ /\{/g ) { $br++ }
8412 while ( $str =~ /\}/g ) { $br-- }
8414 while ( $str =~ /\[/g ) { $sb++ }
8415 while ( $str =~ /\]/g ) { $sb-- }
8417 while ( $str =~ /\(/g ) { $pr++ }
8418 while ( $str =~ /\)/g ) { $pr-- }
8420 # if braces do not balance - not angle operator
8421 if ( $br || $sb || $pr ) {
8425 "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
8428 # we should keep doing more checks here...to be continued
8429 # Tentatively accepting this as a valid angle operator.
8430 # There are lots more things that can be checked.
8433 "ANGLE-Guessing yes: $str expecting=$expecting\n");
8434 write_logfile_entry("Guessing angle operator here: $str\n");
8439 # didn't find ending >
8441 if ( $expecting == TERM ) {
8442 warning("No ending > for angle operator\n");
8446 return ( $i, $type );
8449 sub scan_number_do {
8451 # scan a number in any of the formats that Perl accepts
8452 # Underbars (_) are allowed in decimal numbers.
8453 # input parameters -
8454 # $input_line - the string to scan
8455 # $i - pre_token index to start scanning
8456 # $rtoken_map - reference to the pre_token map giving starting
8457 # character position in $input_line of token $i
8458 # output parameters -
8459 # $i - last pre_token index of the number just scanned
8460 # number - the number (characters); or undef if not a number
8462 my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
8463 my $pos_beg = $rtoken_map->[$i];
8467 my $type = $input_type;
8469 my $first_char = substr( $input_line, $pos_beg, 1 );
8471 # Look for bad starting characters; Shouldn't happen..
8472 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
8475 Program bug - scan_number given bad first character = '$first_char'
8478 return ( $i, $type, $number );
8481 # handle v-string without leading 'v' character ('Two Dot' rule)
8483 # Here is the format prior to including underscores:
8484 ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
8485 pos($input_line) = $pos_beg;
8486 if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
8487 $pos = pos($input_line);
8488 my $numc = $pos - $pos_beg;
8489 $number = substr( $input_line, $pos_beg, $numc );
8491 report_v_string($number);
8494 # handle octal, hex, binary
8495 if ( !defined($number) ) {
8496 pos($input_line) = $pos_beg;
8498 # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
8499 # For reference, the format prior to hex floating point is:
8500 # /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
8501 # (hex) (octal) (binary)
8505 /\G[+-]?0( # leading [signed] 0
8507 # a hex float, i.e. '0x0.b17217f7d1cf78p0'
8508 ([xX][0-9a-fA-F_]* # X and optional leading digits
8509 (\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction
8510 [Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
8511 [0-9a-fA-F_]*) # optional Additional exponent digits
8514 |([xX][0-9a-fA-F_]+)
8517 |([oO]?[0-7_]+ # string of octal digits
8518 (\.([0-7][0-7_]*)?)? # optional decimal and fraction
8519 [Pp][+-]?[0-7] # REQUIRED exponent, no underscore
8520 [0-7_]*) # Additional exponent digits with underscores
8523 |([oO]?[0-7_]+) # string of octal digits
8526 |([bB][01_]* # 'b' with string of binary digits
8527 (\.([01][01_]*)?)? # optional decimal and fraction
8528 [Pp][+-]?[01] # Required exponent indicator, no underscore
8529 [01_]*) # additional exponent bits
8532 |([bB][01_]+) # 'b' with string of binary digits
8537 $pos = pos($input_line);
8538 my $numc = $pos - $pos_beg;
8539 $number = substr( $input_line, $pos_beg, $numc );
8545 if ( !defined($number) ) {
8546 pos($input_line) = $pos_beg;
8548 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
8549 $pos = pos($input_line);
8551 # watch out for things like 0..40 which would give 0. by this;
8552 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
8553 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
8557 my $numc = $pos - $pos_beg;
8558 $number = substr( $input_line, $pos_beg, $numc );
8563 # filter out non-numbers like e + - . e2 .e3 +e6
8564 # the rule: at least one digit, and any 'e' must be preceded by a digit
8566 $number !~ /\d/ # no digits
8567 || ( $number =~ /^(.*)[eE]/
8568 && $1 !~ /\d/ ) # or no digits before the 'e'
8572 $type = $input_type;
8573 return ( $i, $type, $number );
8576 # Found a number; now we must convert back from character position
8577 # to pre_token index. An error here implies user syntax error.
8578 # An example would be an invalid octal number like '009'.
8581 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
8582 if ($error) { warning("Possibly invalid number\n") }
8584 return ( $i, $type, $number );
8587 sub inverse_pretoken_map {
8589 # Starting with the current pre_token index $i, scan forward until
8590 # finding the index of the next pre_token whose position is $pos.
8591 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
8594 while ( ++$i <= $max_token_index ) {
8596 if ( $pos <= $rtoken_map->[$i] ) {
8598 # Let the calling routine handle errors in which we do not
8599 # land on a pre-token boundary. It can happen by running
8600 # perltidy on some non-perl scripts, for example.
8601 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
8606 return ( $i, $error );
8611 # find the target of a here document, if any
8613 # $i - token index of the second < of <<
8614 # ($i must be less than the last token index if this is called)
8615 # output parameters:
8616 # $found_target = 0 didn't find target; =1 found target
8617 # HERE_TARGET - the target string (may be empty string)
8618 # $i - unchanged if not here doc,
8619 # or index of the last token of the here target
8620 # $saw_error - flag noting unbalanced quote on here target
8621 my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
8623 my $found_target = 0;
8624 my $here_doc_target = '';
8625 my $here_quote_character = '';
8627 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
8628 $next_token = $rtokens->[ $i + 1 ];
8630 # perl allows a backslash before the target string (heredoc.t)
8632 if ( $next_token eq '\\' ) {
8634 $next_token = $rtokens->[ $i + 2 ];
8637 ( $next_nonblank_token, $i_next_nonblank ) =
8638 find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
8640 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
8643 my $quote_depth = 0;
8648 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
8651 = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
8652 $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
8654 if ($in_quote) { # didn't find end of quote, so no target found
8656 if ( $expecting == TERM ) {
8658 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
8663 else { # found ending quote
8667 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
8668 $tokj = $rtokens->[$j];
8670 # we have to remove any backslash before the quote character
8671 # so that the here-doc-target exactly matches this string
8675 && $rtokens->[ $j + 1 ] eq $here_quote_character );
8676 $here_doc_target .= $tokj;
8681 elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
8683 write_logfile_entry(
8684 "found blank here-target after <<; suggest using \"\"\n");
8687 elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
8689 my $here_doc_expected;
8690 if ( $expecting == UNKNOWN ) {
8691 $here_doc_expected = guess_if_here_doc($next_token);
8694 $here_doc_expected = 1;
8697 if ($here_doc_expected) {
8699 $here_doc_target = $next_token;
8706 if ( $expecting == TERM ) {
8708 write_logfile_entry("Note: bare here-doc operator <<\n");
8715 # patch to neglect any prepended backslash
8716 if ( $found_target && $backslash ) { $i++ }
8718 return ( $found_target, $here_doc_target, $here_quote_character, $i,
8724 # follow (or continue following) quoted string(s)
8725 # $in_quote return code:
8727 # 1 - still must find end of quote whose target is $quote_character
8728 # 2 - still looking for end of first of two quotes
8730 # Returns updated strings:
8731 # $quoted_string_1 = quoted string seen while in_quote=1
8732 # $quoted_string_2 = quoted string seen while in_quote=2
8734 $i, $in_quote, $quote_character,
8735 $quote_pos, $quote_depth, $quoted_string_1,
8736 $quoted_string_2, $rtokens, $rtoken_map,
8740 my $in_quote_starting = $in_quote;
8743 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
8746 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
8749 = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
8750 $quote_pos, $quote_depth, $max_token_index );
8751 $quoted_string_2 .= $quoted_string;
8752 if ( $in_quote == 1 ) {
8753 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
8754 $quote_character = '';
8757 $quoted_string_2 .= "\n";
8761 if ( $in_quote == 1 ) { # one (more) quote to follow
8764 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
8767 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
8768 $quote_pos, $quote_depth, $max_token_index );
8769 $quoted_string_1 .= $quoted_string;
8770 if ( $in_quote == 1 ) {
8771 $quoted_string_1 .= "\n";
8774 return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
8775 $quoted_string_1, $quoted_string_2 );
8778 sub follow_quoted_string {
8780 # scan for a specific token, skipping escaped characters
8781 # if the quote character is blank, use the first non-blank character
8783 # $rtokens = reference to the array of tokens
8784 # $i = the token index of the first character to search
8785 # $in_quote = number of quoted strings being followed
8786 # $beginning_tok = the starting quote character
8787 # $quote_pos = index to check next for alphanumeric delimiter
8788 # output parameters:
8789 # $i = the token index of the ending quote character
8790 # $in_quote = decremented if found end, unchanged if not
8791 # $beginning_tok = the starting quote character
8792 # $quote_pos = index to check next for alphanumeric delimiter
8793 # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
8794 # $quoted_string = the text of the quote (without quotation tokens)
8795 my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
8798 my ( $tok, $end_tok );
8800 my $quoted_string = "";
8804 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
8807 # get the corresponding end token
8808 if ( $beginning_tok !~ /^\s*$/ ) {
8809 $end_tok = matching_end_token($beginning_tok);
8812 # a blank token means we must find and use the first non-blank one
8814 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
8816 while ( $i < $max_token_index ) {
8817 $tok = $rtokens->[ ++$i ];
8819 if ( $tok !~ /^\s*$/ ) {
8821 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
8822 $i = $max_token_index;
8826 if ( length($tok) > 1 ) {
8827 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
8828 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
8831 $beginning_tok = $tok;
8834 $end_tok = matching_end_token($beginning_tok);
8840 $allow_quote_comments = 1;
8845 # There are two different loops which search for the ending quote
8846 # character. In the rare case of an alphanumeric quote delimiter, we
8847 # have to look through alphanumeric tokens character-by-character, since
8848 # the pre-tokenization process combines multiple alphanumeric
8849 # characters, whereas for a non-alphanumeric delimiter, only tokens of
8850 # length 1 can match.
8852 ###################################################################
8853 # Case 1 (rare): loop for case of alphanumeric quote delimiter..
8854 # "quote_pos" is the position the current word to begin searching
8855 ###################################################################
8856 if ( $beginning_tok =~ /\w/ ) {
8858 # Note this because it is not recommended practice except
8859 # for obfuscated perl contests
8860 if ( $in_quote == 1 ) {
8861 write_logfile_entry(
8862 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
8865 # Note: changed < to <= here to fix c109. Relying on extra end blanks.
8866 while ( $i <= $max_token_index ) {
8868 if ( $quote_pos == 0 || ( $i < 0 ) ) {
8869 $tok = $rtokens->[ ++$i ];
8871 if ( $tok eq '\\' ) {
8873 # retain backslash unless it hides the end token
8874 $quoted_string .= $tok
8875 unless $rtokens->[ $i + 1 ] eq $end_tok;
8877 last if ( $i >= $max_token_index );
8878 $tok = $rtokens->[ ++$i ];
8881 my $old_pos = $quote_pos;
8883 unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
8887 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
8889 if ( $quote_pos > 0 ) {
8892 substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
8894 # NOTE: any quote modifiers will be at the end of '$tok'. If we
8895 # wanted to check them, this is the place to get them. But
8896 # this quote form is rarely used in practice, so it isn't
8901 if ( $quote_depth == 0 ) {
8907 if ( $old_pos <= length($tok) ) {
8908 $quoted_string .= substr( $tok, $old_pos );
8914 ########################################################################
8915 # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
8916 ########################################################################
8919 while ( $i < $max_token_index ) {
8920 $tok = $rtokens->[ ++$i ];
8922 if ( $tok eq $end_tok ) {
8925 if ( $quote_depth == 0 ) {
8930 elsif ( $tok eq $beginning_tok ) {
8933 elsif ( $tok eq '\\' ) {
8935 # retain backslash unless it hides the beginning or end token
8936 $tok = $rtokens->[ ++$i ];
8937 $quoted_string .= '\\'
8938 unless ( $tok eq $end_tok || $tok eq $beginning_tok );
8940 $quoted_string .= $tok;
8943 if ( $i > $max_token_index ) { $i = $max_token_index }
8944 return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
8948 sub indicate_error {
8949 my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
8950 interrupt_logfile();
8952 write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
8957 sub write_error_indicator_pair {
8958 my ( $line_number, $input_line, $pos, $carrat ) = @_;
8959 my ( $offset, $numbered_line, $underline ) =
8960 make_numbered_line( $line_number, $input_line, $pos );
8961 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
8962 warning( $numbered_line . "\n" );
8963 $underline =~ s/\s*$//;
8964 warning( $underline . "\n" );
8968 sub make_numbered_line {
8970 # Given an input line, its line number, and a character position of
8971 # interest, create a string not longer than 80 characters of the form
8972 # $lineno: sub_string
8973 # such that the sub_string of $str contains the position of interest
8975 # Here is an example of what we want, in this case we add trailing
8976 # '...' because the line is long.
8978 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
8980 # Here is another example, this time in which we used leading '...'
8981 # because of excessive length:
8983 # 2: ... er of the World Wide Web Consortium's
8985 # input parameters are:
8986 # $lineno = line number
8987 # $str = the text of the line
8988 # $pos = position of interest (the error) : 0 = first character
8991 # - $offset = an offset which corrects the position in case we only
8992 # display part of a line, such that $pos-$offset is the effective
8993 # position from the start of the displayed line.
8994 # - $numbered_line = the numbered line as above,
8995 # - $underline = a blank 'underline' which is all spaces with the same
8996 # number of characters as the numbered line.
8998 my ( $lineno, $str, $pos ) = @_;
8999 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
9000 my $excess = length($str) - $offset - 68;
9001 my $numc = ( $excess > 0 ) ? 68 : undef;
9003 if ( defined($numc) ) {
9004 if ( $offset == 0 ) {
9005 $str = substr( $str, $offset, $numc - 4 ) . " ...";
9008 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
9013 if ( $offset == 0 ) {
9016 $str = "... " . substr( $str, $offset + 4 );
9020 my $numbered_line = sprintf( "%d: ", $lineno );
9021 $offset -= length($numbered_line);
9022 $numbered_line .= $str;
9023 my $underline = " " x length($numbered_line);
9024 return ( $offset, $numbered_line, $underline );
9027 sub write_on_underline {
9029 # The "underline" is a string that shows where an error is; it starts
9030 # out as a string of blanks with the same length as the numbered line of
9031 # code above it, and we have to add marking to show where an error is.
9032 # In the example below, we want to write the string '--^' just below
9033 # the line of bad code:
9035 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9037 # We are given the current underline string, plus a position and a
9038 # string to write on it.
9040 # In the above example, there will be 2 calls to do this:
9041 # First call: $pos=19, pos_chr=^
9042 # Second call: $pos=16, pos_chr=---
9044 # This is a trivial thing to do with substr, but there is some
9047 my ( $underline, $pos, $pos_chr ) = @_;
9049 # check for error..shouldn't happen
9050 unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
9053 my $excess = length($pos_chr) + $pos - length($underline);
9054 if ( $excess > 0 ) {
9055 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
9057 substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
9058 return ($underline);
9063 # Break a string, $str, into a sequence of preliminary tokens. We
9064 # are interested in these types of tokens:
9065 # words (type='w'), example: 'max_tokens_wanted'
9066 # digits (type = 'd'), example: '0755'
9067 # whitespace (type = 'b'), example: ' '
9068 # any other single character (i.e. punct; type = the character itself).
9069 # We cannot do better than this yet because we might be in a quoted
9070 # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
9072 my ( $str, $max_tokens_wanted ) = @_;
9074 # we return references to these 3 arrays:
9075 my @tokens = (); # array of the tokens themselves
9076 my @token_map = (0); # string position of start of each token
9077 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
9082 if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
9085 # note that this must come before words!
9086 elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
9089 elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
9091 # single-character punctuation
9092 elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
9096 return ( \@tokens, \@token_map, \@type );
9100 push @token_map, pos($str);
9102 } while ( --$max_tokens_wanted != 0 );
9104 return ( \@tokens, \@token_map, \@type );
9109 # this is an old debug routine
9110 # not called, but saved for reference
9111 my ( $rtokens, $rtoken_map ) = @_;
9112 my $num = scalar( @{$rtokens} );
9114 foreach my $i ( 0 .. $num - 1 ) {
9115 my $len = length( $rtokens->[$i] );
9116 print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
9121 { ## closure for sub matching end token
9122 my %matching_end_token;
9125 %matching_end_token = (
9133 sub matching_end_token {
9135 # return closing character for a pattern
9136 my $beginning_token = shift;
9137 if ( $matching_end_token{$beginning_token} ) {
9138 return $matching_end_token{$beginning_token};
9140 return ($beginning_token);
9144 sub dump_token_types {
9145 my ( $class, $fh ) = @_;
9147 # This should be the latest list of token types in use
9148 # adding NEW_TOKENS: add a comment here
9149 $fh->print(<<'END_OF_LIST');
9151 Here is a list of the token types currently used for lines of type 'CODE'.
9152 For the following tokens, the "type" of a token is just the token itself.
9154 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
9155 ( ) <= >= == =~ !~ != ++ -- /= x=
9156 ... **= <<= >>= &&= ||= //= <=>
9157 , + - / * | % ! x ~ = \ ? : . < > ^ &
9159 The following additional token types are defined:
9162 b blank (white space)
9163 { indent: opening structural curly brace or square bracket or paren
9164 (code block, anonymous hash reference, or anonymous array reference)
9165 } outdent: right structural curly brace or square bracket or paren
9166 [ left non-structural square bracket (enclosing an array index)
9167 ] right non-structural square bracket
9168 ( left non-structural paren (all but a list right of an =)
9169 ) right non-structural paren
9170 L left non-structural curly brace (enclosing a key)
9171 R right non-structural curly brace
9172 ; terminal semicolon
9173 f indicates a semicolon in a "for" statement
9174 h here_doc operator <<
9176 Q indicates a quote or pattern
9177 q indicates a qw quote block
9179 C user-defined constant or constant function (with void prototype = ())
9180 U user-defined function taking parameters
9181 G user-defined function taking block parameter (like grep/map/eval)
9182 M (unused, but reserved for subroutine definition name)
9183 P (unused, but -html uses it to label pod text)
9184 t type indicater such as %,$,@,*,&,sub
9185 w bare word (perhaps a subroutine call)
9186 i identifier of some type (with leading %, $, @, *, &, sub, -> )
9189 F a file test operator (like -e)
9191 Z identifier in indirect object slot: may be file handle, object
9192 J LABEL: code block label
9193 j LABEL after next, last, redo, goto
9196 pp pre-increment operator ++
9197 mm pre-decrement operator --
9198 A : used as attribute separator
9200 Here are the '_line_type' codes used internally:
9201 SYSTEM - system-specific code before hash-bang line
9202 CODE - line of perl code (including comments)
9203 POD_START - line starting pod, such as '=head'
9204 POD - pod documentation text
9205 POD_END - last line of pod section, '=cut'
9206 HERE - text of here-document
9207 HERE_END - last line of here-doc (target word)
9208 FORMAT - format section
9209 FORMAT_END - last line of format section, '.'
9210 SKIP - code skipping section
9211 SKIP_END - last line of code skipping section, '#>>V'
9212 DATA_START - __DATA__ line
9213 DATA - unidentified text following __DATA__
9214 END_START - __END__ line
9215 END - unidentified text following __END__
9216 ERROR - we are in big trouble, probably not a perl script
9224 # These names are used in error messages
9225 @opening_brace_names = qw# '{' '[' '(' '?' #;
9226 @closing_brace_names = qw# '}' ']' ')' ':' #;
9231 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
9232 <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
9234 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
9236 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
9237 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
9239 my @tetragraphs = qw( <<>> );
9240 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
9242 # make a hash of all valid token types for self-checking the tokenizer
9243 # (adding NEW_TOKENS : select a new character and add to this list)
9244 my @valid_token_types = qw#
9245 A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
9246 { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
9248 push( @valid_token_types, @digraphs );
9249 push( @valid_token_types, @trigraphs );
9250 push( @valid_token_types, @tetragraphs );
9251 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
9252 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
9254 # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
9255 my @file_test_operators =
9256 qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
9257 @is_file_test_operator{@file_test_operators} =
9258 (1) x scalar(@file_test_operators);
9260 # these functions have prototypes of the form (&), so when they are
9261 # followed by a block, that block MAY BE followed by an operator.
9262 # Smartmatch operator ~~ may be followed by anonymous hash or array ref
9264 @is_block_operator{@q} = (1) x scalar(@q);
9266 # these functions allow an identifier in the indirect object slot
9267 @q = qw( print printf sort exec system say);
9268 @is_indirect_object_taker{@q} = (1) x scalar(@q);
9270 # These tokens may precede a code block
9271 # patched for SWITCH/CASE/CATCH. Actually these could be removed
9272 # now and we could let the extended-syntax coding handle them.
9273 # Added 'default' for Switch::Plain.
9275 qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
9276 unless do while until eval for foreach map grep sort
9277 switch case given when default catch try finally);
9278 @is_code_block_token{@q} = (1) x scalar(@q);
9280 # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
9281 # to contrast it with the block types in '%is_zero_continuation_block_type'
9282 @q = qw( sort map grep eval do );
9283 @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
9285 %is_grep_alias = ();
9287 # I'll build the list of keywords incrementally
9290 # keywords and tokens after which a value or pattern is expected,
9291 # but not an operator. In other words, these should consume terms
9292 # to their right, or at least they are not expected to be followed
9293 # immediately by operators.
9294 my @value_requestor = qw(
9522 # patched above for SWITCH/CASE given/when err say
9523 # 'err' is a fairly safe addition.
9524 # Added 'default' for Switch::Plain. Note that we could also have
9525 # a separate set of keywords to include if we see 'use Switch::Plain'
9526 push( @Keywords, @value_requestor );
9528 # These are treated the same but are not keywords:
9533 push( @value_requestor, @extra_vr );
9535 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
9537 # this list contains keywords which do not look for arguments,
9538 # so that they might be followed by an operator, or at least
9540 my @operator_requestor = qw(
9564 push( @Keywords, @operator_requestor );
9566 # These are treated the same but are not considered keywords:
9573 push( @operator_requestor, @extra_or );
9575 @expecting_operator_token{@operator_requestor} =
9576 (1) x scalar(@operator_requestor);
9578 # these token TYPES expect trailing operator but not a term
9579 # note: ++ and -- are post-increment and decrement, 'C' = constant
9580 my @operator_requestor_types = qw( ++ -- C <> q );
9581 @expecting_operator_types{@operator_requestor_types} =
9582 (1) x scalar(@operator_requestor_types);
9584 # these token TYPES consume values (terms)
9585 # note: pp and mm are pre-increment and decrement
9586 # f=semicolon in for, F=file test operator
9587 my @value_requestor_type = qw#
9588 L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
9589 **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
9590 <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
9591 f F pp mm Y p m U J G j >> << ^ t
9592 ~. ^. |. &. ^.= |.= &.=
9594 push( @value_requestor_type, ',' )
9595 ; # (perl doesn't like a ',' in a qw block)
9596 @expecting_term_types{@value_requestor_type} =
9597 (1) x scalar(@value_requestor_type);
9599 # Note: the following valid token types are not assigned here to
9600 # hashes requesting to be followed by values or terms, but are
9601 # instead currently hard-coded into sub operator_expected:
9602 # ) -> :: Q R Z ] b h i k n v w } #
9604 # For simple syntax checking, it is nice to have a list of operators which
9605 # will really be unhappy if not followed by a term. This includes most
9607 %really_want_term = %expecting_term_types;
9609 # with these exceptions...
9610 delete $really_want_term{'U'}; # user sub, depends on prototype
9611 delete $really_want_term{'F'}; # file test works on $_ if no following term
9612 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
9615 @q = qw(q qq qw qx qr s y tr m);
9616 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
9619 @is_package{@q} = (1) x scalar(@q);
9623 @is_comma_question_colon{@q} = (1) x scalar(@q);
9625 # Hash of other possible line endings which may occur.
9626 # Keep these coordinated with the regex where this is used.
9627 # Note: chr(13) = chr(015)="\r".
9628 @q = ( chr(13), chr(29), chr(26) );
9629 @other_line_endings{@q} = (1) x scalar(@q);
9631 # These keywords are handled specially in the tokenizer code:
9632 my @special_keywords = qw(
9648 push( @Keywords, @special_keywords );
9650 # Keywords after which list formatting may be used
9651 # WARNING: do not include |map|grep|eval or perl may die on
9652 # syntax errors (map1.t).
9653 my @keyword_taking_list = qw(
9728 @is_keyword_taking_list{@keyword_taking_list} =
9729 (1) x scalar(@keyword_taking_list);
9731 # perl functions which may be unary operators.
9733 # This list is used to decide if a pattern delimited by slashes, /pattern/,
9734 # can follow one of these keywords.
9736 chomp eof eval fc lc pop shift uc undef
9739 @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
9742 # These are keywords for which an arg may optionally be omitted. They are
9743 # currently only used to disambiguate a ? used as a ternary from one used
9744 # as a (depricated) pattern delimiter. In the future, they might be used
9745 # to give a warning about ambiguous syntax before a /.
9746 # Note: split has been omitted (see not below).
9747 my @keywords_taking_optional_arg = qw(
9816 @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
9817 (1) x scalar(@keywords_taking_optional_arg);
9819 # This list is used to decide if a pattern delmited by question marks,
9820 # ?pattern?, can follow one of these keywords. Note that from perl 5.22
9821 # on, a ?pattern? is not recognized, so we can be much more strict than
9822 # with a /pattern/. Note that 'split' is not in this list. In current
9823 # versions of perl a question following split must be a ternary, but
9824 # in older versions it could be a pattern. The guessing algorithm will
9825 # decide. We are combining two lists here to simplify the test.
9826 @q = ( @keywords_taking_optional_arg, @operator_requestor );
9827 @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
9830 # These are not used in any way yet
9831 # my @unused_keywords = qw(
9837 # The list of keywords was originally extracted from function 'keyword' in
9838 # perl file toke.c version 5.005.03, using this utility, plus a
9839 # little editing: (file getkwd.pl):
9840 # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
9841 # Add 'get' prefix where necessary, then split into the above lists.
9842 # This list should be updated as necessary.
9843 # The list should not contain these special variables:
9844 # ARGV DATA ENV SIG STDERR STDIN STDOUT
9847 @is_keyword{@Keywords} = (1) x scalar(@Keywords);