1 #####################################################################
3 # The Perl::Tidy::Tokenizer package is essentially a filter which
4 # reads lines of perl source code from a source object and provides
5 # corresponding tokenized lines through its get_line() method. Lines
6 # flow from the source_object to the caller like this:
8 # source_object --> LineBuffer_object --> Tokenizer --> calling routine
9 # get_line() get_line() get_line() line_of_tokens
11 # The source object can be any object with a get_line() method which
12 # supplies one line (a character string) perl call.
13 # The LineBuffer object is created by the Tokenizer.
14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
15 # containing one tokenized line for each call to its get_line() method.
17 # WARNING: This is not a real class. Only one tokenizer my be used.
19 ########################################################################
21 package Perl::Tidy::Tokenizer;
24 use English qw( -no_match_vars );
26 our $VERSION = '20221112';
28 use Perl::Tidy::LineBuffer;
31 use constant DEVEL_MODE => 0;
32 use constant EMPTY_STRING => q{};
33 use constant SPACE => q{ };
35 # Decimal values of some ascii characters for quick checks
36 use constant ORD_TAB => 9;
37 use constant ORD_SPACE => 32;
38 use constant ORD_PRINTABLE_MIN => 33;
39 use constant ORD_PRINTABLE_MAX => 126;
41 # PACKAGE VARIABLES for processing an entire FILE.
42 # These must be package variables because most may get localized during
43 # processing. Most are initialized in sub prepare_for_a_new_file.
49 $last_nonblank_block_type
57 %user_function_prototype
59 %is_block_list_function
60 %saw_function_definition
71 @nesting_sequence_number
72 @current_sequence_number
74 @paren_semicolon_count
75 @paren_structural_type
77 @brace_structural_type
81 @square_bracket_structural_type
84 @nested_statement_type
85 @starting_line_of_current_depth
88 # GLOBAL CONSTANTS for routines in this package,
89 # Initialized in a BEGIN block.
91 %is_indirect_object_taker
93 %expecting_operator_token
94 %expecting_operator_types
99 %is_file_test_operator
105 %is_sort_map_grep_eval_do
111 %is_keyword_taking_list
112 %is_keyword_taking_optional_arg
113 %is_keyword_rejecting_slash_as_pattern_delimiter
114 %is_keyword_rejecting_question_as_pattern_delimiter
115 %is_q_qq_qx_qr_s_y_tr_m
116 %is_q_qq_qw_qx_qr_s_y_tr_m
119 %is_comma_question_colon
121 %is_if_elsif_unless_case_when
123 %is_END_DATA_format_sub
125 $code_skipping_pattern_begin
126 $code_skipping_pattern_end
129 # GLOBAL VARIABLES which are constant after being configured by user-supplied
130 # parameters. They remain constant as a file is being processed.
133 $rOpts_code_skipping,
134 $code_skipping_pattern_begin,
135 $code_skipping_pattern_end,
138 # possible values of operator_expected()
139 use constant TERM => -1;
140 use constant UNKNOWN => 0;
141 use constant OPERATOR => 1;
143 # possible values of context
144 use constant SCALAR_CONTEXT => -1;
145 use constant UNKNOWN_CONTEXT => 0;
146 use constant LIST_CONTEXT => 1;
148 # Maximum number of little messages; probably need not be changed.
149 use constant MAX_NAG_MESSAGES => 6;
153 # Array index names for $self.
154 # Do not combine with other BEGIN blocks (c101).
157 _rhere_target_list_ => $i++,
158 _in_here_doc_ => $i++,
159 _here_doc_target_ => $i++,
160 _here_quote_character_ => $i++,
166 _in_skipped_ => $i++,
167 _in_attribute_list_ => $i++,
169 _quote_target_ => $i++,
170 _line_start_quote_ => $i++,
171 _starting_level_ => $i++,
172 _know_starting_level_ => $i++,
174 _indent_columns_ => $i++,
175 _look_for_hash_bang_ => $i++,
177 _continuation_indentation_ => $i++,
178 _outdent_labels_ => $i++,
179 _last_line_number_ => $i++,
180 _saw_perl_dash_P_ => $i++,
181 _saw_perl_dash_w_ => $i++,
182 _saw_use_strict_ => $i++,
183 _saw_v_string_ => $i++,
185 _look_for_autoloader_ => $i++,
186 _look_for_selfloader_ => $i++,
187 _saw_autoloader_ => $i++,
188 _saw_selfloader_ => $i++,
189 _saw_hash_bang_ => $i++,
192 _saw_negative_indentation_ => $i++,
193 _started_tokenizing_ => $i++,
194 _line_buffer_object_ => $i++,
195 _debugger_object_ => $i++,
196 _diagnostics_object_ => $i++,
197 _logger_object_ => $i++,
198 _unexpected_error_count_ => $i++,
199 _started_looking_for_here_target_at_ => $i++,
200 _nearly_matched_here_target_at_ => $i++,
201 _line_of_text_ => $i++,
202 _rlower_case_labels_at_ => $i++,
203 _extended_syntax_ => $i++,
204 _maximum_level_ => $i++,
205 _true_brace_error_count_ => $i++,
206 _rOpts_maximum_level_errors_ => $i++,
207 _rOpts_maximum_unexpected_errors_ => $i++,
208 _rOpts_logfile_ => $i++,
213 { ## closure for subs to count instances
215 # methods to count instances
217 sub get_count { return $_count; }
218 sub _increment_count { return ++$_count }
219 sub _decrement_count { return --$_count }
224 $self->_decrement_count();
230 # Catch any undefined sub calls so that we are sure to get
231 # some diagnostic information. This sub should never be called
232 # except for a programming error.
234 return if ( $AUTOLOAD =~ /\bDESTROY$/ );
235 my ( $pkg, $fname, $lno ) = caller();
236 my $my_package = __PACKAGE__;
238 ======================================================================
239 Error detected in package '$my_package', version $VERSION
240 Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
241 Called from package: '$pkg'
242 Called from File '$fname' at line '$lno'
243 This error is probably due to a recent programming change
244 ======================================================================
247 } ## end sub AUTOLOAD
251 Perl::Tidy::Die($msg);
252 croak "unexpected return from Perl::Tidy::Die";
258 # This routine is called for errors that really should not occur
259 # except if there has been a bug introduced by a recent program change.
260 # Please add comments at calls to Fault to explain why the call
261 # should not occur, and where to look to fix it.
262 my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
263 my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
264 my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
265 my $input_stream_name = get_input_stream_name();
268 ==============================================================================
269 While operating on input stream with name: '$input_stream_name'
270 A fault was detected at line $line0 of sub '$subroutine1'
272 which was called from line $line1 of sub '$subroutine2'
274 This is probably an error introduced by a recent programming change.
275 Perl::Tidy::Tokenizer.pm reports VERSION='$VERSION'.
276 ==============================================================================
279 # We shouldn't get here, but this return is to keep Perl-Critic from
286 # See if a pattern will compile. We have to use a string eval here,
287 # but it should be safe because the pattern has been constructed
290 my $ok = eval "'##'=~/$pattern/";
291 return !defined($ok) || $EVAL_ERROR;
294 sub make_code_skipping_pattern {
295 my ( $rOpts, $opt_name, $default ) = @_;
296 my $param = $rOpts->{$opt_name};
297 unless ($param) { $param = $default }
298 $param =~ s/^\s*//; # allow leading spaces to be like format-skipping
299 if ( $param !~ /^#/ ) {
300 Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
302 my $pattern = '^\s*' . $param . '\b';
303 if ( bad_pattern($pattern) ) {
305 "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
309 } ## end sub make_code_skipping_pattern
313 # Check Tokenizer parameters
319 %is_END_DATA_format_sub = (
326 # Install any aliases to 'sub'
327 if ( $rOpts->{'sub-alias-list'} ) {
329 # Note that any 'sub-alias-list' has been preprocessed to
330 # be a trimmed, space-separated list which includes 'sub'
331 # for example, it might be 'sub method fun'
332 my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
333 foreach my $word (@sub_alias_list) {
335 $is_END_DATA_format_sub{$word} = 1;
340 if ( $rOpts->{'grep-alias-list'} ) {
342 # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
343 # space-separated list
344 my @q = split /\s+/, $rOpts->{'grep-alias-list'};
345 @{is_grep_alias}{@q} = (1) x scalar(@q);
348 $rOpts_code_skipping = $rOpts->{'code-skipping'};
349 $code_skipping_pattern_begin =
350 make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
351 $code_skipping_pattern_end =
352 make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
354 } ## end sub check_options
358 my ( $class, @args ) = @_;
360 # Note: 'tabs' and 'indent_columns' are temporary and should be
363 source_object => undef,
364 debugger_object => undef,
365 diagnostics_object => undef,
366 logger_object => undef,
367 starting_level => undef,
370 look_for_hash_bang => 0,
372 look_for_autoloader => 1,
373 look_for_selfloader => 1,
374 starting_line_number => 1,
375 extended_syntax => 0,
378 my %args = ( %defaults, @args );
380 # we are given an object with a get_line() method to supply source lines
381 my $source_object = $args{source_object};
382 my $rOpts = $args{rOpts};
384 # we create another object with a get_line() and peek_ahead() method
385 my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
387 # Tokenizer state data is as follows:
388 # _rhere_target_list_ reference to list of here-doc targets
389 # _here_doc_target_ the target string for a here document
390 # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
391 # to determine if interpolation is done
392 # _quote_target_ character we seek if chasing a quote
393 # _line_start_quote_ line where we started looking for a long quote
394 # _in_here_doc_ flag indicating if we are in a here-doc
395 # _in_pod_ flag set if we are in pod documentation
396 # _in_skipped_ flag set if we are in a skipped section
397 # _in_error_ flag set if we saw severe error (binary in script)
398 # _in_data_ flag set if we are in __DATA__ section
399 # _in_end_ flag set if we are in __END__ section
400 # _in_format_ flag set if we are in a format description
401 # _in_attribute_list_ flag telling if we are looking for attributes
402 # _in_quote_ flag telling if we are chasing a quote
403 # _starting_level_ indentation level of first line
404 # _line_buffer_object_ object with get_line() method to supply source code
405 # _diagnostics_object_ place to write debugging information
406 # _unexpected_error_count_ error count used to limit output
407 # _lower_case_labels_at_ line numbers where lower case labels seen
408 # _hit_bug_ program bug detected
411 $self->[_rhere_target_list_] = [];
412 $self->[_in_here_doc_] = 0;
413 $self->[_here_doc_target_] = EMPTY_STRING;
414 $self->[_here_quote_character_] = EMPTY_STRING;
415 $self->[_in_data_] = 0;
416 $self->[_in_end_] = 0;
417 $self->[_in_format_] = 0;
418 $self->[_in_error_] = 0;
419 $self->[_in_pod_] = 0;
420 $self->[_in_skipped_] = 0;
421 $self->[_in_attribute_list_] = 0;
422 $self->[_in_quote_] = 0;
423 $self->[_quote_target_] = EMPTY_STRING;
424 $self->[_line_start_quote_] = -1;
425 $self->[_starting_level_] = $args{starting_level};
426 $self->[_know_starting_level_] = defined( $args{starting_level} );
427 $self->[_tabsize_] = $args{tabsize};
428 $self->[_indent_columns_] = $args{indent_columns};
429 $self->[_look_for_hash_bang_] = $args{look_for_hash_bang};
430 $self->[_trim_qw_] = $args{trim_qw};
431 $self->[_continuation_indentation_] = $args{continuation_indentation};
432 $self->[_outdent_labels_] = $args{outdent_labels};
433 $self->[_last_line_number_] = $args{starting_line_number} - 1;
434 $self->[_saw_perl_dash_P_] = 0;
435 $self->[_saw_perl_dash_w_] = 0;
436 $self->[_saw_use_strict_] = 0;
437 $self->[_saw_v_string_] = 0;
438 $self->[_hit_bug_] = 0;
439 $self->[_look_for_autoloader_] = $args{look_for_autoloader};
440 $self->[_look_for_selfloader_] = $args{look_for_selfloader};
441 $self->[_saw_autoloader_] = 0;
442 $self->[_saw_selfloader_] = 0;
443 $self->[_saw_hash_bang_] = 0;
444 $self->[_saw_end_] = 0;
445 $self->[_saw_data_] = 0;
446 $self->[_saw_negative_indentation_] = 0;
447 $self->[_started_tokenizing_] = 0;
448 $self->[_line_buffer_object_] = $line_buffer_object;
449 $self->[_debugger_object_] = $args{debugger_object};
450 $self->[_diagnostics_object_] = $args{diagnostics_object};
451 $self->[_logger_object_] = $args{logger_object};
452 $self->[_unexpected_error_count_] = 0;
453 $self->[_started_looking_for_here_target_at_] = 0;
454 $self->[_nearly_matched_here_target_at_] = undef;
455 $self->[_line_of_text_] = EMPTY_STRING;
456 $self->[_rlower_case_labels_at_] = undef;
457 $self->[_extended_syntax_] = $args{extended_syntax};
458 $self->[_maximum_level_] = 0;
459 $self->[_true_brace_error_count_] = 0;
460 $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
461 $self->[_rOpts_maximum_unexpected_errors_] =
462 $rOpts->{'maximum-unexpected-errors'};
463 $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
464 $self->[_rOpts_] = $rOpts;
466 # These vars are used for guessing indentation and must be positive
467 $self->[_tabsize_] = 8 if ( !$self->[_tabsize_] );
468 $self->[_indent_columns_] = 4 if ( !$self->[_indent_columns_] );
472 $tokenizer_self = $self;
474 prepare_for_a_new_file();
475 $self->find_starting_indentation_level();
477 # This is not a full class yet, so die if an attempt is made to
478 # create more than one object.
480 if ( _increment_count() > 1 ) {
482 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
489 # interface to Perl::Tidy::Logger routines
492 my $logger_object = $tokenizer_self->[_logger_object_];
493 if ($logger_object) {
494 $logger_object->warning($msg);
499 sub get_input_stream_name {
500 my $input_stream_name = EMPTY_STRING;
501 my $logger_object = $tokenizer_self->[_logger_object_];
502 if ($logger_object) {
503 $input_stream_name = $logger_object->get_input_stream_name();
505 return $input_stream_name;
510 my $logger_object = $tokenizer_self->[_logger_object_];
511 if ($logger_object) {
512 my $input_line_number = $tokenizer_self->[_last_line_number_] + 1;
513 $msg = "Line $input_line_number: $msg";
514 $logger_object->complain($msg);
517 } ## end sub complain
519 sub write_logfile_entry {
521 my $logger_object = $tokenizer_self->[_logger_object_];
522 if ($logger_object) {
523 $logger_object->write_logfile_entry($msg);
528 sub interrupt_logfile {
529 my $logger_object = $tokenizer_self->[_logger_object_];
530 if ($logger_object) {
531 $logger_object->interrupt_logfile();
537 my $logger_object = $tokenizer_self->[_logger_object_];
538 if ($logger_object) {
539 $logger_object->resume_logfile();
544 sub increment_brace_error {
545 my $logger_object = $tokenizer_self->[_logger_object_];
546 if ($logger_object) {
547 $logger_object->increment_brace_error();
552 sub report_definite_bug {
553 $tokenizer_self->[_hit_bug_] = 1;
554 my $logger_object = $tokenizer_self->[_logger_object_];
555 if ($logger_object) {
556 $logger_object->report_definite_bug();
563 my $logger_object = $tokenizer_self->[_logger_object_];
564 if ($logger_object) {
565 $logger_object->brace_warning($msg);
570 sub get_saw_brace_error {
571 my $logger_object = $tokenizer_self->[_logger_object_];
572 if ($logger_object) {
573 return $logger_object->get_saw_brace_error();
580 sub get_unexpected_error_count {
582 return $self->[_unexpected_error_count_];
585 # interface to Perl::Tidy::Diagnostics routines
586 sub write_diagnostics {
588 if ( $tokenizer_self->[_diagnostics_object_] ) {
589 $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
594 sub get_maximum_level {
595 return $tokenizer_self->[_maximum_level_];
598 sub report_tokenization_errors {
602 # Report any tokenization errors and return a flag '$severe_error'.
603 # Set $severe_error = 1 if the tokenization errors are so severe that
604 # the formatter should not attempt to format the file. Instead, it will
605 # just output the file verbatim.
607 # set severe error flag if tokenizer has encountered file reading problems
608 # (i.e. unexpected binary characters)
609 my $severe_error = $self->[_in_error_];
611 my $maxle = $self->[_rOpts_maximum_level_errors_];
612 my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
613 $maxle = 1 unless defined($maxle);
614 $maxue = 0 unless defined($maxue);
616 my $level = get_indentation_level();
617 if ( $level != $tokenizer_self->[_starting_level_] ) {
618 warning("final indentation level: $level\n");
619 my $level_diff = $tokenizer_self->[_starting_level_] - $level;
620 if ( $level_diff < 0 ) { $level_diff = -$level_diff }
622 # Set severe error flag if the level error is greater than 1.
623 # The formatter can function for any level error but it is probably
624 # best not to attempt formatting for a high level error.
625 if ( $maxle >= 0 && $level_diff > $maxle ) {
628 Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
633 check_final_nesting_depths();
635 # Likewise, large numbers of brace errors usually indicate non-perl
636 # scripts, so set the severe error flag at a low number. This is similar
637 # to the level check, but different because braces may balance but be
638 # incorrectly interlaced.
639 if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
643 if ( $tokenizer_self->[_look_for_hash_bang_]
644 && !$tokenizer_self->[_saw_hash_bang_] )
647 "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
650 if ( $tokenizer_self->[_in_format_] ) {
651 warning("hit EOF while in format description\n");
654 if ( $tokenizer_self->[_in_skipped_] ) {
656 "hit EOF while in lines skipped with --code-skipping\n");
659 if ( $tokenizer_self->[_in_pod_] ) {
661 # Just write log entry if this is after __END__ or __DATA__
662 # because this happens to often, and it is not likely to be
664 if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
666 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
672 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
678 if ( $tokenizer_self->[_in_here_doc_] ) {
680 my $here_doc_target = $tokenizer_self->[_here_doc_target_];
681 my $started_looking_for_here_target_at =
682 $tokenizer_self->[_started_looking_for_here_target_at_];
683 if ($here_doc_target) {
685 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
690 Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
691 (Perl will match to the end of file but this may not be intended).
694 my $nearly_matched_here_target_at =
695 $tokenizer_self->[_nearly_matched_here_target_at_];
696 if ($nearly_matched_here_target_at) {
698 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
703 # Something is seriously wrong if we ended inside a quote
704 if ( $tokenizer_self->[_in_quote_] ) {
706 my $line_start_quote = $tokenizer_self->[_line_start_quote_];
707 my $quote_target = $tokenizer_self->[_quote_target_];
709 ( $tokenizer_self->[_in_attribute_list_] )
713 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
717 if ( $tokenizer_self->[_hit_bug_] ) {
721 # Multiple "unexpected" type tokenization errors usually indicate parsing
722 # non-perl scripts, or that something is seriously wrong, so we should
723 # avoid formatting them. This can happen for example if we run perltidy on
724 # a shell script or an html file. But unfortunately this check can
725 # interfere with some extended syntaxes, such as RPerl, so it has to be off
727 my $ue_count = $tokenizer_self->[_unexpected_error_count_];
728 if ( $maxue > 0 && $ue_count > $maxue ) {
730 Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
735 unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
737 write_logfile_entry("Suggest including '-w parameter'\n");
740 write_logfile_entry("Suggest including 'use warnings;'\n");
744 if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
745 write_logfile_entry("Use of -P parameter for defines is discouraged\n");
748 unless ( $tokenizer_self->[_saw_use_strict_] ) {
749 write_logfile_entry("Suggest including 'use strict;'\n");
752 # it is suggested that labels have at least one upper case character
753 # for legibility and to avoid code breakage as new keywords are introduced
754 if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
755 my @lower_case_labels_at =
756 @{ $tokenizer_self->[_rlower_case_labels_at_] };
758 "Suggest using upper case characters in label(s)\n");
759 local $LIST_SEPARATOR = ')(';
760 write_logfile_entry(" defined at line(s): (@lower_case_labels_at)\n");
762 return $severe_error;
763 } ## end sub report_tokenization_errors
765 sub report_v_string {
767 # warn if this version can't handle v-strings
769 unless ( $tokenizer_self->[_saw_v_string_] ) {
770 $tokenizer_self->[_saw_v_string_] =
771 $tokenizer_self->[_last_line_number_];
775 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
779 } ## end sub report_v_string
781 sub is_valid_token_type {
783 return $is_valid_token_type{$type};
786 sub get_input_line_number {
787 return $tokenizer_self->[_last_line_number_];
790 sub log_numbered_msg {
791 my ( $self, $msg ) = @_;
793 # write input line number + message to logfile
794 my $input_line_number = $self->[_last_line_number_];
795 write_logfile_entry("Line $input_line_number: $msg");
799 # returns the next tokenized line
804 # USES GLOBAL VARIABLES:
805 # $brace_depth, $square_bracket_depth, $paren_depth
807 my $input_line = $self->[_line_buffer_object_]->get_line();
808 $self->[_line_of_text_] = $input_line;
810 return unless ($input_line);
812 my $input_line_number = ++$self->[_last_line_number_];
814 # Find and remove what characters terminate this line, including any
816 my $input_line_separator = EMPTY_STRING;
817 if ( chomp($input_line) ) {
818 $input_line_separator = $INPUT_RECORD_SEPARATOR;
821 # The first test here very significantly speeds things up, but be sure to
822 # keep the regex and hash %other_line_endings the same.
823 if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
824 if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
825 $input_line_separator = $2 . $input_line_separator;
829 # for backwards compatibility we keep the line text terminated with
830 # a newline character
832 $self->[_line_of_text_] = $input_line;
834 # create a data structure describing this line which will be
835 # returned to the caller.
837 # _line_type codes are:
838 # SYSTEM - system-specific code before hash-bang line
839 # CODE - line of perl code (including comments)
840 # POD_START - line starting pod, such as '=head'
841 # POD - pod documentation text
842 # POD_END - last line of pod section, '=cut'
843 # HERE - text of here-document
844 # HERE_END - last line of here-doc (target word)
845 # FORMAT - format section
846 # FORMAT_END - last line of format section, '.'
847 # SKIP - code skipping section
848 # SKIP_END - last line of code skipping section, '#>>V'
849 # DATA_START - __DATA__ line
850 # DATA - unidentified text following __DATA__
851 # END_START - __END__ line
852 # END - unidentified text following __END__
853 # ERROR - we are in big trouble, probably not a perl script
856 # _curly_brace_depth - depth of curly braces at start of line
857 # _square_bracket_depth - depth of square brackets at start of line
858 # _paren_depth - depth of parens at start of line
859 # _starting_in_quote - this line continues a multi-line quote
860 # (so don't trim leading blanks!)
861 # _ending_in_quote - this line ends in a multi-line quote
862 # (so don't trim trailing blanks!)
863 my $line_of_tokens = {
865 _line_text => $input_line,
866 _line_number => $input_line_number,
867 _guessed_indentation_level => 0,
868 _curly_brace_depth => $brace_depth,
869 _square_bracket_depth => $square_bracket_depth,
870 _paren_depth => $paren_depth,
871 _quote_character => EMPTY_STRING,
872 ## Skip these needless initializations for efficiency:
873 ## _rtoken_type => undef,
874 ## _rtokens => undef,
875 ## _rlevels => undef,
876 ## _rblock_type => undef,
877 ## _rtype_sequence => undef,
878 ## _rci_levels => undef,
879 ## _starting_in_quote => 0,
880 ## _ending_in_quote => 0,
883 # must print line unchanged if we are in a here document
884 if ( $self->[_in_here_doc_] ) {
886 $line_of_tokens->{_line_type} = 'HERE';
887 my $here_doc_target = $self->[_here_doc_target_];
888 my $here_quote_character = $self->[_here_quote_character_];
889 my $candidate_target = $input_line;
890 chomp $candidate_target;
892 # Handle <<~ targets, which are indicated here by a leading space on
893 # the here quote character
894 if ( $here_quote_character =~ /^\s/ ) {
895 $candidate_target =~ s/^\s*//;
897 if ( $candidate_target eq $here_doc_target ) {
898 $self->[_nearly_matched_here_target_at_] = undef;
899 $line_of_tokens->{_line_type} = 'HERE_END';
900 $self->log_numbered_msg("Exiting HERE document $here_doc_target\n");
902 my $rhere_target_list = $self->[_rhere_target_list_];
903 if ( @{$rhere_target_list} ) { # there can be multiple here targets
904 ( $here_doc_target, $here_quote_character ) =
905 @{ shift @{$rhere_target_list} };
906 $self->[_here_doc_target_] = $here_doc_target;
907 $self->[_here_quote_character_] = $here_quote_character;
908 $self->log_numbered_msg(
909 "Entering HERE document $here_doc_target\n");
910 $self->[_nearly_matched_here_target_at_] = undef;
911 $self->[_started_looking_for_here_target_at_] =
915 $self->[_in_here_doc_] = 0;
916 $self->[_here_doc_target_] = EMPTY_STRING;
917 $self->[_here_quote_character_] = EMPTY_STRING;
921 # check for error of extra whitespace
922 # note for PERL6: leading whitespace is allowed
924 $candidate_target =~ s/\s*$//;
925 $candidate_target =~ s/^\s*//;
926 if ( $candidate_target eq $here_doc_target ) {
927 $self->[_nearly_matched_here_target_at_] = $input_line_number;
930 return $line_of_tokens;
933 # Print line unchanged if we are in a format section
934 elsif ( $self->[_in_format_] ) {
936 if ( $input_line =~ /^\.[\s#]*$/ ) {
938 # Decrement format depth count at a '.' after a 'format'
939 $self->[_in_format_]--;
941 # This is the end when count reaches 0
942 if ( !$self->[_in_format_] ) {
943 $self->log_numbered_msg("Exiting format section\n");
944 $line_of_tokens->{_line_type} = 'FORMAT_END';
948 $line_of_tokens->{_line_type} = 'FORMAT';
949 if ( $input_line =~ /^\s*format\s+\w+/ ) {
951 # Increment format depth count at a 'format' within a 'format'
952 # This is a simple way to handle nested formats (issue c019).
953 $self->[_in_format_]++;
956 return $line_of_tokens;
959 # must print line unchanged if we are in pod documentation
960 elsif ( $self->[_in_pod_] ) {
962 $line_of_tokens->{_line_type} = 'POD';
963 if ( $input_line =~ /^=cut/ ) {
964 $line_of_tokens->{_line_type} = 'POD_END';
965 $self->log_numbered_msg("Exiting POD section\n");
966 $self->[_in_pod_] = 0;
968 if ( $input_line =~ /^\#\!.*perl\b/ && !$self->[_in_end_] ) {
970 "Hash-bang in pod can cause older versions of perl to fail! \n"
974 return $line_of_tokens;
977 # print line unchanged if in skipped section
978 elsif ( $self->[_in_skipped_] ) {
980 $line_of_tokens->{_line_type} = 'SKIP';
981 if ( $input_line =~ /$code_skipping_pattern_end/ ) {
982 $line_of_tokens->{_line_type} = 'SKIP_END';
983 $self->log_numbered_msg("Exiting code-skipping section\n");
984 $self->[_in_skipped_] = 0;
986 return $line_of_tokens;
989 # must print line unchanged if we have seen a severe error (i.e., we
990 # are seeing illegal tokens and cannot continue. Syntax errors do
991 # not pass this route). Calling routine can decide what to do, but
992 # the default can be to just pass all lines as if they were after __END__
993 elsif ( $self->[_in_error_] ) {
994 $line_of_tokens->{_line_type} = 'ERROR';
995 return $line_of_tokens;
998 # print line unchanged if we are __DATA__ section
999 elsif ( $self->[_in_data_] ) {
1001 # ...but look for POD
1002 # Note that the _in_data and _in_end flags remain set
1003 # so that we return to that state after seeing the
1004 # end of a pod section
1005 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1006 $line_of_tokens->{_line_type} = 'POD_START';
1007 $self->log_numbered_msg("Entering POD section\n");
1008 $self->[_in_pod_] = 1;
1009 return $line_of_tokens;
1012 $line_of_tokens->{_line_type} = 'DATA';
1013 return $line_of_tokens;
1017 # print line unchanged if we are in __END__ section
1018 elsif ( $self->[_in_end_] ) {
1020 # ...but look for POD
1021 # Note that the _in_data and _in_end flags remain set
1022 # so that we return to that state after seeing the
1023 # end of a pod section
1024 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1025 $line_of_tokens->{_line_type} = 'POD_START';
1026 $self->log_numbered_msg("Entering POD section\n");
1027 $self->[_in_pod_] = 1;
1028 return $line_of_tokens;
1031 $line_of_tokens->{_line_type} = 'END';
1032 return $line_of_tokens;
1036 # check for a hash-bang line if we haven't seen one
1037 if ( !$self->[_saw_hash_bang_] ) {
1038 if ( $input_line =~ /^\#\!.*perl\b/ ) {
1039 $self->[_saw_hash_bang_] = $input_line_number;
1041 # check for -w and -P flags
1042 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
1043 $self->[_saw_perl_dash_P_] = 1;
1046 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
1047 $self->[_saw_perl_dash_w_] = 1;
1051 $input_line_number > 1
1053 # leave any hash bang in a BEGIN block alone
1054 # i.e. see 'debugger-duck_type.t'
1056 $last_nonblank_block_type
1057 && $last_nonblank_block_type eq 'BEGIN'
1059 && !$self->[_look_for_hash_bang_]
1061 # Try to avoid giving a false alarm at a simple comment.
1062 # These look like valid hash-bang lines:
1066 #!c:\perl\bin\perl.exe
1068 # These are comments:
1070 #! sunos does not yet provide a /usr/bin/perl
1072 # Comments typically have multiple spaces, which suggests
1074 && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
1078 # this is helpful for VMS systems; we may have accidentally
1079 # tokenized some DCL commands
1080 if ( $self->[_started_tokenizing_] ) {
1082 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
1086 complain("Useless hash-bang after line 1\n");
1090 # Report the leading hash-bang as a system line
1091 # This will prevent -dac from deleting it
1093 $line_of_tokens->{_line_type} = 'SYSTEM';
1094 return $line_of_tokens;
1099 # wait for a hash-bang before parsing if the user invoked us with -x
1100 if ( $self->[_look_for_hash_bang_]
1101 && !$self->[_saw_hash_bang_] )
1103 $line_of_tokens->{_line_type} = 'SYSTEM';
1104 return $line_of_tokens;
1107 # a first line of the form ': #' will be marked as SYSTEM
1108 # since lines of this form may be used by tcsh
1109 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1110 $line_of_tokens->{_line_type} = 'SYSTEM';
1111 return $line_of_tokens;
1114 # now we know that it is ok to tokenize the line...
1115 # the line tokenizer will modify any of these private variables:
1116 # _rhere_target_list_
1124 my $ending_in_quote_last = $self->[_in_quote_];
1125 $self->tokenize_this_line($line_of_tokens);
1127 # Now finish defining the return structure and return it
1128 $line_of_tokens->{_ending_in_quote} = $self->[_in_quote_];
1130 # handle severe error (binary data in script)
1131 if ( $self->[_in_error_] ) {
1132 $self->[_in_quote_] = 0; # to avoid any more messages
1133 warning("Giving up after error\n");
1134 $line_of_tokens->{_line_type} = 'ERROR';
1135 reset_indentation_level(0); # avoid error messages
1136 return $line_of_tokens;
1139 # handle start of pod documentation
1140 if ( $self->[_in_pod_] ) {
1142 # This gets tricky..above a __DATA__ or __END__ section, perl
1143 # accepts '=cut' as the start of pod section. But afterwards,
1144 # only pod utilities see it and they may ignore an =cut without
1145 # leading =head. In any case, this isn't good.
1146 if ( $input_line =~ /^=cut\b/ ) {
1147 if ( $self->[_saw_data_] || $self->[_saw_end_] ) {
1148 complain("=cut while not in pod ignored\n");
1149 $self->[_in_pod_] = 0;
1150 $line_of_tokens->{_line_type} = 'POD_END';
1153 $line_of_tokens->{_line_type} = 'POD_START';
1155 "=cut starts a pod section .. this can fool pod utilities.\n"
1156 ) unless (DEVEL_MODE);
1157 $self->log_numbered_msg("Entering POD section\n");
1162 $line_of_tokens->{_line_type} = 'POD_START';
1163 $self->log_numbered_msg("Entering POD section\n");
1166 return $line_of_tokens;
1169 # handle start of skipped section
1170 if ( $self->[_in_skipped_] ) {
1172 $line_of_tokens->{_line_type} = 'SKIP';
1173 $self->log_numbered_msg("Entering code-skipping section\n");
1174 return $line_of_tokens;
1177 # see if this line contains here doc targets
1178 my $rhere_target_list = $self->[_rhere_target_list_];
1179 if ( @{$rhere_target_list} ) {
1181 my ( $here_doc_target, $here_quote_character ) =
1182 @{ shift @{$rhere_target_list} };
1183 $self->[_in_here_doc_] = 1;
1184 $self->[_here_doc_target_] = $here_doc_target;
1185 $self->[_here_quote_character_] = $here_quote_character;
1186 $self->log_numbered_msg("Entering HERE document $here_doc_target\n");
1187 $self->[_started_looking_for_here_target_at_] = $input_line_number;
1190 # NOTE: __END__ and __DATA__ statements are written unformatted
1191 # because they can theoretically contain additional characters
1192 # which are not tokenized (and cannot be read with <DATA> either!).
1193 if ( $self->[_in_data_] ) {
1194 $line_of_tokens->{_line_type} = 'DATA_START';
1195 $self->log_numbered_msg("Starting __DATA__ section\n");
1196 $self->[_saw_data_] = 1;
1198 # keep parsing after __DATA__ if use SelfLoader was seen
1199 if ( $self->[_saw_selfloader_] ) {
1200 $self->[_in_data_] = 0;
1201 $self->log_numbered_msg(
1202 "SelfLoader seen, continuing; -nlsl deactivates\n");
1205 return $line_of_tokens;
1208 elsif ( $self->[_in_end_] ) {
1209 $line_of_tokens->{_line_type} = 'END_START';
1210 $self->log_numbered_msg("Starting __END__ section\n");
1211 $self->[_saw_end_] = 1;
1213 # keep parsing after __END__ if use AutoLoader was seen
1214 if ( $self->[_saw_autoloader_] ) {
1215 $self->[_in_end_] = 0;
1216 $self->log_numbered_msg(
1217 "AutoLoader seen, continuing; -nlal deactivates\n");
1219 return $line_of_tokens;
1222 # now, finally, we know that this line is type 'CODE'
1223 $line_of_tokens->{_line_type} = 'CODE';
1225 # remember if we have seen any real code
1226 if ( !$self->[_started_tokenizing_]
1227 && $input_line !~ /^\s*$/
1228 && $input_line !~ /^\s*#/ )
1230 $self->[_started_tokenizing_] = 1;
1233 if ( $self->[_debugger_object_] ) {
1234 $self->[_debugger_object_]->write_debug_entry($line_of_tokens);
1237 # Note: if keyword 'format' occurs in this line code, it is still CODE
1238 # (keyword 'format' need not start a line)
1239 if ( $self->[_in_format_] ) {
1240 $self->log_numbered_msg("Entering format section\n");
1243 if ( $self->[_in_quote_]
1244 and ( $self->[_line_start_quote_] < 0 ) )
1247 #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1248 if ( ( my $quote_target = $self->[_quote_target_] ) !~ /^\s*$/ ) {
1249 $self->[_line_start_quote_] = $input_line_number;
1250 $self->log_numbered_msg(
1251 "Start multi-line quote or pattern ending in $quote_target\n");
1254 elsif ( ( $self->[_line_start_quote_] >= 0 )
1255 && !$self->[_in_quote_] )
1257 $self->[_line_start_quote_] = -1;
1258 $self->log_numbered_msg("End of multi-line quote or pattern\n");
1261 # we are returning a line of CODE
1262 return $line_of_tokens;
1263 } ## end sub get_line
1265 sub find_starting_indentation_level {
1267 # We need to find the indentation level of the first line of the
1268 # script being formatted. Often it will be zero for an entire file,
1269 # but if we are formatting a local block of code (within an editor for
1270 # example) it may not be zero. The user may specify this with the
1271 # -sil=n parameter but normally doesn't so we have to guess.
1274 my $starting_level = 0;
1276 # use value if given as parameter
1277 if ( $self->[_know_starting_level_] ) {
1278 $starting_level = $self->[_starting_level_];
1281 # if we know there is a hash_bang line, the level must be zero
1282 elsif ( $self->[_look_for_hash_bang_] ) {
1283 $self->[_know_starting_level_] = 1;
1286 # otherwise figure it out from the input file
1291 # keep looking at lines until we find a hash bang or piece of code
1292 my $msg = EMPTY_STRING;
1293 while ( $line = $self->[_line_buffer_object_]->peek_ahead( $i++ ) ) {
1295 # if first line is #! then assume starting level is zero
1296 if ( $i == 1 && $line =~ /^\#\!/ ) {
1297 $starting_level = 0;
1300 next if ( $line =~ /^\s*#/ ); # skip past comments
1301 next if ( $line =~ /^\s*$/ ); # skip past blank lines
1302 $starting_level = guess_old_indentation_level($line);
1305 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1306 write_logfile_entry("$msg");
1308 $self->[_starting_level_] = $starting_level;
1309 reset_indentation_level($starting_level);
1311 } ## end sub find_starting_indentation_level
1313 sub guess_old_indentation_level {
1316 # Guess the indentation level of an input line.
1318 # For the first line of code this result will define the starting
1319 # indentation level. It will mainly be non-zero when perltidy is applied
1320 # within an editor to a local block of code.
1322 # This is an impossible task in general because we can't know what tabs
1323 # meant for the old script and how many spaces were used for one
1324 # indentation level in the given input script. For example it may have
1325 # been previously formatted with -i=7 -et=3. But we can at least try to
1326 # make sure that perltidy guesses correctly if it is applied repeatedly to
1327 # a block of code within an editor, so that the block stays at the same
1328 # level when perltidy is applied repeatedly.
1330 # USES GLOBAL VARIABLES: $tokenizer_self
1333 # find leading tabs, spaces, and any statement label
1335 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1337 # If there are leading tabs, we use the tab scheme for this run, if
1338 # any, so that the code will remain stable when editing.
1339 if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }
1341 if ($2) { $spaces += length($2) }
1343 # correct for outdented labels
1344 if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
1345 $spaces += $tokenizer_self->[_continuation_indentation_];
1349 # compute indentation using the value of -i for this run.
1350 # If -i=0 is used for this run (which is possible) it doesn't matter
1351 # what we do here but we'll guess that the old run used 4 spaces per level.
1352 my $indent_columns = $tokenizer_self->[_indent_columns_];
1353 $indent_columns = 4 if ( !$indent_columns );
1354 $level = int( $spaces / $indent_columns );
1356 } ## end sub guess_old_indentation_level
1358 # This is a currently unused debug routine
1359 sub dump_functions {
1362 foreach my $pkg ( keys %is_user_function ) {
1363 $fh->print("\nnon-constant subs in package $pkg\n");
1365 foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1366 my $msg = EMPTY_STRING;
1367 if ( $is_block_list_function{$pkg}{$sub} ) {
1368 $msg = 'block_list';
1371 if ( $is_block_function{$pkg}{$sub} ) {
1374 $fh->print("$sub $msg\n");
1378 foreach my $pkg ( keys %is_constant ) {
1379 $fh->print("\nconstants and constant subs in package $pkg\n");
1381 foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1382 $fh->print("$sub\n");
1386 } ## end sub dump_functions
1388 sub prepare_for_a_new_file {
1390 # previous tokens needed to determine what to expect next
1391 $last_nonblank_token = ';'; # the only possible starting state which
1392 $last_nonblank_type = ';'; # will make a leading brace a code block
1393 $last_nonblank_block_type = EMPTY_STRING;
1395 # scalars for remembering statement types across multiple lines
1396 $statement_type = EMPTY_STRING; # '' or 'use' or 'sub..' or 'case..'
1397 $in_attribute_list = 0;
1399 # scalars for remembering where we are in the file
1400 $current_package = "main";
1401 $context = UNKNOWN_CONTEXT;
1403 # hashes used to remember function information
1404 %is_constant = (); # user-defined constants
1405 %is_user_function = (); # user-defined functions
1406 %user_function_prototype = (); # their prototypes
1407 %is_block_function = ();
1408 %is_block_list_function = ();
1409 %saw_function_definition = ();
1410 %saw_use_module = ();
1412 # variables used to track depths of various containers
1413 # and report nesting errors
1416 $square_bracket_depth = 0;
1417 @current_depth = (0) x scalar @closing_brace_names;
1420 @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
1421 @current_sequence_number = ();
1422 $next_sequence_number = 2; # The value 1 is reserved for SEQ_ROOT
1425 @paren_semicolon_count = ();
1426 @paren_structural_type = ();
1428 @brace_structural_type = ();
1429 @brace_context = ();
1430 @brace_package = ();
1431 @square_bracket_type = ();
1432 @square_bracket_structural_type = ();
1434 @nested_ternary_flag = ();
1435 @nested_statement_type = ();
1436 @starting_line_of_current_depth = ();
1438 $paren_type[$paren_depth] = EMPTY_STRING;
1439 $paren_semicolon_count[$paren_depth] = 0;
1440 $paren_structural_type[$brace_depth] = EMPTY_STRING;
1441 $brace_type[$brace_depth] = ';'; # identify opening brace as code block
1442 $brace_structural_type[$brace_depth] = EMPTY_STRING;
1443 $brace_context[$brace_depth] = UNKNOWN_CONTEXT;
1444 $brace_package[$paren_depth] = $current_package;
1445 $square_bracket_type[$square_bracket_depth] = EMPTY_STRING;
1446 $square_bracket_structural_type[$square_bracket_depth] = EMPTY_STRING;
1448 initialize_tokenizer_state();
1450 } ## end sub prepare_for_a_new_file
1452 { ## closure for sub tokenize_this_line
1454 use constant BRACE => 0;
1455 use constant SQUARE_BRACKET => 1;
1456 use constant PAREN => 2;
1457 use constant QUESTION_COLON => 3;
1459 # TV1: scalars for processing one LINE.
1460 # Re-initialized on each entry to sub tokenize_this_line.
1462 $block_type, $container_type, $expecting,
1463 $i, $i_tok, $input_line,
1464 $input_line_number, $last_nonblank_i, $max_token_index,
1465 $next_tok, $next_type, $peeked_ahead,
1466 $prototype, $rhere_target_list, $rtoken_map,
1467 $rtoken_type, $rtokens, $tok,
1468 $type, $type_sequence, $indent_flag,
1471 # TV2: refs to ARRAYS for processing one LINE
1472 # Re-initialized on each call.
1473 my $routput_token_list = []; # stack of output token indexes
1474 my $routput_token_type = []; # token types
1475 my $routput_block_type = []; # types of code block
1476 my $routput_container_type = []; # paren types, such as if, elsif, ..
1477 my $routput_type_sequence = []; # nesting sequential number
1478 my $routput_indent_flag = []; #
1480 # TV3: SCALARS for quote variables. These are initialized with a
1481 # subroutine call and continually updated as lines are processed.
1482 my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1483 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1485 # TV4: SCALARS for multi-line identifiers and
1486 # statements. These are initialized with a subroutine call
1487 # and continually updated as lines are processed.
1488 my ( $id_scan_state, $identifier, $want_paren );
1490 # TV5: SCALARS for tracking indentation level.
1491 # Initialized once and continually updated as lines are
1494 $nesting_token_string, $nesting_type_string,
1495 $nesting_block_string, $nesting_block_flag,
1496 $nesting_list_string, $nesting_list_flag,
1497 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1498 $in_statement_continuation, $level_in_tokenizer,
1499 $slevel_in_tokenizer, $rslevel_stack,
1502 # TV6: SCALARS for remembering several previous
1503 # tokens. Initialized once and continually updated as
1504 # lines are processed.
1506 $last_nonblank_container_type, $last_nonblank_type_sequence,
1507 $last_last_nonblank_token, $last_last_nonblank_type,
1508 $last_last_nonblank_block_type, $last_last_nonblank_container_type,
1509 $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1512 # ----------------------------------------------------------------
1513 # beginning of tokenizer variable access and manipulation routines
1514 # ----------------------------------------------------------------
1516 sub initialize_tokenizer_state {
1518 # TV1: initialized on each call
1519 # TV2: initialized on each call
1523 $quote_character = EMPTY_STRING;
1526 $quoted_string_1 = EMPTY_STRING;
1527 $quoted_string_2 = EMPTY_STRING;
1528 $allowed_quote_modifiers = EMPTY_STRING;
1531 $id_scan_state = EMPTY_STRING;
1532 $identifier = EMPTY_STRING;
1533 $want_paren = EMPTY_STRING;
1536 $nesting_token_string = EMPTY_STRING;
1537 $nesting_type_string = EMPTY_STRING;
1538 $nesting_block_string = '1'; # initially in a block
1539 $nesting_block_flag = 1;
1540 $nesting_list_string = '0'; # initially not in a list
1541 $nesting_list_flag = 0; # initially not in a list
1542 $ci_string_in_tokenizer = EMPTY_STRING;
1543 $continuation_string_in_tokenizer = "0";
1544 $in_statement_continuation = 0;
1545 $level_in_tokenizer = 0;
1546 $slevel_in_tokenizer = 0;
1547 $rslevel_stack = [];
1550 $last_nonblank_container_type = EMPTY_STRING;
1551 $last_nonblank_type_sequence = EMPTY_STRING;
1552 $last_last_nonblank_token = ';';
1553 $last_last_nonblank_type = ';';
1554 $last_last_nonblank_block_type = EMPTY_STRING;
1555 $last_last_nonblank_container_type = EMPTY_STRING;
1556 $last_last_nonblank_type_sequence = EMPTY_STRING;
1557 $last_nonblank_prototype = EMPTY_STRING;
1559 } ## end sub initialize_tokenizer_state
1561 sub save_tokenizer_state {
1564 $block_type, $container_type, $expecting,
1565 $i, $i_tok, $input_line,
1566 $input_line_number, $last_nonblank_i, $max_token_index,
1567 $next_tok, $next_type, $peeked_ahead,
1568 $prototype, $rhere_target_list, $rtoken_map,
1569 $rtoken_type, $rtokens, $tok,
1570 $type, $type_sequence, $indent_flag,
1574 $routput_token_list, $routput_token_type,
1575 $routput_block_type, $routput_container_type,
1576 $routput_type_sequence, $routput_indent_flag,
1580 $in_quote, $quote_type,
1581 $quote_character, $quote_pos,
1582 $quote_depth, $quoted_string_1,
1583 $quoted_string_2, $allowed_quote_modifiers,
1586 my $rTV4 = [ $id_scan_state, $identifier, $want_paren ];
1589 $nesting_token_string, $nesting_type_string,
1590 $nesting_block_string, $nesting_block_flag,
1591 $nesting_list_string, $nesting_list_flag,
1592 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1593 $in_statement_continuation, $level_in_tokenizer,
1594 $slevel_in_tokenizer, $rslevel_stack,
1598 $last_nonblank_container_type,
1599 $last_nonblank_type_sequence,
1600 $last_last_nonblank_token,
1601 $last_last_nonblank_type,
1602 $last_last_nonblank_block_type,
1603 $last_last_nonblank_container_type,
1604 $last_last_nonblank_type_sequence,
1605 $last_nonblank_prototype,
1607 return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1608 } ## end sub save_tokenizer_state
1610 sub restore_tokenizer_state {
1612 my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1614 $block_type, $container_type, $expecting,
1615 $i, $i_tok, $input_line,
1616 $input_line_number, $last_nonblank_i, $max_token_index,
1617 $next_tok, $next_type, $peeked_ahead,
1618 $prototype, $rhere_target_list, $rtoken_map,
1619 $rtoken_type, $rtokens, $tok,
1620 $type, $type_sequence, $indent_flag,
1624 $routput_token_list, $routput_token_type,
1625 $routput_block_type, $routput_container_type,
1626 $routput_type_sequence, $routput_indent_flag,
1630 $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1631 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1634 ( $id_scan_state, $identifier, $want_paren ) = @{$rTV4};
1637 $nesting_token_string, $nesting_type_string,
1638 $nesting_block_string, $nesting_block_flag,
1639 $nesting_list_string, $nesting_list_flag,
1640 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1641 $in_statement_continuation, $level_in_tokenizer,
1642 $slevel_in_tokenizer, $rslevel_stack,
1646 $last_nonblank_container_type,
1647 $last_nonblank_type_sequence,
1648 $last_last_nonblank_token,
1649 $last_last_nonblank_type,
1650 $last_last_nonblank_block_type,
1651 $last_last_nonblank_container_type,
1652 $last_last_nonblank_type_sequence,
1653 $last_nonblank_prototype,
1656 } ## end sub restore_tokenizer_state
1658 sub split_pretoken {
1662 # Split the leading $numc characters from the current token (at index=$i)
1663 # which is pre-type 'w' and insert the remainder back into the pretoken
1664 # stream with appropriate settings. Since we are splitting a pre-type 'w',
1665 # there are three cases, depending on if the remainder starts with a digit:
1666 # Case 1: remainder is type 'd', all digits
1667 # Case 2: remainder is type 'd' and type 'w': digits and other characters
1668 # Case 3: remainder is type 'w'
1670 # Examples, for $numc=1:
1671 # $tok => $tok_0 $tok_1 $tok_2
1672 # 'x10' => 'x' '10' # case 1
1673 # 'x10if' => 'x' '10' 'if' # case 2
1674 # '0ne => 'O' 'ne' # case 3
1677 # $tok_1 is a possible string of digits (pre-type 'd')
1678 # $tok_2 is a possible word (pre-type 'w')
1680 # return 1 if successful
1681 # return undef if error (shouldn't happen)
1683 # Calling routine should update '$type' and '$tok' if successful.
1685 my $pretoken = $rtokens->[$i];
1687 && length($pretoken) > $numc
1688 && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
1691 # Split $tok into up to 3 tokens:
1692 my $tok_0 = substr( $pretoken, 0, $numc );
1693 my $tok_1 = defined($1) ? $1 : EMPTY_STRING;
1694 my $tok_2 = defined($2) ? $2 : EMPTY_STRING;
1696 my $len_0 = length($tok_0);
1697 my $len_1 = length($tok_1);
1698 my $len_2 = length($tok_2);
1700 my $pre_type_0 = 'w';
1701 my $pre_type_1 = 'd';
1702 my $pre_type_2 = 'w';
1704 my $pos_0 = $rtoken_map->[$i];
1705 my $pos_1 = $pos_0 + $len_0;
1706 my $pos_2 = $pos_1 + $len_1;
1708 my $isplice = $i + 1;
1710 # Splice in any digits
1712 splice @{$rtoken_map}, $isplice, 0, $pos_1;
1713 splice @{$rtokens}, $isplice, 0, $tok_1;
1714 splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
1719 # Splice in any trailing word
1721 splice @{$rtoken_map}, $isplice, 0, $pos_2;
1722 splice @{$rtokens}, $isplice, 0, $tok_2;
1723 splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
1727 $rtokens->[$i] = $tok_0;
1732 # Shouldn't get here
1735 While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
1740 } ## end sub split_pretoken
1742 sub get_indentation_level {
1743 return $level_in_tokenizer;
1746 sub reset_indentation_level {
1747 $level_in_tokenizer = $slevel_in_tokenizer = shift;
1748 push @{$rslevel_stack}, $slevel_in_tokenizer;
1754 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1755 return $peeked_ahead;
1758 # ------------------------------------------------------------
1759 # end of tokenizer variable access and manipulation routines
1760 # ------------------------------------------------------------
1762 #------------------------------
1763 # beginning of tokenizer hashes
1764 #------------------------------
1766 my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
1768 # These block types terminate statements and do not need a trailing
1770 # patched for SWITCH/CASE/
1771 my %is_zero_continuation_block_type;
1773 @q = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
1774 if elsif else unless while until for foreach switch case given when);
1775 @is_zero_continuation_block_type{@q} = (1) x scalar(@q);
1777 my %is_logical_container;
1778 @q = qw(if elsif unless while and or err not && ! || for foreach);
1779 @is_logical_container{@q} = (1) x scalar(@q);
1783 @is_binary_type{@q} = (1) x scalar(@q);
1785 my %is_binary_keyword;
1786 @q = qw(and or err eq ne cmp);
1787 @is_binary_keyword{@q} = (1) x scalar(@q);
1789 # 'L' is token for opening { at hash key
1790 my %is_opening_type;
1792 @is_opening_type{@q} = (1) x scalar(@q);
1794 # 'R' is token for closing } at hash key
1795 my %is_closing_type;
1797 @is_closing_type{@q} = (1) x scalar(@q);
1799 my %is_redo_last_next_goto;
1800 @q = qw(redo last next goto);
1801 @is_redo_last_next_goto{@q} = (1) x scalar(@q);
1804 @q = qw(use require);
1805 @is_use_require{@q} = (1) x scalar(@q);
1807 # This hash holds the array index in $tokenizer_self for these keywords:
1808 # Fix for issue c035: removed 'format' from this hash
1810 '__END__' => _in_end_,
1811 '__DATA__' => _in_data_,
1814 my %is_list_end_type;
1817 @is_list_end_type{@q} = (1) x scalar(@q);
1819 # original ref: camel 3 p 147,
1820 # but perl may accept undocumented flags
1821 # perl 5.10 adds 'p' (preserve)
1822 # Perl version 5.22 added 'n'
1823 # From http://perldoc.perl.org/perlop.html we have
1824 # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
1825 # s/PATTERN/REPLACEMENT/msixpodualngcer
1826 # y/SEARCHLIST/REPLACEMENTLIST/cdsr
1827 # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
1828 # qr/STRING/msixpodualn
1829 my %quote_modifiers = (
1830 's' => '[msixpodualngcer]',
1833 'm' => '[msixpodualngc]',
1834 'qr' => '[msixpodualn]',
1835 'q' => EMPTY_STRING,
1836 'qq' => EMPTY_STRING,
1837 'qw' => EMPTY_STRING,
1838 'qx' => EMPTY_STRING,
1841 # table showing how many quoted things to look for after quote operator..
1842 # s, y, tr have 2 (pattern and replacement)
1843 # others have 1 (pattern only)
1857 @q = qw(for foreach);
1858 @is_for_foreach{@q} = (1) x scalar(@q);
1860 my %is_my_our_state;
1861 @q = qw(my our state);
1862 @is_my_our_state{@q} = (1) x scalar(@q);
1864 # These keywords may introduce blocks after parenthesized expressions,
1866 # keyword ( .... ) { BLOCK }
1867 # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
1868 my %is_blocktype_with_paren;
1870 qw(if elsif unless while until for foreach switch case given when catch);
1871 @is_blocktype_with_paren{@q} = (1) x scalar(@q);
1873 my %is_case_default;
1874 @q = qw(case default);
1875 @is_case_default{@q} = (1) x scalar(@q);
1877 #------------------------
1878 # end of tokenizer hashes
1879 #------------------------
1881 # ------------------------------------------------------------
1882 # beginning of various scanner interface routines
1883 # ------------------------------------------------------------
1884 sub scan_replacement_text {
1886 # check for here-docs in replacement text invoked by
1887 # a substitution operator with executable modifier 'e'.
1892 # $rht = reference to any here-doc targets
1893 my ($replacement_text) = @_;
1896 return unless ( $replacement_text =~ /<</ );
1898 write_logfile_entry("scanning replacement text for here-doc targets\n");
1900 # save the logger object for error messages
1901 my $logger_object = $tokenizer_self->[_logger_object_];
1903 # localize all package variables
1905 $tokenizer_self, $last_nonblank_token,
1906 $last_nonblank_type, $last_nonblank_block_type,
1907 $statement_type, $in_attribute_list,
1908 $current_package, $context,
1909 %is_constant, %is_user_function,
1910 %user_function_prototype, %is_block_function,
1911 %is_block_list_function, %saw_function_definition,
1912 $brace_depth, $paren_depth,
1913 $square_bracket_depth, @current_depth,
1914 @total_depth, $total_depth,
1915 @nesting_sequence_number, @current_sequence_number,
1916 @paren_type, @paren_semicolon_count,
1917 @paren_structural_type, @brace_type,
1918 @brace_structural_type, @brace_context,
1919 @brace_package, @square_bracket_type,
1920 @square_bracket_structural_type, @depth_array,
1921 @starting_line_of_current_depth, @nested_ternary_flag,
1922 @nested_statement_type, $next_sequence_number,
1925 # save all lexical variables
1926 my $rstate = save_tokenizer_state();
1927 _decrement_count(); # avoid error check for multiple tokenizers
1929 # make a new tokenizer
1931 my $source_object = Perl::Tidy::LineSource->new(
1932 input_file => \$replacement_text,
1935 my $tokenizer = Perl::Tidy::Tokenizer->new(
1936 source_object => $source_object,
1937 logger_object => $logger_object,
1938 starting_line_number => $input_line_number,
1941 # scan the replacement text
1942 1 while ( $tokenizer->get_line() );
1944 # remove any here doc targets
1946 if ( $tokenizer_self->[_in_here_doc_] ) {
1950 $tokenizer_self->[_here_doc_target_],
1951 $tokenizer_self->[_here_quote_character_]
1953 if ( $tokenizer_self->[_rhere_target_list_] ) {
1954 push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
1955 $tokenizer_self->[_rhere_target_list_] = undef;
1957 $tokenizer_self->[_in_here_doc_] = undef;
1960 # now its safe to report errors
1961 my $severe_error = $tokenizer->report_tokenization_errors();
1963 # TODO: Could propagate a severe error up
1965 # restore all tokenizer lexical variables
1966 restore_tokenizer_state($rstate);
1968 # return the here doc targets
1970 } ## end sub scan_replacement_text
1972 sub scan_bare_identifier {
1973 ( $i, $tok, $type, $prototype ) =
1974 scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
1975 $rtoken_map, $max_token_index );
1979 sub scan_identifier {
1981 $i, $tok, $type, $id_scan_state, $identifier,
1982 my $split_pretoken_flag
1984 = scan_complex_identifier( $i, $id_scan_state, $identifier, $rtokens,
1985 $max_token_index, $expecting, $paren_type[$paren_depth] );
1987 # Check for signal to fix a special variable adjacent to a keyword,
1988 # such as '$^One$0'.
1989 if ($split_pretoken_flag) {
1991 # Try to fix it by splitting the pretoken
1993 && $rtokens->[ $i - 1 ] eq '^'
1994 && split_pretoken(1) )
1996 $identifier = substr( $identifier, 0, 3 );
2001 # This shouldn't happen ...
2002 my $var = substr( $tok, 0, 3 );
2003 my $excess = substr( $tok, 3 );
2004 interrupt_logfile();
2006 $input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
2007 A space may be needed after '$var'.
2013 } ## end sub scan_identifier
2015 use constant VERIFY_FASTSCAN => 0;
2016 my %fast_scan_context;
2019 %fast_scan_context = (
2020 '$' => SCALAR_CONTEXT,
2021 '*' => SCALAR_CONTEXT,
2022 '@' => LIST_CONTEXT,
2023 '%' => LIST_CONTEXT,
2024 '&' => UNKNOWN_CONTEXT,
2028 sub scan_simple_identifier {
2030 # This is a wrapper for sub scan_identifier. It does a fast preliminary
2031 # scan for certain common identifiers:
2032 # '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
2033 # If it does not find one of these, or this is a restart, it calls the
2034 # original scanner directly.
2036 # This gives the same results as the full scanner in about 1/4 the
2037 # total runtime for a typical input stream.
2043 # || ---- $i_next [= next nonblank pretoken ]
2044 # |----$i_plus_1 [= a bareword ]
2045 # ---$i_begin [= a sigil]
2048 my $tok_begin = $tok;
2049 my $i_plus_1 = $i + 1;
2052 #-------------------------------------------------------
2053 # Do full scan for anything following a pointer, such as
2054 # $cref->&*; # a postderef
2055 #-------------------------------------------------------
2056 if ( $last_nonblank_token eq '->' ) {
2060 #------------------------------
2061 # quick scan with leading sigil
2062 #------------------------------
2063 elsif ( !$id_scan_state
2064 && $i_plus_1 <= $max_token_index
2065 && $fast_scan_context{$tok} )
2067 $context = $fast_scan_context{$tok};
2069 # look for $var, @var, ...
2070 if ( $rtoken_type->[$i_plus_1] eq 'w' ) {
2071 my $pretype_next = EMPTY_STRING;
2072 if ( $i_plus_1 < $max_token_index ) {
2073 my $i_next = $i_plus_1 + 1;
2074 if ( $rtoken_type->[$i_next] eq 'b'
2075 && $i_next < $max_token_index )
2079 $pretype_next = $rtoken_type->[$i_next];
2081 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
2083 # Found type 'i' like '$var', '@var', or '%var'
2084 $identifier = $tok . $rtokens->[$i_plus_1];
2088 $fast_scan_type = $type;
2092 # Look for @{ or %{ .
2093 # But we must let the full scanner handle things ${ because it may
2094 # keep going to get a complete identifier like '${#}' .
2096 $rtoken_type->[$i_plus_1] eq '{'
2097 && ( $tok_begin eq '@'
2098 || $tok_begin eq '%' )
2104 $fast_scan_type = $type;
2108 #---------------------------
2109 # Quick scan with leading ->
2110 # Look for ->[ and ->{
2111 #---------------------------
2114 && $i < $max_token_index
2115 && ( $rtokens->[$i_plus_1] eq '{'
2116 || $rtokens->[$i_plus_1] eq '[' )
2120 $fast_scan_type = $type;
2122 $context = UNKNOWN_CONTEXT;
2125 #--------------------------------------
2126 # Verify correctness during development
2127 #--------------------------------------
2128 if ( VERIFY_FASTSCAN && $fast_scan_type ) {
2130 # We will call the full method
2131 my $identifier_simple = $identifier;
2132 my $tok_simple = $tok;
2134 my $context_simple = $context;
2140 if ( $tok ne $tok_simple
2141 || $type ne $fast_scan_type
2143 || $identifier ne $identifier_simple
2145 || $context ne $context_simple )
2148 scan_simple_identifier differs from scan_identifier:
2149 simple: i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
2150 full: i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
2155 #-------------------------------------------------
2156 # call full scanner if fast method did not succeed
2157 #-------------------------------------------------
2158 if ( !$fast_scan_type ) {
2162 } ## end sub scan_simple_identifier
2165 ( $i, $tok, $type, $id_scan_state ) =
2166 scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
2167 $id_scan_state, $max_token_index );
2173 ( $i, $type, $number ) =
2174 scan_number_do( $input_line, $i, $rtoken_map, $type,
2179 use constant VERIFY_FASTNUM => 0;
2181 sub scan_number_fast {
2183 # This is a wrapper for sub scan_number. It does a fast preliminary
2184 # scan for a simple integer. It calls the original scan_number if it
2185 # does not find one.
2188 my $tok_begin = $tok;
2191 #---------------------------------
2192 # Quick check for (signed) integer
2193 #---------------------------------
2195 # This will be the string of digits:
2198 my $typ_d = $rtoken_type->[$i_d];
2200 # check for signed integer
2201 my $sign = EMPTY_STRING;
2203 && ( $typ_d eq '+' || $typ_d eq '-' )
2204 && $i_d < $max_token_index )
2208 $tok_d = $rtokens->[$i_d];
2209 $typ_d = $rtoken_type->[$i_d];
2216 $i_d == $max_token_index
2217 || ( $i_d < $max_token_index
2218 && $rtoken_type->[ $i_d + 1 ] ne '.'
2219 && $rtoken_type->[ $i_d + 1 ] ne 'w' )
2223 # Let let full scanner handle multi-digit integers beginning with
2224 # '0' because there could be error messages. For example, '009' is
2225 # not a valid number.
2227 if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
2228 $number = $sign . $tok_d;
2234 #--------------------------------------
2235 # Verify correctness during development
2236 #--------------------------------------
2237 if ( VERIFY_FASTNUM && defined($number) ) {
2239 # We will call the full method
2240 my $type_simple = $type;
2242 my $number_simple = $number;
2246 $number = scan_number();
2248 if ( $type ne $type_simple
2249 || ( $i != $i_simple && $i <= $max_token_index )
2250 || $number ne $number_simple )
2253 scan_number_fast differs from scan_number:
2254 simple: i=$i_simple, type=$type_simple, number=$number_simple
2255 full: i=$i, type=$type, number=$number
2260 #----------------------------------------
2261 # call full scanner if may not be integer
2262 #----------------------------------------
2263 if ( !defined($number) ) {
2264 $number = scan_number();
2267 } ## end sub scan_number_fast
2269 # a sub to warn if token found where term expected
2270 sub error_if_expecting_TERM {
2271 if ( $expecting == TERM ) {
2272 if ( $really_want_term{$last_nonblank_type} ) {
2273 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
2274 $rtoken_map, $rtoken_type, $input_line );
2279 } ## end sub error_if_expecting_TERM
2281 # a sub to warn if token found where operator expected
2282 sub error_if_expecting_OPERATOR {
2284 if ( $expecting == OPERATOR ) {
2285 if ( !defined($thing) ) { $thing = $tok }
2286 report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
2287 $rtoken_map, $rtoken_type, $input_line );
2288 if ( $i_tok == 0 ) {
2289 interrupt_logfile();
2290 warning("Missing ';' or ',' above?\n");
2296 } ## end sub error_if_expecting_OPERATOR
2298 # ------------------------------------------------------------
2299 # end scanner interfaces
2300 # ------------------------------------------------------------
2305 sub do_GREATER_THAN_SIGN {
2308 error_if_expecting_TERM()
2309 if ( $expecting == TERM );
2313 sub do_VERTICAL_LINE {
2316 error_if_expecting_TERM()
2317 if ( $expecting == TERM );
2321 sub do_DOLLAR_SIGN {
2324 # start looking for a scalar
2325 error_if_expecting_OPERATOR("Scalar")
2326 if ( $expecting == OPERATOR );
2327 scan_simple_identifier();
2329 if ( $identifier eq '$^W' ) {
2330 $tokenizer_self->[_saw_perl_dash_w_] = 1;
2333 # Check for identifier in indirect object slot
2334 # (vorboard.pl, sort.t). Something like:
2335 # /^(print|printf|sort|exec|system)$/
2337 $is_indirect_object_taker{$last_nonblank_token}
2338 && $last_nonblank_type eq 'k'
2339 || ( ( $last_nonblank_token eq '(' )
2340 && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
2341 || ( $last_nonblank_type eq 'w'
2342 || $last_nonblank_type eq 'U' ) # possible object
2346 # An identifier followed by '->' is not indirect object;
2347 # fixes b1175, b1176
2348 my ( $next_nonblank_type, $i_next ) =
2349 find_next_noncomment_type( $i, $rtokens, $max_token_index );
2350 $type = 'Z' if ( $next_nonblank_type ne '->' );
2353 } ## end sub do_DOLLAR_SIGN
2355 sub do_LEFT_PARENTHESIS {
2359 $paren_semicolon_count[$paren_depth] = 0;
2361 $container_type = $want_paren;
2362 $want_paren = EMPTY_STRING;
2364 elsif ( $statement_type =~ /^sub\b/ ) {
2365 $container_type = $statement_type;
2368 $container_type = $last_nonblank_token;
2370 # We can check for a syntax error here of unexpected '(',
2371 # but this is going to get messy...
2373 $expecting == OPERATOR
2375 # Be sure this is not a method call of the form
2376 # &method(...), $method->(..), &{method}(...),
2377 # $ref[2](list) is ok & short for $ref[2]->(list)
2378 # NOTE: at present, braces in something like &{ xxx }
2379 # are not marked as a block, we might have a method call.
2380 # Added ')' to fix case c017, something like ()()()
2381 && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/
2385 # ref: camel 3 p 703.
2386 if ( $last_last_nonblank_token eq 'do' ) {
2388 "do SUBROUTINE is deprecated; consider & or -> notation\n"
2393 # if this is an empty list, (), then it is not an
2394 # error; for example, we might have a constant pi and
2395 # invoke it with pi() or just pi;
2396 my ( $next_nonblank_token, $i_next ) =
2397 find_next_nonblank_token( $i, $rtokens,
2400 # Patch for c029: give up error check if
2401 # a side comment follows
2402 if ( $next_nonblank_token ne ')'
2403 && $next_nonblank_token ne '#' )
2407 error_if_expecting_OPERATOR('(');
2409 if ( $last_nonblank_type eq 'C' ) {
2411 "$last_nonblank_token has a void prototype\n";
2413 elsif ( $last_nonblank_type eq 'i' ) {
2415 && $last_nonblank_token =~ /^\$/ )
2418 "Do you mean '$last_nonblank_token->(' ?\n";
2422 interrupt_logfile();
2426 } ## end if ( $next_nonblank_token...
2427 } ## end else [ if ( $last_last_nonblank_token...
2428 } ## end if ( $expecting == OPERATOR...
2431 # Do not update container type at ') ('; fix for git #105. This will
2432 # propagate the container type onward so that any subsequent brace gets
2433 # correctly marked. I have implemented this as a general rule, which
2434 # should be safe, but if necessary it could be restricted to certain
2435 # container statement types such as 'for'.
2436 $paren_type[$paren_depth] = $container_type
2437 if ( $last_nonblank_token ne ')' );
2439 ( $type_sequence, $indent_flag ) =
2440 increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2442 # propagate types down through nested parens
2443 # for example: the second paren in 'if ((' would be structural
2444 # since the first is.
2446 if ( $last_nonblank_token eq '(' ) {
2447 $type = $last_nonblank_type;
2450 # We exclude parens as structural after a ',' because it
2451 # causes subtle problems with continuation indentation for
2452 # something like this, where the first 'or' will not get
2457 # ( not defined $check )
2459 # or $check eq "new"
2460 # or $check eq "old",
2463 # Likewise, we exclude parens where a statement can start
2464 # because of problems with continuation indentation, like
2467 # ($firstline =~ /^#\!.*perl/)
2468 # and (print $File::Find::name, "\n")
2471 # (ref($usage_fref) =~ /CODE/)
2473 # : (&blast_usage, &blast_params, &blast_general_params);
2479 if ( $last_nonblank_type eq ')' ) {
2481 "Syntax error? found token '$last_nonblank_type' then '('\n");
2483 $paren_structural_type[$paren_depth] = $type;
2486 } ## end sub do_LEFT_PARENTHESIS
2488 sub do_RIGHT_PARENTHESIS {
2491 ( $type_sequence, $indent_flag ) =
2492 decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2494 if ( $paren_structural_type[$paren_depth] eq '{' ) {
2498 $container_type = $paren_type[$paren_depth];
2500 # restore statement type as 'sub' at closing paren of a signature
2501 # so that a subsequent ':' is identified as an attribute
2502 if ( $container_type =~ /^sub\b/ ) {
2503 $statement_type = $container_type;
2507 if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
2508 my $num_sc = $paren_semicolon_count[$paren_depth];
2509 if ( $num_sc > 0 && $num_sc != 2 ) {
2510 warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2514 if ( $paren_depth > 0 ) { $paren_depth-- }
2516 } ## end sub do_RIGHT_PARENTHESIS
2521 if ( $last_nonblank_type eq ',' ) {
2522 complain("Repeated ','s \n");
2525 # Note that we have to check both token and type here because a
2526 # comma following a qw list can have last token='(' but type = 'q'
2527 elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' ) {
2528 warning("Unexpected leading ',' after a '('\n");
2531 # patch for operator_expected: note if we are in the list (use.t)
2532 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2535 } ## end sub do_COMMA
2540 $context = UNKNOWN_CONTEXT;
2541 $statement_type = EMPTY_STRING;
2542 $want_paren = EMPTY_STRING;
2545 if ( $is_for_foreach{ $paren_type[$paren_depth] } )
2546 { # mark ; in for loop
2548 # Be careful: we do not want a semicolon such as the
2549 # following to be included:
2551 # for (sort {strcoll($a,$b);} keys %investments) {
2553 if ( $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
2554 && $square_bracket_depth ==
2555 $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
2559 $paren_semicolon_count[$paren_depth]++;
2563 } ## end sub do_SEMICOLON
2565 sub do_QUOTATION_MARK {
2568 error_if_expecting_OPERATOR("String")
2569 if ( $expecting == OPERATOR );
2572 $allowed_quote_modifiers = EMPTY_STRING;
2574 } ## end sub do_QUOTATION_MARK
2579 error_if_expecting_OPERATOR("String")
2580 if ( $expecting == OPERATOR );
2583 $allowed_quote_modifiers = EMPTY_STRING;
2585 } ## end sub do_APOSTROPHE
2590 error_if_expecting_OPERATOR("String")
2591 if ( $expecting == OPERATOR );
2594 $allowed_quote_modifiers = EMPTY_STRING;
2596 } ## end sub do_BACKTICK
2603 # a pattern cannot follow certain keywords which take optional
2604 # arguments, like 'shift' and 'pop'. See also '?'.
2606 $last_nonblank_type eq 'k'
2607 && $is_keyword_rejecting_slash_as_pattern_delimiter{
2608 $last_nonblank_token}
2613 elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
2615 ( $is_pattern, $msg ) =
2616 guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
2620 write_diagnostics("DIVIDE:$msg\n");
2621 write_logfile_entry($msg);
2624 else { $is_pattern = ( $expecting == TERM ) }
2629 $allowed_quote_modifiers = '[msixpodualngc]';
2631 else { # not a pattern; check for a /= token
2633 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
2639 #DEBUG - collecting info on what tokens follow a divide
2640 # for development of guessing algorithm
2641 #if ( is_possible_numerator( $i, $rtokens, $max_token_index ) < 0 ) {
2642 # #write_diagnostics( "DIVIDE? $input_line\n" );
2646 } ## end sub do_SLASH
2648 sub do_LEFT_CURLY_BRACKET {
2651 # if we just saw a ')', we will label this block with
2652 # its type. We need to do this to allow sub
2653 # code_block_type to determine if this brace starts a
2654 # code block or anonymous hash. (The type of a paren
2655 # pair is the preceding token, such as 'if', 'else',
2657 $container_type = EMPTY_STRING;
2659 # ATTRS: for a '{' following an attribute list, reset
2660 # things to look like we just saw the sub name
2661 if ( $statement_type =~ /^sub\b/ ) {
2662 $last_nonblank_token = $statement_type;
2663 $last_nonblank_type = 'i';
2664 $statement_type = EMPTY_STRING;
2667 # patch for SWITCH/CASE: hide these keywords from an immediately
2668 # following opening brace
2669 elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
2670 && $statement_type eq $last_nonblank_token )
2672 $last_nonblank_token = ";";
2675 elsif ( $last_nonblank_token eq ')' ) {
2676 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
2678 # defensive move in case of a nesting error (pbug.t)
2679 # in which this ')' had no previous '('
2680 # this nesting error will have been caught
2681 if ( !defined($last_nonblank_token) ) {
2682 $last_nonblank_token = 'if';
2685 # check for syntax error here;
2686 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
2687 if ( $tokenizer_self->[_extended_syntax_] ) {
2689 # we append a trailing () to mark this as an unknown
2690 # block type. This allows perltidy to format some
2691 # common extensions of perl syntax.
2692 # This is used by sub code_block_type
2693 $last_nonblank_token .= '()';
2697 join( SPACE, sort keys %is_blocktype_with_paren );
2699 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
2705 # patch for paren-less for/foreach glitch, part 2.
2706 # see note below under 'qw'
2707 elsif ($last_nonblank_token eq 'qw'
2708 && $is_for_foreach{$want_paren} )
2710 $last_nonblank_token = $want_paren;
2711 if ( $last_last_nonblank_token eq $want_paren ) {
2713 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
2717 $want_paren = EMPTY_STRING;
2720 # now identify which of the three possible types of
2721 # curly braces we have: hash index container, anonymous
2722 # hash reference, or code block.
2724 # non-structural (hash index) curly brace pair
2725 # get marked 'L' and 'R'
2726 if ( is_non_structural_brace() ) {
2729 # patch for SWITCH/CASE:
2730 # allow paren-less identifier after 'when'
2731 # if the brace is preceded by a space
2732 if ( $statement_type eq 'when'
2733 && $last_nonblank_type eq 'i'
2734 && $last_last_nonblank_type eq 'k'
2735 && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
2738 $block_type = $statement_type;
2742 # code and anonymous hash have the same type, '{', but are
2743 # distinguished by 'block_type',
2744 # which will be blank for an anonymous hash
2747 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
2750 # patch to promote bareword type to function taking block
2752 && $last_nonblank_type eq 'w'
2753 && $last_nonblank_i >= 0 )
2755 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
2756 $routput_token_type->[$last_nonblank_i] =
2757 $is_grep_alias{$block_type} ? 'k' : 'G';
2761 # patch for SWITCH/CASE: if we find a stray opening block brace
2762 # where we might accept a 'case' or 'when' block, then take it
2763 if ( $statement_type eq 'case'
2764 || $statement_type eq 'when' )
2766 if ( !$block_type || $block_type eq '}' ) {
2767 $block_type = $statement_type;
2772 $brace_type[ ++$brace_depth ] = $block_type;
2773 $brace_package[$brace_depth] = $current_package;
2774 $brace_structural_type[$brace_depth] = $type;
2775 $brace_context[$brace_depth] = $context;
2776 ( $type_sequence, $indent_flag ) =
2777 increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2779 } ## end sub do_LEFT_CURLY_BRACKET
2781 sub do_RIGHT_CURLY_BRACKET {
2784 $block_type = $brace_type[$brace_depth];
2785 if ($block_type) { $statement_type = EMPTY_STRING }
2786 if ( defined( $brace_package[$brace_depth] ) ) {
2787 $current_package = $brace_package[$brace_depth];
2790 # can happen on brace error (caught elsewhere)
2793 ( $type_sequence, $indent_flag ) =
2794 decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2796 if ( $brace_structural_type[$brace_depth] eq 'L' ) {
2800 # propagate type information for 'do' and 'eval' blocks, and also
2801 # for smartmatch operator. This is necessary to enable us to know
2802 # if an operator or term is expected next.
2803 if ( $is_block_operator{$block_type} ) {
2807 $context = $brace_context[$brace_depth];
2808 if ( $brace_depth > 0 ) { $brace_depth--; }
2810 } ## end sub do_RIGHT_CURLY_BRACKET
2814 # '&' = maybe sub call? start looking
2815 # We have to check for sub call unless we are sure we
2816 # are expecting an operator. This example from s2p
2817 # got mistaken as a q operator in an early version:
2818 # print BODY &q(<<'EOT');
2819 if ( $expecting != OPERATOR ) {
2821 # But only look for a sub call if we are expecting a term or
2822 # if there is no existing space after the &.
2823 # For example we probably don't want & as sub call here:
2824 # Fcntl::S_IRUSR & $mode;
2825 if ( $expecting == TERM || $next_type ne 'b' ) {
2826 scan_simple_identifier();
2832 } ## end sub do_AMPERSAND
2834 sub do_LESS_THAN_SIGN {
2836 # '<' - angle operator or less than?
2837 if ( $expecting != OPERATOR ) {
2839 find_angle_operator_termination( $input_line, $i, $rtoken_map,
2840 $expecting, $max_token_index );
2842 ## This message is not very helpful and quite confusing if the above
2843 ## routine decided not to write a message with the line number.
2844 ## if ( $type eq '<' && $expecting == TERM ) {
2845 ## error_if_expecting_TERM();
2846 ## interrupt_logfile();
2847 ## warning("Unterminated <> operator?\n");
2848 ## resume_logfile();
2855 } ## end sub do_LESS_THAN_SIGN
2857 sub do_QUESTION_MARK {
2859 # '?' = conditional or starting pattern?
2862 # Patch for rt #126965
2863 # a pattern cannot follow certain keywords which take optional
2864 # arguments, like 'shift' and 'pop'. See also '/'.
2866 $last_nonblank_type eq 'k'
2867 && $is_keyword_rejecting_question_as_pattern_delimiter{
2868 $last_nonblank_token}
2874 # patch for RT#131288, user constant function without prototype
2875 # last type is 'U' followed by ?.
2876 elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
2879 elsif ( $expecting == UNKNOWN ) {
2881 # In older versions of Perl, a bare ? can be a pattern
2882 # delimiter. In perl version 5.22 this was
2883 # dropped, but we have to support it in order to format
2884 # older programs. See:
2885 ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
2886 # For example, the following line worked
2888 # ?(.*)? && (print $1,"\n");
2889 # In current versions it would have to be written with slashes:
2890 # /(.*)/ && (print $1,"\n");
2892 ( $is_pattern, $msg ) =
2893 guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
2896 if ($msg) { write_logfile_entry($msg) }
2898 else { $is_pattern = ( $expecting == TERM ) }
2903 $allowed_quote_modifiers = '[msixpodualngc]';
2906 ( $type_sequence, $indent_flag ) =
2907 increase_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
2910 } ## end sub do_QUESTION_MARK
2914 # '*' = typeglob, or multiply?
2915 if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
2916 if ( $next_type ne 'b'
2917 && $next_type ne '('
2918 && $next_type ne '#' ) # Fix c036
2923 if ( $expecting == TERM ) {
2924 scan_simple_identifier();
2928 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2933 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
2937 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2945 } ## end sub do_STAR
2949 # '.' = what kind of . ?
2950 if ( $expecting != OPERATOR ) {
2952 if ( $type eq '.' ) {
2953 error_if_expecting_TERM()
2954 if ( $expecting == TERM );
2964 # ':' = label, ternary, attribute, ?
2966 # if this is the first nonblank character, call it a label
2967 # since perl seems to just swallow it
2968 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
2972 # ATTRS: check for a ':' which introduces an attribute list
2973 # either after a 'sub' keyword or within a paren list
2974 elsif ( $statement_type =~ /^sub\b/ ) {
2976 $in_attribute_list = 1;
2979 # Within a signature, unless we are in a ternary. For example,
2980 # from 't/filter_example.t':
2981 # method foo4 ( $class: $bar ) { $class->bar($bar) }
2982 elsif ( $paren_type[$paren_depth] =~ /^sub\b/
2983 && !is_balanced_closing_container(QUESTION_COLON) )
2986 $in_attribute_list = 1;
2989 # check for scalar attribute, such as
2990 # my $foo : shared = 1;
2991 elsif ($is_my_our_state{$statement_type}
2992 && $current_depth[QUESTION_COLON] == 0 )
2995 $in_attribute_list = 1;
2998 # Look for Switch::Plain syntax if an error would otherwise occur
2999 # here. Note that we do not need to check if the extended syntax
3000 # flag is set because otherwise an error would occur, and we would
3001 # then have to output a message telling the user to set the
3002 # extended syntax flag to avoid the error.
3006 # Note that the line 'default:' will be parsed as a label elsewhere.
3007 elsif ( $is_case_default{$statement_type}
3008 && !is_balanced_closing_container(QUESTION_COLON) )
3010 # mark it as a perltidy label type
3014 # otherwise, it should be part of a ?/: operator
3016 ( $type_sequence, $indent_flag ) =
3017 decrease_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
3018 if ( $last_nonblank_token eq '?' ) {
3019 warning("Syntax error near ? :\n");
3023 } ## end sub do_COLON
3027 # '+' = what kind of plus?
3028 if ( $expecting == TERM ) {
3029 my $number = scan_number_fast();
3031 # unary plus is safest assumption if not a number
3032 if ( !defined($number) ) { $type = 'p'; }
3034 elsif ( $expecting == OPERATOR ) {
3037 if ( $next_type eq 'w' ) { $type = 'p' }
3040 } ## end sub do_PLUS_SIGN
3044 # '@' = sigil for array?
3045 error_if_expecting_OPERATOR("Array")
3046 if ( $expecting == OPERATOR );
3047 scan_simple_identifier();
3051 sub do_PERCENT_SIGN {
3053 # '%' = hash or modulo?
3054 # first guess is hash if no following blank or paren
3055 if ( $expecting == UNKNOWN ) {
3056 if ( $next_type ne 'b' && $next_type ne '(' ) {
3060 if ( $expecting == TERM ) {
3061 scan_simple_identifier();
3064 } ## end sub do_PERCENT_SIGN
3066 sub do_LEFT_SQUARE_BRACKET {
3069 $square_bracket_type[ ++$square_bracket_depth ] = $last_nonblank_token;
3070 ( $type_sequence, $indent_flag ) =
3071 increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3073 # It may seem odd, but structural square brackets have
3074 # type '{' and '}'. This simplifies the indentation logic.
3075 if ( !is_non_structural_brace() ) {
3078 $square_bracket_structural_type[$square_bracket_depth] = $type;
3080 } ## end sub do_LEFT_SQUARE_BRACKET
3082 sub do_RIGHT_SQUARE_BRACKET {
3085 ( $type_sequence, $indent_flag ) =
3086 decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3088 if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' ) {
3092 # propagate type information for smartmatch operator. This is
3093 # necessary to enable us to know if an operator or term is expected
3095 if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
3096 $tok = $square_bracket_type[$square_bracket_depth];
3099 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
3101 } ## end sub do_RIGHT_SQUARE_BRACKET
3105 # '-' = what kind of minus?
3106 if ( ( $expecting != OPERATOR )
3107 && $is_file_test_operator{$next_tok} )
3109 my ( $next_nonblank_token, $i_next ) =
3110 find_next_nonblank_token( $i + 1, $rtokens, $max_token_index );
3112 # check for a quoted word like "-w=>xx";
3113 # it is sufficient to just check for a following '='
3114 if ( $next_nonblank_token eq '=' ) {
3123 elsif ( $expecting == TERM ) {
3124 my $number = scan_number_fast();
3126 # maybe part of bareword token? unary is safest
3127 if ( !defined($number) ) { $type = 'm'; }
3130 elsif ( $expecting == OPERATOR ) {
3134 if ( $next_type eq 'w' ) {
3139 } ## end sub do_MINUS_SIGN
3144 # check for special variables like ${^WARNING_BITS}
3145 if ( $expecting == TERM ) {
3147 if ( $last_nonblank_token eq '{'
3148 && ( $next_tok !~ /^\d/ )
3149 && ( $next_tok =~ /^\w/ ) )
3152 if ( $next_tok eq 'W' ) {
3153 $tokenizer_self->[_saw_perl_dash_w_] = 1;
3155 $tok = $tok . $next_tok;
3159 # Optional coding to try to catch syntax errors. This can
3160 # be removed if it ever causes incorrect warning messages.
3161 # The '{^' should be preceded by either by a type or '$#'
3164 # *${^LAST_FH}{NAME} ok
3166 # $hash{^HOWDY} error
3168 # Note that a type sigil '$' may be tokenized as 'Z'
3169 # after something like 'print', so allow type 'Z'
3170 if ( $last_last_nonblank_type ne 't'
3171 && $last_last_nonblank_type ne 'Z'
3172 && $last_last_nonblank_token ne '$#' )
3174 warning("Possible syntax error near '{^'\n");
3179 unless ( error_if_expecting_TERM() ) {
3181 # Something like this is valid but strange:
3183 complain("The '^' seems unusual here\n");
3188 } ## end sub do_CARAT_SIGN
3190 sub do_DOUBLE_COLON {
3192 # '::' = probably a sub call
3193 scan_bare_identifier();
3199 # '<<' = maybe a here-doc?
3201 ## This check removed because it could be a deprecated here-doc with
3202 ## no specified target. See example in log 16 Sep 2020.
3204 ## unless ( $i < $max_token_index )
3205 ## ; # here-doc not possible if end of line
3207 if ( $expecting != OPERATOR ) {
3208 my ( $found_target, $here_doc_target, $here_quote_character,
3211 $found_target, $here_doc_target, $here_quote_character, $i,
3214 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3217 if ($found_target) {
3218 push @{$rhere_target_list},
3219 [ $here_doc_target, $here_quote_character ];
3221 if ( length($here_doc_target) > 80 ) {
3222 my $truncated = substr( $here_doc_target, 0, 80 );
3223 complain("Long here-target: '$truncated' ...\n");
3225 elsif ( !$here_doc_target ) {
3227 'Use of bare << to mean <<"" is deprecated' . "\n" )
3228 unless ($here_quote_character);
3230 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3232 "Unconventional here-target: '$here_doc_target'\n");
3235 elsif ( $expecting == TERM ) {
3236 unless ($saw_error) {
3238 # shouldn't happen..arriving here implies an error in
3239 # the logic in sub 'find_here_doc'
3242 Program bug; didn't find here doc target
3246 "Possible program error: didn't find here doc target\n"
3248 report_definite_bug();
3255 } ## end sub do_LEFT_SHIFT
3257 sub do_NEW_HERE_DOC {
3259 # '<<~' = a here-doc, new type added in v26
3261 unless ( $i < $max_token_index )
3262 ; # here-doc not possible if end of line
3263 if ( $expecting != OPERATOR ) {
3264 my ( $found_target, $here_doc_target, $here_quote_character,
3267 $found_target, $here_doc_target, $here_quote_character, $i,
3270 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3273 if ($found_target) {
3275 if ( length($here_doc_target) > 80 ) {
3276 my $truncated = substr( $here_doc_target, 0, 80 );
3277 complain("Long here-target: '$truncated' ...\n");
3279 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3281 "Unconventional here-target: '$here_doc_target'\n");
3284 # Note that we put a leading space on the here quote
3285 # character indicate that it may be preceded by spaces
3286 $here_quote_character = SPACE . $here_quote_character;
3287 push @{$rhere_target_list},
3288 [ $here_doc_target, $here_quote_character ];
3291 elsif ( $expecting == TERM ) {
3292 unless ($saw_error) {
3294 # shouldn't happen..arriving here implies an error in
3295 # the logic in sub 'find_here_doc'
3298 Program bug; didn't find here doc target
3302 "Possible program error: didn't find here doc target\n"
3304 report_definite_bug();
3309 error_if_expecting_OPERATOR();
3312 } ## end sub do_NEW_HERE_DOC
3318 } ## end sub do_POINTER
3323 # type = 'pp' for pre-increment, '++' for post-increment
3324 if ( $expecting == TERM ) { $type = 'pp' }
3325 elsif ( $expecting == UNKNOWN ) {
3327 my ( $next_nonblank_token, $i_next ) =
3328 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3330 # Fix for c042: look past a side comment
3331 if ( $next_nonblank_token eq '#' ) {
3332 ( $next_nonblank_token, $i_next ) =
3333 find_next_nonblank_token( $max_token_index,
3334 $rtokens, $max_token_index );
3337 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
3340 } ## end sub do_PLUS_PLUS
3345 if ( $last_nonblank_type eq $tok ) {
3346 complain("Repeated '=>'s \n");
3349 # patch for operator_expected: note if we are in the list (use.t)
3350 # TODO: make version numbers a new token type
3351 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
3353 } ## end sub do_FAT_COMMA
3355 sub do_MINUS_MINUS {
3358 # type = 'mm' for pre-decrement, '--' for post-decrement
3360 if ( $expecting == TERM ) { $type = 'mm' }
3361 elsif ( $expecting == UNKNOWN ) {
3362 my ( $next_nonblank_token, $i_next ) =
3363 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3365 # Fix for c042: look past a side comment
3366 if ( $next_nonblank_token eq '#' ) {
3367 ( $next_nonblank_token, $i_next ) =
3368 find_next_nonblank_token( $max_token_index,
3369 $rtokens, $max_token_index );
3372 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
3375 } ## end sub do_MINUS_MINUS
3377 sub do_LOGICAL_AND {
3380 error_if_expecting_TERM()
3381 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3388 error_if_expecting_TERM()
3389 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
3393 sub do_SLASH_SLASH {
3396 error_if_expecting_TERM()
3397 if ( $expecting == TERM );
3403 # 'd' = string of digits
3404 error_if_expecting_OPERATOR("Number")
3405 if ( $expecting == OPERATOR );
3407 my $number = scan_number_fast();
3408 if ( !defined($number) ) {
3410 # shouldn't happen - we should always get a number
3413 non-number beginning with digit--program bug
3417 "Unexpected error condition: non-number beginning with digit\n"
3419 report_definite_bug();
3422 } ## end sub do_DIGITS
3424 sub do_ATTRIBUTE_LIST {
3426 my ($next_nonblank_token) = @_;
3428 # Called at a bareword encountered while in an attribute list
3429 # returns 'is_attribute':
3430 # true if attribute found
3431 # false if an attribute (continue parsing bareword)
3433 # treat bare word followed by open paren like qw(
3434 if ( $next_nonblank_token eq '(' ) {
3436 # For something like:
3438 # we should let do_scan_sub see it so that it can see
3439 # the prototype. All other attributes get parsed as a
3441 if ( $tok eq 'prototype' ) {
3442 $id_scan_state = 'prototype';
3444 # start just after the word 'prototype'
3446 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
3448 input_line => $input_line,
3453 rtokens => $rtokens,
3454 rtoken_map => $rtoken_map,
3455 id_scan_state => $id_scan_state,
3456 max_token_index => $max_token_index,
3460 # If successful, mark as type 'q' to be consistent
3461 # with other attributes. Type 'w' would also work.
3462 if ( $i > $i_beg ) {
3467 # If not successful, continue and parse as a quote.
3470 # All other attribute lists must be parsed as quotes
3471 # (see 'signatures.t' for good examples)
3472 $in_quote = $quote_items{'q'};
3473 $allowed_quote_modifiers = $quote_modifiers{'q'};
3479 # handle bareword not followed by open paren
3485 # attribute not found
3487 } ## end sub do_ATTRIBUTE_LIST
3489 sub do_QUOTED_BAREWORD {
3491 # find type of a bareword followed by a '=>'
3492 if ( $is_constant{$current_package}{$tok} ) {
3495 elsif ( $is_user_function{$current_package}{$tok} ) {
3497 $prototype = $user_function_prototype{$current_package}{$tok};
3499 elsif ( $tok =~ /^v\d+$/ ) {
3501 report_v_string($tok);
3505 # Bareword followed by a fat comma - see 'git18.in'
3506 # If tok is something like 'x17' then it could
3507 # actually be operator x followed by number 17.
3508 # For example, here:
3509 # 123x17 => [ 792, 1224 ],
3510 # (a key of 123 repeated 17 times, perhaps not
3511 # what was intended). We will mark x17 as type
3512 # 'n' and it will be split. If the previous token
3513 # was also a bareword then it is not very clear is
3514 # going on. In this case we will not be sure that
3515 # an operator is expected, so we just mark it as a
3516 # bareword. Perl is a little murky in what it does
3517 # with stuff like this, and its behavior can change
3518 # over time. Something like
3519 # a x18 => [792, 1224], will compile as
3520 # a key with 18 a's. But something like
3521 # push @array, a x18;
3522 # is a syntax error.
3524 $expecting == OPERATOR
3525 && substr( $tok, 0, 1 ) eq 'x'
3526 && ( length($tok) == 1
3527 || substr( $tok, 1, 1 ) =~ /^\d/ )
3531 if ( split_pretoken(1) ) {
3540 error_if_expecting_OPERATOR();
3544 } ## end sub do_QUOTED_BAREWORD
3548 if ( $tok eq 'x' ) {
3549 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
3560 # Split a pretoken like 'x10' into 'x' and '10'.
3561 # Note: In previous versions of perltidy it was marked
3562 # as a number, $type = 'n', and fixed downstream by the
3565 if ( split_pretoken(1) ) {
3571 } ## end sub do_X_OPERATOR
3573 sub do_USE_CONSTANT {
3574 scan_bare_identifier();
3575 my ( $next_nonblank_tok2, $i_next2 ) =
3576 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3578 if ($next_nonblank_tok2) {
3580 if ( $is_keyword{$next_nonblank_tok2} ) {
3582 # Assume qw is used as a quote and okay, as in:
3583 # use constant qw{ DEBUG 0 };
3584 # Not worth trying to parse for just a warning
3586 # NOTE: This warning is deactivated because recent
3587 # versions of perl do not complain here, but
3588 # the coding is retained for reference.
3589 if ( 0 && $next_nonblank_tok2 ne 'qw' ) {
3591 "Attempting to define constant '$next_nonblank_tok2' which is a perl keyword\n"
3597 $is_constant{$current_package}{$next_nonblank_tok2} = 1;
3601 } ## end sub do_USE_CONSTANT
3605 # found a keyword - set any associated flags
3608 # Since for and foreach may not be followed immediately
3609 # by an opening paren, we have to remember which keyword
3610 # is associated with the next '('
3611 if ( $is_for_foreach{$tok} ) {
3612 if ( new_statement_ok() ) {
3617 # recognize 'use' statements, which are special
3618 elsif ( $is_use_require{$tok} ) {
3619 $statement_type = $tok;
3620 error_if_expecting_OPERATOR()
3621 if ( $expecting == OPERATOR );
3624 # remember my and our to check for trailing ": shared"
3625 elsif ( $is_my_our_state{$tok} ) {
3626 $statement_type = $tok;
3629 # Check for misplaced 'elsif' and 'else', but allow isolated
3630 # else or elsif blocks to be formatted. This is indicated
3631 # by a last noblank token of ';'
3632 elsif ( $tok eq 'elsif' ) {
3634 $last_nonblank_token ne ';'
3636 ## !~ /^(if|elsif|unless)$/
3637 && !$is_if_elsif_unless{$last_nonblank_block_type}
3641 "expecting '$tok' to follow one of 'if|elsif|unless'\n");
3644 elsif ( $tok eq 'else' ) {
3646 # patched for SWITCH/CASE
3648 $last_nonblank_token ne ';'
3650 ## !~ /^(if|elsif|unless|case|when)$/
3651 && !$is_if_elsif_unless_case_when{$last_nonblank_block_type}
3653 # patch to avoid an unwanted error message for
3654 # the case of a parenless 'case' (RT 105484):
3655 # switch ( 1 ) { case x { 2 } else { } }
3656 ## !~ /^(if|elsif|unless|case|when)$/
3657 && !$is_if_elsif_unless_case_when{$statement_type}
3661 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
3666 # patch for SWITCH/CASE if 'case' and 'when are
3667 # treated as keywords. Also 'default' for Switch::Plain
3668 elsif ($tok eq 'when'
3670 || $tok eq 'default' )
3672 $statement_type = $tok; # next '{' is block
3675 # feature 'err' was removed in Perl 5.10. So mark this as
3676 # a bareword unless an operator is expected (see c158).
3677 elsif ( $tok eq 'err' ) {
3678 if ( $expecting != OPERATOR ) { $type = 'w' }
3682 } ## end sub do_KEYWORD
3684 sub do_QUOTE_OPERATOR {
3686 if ( $expecting == OPERATOR ) {
3688 # Be careful not to call an error for a qw quote
3689 # where a parenthesized list is allowed. For example,
3690 # it could also be a for/foreach construct such as
3692 # foreach my $key qw\Uno Due Tres Quadro\ {
3693 # print "Set $key\n";
3697 # Or it could be a function call.
3698 # NOTE: Braces in something like &{ xxx } are not
3699 # marked as a block, we might have a method call.
3700 # &method(...), $method->(..), &{method}(...),
3701 # $ref[2](list) is ok & short for $ref[2]->(list)
3703 # See notes in 'sub code_block_type' and
3704 # 'sub is_non_structural_brace'
3708 && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
3709 || $is_for_foreach{$want_paren} )
3712 error_if_expecting_OPERATOR();
3715 $in_quote = $quote_items{$tok};
3716 $allowed_quote_modifiers = $quote_modifiers{$tok};
3718 # All quote types are 'Q' except possibly qw quotes.
3719 # qw quotes are special in that they may generally be trimmed
3720 # of leading and trailing whitespace. So they are given a
3721 # separate type, 'q', unless requested otherwise.
3723 ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
3726 $quote_type = $type;
3728 } ## end sub do_QUOTE_OPERATOR
3730 sub do_UNKNOWN_BAREWORD {
3732 my ($next_nonblank_token) = @_;
3734 scan_bare_identifier();
3736 if ( $statement_type eq 'use'
3737 && $last_nonblank_token eq 'use' )
3739 $saw_use_module{$current_package}->{$tok} = 1;
3742 if ( $type eq 'w' ) {
3744 if ( $expecting == OPERATOR ) {
3746 # Patch to avoid error message for RPerl overloaded
3747 # operator functions: use overload
3752 # TODO: this could eventually be generalized
3753 if ( $saw_use_module{$current_package}->{'RPerl'}
3754 && $tok =~ /^sse_(mul|div|add|sub)$/ )
3759 # Fix part 1 for git #63 in which a comment falls
3760 # between an -> and the following word. An
3761 # alternate fix would be to change operator_expected
3762 # to return an UNKNOWN for this type.
3763 elsif ( $last_nonblank_type eq '->' ) {
3767 # don't complain about possible indirect object
3771 # sub new($) { ... }
3772 # $b = new A::; # calls A::new
3773 # $c = new A; # same thing but suspicious
3774 # This will call A::new but we have a 'new' in
3775 # main:: which looks like a constant.
3777 elsif ( $last_nonblank_type eq 'C' ) {
3778 if ( $tok !~ /::$/ ) {
3780 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
3781 Maybe indirectet object notation?
3786 error_if_expecting_OPERATOR("bareword");
3790 # mark bare words immediately followed by a paren as
3792 $next_tok = $rtokens->[ $i + 1 ];
3793 if ( $next_tok eq '(' ) {
3795 # Patch for issue c151, where we are processing a snippet and
3796 # have not seen that SPACE is a constant. In this case 'x' is
3797 # probably an operator. The only disadvantage with an incorrect
3798 # guess is that the space after it may be incorrect. For example
3799 # $str .= SPACE x ( 16 - length($str) ); See also b1410.
3800 if ( $tok eq 'x' && $last_nonblank_type eq 'w' ) { $type = 'x' }
3802 # Fix part 2 for git #63. Leave type as 'w' to keep
3803 # the type the same as if the -> were not separated
3804 elsif ( $last_nonblank_type ne '->' ) { $type = 'U' }
3808 # underscore after file test operator is file handle
3809 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
3813 # patch for SWITCH/CASE if 'case' and 'when are
3814 # not treated as keywords:
3816 ( $tok eq 'case' && $brace_type[$brace_depth] eq 'switch' )
3818 && $brace_type[$brace_depth] eq 'given' )
3821 $statement_type = $tok; # next '{' is block
3822 $type = 'k'; # for keyword syntax coloring
3824 if ( $next_nonblank_token eq '(' ) {
3826 # patch for SWITCH/CASE if switch and given not keywords
3827 # Switch is not a perl 5 keyword, but we will gamble
3828 # and mark switch followed by paren as a keyword. This
3829 # is only necessary to get html syntax coloring nice,
3830 # and does not commit this as being a switch/case.
3831 if ( $tok eq 'switch' || $tok eq 'given' ) {
3832 $type = 'k'; # for keyword syntax coloring
3835 # mark 'x' as operator for something like this (see b1410)
3836 # my $line = join( LD_X, map { LD_H x ( $_ + 2 ) } @$widths );
3837 elsif ( $tok eq 'x' && $last_nonblank_type eq 'w' ) {
3843 } ## end sub do_UNKNOWN_BAREWORD
3845 sub sub_attribute_ok_here {
3847 my ( $tok_kw, $next_nonblank_token, $i_next ) = @_;
3849 # Decide if 'sub :' can be the start of a sub attribute list.
3850 # We will decide based on if the colon is followed by a
3851 # bareword which is not a keyword.
3852 # Changed inext+1 to inext to fixed case b1190.
3853 my $sub_attribute_ok_here;
3854 if ( $is_sub{$tok_kw}
3855 && $expecting != OPERATOR
3856 && $next_nonblank_token eq ':' )
3858 my ( $nn_nonblank_token, $i_nn ) =
3859 find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
3860 $sub_attribute_ok_here =
3861 $nn_nonblank_token =~ /^\w/
3862 && $nn_nonblank_token !~ /^\d/
3863 && !$is_keyword{$nn_nonblank_token};
3865 return $sub_attribute_ok_here;
3866 } ## end sub sub_attribute_ok_here
3870 my ($is_END_or_DATA) = @_;
3872 # handle a bareword token:
3874 # true if this token ends the current line
3877 my ( $next_nonblank_token, $i_next ) =
3878 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3880 # a bare word immediately followed by :: is not a keyword;
3881 # use $tok_kw when testing for keywords to avoid a mistake
3883 if ( $rtokens->[ $i + 1 ] eq ':'
3884 && $rtokens->[ $i + 2 ] eq ':' )
3889 if ($in_attribute_list) {
3890 my $is_attribute = do_ATTRIBUTE_LIST($next_nonblank_token);
3891 return if ($is_attribute);
3894 #----------------------------------------
3895 # Starting final if-elsif- chain of tests
3896 #----------------------------------------
3898 # This is the return flag:
3899 # true => this is the last token on the line
3900 # false => keep tokenizing the line
3903 # The following blocks of code must update these vars:
3904 # $type - the final token type, must always be set
3906 # In addition, if additional pretokens are added:
3907 # $tok - the final token
3908 # $i - the index of the last pretoken
3910 # They may also need to check and set various flags
3912 # Scan a bare word following a -> as an identifier; it could
3913 # have a long package name. Fixes c037, c041.
3914 if ( $last_nonblank_token eq '->' ) {
3915 scan_bare_identifier();
3917 # a bareward after '->' gets type 'i'
3921 # Quote a word followed by => operator
3922 # unless the word __END__ or __DATA__ and the only word on
3924 elsif ( !$is_END_or_DATA
3925 && $next_nonblank_token eq '='
3926 && $rtokens->[ $i_next + 1 ] eq '>' )
3928 do_QUOTED_BAREWORD();
3931 # quote a bare word within braces..like xxx->{s}; note that we
3932 # must be sure this is not a structural brace, to avoid
3933 # mistaking {s} in the following for a quoted bare word:
3934 # for(@[){s}bla}BLA}
3935 # Also treat q in something like var{-q} as a bare word, not
3938 $next_nonblank_token eq '}'
3940 $last_nonblank_type eq 'L'
3941 || ( $last_nonblank_type eq 'm'
3942 && $last_last_nonblank_type eq 'L' )
3949 # handle operator x (now we know it isn't $x=)
3951 $expecting == OPERATOR
3952 && substr( $tok, 0, 1 ) eq 'x'
3953 && ( length($tok) == 1
3954 || substr( $tok, 1, 1 ) =~ /^\d/ )
3959 elsif ( $tok_kw eq 'CORE::' ) {
3960 $type = $tok = $tok_kw;
3963 elsif ( ( $tok eq 'strict' )
3964 and ( $last_nonblank_token eq 'use' ) )
3966 $tokenizer_self->[_saw_use_strict_] = 1;
3967 scan_bare_identifier();
3970 elsif ( ( $tok eq 'warnings' )
3971 and ( $last_nonblank_token eq 'use' ) )
3973 $tokenizer_self->[_saw_perl_dash_w_] = 1;
3975 # scan as identifier, so that we pick up something like:
3976 # use warnings::register
3977 scan_bare_identifier();
3981 $tok eq 'AutoLoader'
3982 && $tokenizer_self->[_look_for_autoloader_]
3984 $last_nonblank_token eq 'use'
3986 # these regexes are from AutoSplit.pm, which we want
3988 || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
3989 || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
3993 write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
3994 $tokenizer_self->[_saw_autoloader_] = 1;
3995 $tokenizer_self->[_look_for_autoloader_] = 0;
3996 scan_bare_identifier();
4000 $tok eq 'SelfLoader'
4001 && $tokenizer_self->[_look_for_selfloader_]
4002 && ( $last_nonblank_token eq 'use'
4003 || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
4004 || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
4007 write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
4008 $tokenizer_self->[_saw_selfloader_] = 1;
4009 $tokenizer_self->[_look_for_selfloader_] = 0;
4010 scan_bare_identifier();
4013 elsif ( ( $tok eq 'constant' )
4014 and ( $last_nonblank_token eq 'use' ) )
4019 # various quote operators
4020 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
4021 do_QUOTE_OPERATOR();
4024 # check for a statement label
4026 ( $next_nonblank_token eq ':' )
4027 && ( $rtokens->[ $i_next + 1 ] ne ':' )
4028 && ( $i_next <= $max_token_index ) # colon on same line
4030 # like 'sub : lvalue' ?
4031 && !sub_attribute_ok_here( $tok_kw, $next_nonblank_token, $i_next )
4035 if ( $tok !~ /[A-Z]/ ) {
4036 push @{ $tokenizer_self->[_rlower_case_labels_at_] },
4045 elsif ( $is_sub{$tok_kw} ) {
4046 error_if_expecting_OPERATOR()
4047 if ( $expecting == OPERATOR );
4048 initialize_subname();
4053 elsif ( $is_package{$tok_kw} ) {
4054 error_if_expecting_OPERATOR()
4055 if ( $expecting == OPERATOR );
4059 # Fix for c035: split 'format' from 'is_format_END_DATA' to be
4060 # more restrictive. Require a new statement to be ok here.
4061 elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
4062 $type = ';'; # make tokenizer look for TERM next
4063 $tokenizer_self->[_in_format_] = 1;
4064 $is_last = 1; ## is last token on this line
4067 # Note on token types for format, __DATA__, __END__:
4068 # It simplifies things to give these type ';', so that when we
4069 # start rescanning we will be expecting a token of type TERM.
4070 # We will switch to type 'k' before outputting the tokens.
4071 elsif ( $is_END_DATA{$tok_kw} ) {
4072 $type = ';'; # make tokenizer look for TERM next
4074 # Remember that we are in one of these three sections
4075 $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1;
4076 $is_last = 1; ## is last token on this line
4079 elsif ( $is_keyword{$tok_kw} ) {
4083 # check for inline label following
4084 # /^(redo|last|next|goto)$/
4085 elsif (( $last_nonblank_type eq 'k' )
4086 && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4093 do_UNKNOWN_BAREWORD($next_nonblank_token);
4098 } ## end sub do_BAREWORD
4100 sub do_FOLLOW_QUOTE {
4102 # Continue following a quote on a new line
4103 $type = $quote_type;
4105 unless ( @{$routput_token_list} ) { # initialize if continuation line
4106 push( @{$routput_token_list}, $i );
4107 $routput_token_type->[$i] = $type;
4111 # scan for the end of the quote or pattern
4136 # all done if we didn't find it
4137 if ($in_quote) { return }
4139 # save pattern and replacement text for rescanning
4140 my $qs1 = $quoted_string_1;
4142 # re-initialize for next search
4143 $quote_character = EMPTY_STRING;
4146 $quoted_string_1 = EMPTY_STRING;
4147 $quoted_string_2 = EMPTY_STRING;
4148 if ( ++$i > $max_token_index ) { return }
4150 # look for any modifiers
4151 if ($allowed_quote_modifiers) {
4153 # check for exact quote modifiers
4154 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
4155 my $str = $rtokens->[$i];
4157 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
4158 my $pos = pos($str);
4159 my $char = substr( $str, $pos - 1, 1 );
4160 $saw_modifier_e ||= ( $char eq 'e' );
4163 # For an 'e' quote modifier we must scan the replacement
4164 # text for here-doc targets...
4165 # but if the modifier starts a new line we can skip
4166 # this because either the here doc will be fully
4167 # contained in the replacement text (so we can
4168 # ignore it) or Perl will not find it.
4169 # See test 'here2.in'.
4170 if ( $saw_modifier_e && $i_tok >= 0 ) {
4172 my $rht = scan_replacement_text($qs1);
4174 # Change type from 'Q' to 'h' for quotes with
4175 # here-doc targets so that the formatter (see sub
4176 # process_line_of_CODE) will not make any line
4177 # breaks after this point.
4179 push @{$rhere_target_list}, @{$rht};
4182 my $ilast = $routput_token_list->[-1];
4183 $routput_token_type->[$ilast] = $type;
4188 if ( defined( pos($str) ) ) {
4191 if ( pos($str) == length($str) ) {
4192 if ( ++$i > $max_token_index ) { return }
4195 # Looks like a joined quote modifier
4196 # and keyword, maybe something like
4197 # s/xxx/yyy/gefor @k=...
4198 # Example is "galgen.pl". Would have to split
4199 # the word and insert a new token in the
4200 # pre-token list. This is so rare that I haven't
4201 # done it. Will just issue a warning citation.
4203 # This error might also be triggered if my quote
4204 # modifier characters are incomplete
4208 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
4209 Please put a space between quote modifiers and trailing keywords.
4212 # print "token $rtokens->[$i]\n";
4213 # my $num = length($str) - pos($str);
4214 # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
4215 # print "continuing with new token $rtokens->[$i]\n";
4217 # skipping past this token does least damage
4218 if ( ++$i > $max_token_index ) { return }
4223 # example file: rokicki4.pl
4224 # This error might also be triggered if my quote
4225 # modifier characters are incomplete
4226 write_logfile_entry(
4227 "Note: found word $str at quote modifier location\n");
4232 $allowed_quote_modifiers = EMPTY_STRING;
4235 } ## end sub do_FOLLOW_QUOTE
4237 # ------------------------------------------------------------
4238 # begin hash of code for handling most token types
4239 # ------------------------------------------------------------
4240 my $tokenization_code = {
4242 '>' => \&do_GREATER_THAN_SIGN,
4243 '|' => \&do_VERTICAL_LINE,
4244 '$' => \&do_DOLLAR_SIGN,
4245 '(' => \&do_LEFT_PARENTHESIS,
4246 ')' => \&do_RIGHT_PARENTHESIS,
4248 ';' => \&do_SEMICOLON,
4249 '"' => \&do_QUOTATION_MARK,
4250 "'" => \&do_APOSTROPHE,
4251 '`' => \&do_BACKTICK,
4253 '{' => \&do_LEFT_CURLY_BRACKET,
4254 '}' => \&do_RIGHT_CURLY_BRACKET,
4255 '&' => \&do_AMPERSAND,
4256 '<' => \&do_LESS_THAN_SIGN,
4257 '?' => \&do_QUESTION_MARK,
4261 '+' => \&do_PLUS_SIGN,
4262 '@' => \&do_AT_SIGN,
4263 '%' => \&do_PERCENT_SIGN,
4264 '[' => \&do_LEFT_SQUARE_BRACKET,
4265 ']' => \&do_RIGHT_SQUARE_BRACKET,
4266 '-' => \&do_MINUS_SIGN,
4267 '^' => \&do_CARAT_SIGN,
4268 '::' => \&do_DOUBLE_COLON,
4269 '<<' => \&do_LEFT_SHIFT,
4270 '<<~' => \&do_NEW_HERE_DOC,
4271 '->' => \&do_POINTER,
4272 '++' => \&do_PLUS_PLUS,
4273 '=>' => \&do_FAT_COMMA,
4274 '--' => \&do_MINUS_MINUS,
4275 '&&' => \&do_LOGICAL_AND,
4276 '||' => \&do_LOGICAL_OR,
4277 '//' => \&do_SLASH_SLASH,
4279 # No special code for these types yet, but syntax checks
4314 # ------------------------------------------------------------
4315 # end hash of code for handling individual token types
4316 # ------------------------------------------------------------
4318 use constant DEBUG_TOKENIZE => 0;
4320 sub tokenize_this_line {
4322 # This routine breaks a line of perl code into tokens which are of use in
4323 # indentation and reformatting. One of my goals has been to define tokens
4324 # such that a newline may be inserted between any pair of tokens without
4325 # changing or invalidating the program. This version comes close to this,
4326 # although there are necessarily a few exceptions which must be caught by
4327 # the formatter. Many of these involve the treatment of bare words.
4329 # The tokens and their types are returned in arrays. See previous
4330 # routine for their names.
4332 # See also the array "valid_token_types" in the BEGIN section for an
4335 # To simplify things, token types are either a single character, or they
4336 # are identical to the tokens themselves.
4338 # As a debugging aid, the -D flag creates a file containing a side-by-side
4339 # comparison of the input string and its tokenization for each line of a file.
4340 # This is an invaluable debugging aid.
4342 # In addition to tokens, and some associated quantities, the tokenizer
4343 # also returns flags indication any special line types. These include
4344 # quotes, here_docs, formats.
4346 # -----------------------------------------------------------------------
4348 # How to add NEW_TOKENS:
4350 # New token types will undoubtedly be needed in the future both to keep up
4351 # with changes in perl and to help adapt the tokenizer to other applications.
4353 # Here are some notes on the minimal steps. I wrote these notes while
4354 # adding the 'v' token type for v-strings, which are things like version
4355 # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
4356 # can use your editor to search for the string "NEW_TOKENS" to find the
4357 # appropriate sections to change):
4359 # *. Try to talk somebody else into doing it! If not, ..
4361 # *. Make a backup of your current version in case things don't work out!
4363 # *. Think of a new, unused character for the token type, and add to
4364 # the array @valid_token_types in the BEGIN section of this package.
4365 # For example, I used 'v' for v-strings.
4367 # *. Implement coding to recognize the $type of the token in this routine.
4368 # This is the hardest part, and is best done by imitating or modifying
4369 # some of the existing coding. For example, to recognize v-strings, I
4370 # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
4371 # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
4373 # *. Update sub operator_expected. This update is critically important but
4374 # the coding is trivial. Look at the comments in that routine for help.
4375 # For v-strings, which should behave like numbers, I just added 'v' to the
4376 # regex used to handle numbers and strings (types 'n' and 'Q').
4378 # *. Implement a 'bond strength' rule in sub set_bond_strengths in
4379 # Perl::Tidy::Formatter for breaking lines around this token type. You can
4380 # skip this step and take the default at first, then adjust later to get
4381 # desired results. For adding type 'v', I looked at sub bond_strength and
4382 # saw that number type 'n' was using default strengths, so I didn't do
4383 # anything. I may tune it up someday if I don't like the way line
4384 # breaks with v-strings look.
4386 # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
4387 # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
4388 # and saw that type 'n' used spaces on both sides, so I just added 'v'
4389 # to the array @spaces_both_sides.
4391 # *. Update HtmlWriter package so that users can colorize the token as
4392 # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
4393 # that package. For v-strings, I initially chose to use a default color
4394 # equal to the default for numbers, but it might be nice to change that
4397 # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
4399 # *. Run lots and lots of debug tests. Start with special files designed
4400 # to test the new token type. Run with the -D flag to create a .DEBUG
4401 # file which shows the tokenization. When these work ok, test as many old
4402 # scripts as possible. Start with all of the '.t' files in the 'test'
4403 # directory of the distribution file. Compare .tdy output with previous
4404 # version and updated version to see the differences. Then include as
4405 # many more files as possible. My own technique has been to collect a huge
4406 # number of perl scripts (thousands!) into one directory and run perltidy
4407 # *, then run diff between the output of the previous version and the
4410 # *. For another example, search for the smartmatch operator '~~'
4411 # with your editor to see where updates were made for it.
4413 # -----------------------------------------------------------------------
4415 my ( $self, $line_of_tokens ) = @_;
4416 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
4418 # Extract line number for use in error messages
4419 $input_line_number = $line_of_tokens->{_line_number};
4421 # Check for pod documentation
4422 if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
4423 && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
4426 # Must not be in multi-line quote
4427 # and must not be in an equation
4429 && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
4431 $self->[_in_pod_] = 1;
4436 $input_line = $untrimmed_input_line;
4440 # Set a flag to indicate if we might be at an __END__ or __DATA__ line
4441 # This will be used below to avoid quoting a bare word followed by
4445 # Reinitialize the multi-line quote flag
4446 if ( $in_quote && $quote_type eq 'Q' ) {
4447 $line_of_tokens->{_starting_in_quote} = 1;
4450 $line_of_tokens->{_starting_in_quote} = 0;
4452 # Trim start of this line unless we are continuing a quoted line.
4453 # Do not trim end because we might end in a quote (test: deken4.pl)
4454 # Perl::Tidy::Formatter will delete needless trailing blanks
4455 $input_line =~ s/^(\s+)//;
4457 # Calculate a guessed level for nonblank lines to avoid calls to
4458 # sub guess_old_indentation_level()
4459 if ( length($input_line) && $1 ) {
4460 my $leading_spaces = $1;
4461 my $spaces = length($leading_spaces);
4463 # handle leading tabs
4464 if ( ord( substr( $leading_spaces, 0, 1 ) ) == ORD_TAB
4465 && $leading_spaces =~ /^(\t+)/ )
4467 my $tabsize = $self->[_tabsize_];
4468 $spaces += length($1) * ( $tabsize - 1 );
4471 my $indent_columns = $self->[_indent_columns_];
4472 $line_of_tokens->{_guessed_indentation_level} =
4473 int( $spaces / $indent_columns );
4476 $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
4477 && $input_line =~ /^__(END|DATA)__\s*$/;
4480 # Optimize for a full-line comment.
4482 if ( substr( $input_line, 0, 1 ) eq '#' ) {
4484 # and check for skipped section
4485 if ( $rOpts_code_skipping
4486 && $input_line =~ /$code_skipping_pattern_begin/ )
4488 $self->[_in_skipped_] = 1;
4492 # Optional fast processing of a block comment
4494 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4495 my $ci_string_i = $ci_string_sum + $in_statement_continuation;
4496 $line_of_tokens->{_line_type} = 'CODE';
4497 $line_of_tokens->{_rtokens} = [$input_line];
4498 $line_of_tokens->{_rtoken_type} = ['#'];
4499 $line_of_tokens->{_rlevels} = [$level_in_tokenizer];
4500 $line_of_tokens->{_rci_levels} = [$ci_string_i];
4501 $line_of_tokens->{_rblock_type} = [EMPTY_STRING];
4502 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
4503 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
4507 # Optimize handling of a blank line
4508 if ( !length($input_line) ) {
4509 $line_of_tokens->{_line_type} = 'CODE';
4510 $line_of_tokens->{_rtokens} = [];
4511 $line_of_tokens->{_rtoken_type} = [];
4512 $line_of_tokens->{_rlevels} = [];
4513 $line_of_tokens->{_rci_levels} = [];
4514 $line_of_tokens->{_rblock_type} = [];
4515 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
4516 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
4521 # update the copy of the line for use in error messages
4522 # This must be exactly what we give the pre_tokenizer
4523 $self->[_line_of_text_] = $input_line;
4525 # re-initialize for the main loop
4526 $routput_token_list = []; # stack of output token indexes
4527 $routput_token_type = []; # token types
4528 $routput_block_type = []; # types of code block
4529 $routput_container_type = []; # paren types, such as if, elsif, ..
4530 $routput_type_sequence = []; # nesting sequential number
4532 $rhere_target_list = [];
4534 $tok = $last_nonblank_token;
4535 $type = $last_nonblank_type;
4536 $prototype = $last_nonblank_prototype;
4537 $last_nonblank_i = -1;
4538 $block_type = $last_nonblank_block_type;
4539 $container_type = $last_nonblank_container_type;
4540 $type_sequence = $last_nonblank_type_sequence;
4544 $self->tokenizer_main_loop($is_END_or_DATA);
4546 #-----------------------------------------------
4547 # all done tokenizing this line ...
4548 # now prepare the final list of tokens and types
4549 #-----------------------------------------------
4551 $self->tokenizer_wrapup_line($line_of_tokens);
4554 } ## end sub tokenize_this_line
4556 sub tokenizer_main_loop {
4558 my ( $self, $is_END_or_DATA ) = @_;
4560 #---------------------------------
4561 # Break one input line into tokens
4562 #---------------------------------
4565 # $is_END_or_DATA is true for a __END__ or __DATA__ line
4567 # start by breaking the line into pre-tokens
4568 my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
4569 ( $rtokens, $rtoken_map, $rtoken_type ) =
4570 pre_tokenize( $input_line, $max_tokens_wanted );
4572 $max_token_index = scalar( @{$rtokens} ) - 1;
4573 push( @{$rtokens}, SPACE, SPACE, SPACE )
4574 ; # extra whitespace simplifies logic
4575 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
4576 push( @{$rtoken_type}, 'b', 'b', 'b' );
4578 # initialize for main loop
4579 if (0) { #<<< this is not necessary
4580 foreach my $ii ( 0 .. $max_token_index + 3 ) {
4581 $routput_token_type->[$ii] = EMPTY_STRING;
4582 $routput_block_type->[$ii] = EMPTY_STRING;
4583 $routput_container_type->[$ii] = EMPTY_STRING;
4584 $routput_type_sequence->[$ii] = EMPTY_STRING;
4585 $routput_indent_flag->[$ii] = 0;
4592 #-----------------------------
4593 # begin main tokenization loop
4594 #-----------------------------
4596 # we are looking at each pre-token of one line and combining them
4598 while ( ++$i <= $max_token_index ) {
4600 # continue looking for the end of a quote
4603 last if ( $in_quote || $i > $max_token_index );
4606 if ( $type ne 'b' && $tok ne 'CORE::' ) {
4608 # try to catch some common errors
4609 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
4611 if ( $last_nonblank_token eq 'eq' ) {
4612 complain("Should 'eq' be '==' here ?\n");
4614 elsif ( $last_nonblank_token eq 'ne' ) {
4615 complain("Should 'ne' be '!=' here ?\n");
4619 # fix c090, only rotate vars if a new token will be stored
4620 if ( $i_tok >= 0 ) {
4621 $last_last_nonblank_token = $last_nonblank_token;
4622 $last_last_nonblank_type = $last_nonblank_type;
4623 $last_last_nonblank_block_type = $last_nonblank_block_type;
4624 $last_last_nonblank_container_type =
4625 $last_nonblank_container_type;
4626 $last_last_nonblank_type_sequence =
4627 $last_nonblank_type_sequence;
4629 # Fix part #3 for git82: propagate type 'Z' though L-R pair
4630 unless ( $type eq 'R' && $last_nonblank_type eq 'Z' ) {
4631 $last_nonblank_token = $tok;
4632 $last_nonblank_type = $type;
4634 $last_nonblank_prototype = $prototype;
4635 $last_nonblank_block_type = $block_type;
4636 $last_nonblank_container_type = $container_type;
4637 $last_nonblank_type_sequence = $type_sequence;
4638 $last_nonblank_i = $i_tok;
4641 # Patch for c030: Fix things in case a '->' got separated from
4642 # the subsequent identifier by a side comment. We need the
4643 # last_nonblank_token to have a leading -> to avoid triggering
4644 # an operator expected error message at the next '('. See also
4646 if ( $last_last_nonblank_token eq '->' ) {
4647 if ( $last_nonblank_type eq 'w'
4648 || $last_nonblank_type eq 'i' )
4650 $last_nonblank_token = '->' . $last_nonblank_token;
4651 $last_nonblank_type = 'i';
4656 # store previous token type
4657 if ( $i_tok >= 0 ) {
4658 $routput_token_type->[$i_tok] = $type;
4659 $routput_block_type->[$i_tok] = $block_type;
4660 $routput_container_type->[$i_tok] = $container_type;
4661 $routput_type_sequence->[$i_tok] = $type_sequence;
4662 $routput_indent_flag->[$i_tok] = $indent_flag;
4665 # get the next pre-token and type
4666 # $tok and $type will be modified to make the output token
4667 my $pre_tok = $tok = $rtokens->[$i]; # get the next pre-token
4668 my $pre_type = $type = $rtoken_type->[$i]; # and type
4670 # remember the starting index of this token; we will be updating $i
4673 # re-initialize various flags for the next output token
4674 $block_type &&= EMPTY_STRING;
4675 $container_type &&= EMPTY_STRING;
4676 $type_sequence &&= EMPTY_STRING;
4678 $prototype &&= EMPTY_STRING;
4680 # this pre-token will start an output token
4681 push( @{$routput_token_list}, $i_tok );
4683 #--------------------------
4684 # handle a whitespace token
4685 #--------------------------
4686 next if ( $pre_type eq 'b' );
4691 last if ( $pre_type eq '#' );
4693 # continue gathering identifier if necessary
4694 if ($id_scan_state) {
4696 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
4703 if ($id_scan_state) {
4705 # Still scanning ...
4706 # Check for side comment between sub and prototype (c061)
4708 # done if nothing left to scan on this line
4709 last if ( $i > $max_token_index );
4711 my ( $next_nonblank_token, $i_next ) =
4712 find_next_nonblank_token_on_this_line( $i, $rtokens,
4715 # done if it was just some trailing space
4716 last if ( $i_next > $max_token_index );
4718 # something remains on the line ... must be a side comment
4722 next if ( ( $i > 0 ) || $type );
4724 # didn't find any token; start over
4729 my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : SPACE;
4730 my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
4732 #-----------------------------------------------------------
4733 # Combine pre-tokens into digraphs and trigraphs if possible
4734 #-----------------------------------------------------------
4736 # See if we can make a digraph...
4737 # The following tokens are excluded and handled specially:
4738 # '/=' is excluded because the / might start a pattern.
4739 # 'x=' is excluded since it might be $x=, with $ on previous line
4740 # '**' and *= might be typeglobs of punctuation variables
4741 # I have allowed tokens starting with <, such as <=,
4742 # because I don't think these could be valid angle operators.
4743 # test file: storrs4.pl
4744 if ( $can_start_digraph{$tok}
4745 && $i < $max_token_index
4746 && $is_digraph{ $tok . $rtokens->[ $i + 1 ] } )
4750 my $test_tok = $tok . $rtokens->[ $i + 1 ];
4752 # check for special cases which cannot be combined
4754 # '//' must be defined_or operator if an operator is expected.
4755 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
4756 # could be migrated here for clarity
4758 # Patch for RT#102371, misparsing a // in the following snippet:
4759 # state $b //= ccc();
4760 # The solution is to always accept the digraph (or trigraph)
4761 # after type 'Z' (possible file handle). The reason is that
4762 # sub operator_expected gives TERM expected here, which is
4763 # wrong in this case.
4764 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
4766 # note that here $tok = '/' and the next tok and type is '/'
4767 $expecting = operator_expected( [ $prev_type, $tok, '/' ] );
4769 # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
4770 $combine_ok = 0 if ( $expecting == TERM );
4773 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
4774 # Accept the digraphs '**' only after type 'Z'
4775 # Otherwise postpone the decision.
4776 if ( $test_tok eq '**' ) {
4777 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
4782 # still ok to combine?
4785 && ( $test_tok ne '/=' ) # might be pattern
4786 && ( $test_tok ne 'x=' ) # might be $x
4787 && ( $test_tok ne '*=' ) # typeglob?
4789 # Moved above as part of fix for
4790 # RT #114359: Missparsing of "print $x ** 0.5;
4791 # && ( $test_tok ne '**' ) # typeglob?
4797 # Now try to assemble trigraphs. Note that all possible
4798 # perl trigraphs can be constructed by appending a character
4800 $test_tok = $tok . $rtokens->[ $i + 1 ];
4802 if ( $is_trigraph{$test_tok} ) {
4807 # The only current tetragraph is the double diamond operator
4808 # and its first three characters are not a trigraph, so
4809 # we do can do a special test for it
4810 elsif ( $test_tok eq '<<>' ) {
4811 $test_tok .= $rtokens->[ $i + 2 ];
4812 if ( $is_tetragraph{$test_tok} ) {
4821 $next_tok = $rtokens->[ $i + 1 ];
4822 $next_type = $rtoken_type->[ $i + 1 ];
4824 DEBUG_TOKENIZE && do {
4825 local $LIST_SEPARATOR = ')(';
4827 $last_nonblank_token, $tok,
4828 $next_tok, $brace_depth,
4829 $brace_type[$brace_depth], $paren_depth,
4830 $paren_type[$paren_depth],
4832 print STDOUT "TOKENIZE:(@debug_list)\n";
4835 # Turn off attribute list on first non-blank, non-bareword.
4836 # Added '#' to fix c038 (later moved above).
4837 if ( $in_attribute_list && $pre_type ne 'w' ) {
4838 $in_attribute_list = 0;
4841 #--------------------------------------------------------
4842 # We have the next token, $tok.
4843 # Now we have to examine this token and decide what it is
4844 # and define its $type
4846 # section 1: bare words
4847 #--------------------------------------------------------
4849 if ( $pre_type eq 'w' ) {
4851 operator_expected( [ $prev_type, $tok, $next_type ] );
4852 my $is_last = do_BAREWORD($is_END_or_DATA);
4856 #-----------------------------
4857 # section 2: strings of digits
4858 #-----------------------------
4859 elsif ( $pre_type eq 'd' ) {
4861 operator_expected( [ $prev_type, $tok, $next_type ] );
4865 #----------------------------
4866 # section 3: all other tokens
4867 #----------------------------
4869 my $code = $tokenization_code->{$tok};
4872 operator_expected( [ $prev_type, $tok, $next_type ] );
4879 # -----------------------------
4880 # end of main tokenization loop
4881 # -----------------------------
4883 # Store the final token
4884 if ( $i_tok >= 0 ) {
4885 $routput_token_type->[$i_tok] = $type;
4886 $routput_block_type->[$i_tok] = $block_type;
4887 $routput_container_type->[$i_tok] = $container_type;
4888 $routput_type_sequence->[$i_tok] = $type_sequence;
4889 $routput_indent_flag->[$i_tok] = $indent_flag;
4892 # Remember last nonblank values
4893 if ( $type ne 'b' && $type ne '#' ) {
4894 $last_last_nonblank_token = $last_nonblank_token;
4895 $last_last_nonblank_type = $last_nonblank_type;
4896 $last_last_nonblank_block_type = $last_nonblank_block_type;
4897 $last_last_nonblank_container_type = $last_nonblank_container_type;
4898 $last_last_nonblank_type_sequence = $last_nonblank_type_sequence;
4899 $last_nonblank_token = $tok;
4900 $last_nonblank_type = $type;
4901 $last_nonblank_block_type = $block_type;
4902 $last_nonblank_container_type = $container_type;
4903 $last_nonblank_type_sequence = $type_sequence;
4904 $last_nonblank_prototype = $prototype;
4907 # reset indentation level if necessary at a sub or package
4908 # in an attempt to recover from a nesting error
4909 if ( $level_in_tokenizer < 0 ) {
4910 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
4911 reset_indentation_level(0);
4912 brace_warning("resetting level to 0 at $1 $2\n");
4916 $self->[_in_attribute_list_] = $in_attribute_list;
4917 $self->[_in_quote_] = $in_quote;
4918 $self->[_quote_target_] =
4919 $in_quote ? matching_end_token($quote_character) : EMPTY_STRING;
4920 $self->[_rhere_target_list_] = $rhere_target_list;
4923 } ## end sub tokenizer_main_loop
4925 sub tokenizer_wrapup_line {
4926 my ( $self, $line_of_tokens ) = @_;
4928 #---------------------------------------------------------
4929 # Package a line of tokens for shipping back to the caller
4930 #---------------------------------------------------------
4932 # Most of the remaining work involves defining the two indentation
4933 # parameters that the formatter needs for each token:
4934 # - $level = structural indentation level and
4935 # - $ci_level = continuation indentation level
4937 # The method for setting the indentation level is straightforward.
4938 # But the method used to define the continuation indentation is
4939 # complicated because it has evolved over a long time by trial and
4940 # error. It could undoubtedly be simplified but it works okay as is.
4942 # Here is a brief description of how indentation is computed.
4943 # Perl::Tidy computes indentation as the sum of 2 terms:
4945 # (1) structural indentation, such as if/else/elsif blocks
4946 # (2) continuation indentation, such as long parameter call lists.
4948 # These are occasionally called primary and secondary indentation.
4950 # Structural indentation is introduced by tokens of type '{',
4951 # although the actual tokens might be '{', '(', or '['. Structural
4952 # indentation is of two types: BLOCK and non-BLOCK. Default
4953 # structural indentation is 4 characters if the standard indentation
4956 # Continuation indentation is introduced whenever a line at BLOCK
4957 # level is broken before its termination. Default continuation
4958 # indentation is 2 characters in the standard indentation scheme.
4960 # Both types of indentation may be nested arbitrarily deep and
4961 # interlaced. The distinction between the two is somewhat arbitrary.
4963 # For each token, we will define two variables which would apply if
4964 # the current statement were broken just before that token, so that
4965 # that token started a new line:
4967 # $level = the structural indentation level,
4968 # $ci_level = the continuation indentation level
4970 # The total indentation will be $level * (4 spaces) + $ci_level * (2
4971 # spaces), assuming defaults. However, in some special cases it is
4972 # customary to modify $ci_level from this strict value.
4974 # The total structural indentation is easy to compute by adding and
4975 # subtracting 1 from a saved value as types '{' and '}' are seen.
4976 # The running value of this variable is $level_in_tokenizer.
4978 # The total continuation is much more difficult to compute, and
4979 # requires several variables. These variables are:
4981 # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
4982 # each indentation level, if there are intervening open secondary
4983 # structures just prior to that level.
4984 # $continuation_string_in_tokenizer = a string of 1's and 0's
4985 # indicating if the last token at that level is "continued", meaning
4986 # that it is not the first token of an expression.
4987 # $nesting_block_string = a string of 1's and 0's indicating, for each
4988 # indentation level, if the level is of type BLOCK or not.
4989 # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
4990 # $nesting_list_string = a string of 1's and 0's indicating, for each
4991 # indentation level, if it is appropriate for list formatting.
4992 # If so, continuation indentation is used to indent long list items.
4993 # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
4994 # @{$rslevel_stack} = a stack of total nesting depths at each
4995 # structural indentation level, where "total nesting depth" means
4996 # the nesting depth that would occur if every nesting token
4997 # -- '{', '[', # and '(' -- , regardless of context, is used to
4998 # compute a nesting depth.
5000 # Notes on the Continuation Indentation
5002 # There is a sort of chicken-and-egg problem with continuation
5003 # indentation. The formatter can't make decisions on line breaks
5004 # without knowing what 'ci' will be at arbitrary locations.
5006 # But a problem with setting the continuation indentation (ci) here
5007 # in the tokenizer is that we do not know where line breaks will
5008 # actually be. As a result, we don't know if we should propagate
5009 # continuation indentation to higher levels of structure.
5011 # For nesting of only structural indentation, we never need to do
5012 # this. For example, in a long if statement, like this
5014 # if ( !$output_block_type[$i]
5015 # && ($in_statement_continuation) )
5020 # the second line has ci but we do normally give the lines within
5021 # the BLOCK any ci. This would be true if we had blocks nested
5022 # arbitrarily deeply.
5024 # But consider something like this, where we have created a break
5025 # after an opening paren on line 1, and the paren is not (currently)
5026 # a structural indentation token:
5028 # my $file = $menubar->Menubutton(
5029 # qw/-text File -underline 0 -menuitems/ => [
5031 # Cascade => '~View',
5035 # The second line has ci, so it would seem reasonable to propagate
5036 # it down, giving the third line 1 ci + 1 indentation. This
5037 # suggests the following rule, which is currently used to
5038 # propagating ci down: if there are any non-structural opening
5039 # parens (or brackets, or braces), before an opening structural
5040 # brace, then ci is propagated down, and otherwise
5041 # not. The variable $intervening_secondary_structure contains this
5042 # information for the current token, and the string
5043 # "$ci_string_in_tokenizer" is a stack of previous values of this
5046 my @token_type = (); # stack of output token types
5047 my @block_type = (); # stack of output code block types
5048 my @type_sequence = (); # stack of output type sequence numbers
5049 my @tokens = (); # output tokens
5050 my @levels = (); # structural brace levels of output tokens
5051 my @ci_string = (); # string needed to compute continuation indentation
5053 # Count the number of '1's in the string (previously sub ones_count)
5054 my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5056 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5058 my ( $ci_string_i, $level_i );
5064 foreach my $i ( @{$routput_token_list} ) {
5066 my $type_i = $routput_token_type->[$i];
5067 $level_i = $level_in_tokenizer;
5069 # Quick handling of indentation levels for blanks and comments
5070 if ( $type_i eq 'b' || $type_i eq '#' ) {
5071 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5077 # $tok_i is the PRE-token. It only equals the token for symbols
5078 my $tok_i = $rtokens->[$i];
5080 # Check for an invalid token type..
5081 # This can happen by running perltidy on non-scripts although
5082 # it could also be bug introduced by programming change. Perl
5083 # silently accepts a 032 (^Z) and takes it as the end
5084 if ( !$is_valid_token_type{$type_i} ) {
5085 my $val = ord($type_i);
5087 "unexpected character decimal $val ($type_i) in script\n"
5089 $self->[_in_error_] = 1;
5092 # $ternary_indentation_flag indicates that we need a change
5093 # in level at a nested ternary, as follows
5094 # 1 => at a nested ternary ?
5095 # -1 => at a nested ternary :
5097 my $ternary_indentation_flag = $routput_indent_flag->[$i];
5099 #-------------------------------------------
5100 # Section 1: handle a level-increasing token
5101 #-------------------------------------------
5102 # set primary indentation levels based on structural braces
5103 # Note: these are set so that the leading braces have a HIGHER
5104 # level than their CONTENTS, which is convenient for indentation
5105 # Also, define continuation indentation for each token.
5108 || $ternary_indentation_flag > 0 )
5111 # if the difference between total nesting levels is not 1,
5112 # there are intervening non-structural nesting types between
5113 # this '{' and the previous unclosed '{'
5114 my $intervening_secondary_structure = 0;
5115 if ( @{$rslevel_stack} ) {
5116 $intervening_secondary_structure =
5117 $slevel_in_tokenizer - $rslevel_stack->[-1];
5120 # save the current states
5121 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
5122 $level_in_tokenizer++;
5124 if ( $level_in_tokenizer > $self->[_maximum_level_] ) {
5125 $self->[_maximum_level_] = $level_in_tokenizer;
5128 if ($ternary_indentation_flag) {
5130 # break BEFORE '?' in a nested ternary
5131 if ( $type_i eq '?' ) {
5132 $level_i = $level_in_tokenizer;
5135 $nesting_block_string .= "$nesting_block_flag";
5136 } ## end if ($ternary_indentation_flag)
5139 if ( $routput_block_type->[$i] ) {
5140 $nesting_block_flag = 1;
5141 $nesting_block_string .= '1';
5144 $nesting_block_flag = 0;
5145 $nesting_block_string .= '0';
5149 # we will use continuation indentation within containers
5150 # which are not blocks and not logical expressions
5152 if ( !$routput_block_type->[$i] ) {
5154 # propagate flag down at nested open parens
5155 if ( $routput_container_type->[$i] eq '(' ) {
5156 $bit = 1 if $nesting_list_flag;
5159 # use list continuation if not a logical grouping
5160 # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
5164 $is_logical_container{ $routput_container_type
5168 $nesting_list_string .= $bit;
5169 $nesting_list_flag = $bit;
5171 $ci_string_in_tokenizer .=
5172 ( $intervening_secondary_structure != 0 ) ? '1' : '0';
5174 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5175 $continuation_string_in_tokenizer .=
5176 ( $in_statement_continuation > 0 ) ? '1' : '0';
5178 # Sometimes we want to give an opening brace
5179 # continuation indentation, and sometimes not. For code
5180 # blocks, we don't do it, so that the leading '{' gets
5181 # outdented, like this:
5183 # if ( !$output_block_type[$i]
5184 # && ($in_statement_continuation) )
5187 # For other types, we will give them continuation
5188 # indentation. For example, here is how a list looks
5189 # with the opening paren indented:
5192 # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
5193 # [ "homer", "marge", "bart" ], );
5195 # This looks best when 'ci' is one-half of the
5196 # indentation (i.e., 2 and 4)
5198 my $total_ci = $ci_string_sum;
5200 !$routput_block_type->[$i] # patch: skip for BLOCK
5201 && ($in_statement_continuation)
5202 && !( $ternary_indentation_flag && $type_i eq ':' )
5205 $total_ci += $in_statement_continuation
5207 substr( $ci_string_in_tokenizer, -1 ) eq '1' );
5210 $ci_string_i = $total_ci;
5211 $in_statement_continuation = 0;
5212 } ## end if ( $type_i eq '{' ||...})
5214 #-------------------------------------------
5215 # Section 2: handle a level-decreasing token
5216 #-------------------------------------------
5217 elsif ($type_i eq '}'
5219 || $ternary_indentation_flag < 0 )
5222 # only a nesting error in the script would prevent
5224 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
5226 $level_i = --$level_in_tokenizer;
5228 if ( $level_in_tokenizer < 0 ) {
5229 unless ( $self->[_saw_negative_indentation_] ) {
5230 $self->[_saw_negative_indentation_] = 1;
5231 warning("Starting negative indentation\n");
5235 # restore previous level values
5236 if ( length($nesting_block_string) > 1 )
5237 { # true for valid script
5238 chop $nesting_block_string;
5239 $nesting_block_flag =
5240 substr( $nesting_block_string, -1 ) eq '1';
5241 chop $nesting_list_string;
5242 $nesting_list_flag =
5243 substr( $nesting_list_string, -1 ) eq '1';
5245 chop $ci_string_in_tokenizer;
5247 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5249 $in_statement_continuation =
5250 chop $continuation_string_in_tokenizer;
5252 # zero continuation flag at terminal BLOCK '}' which
5254 my $block_type_i = $routput_block_type->[$i];
5255 if ($block_type_i) {
5257 # ...These include non-anonymous subs
5258 # note: could be sub ::abc { or sub 'abc
5259 if ( substr( $block_type_i, 0, 3 ) eq 'sub'
5260 && $block_type_i =~ m/^sub\s*/gc )
5263 # note: older versions of perl require the /gc
5264 # modifier here or else the \G does not work.
5265 $in_statement_continuation = 0
5266 if ( $block_type_i =~ /\G('|::|\w)/gc );
5269 # ...and include all block types except user subs
5270 # with block prototypes and these:
5271 # (sort|grep|map|do|eval)
5273 $is_zero_continuation_block_type{$block_type_i}
5276 $in_statement_continuation = 0;
5279 # ..but these are not terminal types:
5280 # /^(sort|grep|map|do|eval)$/ )
5281 elsif ($is_sort_map_grep_eval_do{$block_type_i}
5282 || $is_grep_alias{$block_type_i} )
5286 # ..and a block introduced by a label
5288 elsif ( $block_type_i =~ /:$/ ) {
5289 $in_statement_continuation = 0;
5292 # user function with block prototype
5294 $in_statement_continuation = 0;
5296 } ## end if ($block_type_i)
5298 # If we are in a list, then
5299 # we must set continuation indentation at the closing
5300 # paren of something like this (paren after $check):
5303 # ( not defined $check )
5305 # or $check eq "new"
5306 # or $check eq "old",
5308 elsif ( $tok_i eq ')' ) {
5309 $in_statement_continuation = 1
5312 $routput_container_type->[$i]
5315 ##if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
5317 } ## end if ( length($nesting_block_string...))
5319 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5320 } ## end elsif ( $type_i eq '}' ||...{)
5322 #-----------------------------------------
5323 # Section 3: handle a constant level token
5324 #-----------------------------------------
5327 # zero the continuation indentation at certain tokens so
5328 # that they will be at the same level as its container. For
5329 # commas, this simplifies the -lp indentation logic, which
5330 # counts commas. For ?: it makes them stand out.
5333 ## $type_i =~ /^[,\?\:]$/
5334 && $is_comma_question_colon{$type_i}
5337 $in_statement_continuation = 0;
5340 # Be sure binary operators get continuation indentation.
5341 # Note: the check on $nesting_block_flag is only needed
5342 # to add ci to binary operators following a 'try' block,
5343 # or similar extended syntax block operator (see c158).
5345 !$in_statement_continuation
5346 && ( $nesting_block_flag || $nesting_list_flag )
5347 && ( $type_i eq 'k' && $is_binary_keyword{$tok_i}
5348 || $is_binary_type{$type_i} )
5351 $in_statement_continuation = 1;
5354 # continuation indentation is sum of any open ci from
5355 # previous levels plus the current level
5356 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5358 # update continuation flag ...
5360 # if we are in a BLOCK
5361 if ($nesting_block_flag) {
5363 # the next token after a ';' and label starts a new stmt
5364 if ( $type_i eq ';' || $type_i eq 'J' ) {
5365 $in_statement_continuation = 0;
5368 # otherwise, we are continuing the current statement
5370 $in_statement_continuation = 1;
5374 # if we are not in a BLOCK..
5377 # do not use continuation indentation if not list
5378 # environment (could be within if/elsif clause)
5379 if ( !$nesting_list_flag ) {
5380 $in_statement_continuation = 0;
5383 # otherwise, the token after a ',' starts a new term
5385 # Patch FOR RT#99961; no continuation after a ';'
5386 # This is needed because perltidy currently marks
5387 # a block preceded by a type character like % or @
5388 # as a non block, to simplify formatting. But these
5389 # are actually blocks and can have semicolons.
5390 # See code_block_type() and is_non_structural_brace().
5391 elsif ( $type_i eq ',' || $type_i eq ';' ) {
5392 $in_statement_continuation = 0;
5395 # otherwise, we are continuing the current term
5397 $in_statement_continuation = 1;
5399 } ## end else [ if ($nesting_block_flag)]
5401 } ## end else [ if ( $type_i eq '{' ||...})]
5403 #-------------------------------------------
5404 # Section 4: operations common to all levels
5405 #-------------------------------------------
5407 # set secondary nesting levels based on all containment token
5408 # types Note: these are set so that the nesting depth is the
5409 # depth of the PREVIOUS TOKEN, which is convenient for setting
5410 # the strength of token bonds
5413 if ( $is_opening_type{$type_i} ) {
5414 $slevel_in_tokenizer++;
5415 $nesting_token_string .= $tok_i;
5416 $nesting_type_string .= $type_i;
5420 elsif ( $is_closing_type{$type_i} ) {
5421 $slevel_in_tokenizer--;
5422 my $char = chop $nesting_token_string;
5424 if ( $char ne $matching_start_token{$tok_i} ) {
5425 $nesting_token_string .= $char . $tok_i;
5426 $nesting_type_string .= $type_i;
5429 chop $nesting_type_string;
5433 # apply token type patch:
5434 # - output anonymous 'sub' as keyword (type 'k')
5435 # - output __END__, __DATA__, and format as type 'k' instead
5436 # of ';' to make html colors correct, etc.
5437 # The following hash tests are equivalent to these older tests:
5438 # if ( $type_i eq 't' && $is_sub{$tok_i} ) { $fix_type = 'k' }
5439 # if ( $type_i eq ';' && $tok_i =~ /\w/ ) { $fix_type = 'k' }
5440 if ( $is_END_DATA_format_sub{$tok_i}
5441 && $is_semicolon_or_t{$type_i} )
5445 } ## end else [ if ( $type_i eq 'b' ||...)]
5447 #--------------------------------
5448 # Store the values for this token
5449 #--------------------------------
5450 push( @ci_string, $ci_string_i );
5451 push( @levels, $level_i );
5452 push( @block_type, $routput_block_type->[$i] );
5453 push( @type_sequence, $routput_type_sequence->[$i] );
5454 push( @token_type, $type_i );
5456 # Form and store the PREVIOUS token
5457 if ( defined($rtoken_map_im) ) {
5459 $rtoken_map->[$i] - $rtoken_map_im; # how many characters
5463 substr( $input_line, $rtoken_map_im, $numc ) );
5467 # Should not happen unless @{$rtoken_map} is corrupted
5470 "number of characters is '$numc' but should be >0\n");
5474 # or grab some values for the leading token (needed for log output)
5476 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5479 $rtoken_map_im = $rtoken_map->[$i];
5480 } ## end foreach my $i ( @{$routput_token_list...})
5482 #------------------------
5483 # End loop to over tokens
5484 #------------------------
5486 # Form and store the final token of this line
5487 if ( defined($rtoken_map_im) ) {
5488 my $numc = length($input_line) - $rtoken_map_im;
5490 push( @tokens, substr( $input_line, $rtoken_map_im, $numc ) );
5494 # Should not happen unless @{$rtoken_map} is corrupted
5497 "Number of Characters is '$numc' but should be >0\n");
5501 #----------------------------------------------------------
5502 # Wrap up this line of tokens for shipping to the Formatter
5503 #----------------------------------------------------------
5504 $line_of_tokens->{_rtoken_type} = \@token_type;
5505 $line_of_tokens->{_rtokens} = \@tokens;
5506 $line_of_tokens->{_rblock_type} = \@block_type;
5507 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
5508 $line_of_tokens->{_rlevels} = \@levels;
5509 $line_of_tokens->{_rci_levels} = \@ci_string;
5512 } ## end sub tokenizer_wrapup_line
5513 } ## end tokenize_this_line
5515 #######################################################################
5516 # Tokenizer routines which assist in identifying token types
5517 #######################################################################
5519 # hash lookup table of operator expected values
5520 my %op_expected_table;
5522 # exceptions to perl's weird parsing rules after type 'Z'
5523 my %is_weird_parsing_rule_exception;
5525 my %is_paren_dollar;
5531 # Always expecting TERM following these types:
5532 # note: this is identical to '@value_requestor_type' defined later.
5534 ; ! + x & ? F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
5535 || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
5536 &= // >> ~. &. |. ^.
5537 ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
5540 push @q, '('; # for completeness, not currently a token type
5541 push @q, '->'; # was previously in UNKNOWN
5542 @{op_expected_table}{@q} = (TERM) x scalar(@q);
5544 # Always UNKNOWN following these types;
5545 # previously had '->' in this list for c030
5547 @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
5549 # Always expecting OPERATOR ...
5550 # 'n' and 'v' are currently excluded because they might be VERSION numbers
5551 # 'i' is currently excluded because it might be a package
5552 # 'q' is currently excluded because it might be a prototype
5553 # Fix for c030: removed '->' from this list:
5554 @q = qw( -- C h R ++ ] Q <> ); ## n v q i );
5556 @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
5558 # Fix for git #62: added '*' and '%'
5560 @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
5563 @{is_paren_dollar}{@q} = (1) x scalar(@q);
5566 @{is_n_v}{@q} = (1) x scalar(@q);
5570 use constant DEBUG_OPERATOR_EXPECTED => 0;
5572 sub operator_expected {
5574 # Returns a parameter indicating what types of tokens can occur next
5577 # $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
5579 # $prev_type is the type of the previous token (blank or not)
5580 # $tok is the current token
5581 # $next_type is the type of the next token (blank or not)
5583 # Many perl symbols have two or more meanings. For example, '<<'
5584 # can be a shift operator or a here-doc operator. The
5585 # interpretation of these symbols depends on the current state of
5586 # the tokenizer, which may either be expecting a term or an
5587 # operator. For this example, a << would be a shift if an OPERATOR
5588 # is expected, and a here-doc if a TERM is expected. This routine
5589 # is called to make this decision for any current token. It returns
5590 # one of three possible values:
5592 # OPERATOR - operator expected (or at least, not a term)
5593 # UNKNOWN - can't tell
5594 # TERM - a term is expected (or at least, not an operator)
5596 # The decision is based on what has been seen so far. This
5597 # information is stored in the "$last_nonblank_type" and
5598 # "$last_nonblank_token" variables. For example, if the
5599 # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
5600 # if $last_nonblank_type is 'n' (numeric), we are expecting an
5603 # If a UNKNOWN is returned, the calling routine must guess. A major
5604 # goal of this tokenizer is to minimize the possibility of returning
5605 # UNKNOWN, because a wrong guess can spoil the formatting of a
5608 # Adding NEW_TOKENS: it is critically important that this routine be
5609 # updated to allow it to determine if an operator or term is to be
5610 # expected after the new token. Doing this simply involves adding
5611 # the new token character to one of the regexes in this routine or
5612 # to one of the hash lists
5613 # that it uses, which are initialized in the BEGIN section.
5614 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
5617 # When possible, token types should be selected such that we can determine
5618 # the 'operator_expected' value by a simple hash lookup. If there are
5619 # exceptions, that is an indication that a new type is needed.
5627 # Many types are can be obtained by a table lookup given the previous type.
5628 # This typically handles half or more of the calls.
5629 my $op_expected = $op_expected_table{$last_nonblank_type};
5630 if ( defined($op_expected) ) {
5631 DEBUG_OPERATOR_EXPECTED
5633 "OPERATOR_EXPECTED: Table Lookup; returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5634 return $op_expected;
5637 #---------------------
5638 # Handle special cases
5639 #---------------------
5641 $op_expected = UNKNOWN;
5642 my ( $prev_type, $tok, $next_type ) = @{$rarg};
5644 # Types 'k', '}' and 'Z' depend on context
5645 # Types 'i', 'n', 'v', 'q' currently also temporarily depend on context.
5648 if ( $last_nonblank_type eq 'i' ) {
5649 $op_expected = OPERATOR;
5651 # TODO: it would be cleaner to make this a special type
5652 # expecting VERSION or {} after package NAMESPACE;
5653 # maybe mark these words as type 'Y'?
5654 if ( substr( $last_nonblank_token, 0, 7 ) eq 'package'
5655 && $statement_type =~ /^package\b/
5656 && $last_nonblank_token =~ /^package\b/ )
5658 $op_expected = TERM;
5663 elsif ( $last_nonblank_type eq 'k' ) {
5664 $op_expected = TERM;
5665 if ( $expecting_operator_token{$last_nonblank_token} ) {
5666 $op_expected = OPERATOR;
5668 elsif ( $expecting_term_token{$last_nonblank_token} ) {
5670 # Exceptions from TERM:
5672 # // may follow perl functions which may be unary operators
5673 # see test file dor.t (defined or);
5676 && $next_type eq '/'
5677 && $is_keyword_rejecting_slash_as_pattern_delimiter{
5678 $last_nonblank_token}
5681 $op_expected = OPERATOR;
5684 # Patch to allow a ? following 'split' to be a deprecated pattern
5685 # delimiter. This patch is coordinated with the omission of split
5687 # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
5688 # will force perltidy to guess.
5690 && $last_nonblank_token eq 'split' )
5692 $op_expected = UNKNOWN;
5697 # closing container token...
5699 # Note that the actual token for type '}' may also be a ')'.
5701 # Also note that $last_nonblank_token is not the token corresponding to
5702 # $last_nonblank_type when the type is a closing container. In that
5703 # case it is the token before the corresponding opening container token.
5704 # So for example, for this snippet
5705 # $a = do { BLOCK } / 2;
5706 # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
5708 elsif ( $last_nonblank_type eq '}' ) {
5709 $op_expected = UNKNOWN;
5711 # handle something after 'do' and 'eval'
5712 if ( $is_block_operator{$last_nonblank_token} ) {
5714 # something like $a = do { BLOCK } / 2;
5715 $op_expected = OPERATOR; # block mode following }
5718 # $last_nonblank_token =~ /^(\)|\$|\-\>)/
5719 elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
5720 || substr( $last_nonblank_token, 0, 2 ) eq '->' )
5722 $op_expected = OPERATOR;
5723 if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
5726 # Check for smartmatch operator before preceding brace or square
5727 # bracket. For example, at the ? after the ] in the following
5728 # expressions we are expecting an operator:
5730 # qr/3/ ~~ ['1234'] ? 1 : 0;
5731 # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
5732 elsif ( $last_nonblank_token eq '~~' ) {
5733 $op_expected = OPERATOR;
5736 # A right brace here indicates the end of a simple block. All
5737 # non-structural right braces have type 'R' all braces associated with
5738 # block operator keywords have been given those keywords as
5739 # "last_nonblank_token" and caught above. (This statement is order
5740 # dependent, and must come after checking $last_nonblank_token).
5743 # patch for dor.t (defined or).
5745 && $next_type eq '/'
5746 && $last_nonblank_token eq ']' )
5748 $op_expected = OPERATOR;
5751 # Patch for RT #116344: misparse a ternary operator after an
5752 # anonymous hash, like this:
5753 # return ref {} ? 1 : 0;
5754 # The right brace should really be marked type 'R' in this case,
5755 # and it is safest to return an UNKNOWN here. Expecting a TERM will
5756 # cause the '?' to always be interpreted as a pattern delimiter
5757 # rather than introducing a ternary operator.
5758 elsif ( $tok eq '?' ) {
5759 $op_expected = UNKNOWN;
5762 $op_expected = TERM;
5767 # number or v-string...
5768 # An exception is for VERSION numbers a 'use' statement. It has the format
5769 # use Module VERSION LIST
5770 # We could avoid this exception by writing a special sub to parse 'use'
5771 # statements and perhaps mark these numbers with a new type V (for VERSION)
5772 ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
5773 elsif ( $is_n_v{$last_nonblank_type} ) {
5774 $op_expected = OPERATOR;
5775 if ( $statement_type eq 'use' ) {
5776 $op_expected = UNKNOWN;
5781 # TODO: labeled prototype words would better be given type 'A' or maybe
5782 # 'J'; not 'q'; or maybe mark as type 'Y'?
5783 elsif ( $last_nonblank_type eq 'q' ) {
5784 $op_expected = OPERATOR;
5785 if ( $last_nonblank_token eq 'prototype' ) {
5786 $op_expected = TERM;
5790 # file handle or similar
5791 elsif ( $last_nonblank_type eq 'Z' ) {
5793 $op_expected = UNKNOWN;
5796 if ( $last_nonblank_token =~ /^\w/ ) {
5797 $op_expected = UNKNOWN;
5800 # Exception to weird parsing rules for 'x(' ... see case b1205:
5801 # In something like 'print $vv x(...' the x is an operator;
5802 # Likewise in 'print $vv x$ww' the x is an operator (case b1207)
5803 # otherwise x follows the weird parsing rules.
5804 elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
5805 $op_expected = OPERATOR;
5808 # The 'weird parsing rules' of next section do not work for '<' and '?'
5809 # It is best to mark them as unknown. Test case:
5811 elsif ( $is_weird_parsing_rule_exception{$tok} ) {
5812 $op_expected = UNKNOWN;
5815 # For possible file handle like "$a", Perl uses weird parsing rules.
5817 # print $a/2,"/hi"; - division
5818 # print $a / 2,"/hi"; - division
5819 # print $a/ 2,"/hi"; - division
5820 # print $a /2,"/hi"; - pattern (and error)!
5821 # Some examples where this logic works okay, for '&','*','+':
5822 # print $fh &xsi_protos(@mods);
5823 # my $x = new $CompressClass *FH;
5824 # print $OUT +( $count % 15 ? ", " : "\n\t" );
5825 elsif ($prev_type eq 'b'
5826 && $next_type ne 'b' )
5828 $op_expected = TERM;
5831 # Note that '?' and '<' have been moved above
5832 # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
5833 elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
5835 # Do not complain in 'use' statements, which have special syntax.
5836 # For example, from RT#130344:
5837 # use lib $FindBin::Bin . '/lib';
5838 if ( $statement_type ne 'use' ) {
5840 "operator in possible indirect object location not recommended\n"
5843 $op_expected = OPERATOR;
5849 $op_expected = UNKNOWN;
5852 DEBUG_OPERATOR_EXPECTED
5854 "OPERATOR_EXPECTED: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5856 return $op_expected;
5858 } ## end sub operator_expected
5860 sub new_statement_ok {
5862 # return true if the current token can start a new statement
5863 # USES GLOBAL VARIABLES: $last_nonblank_type
5865 return label_ok() # a label would be ok here
5867 || $last_nonblank_type eq 'J'; # or we follow a label
5869 } ## end sub new_statement_ok
5873 # Decide if a bare word followed by a colon here is a label
5874 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5875 # $brace_depth, @brace_type
5877 # if it follows an opening or closing code block curly brace..
5878 if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
5879 && $last_nonblank_type eq $last_nonblank_token )
5882 # it is a label if and only if the curly encloses a code block
5883 return $brace_type[$brace_depth];
5886 # otherwise, it is a label if and only if it follows a ';' (real or fake)
5889 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
5891 } ## end sub label_ok
5893 sub code_block_type {
5895 # Decide if this is a block of code, and its type.
5896 # Must be called only when $type = $token = '{'
5897 # The problem is to distinguish between the start of a block of code
5898 # and the start of an anonymous hash reference
5899 # Returns "" if not code block, otherwise returns 'last_nonblank_token'
5900 # to indicate the type of code block. (For example, 'last_nonblank_token'
5901 # might be 'if' for an if block, 'else' for an else block, etc).
5902 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5903 # $last_nonblank_block_type, $brace_depth, @brace_type
5905 # handle case of multiple '{'s
5907 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
5909 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
5910 if ( $last_nonblank_token eq '{'
5911 && $last_nonblank_type eq $last_nonblank_token )
5914 # opening brace where a statement may appear is probably
5915 # a code block but might be and anonymous hash reference
5916 if ( $brace_type[$brace_depth] ) {
5917 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5921 # cannot start a code block within an anonymous hash
5923 return EMPTY_STRING;
5927 elsif ( $last_nonblank_token eq ';' ) {
5929 # an opening brace where a statement may appear is probably
5930 # a code block but might be and anonymous hash reference
5931 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5935 # handle case of '}{'
5936 elsif ($last_nonblank_token eq '}'
5937 && $last_nonblank_type eq $last_nonblank_token )
5940 # a } { situation ...
5941 # could be hash reference after code block..(blktype1.t)
5942 if ($last_nonblank_block_type) {
5943 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5947 # must be a block if it follows a closing hash reference
5949 return $last_nonblank_token;
5953 #--------------------------------------------------------------
5954 # NOTE: braces after type characters start code blocks, but for
5955 # simplicity these are not identified as such. See also
5956 # sub is_non_structural_brace.
5957 #--------------------------------------------------------------
5959 ## elsif ( $last_nonblank_type eq 't' ) {
5960 ## return $last_nonblank_token;
5963 # brace after label:
5964 elsif ( $last_nonblank_type eq 'J' ) {
5965 return $last_nonblank_token;
5968 # otherwise, look at previous token. This must be a code block if
5969 # it follows any of these:
5970 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
5971 elsif ($is_code_block_token{$last_nonblank_token}
5972 || $is_grep_alias{$last_nonblank_token} )
5975 # Bug Patch: Note that the opening brace after the 'if' in the following
5976 # snippet is an anonymous hash ref and not a code block!
5977 # print 'hi' if { x => 1, }->{x};
5978 # We can identify this situation because the last nonblank type
5979 # will be a keyword (instead of a closing paren)
5981 $last_nonblank_type eq 'k'
5982 && ( $last_nonblank_token eq 'if'
5983 || $last_nonblank_token eq 'unless' )
5986 return EMPTY_STRING;
5989 return $last_nonblank_token;
5993 # or a sub or package BLOCK
5994 elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
5995 && $last_nonblank_token =~ /^(sub|package)\b/ )
5997 return $last_nonblank_token;
6001 elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6002 && ( $is_sub{$last_nonblank_token} ) )
6007 elsif ( $statement_type =~ /^(sub|package)\b/ ) {
6008 return $statement_type;
6011 # user-defined subs with block parameters (like grep/map/eval)
6012 elsif ( $last_nonblank_type eq 'G' ) {
6013 return $last_nonblank_token;
6017 elsif ( $last_nonblank_type eq 'w' ) {
6019 # check for syntax 'use MODULE LIST'
6020 # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
6021 return EMPTY_STRING if ( $statement_type eq 'use' );
6023 return decide_if_code_block( $i, $rtokens, $rtoken_type,
6027 # Patch for bug # RT #94338 reported by Daniel Trizen
6028 # for-loop in a parenthesized block-map triggering an error message:
6029 # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
6030 # Check for a code block within a parenthesized function call
6031 elsif ( $last_nonblank_token eq '(' ) {
6032 my $paren_type = $paren_type[$paren_depth];
6034 # /^(map|grep|sort)$/
6035 if ( $paren_type && $is_sort_map_grep{$paren_type} ) {
6037 # We will mark this as a code block but use type 't' instead
6038 # of the name of the containing function. This will allow for
6039 # correct parsing but will usually produce better formatting.
6040 # Braces with block type 't' are not broken open automatically
6041 # in the formatter as are other code block types, and this usually
6043 return 't'; # (Not $paren_type)
6046 return EMPTY_STRING;
6050 # handle unknown syntax ') {'
6051 # we previously appended a '()' to mark this case
6052 elsif ( $last_nonblank_token =~ /\(\)$/ ) {
6053 return $last_nonblank_token;
6056 # anything else must be anonymous hash reference
6058 return EMPTY_STRING;
6060 } ## end sub code_block_type
6062 sub decide_if_code_block {
6064 # USES GLOBAL VARIABLES: $last_nonblank_token
6065 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6067 my ( $next_nonblank_token, $i_next ) =
6068 find_next_nonblank_token( $i, $rtokens, $max_token_index );
6070 # we are at a '{' where a statement may appear.
6071 # We must decide if this brace starts an anonymous hash or a code
6073 # return "" if anonymous hash, and $last_nonblank_token otherwise
6075 # initialize to be code BLOCK
6076 my $code_block_type = $last_nonblank_token;
6078 # Check for the common case of an empty anonymous hash reference:
6079 # Maybe something like sub { { } }
6080 if ( $next_nonblank_token eq '}' ) {
6081 $code_block_type = EMPTY_STRING;
6086 # To guess if this '{' is an anonymous hash reference, look ahead
6087 # and test as follows:
6089 # it is a hash reference if next come:
6090 # - a string or digit followed by a comma or =>
6091 # - bareword followed by =>
6092 # otherwise it is a code block
6094 # Examples of anonymous hash ref:
6098 # Examples of code blocks:
6099 # {1; print "hello\n", 1;}
6102 # We are only going to look ahead one more (nonblank/comment) line.
6103 # Strange formatting could cause a bad guess, but that's unlikely.
6107 # Ignore the rest of this line if it is a side comment
6108 if ( $next_nonblank_token ne '#' ) {
6109 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
6110 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
6112 my ( $rpre_tokens, $rpre_types ) =
6113 peek_ahead_for_n_nonblank_pre_tokens(20); # 20 is arbitrary but
6114 # generous, and prevents
6116 # time in mangled files
6117 if ( defined($rpre_types) && @{$rpre_types} ) {
6118 push @pre_types, @{$rpre_types};
6119 push @pre_tokens, @{$rpre_tokens};
6122 # put a sentinel token to simplify stopping the search
6123 push @pre_types, '}';
6124 push @pre_types, '}';
6127 $jbeg = 1 if $pre_types[0] eq 'b';
6129 # first look for one of these
6131 # - bareword with leading -
6135 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
6137 # find the closing quote; don't worry about escapes
6138 my $quote_mark = $pre_types[$j];
6139 foreach my $k ( $j + 1 .. @pre_types - 2 ) {
6140 if ( $pre_types[$k] eq $quote_mark ) {
6142 my $next = $pre_types[$j];
6147 elsif ( $pre_types[$j] eq 'd' ) {
6150 elsif ( $pre_types[$j] eq 'w' ) {
6153 elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
6158 $j++ if $pre_types[$j] eq 'b';
6160 # Patched for RT #95708
6163 # it is a comma which is not a pattern delimiter except for qw
6165 $pre_types[$j] eq ','
6166 ## !~ /^(s|m|y|tr|qr|q|qq|qx)$/
6167 && !$is_q_qq_qx_qr_s_y_tr_m{ $pre_tokens[$jbeg] }
6171 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
6174 $code_block_type = EMPTY_STRING;
6178 if ($code_block_type) {
6180 # Patch for cases b1085 b1128: It is uncertain if this is a block.
6181 # If this brace follows a bareword, then append a space as a signal
6182 # to the formatter that this may not be a block brace. To find the
6183 # corresponding code in Formatter.pm search for 'b1085'.
6184 $code_block_type .= SPACE if ( $code_block_type =~ /^\w/ );
6188 return $code_block_type;
6189 } ## end sub decide_if_code_block
6191 sub report_unexpected {
6193 # report unexpected token type and show where it is
6194 # USES GLOBAL VARIABLES: $tokenizer_self
6195 my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
6196 $rpretoken_type, $input_line )
6199 if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
6200 my $msg = "found $found where $expecting expected";
6201 my $pos = $rpretoken_map->[$i_tok];
6202 interrupt_logfile();
6203 my $input_line_number = $tokenizer_self->[_last_line_number_];
6204 my ( $offset, $numbered_line, $underline ) =
6205 make_numbered_line( $input_line_number, $input_line, $pos );
6206 $underline = write_on_underline( $underline, $pos - $offset, '^' );
6208 my $trailer = EMPTY_STRING;
6209 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
6210 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
6212 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
6213 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
6216 $num = $pos - $pos_prev;
6218 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
6221 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
6222 $trailer = " (previous token underlined)";
6224 $underline =~ s/\s+$//;
6225 warning( $numbered_line . "\n" );
6226 warning( $underline . "\n" );
6227 warning( $msg . $trailer . "\n" );
6231 } ## end sub report_unexpected
6233 my %is_sigil_or_paren;
6234 my %is_R_closing_sb;
6238 my @q = qw< $ & % * @ ) >;
6239 @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
6242 @{is_R_closing_sb}{@q} = (1) x scalar(@q);
6245 sub is_non_structural_brace {
6247 # Decide if a brace or bracket is structural or non-structural
6248 # by looking at the previous token and type
6249 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
6251 # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
6252 # Tentatively deactivated because it caused the wrong operator expectation
6254 # $user = @vars[1] / 100;
6255 # Must update sub operator_expected before re-implementing.
6256 # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
6260 #--------------------------------------------------------------
6261 # NOTE: braces after type characters start code blocks, but for
6262 # simplicity these are not identified as such. See also
6263 # sub code_block_type
6264 #--------------------------------------------------------------
6266 ##if ($last_nonblank_type eq 't') {return 0}
6268 # otherwise, it is non-structural if it is decorated
6269 # by type information.
6270 # For example, the '{' here is non-structural: ${xxx}
6271 # Removed '::' to fix c074
6272 ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
6274 ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
6275 $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
6276 || substr( $last_nonblank_token, 0, 2 ) eq '->'
6278 # or if we follow a hash or array closing curly brace or bracket
6279 # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
6280 # because the first '}' would have been given type 'R'
6281 ##|| $last_nonblank_type =~ /^([R\]])$/
6282 || $is_R_closing_sb{$last_nonblank_type}
6284 } ## end sub is_non_structural_brace
6286 #######################################################################
6287 # Tokenizer routines for tracking container nesting depths
6288 #######################################################################
6290 # The following routines keep track of nesting depths of the nesting
6291 # types, ( [ { and ?. This is necessary for determining the indentation
6292 # level, and also for debugging programs. Not only do they keep track of
6293 # nesting depths of the individual brace types, but they check that each
6294 # of the other brace types is balanced within matching pairs. For
6295 # example, if the program sees this sequence:
6299 # then it can determine that there is an extra left paren somewhere
6300 # between the { and the }. And so on with every other possible
6301 # combination of outer and inner brace types. For another
6306 # which has an extra ] within the parens.
6308 # The brace types have indexes 0 .. 3 which are indexes into
6311 # The pair ? : are treated as just another nesting type, with ? acting
6312 # as the opening brace and : acting as the closing brace.
6316 # $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
6318 # saves the nesting depth of brace type $b (where $b is either of the other
6319 # nesting types) when brace type $a enters a new depth. When this depth
6320 # decreases, a check is made that the current depth of brace types $b is
6321 # unchanged, or otherwise there must have been an error. This can
6322 # be very useful for localizing errors, particularly when perl runs to
6323 # the end of a large file (such as this one) and announces that there
6324 # is a problem somewhere.
6326 # A numerical sequence number is maintained for every nesting type,
6327 # so that each matching pair can be uniquely identified in a simple
6330 sub increase_nesting_depth {
6331 my ( $aa, $pos ) = @_;
6333 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6334 # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
6336 $current_depth[$aa]++;
6338 $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
6339 my $input_line_number = $tokenizer_self->[_last_line_number_];
6340 my $input_line = $tokenizer_self->[_line_of_text_];
6342 # Sequence numbers increment by number of items. This keeps
6343 # a unique set of numbers but still allows the relative location
6344 # of any type to be determined.
6346 # make a new unique sequence number
6347 my $seqno = $next_sequence_number++;
6349 $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
6351 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
6352 [ $input_line_number, $input_line, $pos ];
6354 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6355 next if ( $bb == $aa );
6356 $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
6359 # set a flag for indenting a nested ternary statement
6361 if ( $aa == QUESTION_COLON ) {
6362 $nested_ternary_flag[ $current_depth[$aa] ] = 0;
6363 if ( $current_depth[$aa] > 1 ) {
6364 if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
6365 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
6366 if ( $pdepth == $total_depth - 1 ) {
6368 $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
6374 # Fix part #1 for git82: save last token type for propagation of type 'Z'
6375 $nested_statement_type[$aa][ $current_depth[$aa] ] =
6376 [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
6377 $statement_type = EMPTY_STRING;
6378 return ( $seqno, $indent );
6379 } ## end sub increase_nesting_depth
6381 sub is_balanced_closing_container {
6383 # Return true if a closing container can go here without error
6384 # Return false if not
6387 # cannot close if there was no opening
6388 return unless ( $current_depth[$aa] > 0 );
6390 # check that any other brace types $bb contained within would be balanced
6391 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6392 next if ( $bb == $aa );
6394 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6395 $current_depth[$bb] );
6398 # OK, everything will be balanced
6400 } ## end sub is_balanced_closing_container
6402 sub decrease_nesting_depth {
6404 my ( $aa, $pos ) = @_;
6406 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6407 # @current_sequence_number, @depth_array, @starting_line_of_current_depth
6410 my $input_line_number = $tokenizer_self->[_last_line_number_];
6411 my $input_line = $tokenizer_self->[_line_of_text_];
6415 if ( $current_depth[$aa] > 0 ) {
6417 # set a flag for un-indenting after seeing a nested ternary statement
6418 $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
6419 if ( $aa == QUESTION_COLON ) {
6420 $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
6423 # Fix part #2 for git82: use saved type for propagation of type 'Z'
6424 # through type L-R braces. Perl seems to allow ${bareword}
6425 # as an indirect object, but nothing much more complex than that.
6426 ( $statement_type, my $saved_type, my $saved_token ) =
6427 @{ $nested_statement_type[$aa][ $current_depth[$aa] ] };
6429 && $saved_type eq 'Z'
6430 && $last_nonblank_type eq 'w'
6431 && $brace_structural_type[$brace_depth] eq 'L' )
6433 $last_nonblank_type = $saved_type;
6436 # check that any brace types $bb contained within are balanced
6437 for my $bb ( 0 .. @closing_brace_names - 1 ) {
6438 next if ( $bb == $aa );
6440 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6441 $current_depth[$bb] )
6444 $current_depth[$bb] -
6445 $depth_array[$aa][$bb][ $current_depth[$aa] ];
6447 # don't whine too many times
6448 my $saw_brace_error = get_saw_brace_error();
6450 $saw_brace_error <= MAX_NAG_MESSAGES
6452 # if too many closing types have occurred, we probably
6453 # already caught this error
6454 && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
6457 interrupt_logfile();
6459 $starting_line_of_current_depth[$aa]
6460 [ $current_depth[$aa] ];
6462 my $rel = [ $input_line_number, $input_line, $pos ];
6466 if ( $diff == 1 || $diff == -1 ) {
6467 $ess = EMPTY_STRING;
6474 ? $opening_brace_names[$bb]
6475 : $closing_brace_names[$bb];
6476 write_error_indicator_pair( @{$rsl}, '^' );
6478 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
6483 $starting_line_of_current_depth[$bb]
6484 [ $current_depth[$bb] ];
6487 " The most recent un-matched $bname is on line $ml\n";
6488 write_error_indicator_pair( @{$rml}, '^' );
6490 write_error_indicator_pair( @{$rel}, '^' );
6494 increment_brace_error();
6497 $current_depth[$aa]--;
6501 my $saw_brace_error = get_saw_brace_error();
6502 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
6504 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
6506 indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
6508 increment_brace_error();
6510 # keep track of errors in braces alone (ignoring ternary nesting errors)
6511 $tokenizer_self->[_true_brace_error_count_]++
6512 if ( $closing_brace_names[$aa] ne "':'" );
6514 return ( $seqno, $outdent );
6515 } ## end sub decrease_nesting_depth
6517 sub check_final_nesting_depths {
6519 # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
6521 for my $aa ( 0 .. @closing_brace_names - 1 ) {
6523 if ( $current_depth[$aa] ) {
6525 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
6528 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
6529 The most recent un-matched $opening_brace_names[$aa] is on line $sl
6531 indicate_error( $msg, @{$rsl}, '^' );
6532 increment_brace_error();
6536 } ## end sub check_final_nesting_depths
6538 #######################################################################
6539 # Tokenizer routines for looking ahead in input stream
6540 #######################################################################
6542 sub peek_ahead_for_n_nonblank_pre_tokens {
6544 # returns next n pretokens if they exist
6545 # returns undef's if hits eof without seeing any pretokens
6546 # USES GLOBAL VARIABLES: $tokenizer_self
6547 my $max_pretokens = shift;
6550 my ( $rpre_tokens, $rmap, $rpre_types );
6553 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6555 $line =~ s/^\s*//; # trim leading blanks
6556 next if ( length($line) <= 0 ); # skip blank
6557 next if ( $line =~ /^#/ ); # skip comment
6558 ( $rpre_tokens, $rmap, $rpre_types ) =
6559 pre_tokenize( $line, $max_pretokens );
6562 return ( $rpre_tokens, $rpre_types );
6563 } ## end sub peek_ahead_for_n_nonblank_pre_tokens
6565 # look ahead for next non-blank, non-comment line of code
6566 sub peek_ahead_for_nonblank_token {
6568 # USES GLOBAL VARIABLES: $tokenizer_self
6569 my ( $rtokens, $max_token_index ) = @_;
6574 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6576 $line =~ s/^\s*//; # trim leading blanks
6577 next if ( length($line) <= 0 ); # skip blank
6578 next if ( $line =~ /^#/ ); # skip comment
6580 # Updated from 2 to 3 to get trigraphs, added for case b1175
6581 my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
6582 my $j = $max_token_index + 1;
6584 foreach my $tok ( @{$rtok} ) {
6585 last if ( $tok =~ "\n" );
6586 $rtokens->[ ++$j ] = $tok;
6591 } ## end sub peek_ahead_for_nonblank_token
6593 #######################################################################
6594 # Tokenizer guessing routines for ambiguous situations
6595 #######################################################################
6597 sub guess_if_pattern_or_conditional {
6599 # this routine is called when we have encountered a ? following an
6600 # unknown bareword, and we must decide if it starts a pattern or not
6602 # $i - token index of the ? starting possible pattern
6603 # output parameters:
6604 # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
6605 # msg = a warning or diagnostic message
6606 # USES GLOBAL VARIABLES: $last_nonblank_token
6608 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6610 my $msg = "guessing that ? after $last_nonblank_token starts a ";
6612 if ( $i >= $max_token_index ) {
6613 $msg .= "conditional (no end to pattern found on the line)\n";
6618 my $next_token = $rtokens->[$i]; # first token after ?
6620 # look for a possible ending ? on this line..
6622 my $quote_depth = 0;
6623 my $quote_character = EMPTY_STRING;
6635 ) = follow_quoted_string(
6649 # we didn't find an ending ? on this line,
6650 # so we bias towards conditional
6652 $msg .= "conditional (no ending ? on this line)\n";
6654 # we found an ending ?, so we bias towards a pattern
6658 # Watch out for an ending ? in quotes, like this
6659 # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
6663 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
6664 my $tok = $rtokens->[$ii];
6665 if ( $tok eq ":" ) { $colons++ }
6666 if ( $tok eq "'" ) { $s_quote++ }
6667 if ( $tok eq '"' ) { $d_quote++ }
6669 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
6671 $msg .= "found ending ? but unbalanced quote chars\n";
6673 elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
6675 $msg .= "pattern (found ending ? and pattern expected)\n";
6678 $msg .= "pattern (uncertain, but found ending ?)\n";
6682 return ( $is_pattern, $msg );
6683 } ## end sub guess_if_pattern_or_conditional
6685 my %is_known_constant;
6686 my %is_known_function;
6690 # Constants like 'pi' in Trig.pm are common
6691 my @q = qw(pi pi2 pi4 pip2 pip4);
6692 @{is_known_constant}{@q} = (1) x scalar(@q);
6694 # parenless calls of 'ok' are common
6696 @{is_known_function}{@q} = (1) x scalar(@q);
6699 sub guess_if_pattern_or_division {
6701 # this routine is called when we have encountered a / following an
6702 # unknown bareword, and we must decide if it starts a pattern or is a
6705 # $i - token index of the / starting possible pattern
6706 # output parameters:
6707 # $is_pattern = 0 if probably division, =1 if probably a pattern
6708 # msg = a warning or diagnostic message
6709 # USES GLOBAL VARIABLES: $last_nonblank_token
6710 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6712 my $msg = "guessing that / after $last_nonblank_token starts a ";
6714 if ( $i >= $max_token_index ) {
6715 $msg .= "division (no end to pattern found on the line)\n";
6719 my $divide_possible =
6720 is_possible_numerator( $i, $rtokens, $max_token_index );
6722 if ( $divide_possible < 0 ) {
6723 $msg = "pattern (division not possible here)\n";
6725 return ( $is_pattern, $msg );
6729 my $next_token = $rtokens->[$i]; # first token after slash
6731 # One of the things we can look at is the spacing around the slash.
6732 # There # are four possible spacings around the first slash:
6734 # return pi/two;#/; -/-
6735 # return pi/ two;#/; -/+
6736 # return pi / two;#/; +/+
6737 # return pi /two;#/; +/- <-- possible pattern
6739 # Spacing rule: a space before the slash but not after the slash
6740 # usually indicates a pattern. We can use this to break ties.
6742 my $is_pattern_by_spacing =
6743 ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
6745 # look for a possible ending / on this line..
6747 my $quote_depth = 0;
6748 my $quote_character = EMPTY_STRING;
6752 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6755 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6756 $quote_pos, $quote_depth, $max_token_index );
6760 # we didn't find an ending / on this line, so we bias towards
6762 if ( $divide_possible >= 0 ) {
6764 $msg .= "division (no ending / on this line)\n";
6768 # assuming a multi-line pattern ... this is risky, but division
6769 # does not seem possible. If this fails, it would either be due
6770 # to a syntax error in the code, or the division_expected logic
6771 # needs to be fixed.
6772 $msg = "multi-line pattern (division not possible)\n";
6777 # we found an ending /, so we bias slightly towards a pattern
6780 my $pattern_expected =
6781 pattern_expected( $i, $rtokens, $max_token_index );
6783 if ( $pattern_expected >= 0 ) {
6785 # pattern looks possible...
6786 if ( $divide_possible >= 0 ) {
6788 # Both pattern and divide can work here...
6790 # Increase weight of divide if a pure number follows
6791 $divide_possible += $next_token =~ /^\d+$/;
6793 # Check for known constants in the numerator, like 'pi'
6794 if ( $is_known_constant{$last_nonblank_token} ) {
6796 "division (pattern works too but saw known constant '$last_nonblank_token')\n";
6800 # A very common bare word in pattern expressions is 'ok'
6801 elsif ( $is_known_function{$last_nonblank_token} ) {
6803 "pattern (division works too but saw '$last_nonblank_token')\n";
6807 # If one rule is more definite, use it
6808 elsif ( $divide_possible > $pattern_expected ) {
6810 "division (more likely based on following tokens)\n";
6814 # otherwise, use the spacing rule
6815 elsif ($is_pattern_by_spacing) {
6817 "pattern (guess on spacing, but division possible too)\n";
6822 "division (guess on spacing, but pattern is possible too)\n";
6827 # divide_possible < 0 means divide can not work here
6830 $msg .= "pattern (division not possible)\n";
6834 # pattern does not look possible...
6837 if ( $divide_possible >= 0 ) {
6839 $msg .= "division (pattern not possible)\n";
6842 # Neither pattern nor divide look possible...go by spacing
6844 if ($is_pattern_by_spacing) {
6845 $msg .= "pattern (guess on spacing)\n";
6849 $msg .= "division (guess on spacing)\n";
6856 return ( $is_pattern, $msg );
6857 } ## end sub guess_if_pattern_or_division
6859 # try to resolve here-doc vs. shift by looking ahead for
6860 # non-code or the end token (currently only looks for end token)
6861 # returns 1 if it is probably a here doc, 0 if not
6862 sub guess_if_here_doc {
6864 # This is how many lines we will search for a target as part of the
6865 # guessing strategy. It is a constant because there is probably
6866 # little reason to change it.
6867 # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
6869 my $HERE_DOC_WINDOW = 40;
6871 my $next_token = shift;
6872 my $here_doc_expected = 0;
6875 my $msg = "checking <<";
6878 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) )
6882 if ( $line =~ /^$next_token$/ ) {
6883 $msg .= " -- found target $next_token ahead $k lines\n";
6884 $here_doc_expected = 1; # got it
6887 last if ( $k >= $HERE_DOC_WINDOW );
6890 unless ($here_doc_expected) {
6892 if ( !defined($line) ) {
6893 $here_doc_expected = -1; # hit eof without seeing target
6894 $msg .= " -- must be shift; target $next_token not in file\n";
6897 else { # still unsure..taking a wild guess
6899 if ( !$is_constant{$current_package}{$next_token} ) {
6900 $here_doc_expected = 1;
6902 " -- guessing it's a here-doc ($next_token not a constant)\n";
6906 " -- guessing it's a shift ($next_token is a constant)\n";
6910 write_logfile_entry($msg);
6911 return $here_doc_expected;
6912 } ## end sub guess_if_here_doc
6914 #######################################################################
6915 # Tokenizer Routines for scanning identifiers and related items
6916 #######################################################################
6918 sub scan_bare_identifier_do {
6920 # this routine is called to scan a token starting with an alphanumeric
6921 # variable or package separator, :: or '.
6922 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
6923 # $last_nonblank_type,@paren_type, $paren_depth
6925 my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
6929 my $package = undef;
6933 # we have to back up one pretoken at a :: since each : is one pretoken
6934 if ( $tok eq '::' ) { $i_beg-- }
6935 if ( $tok eq '->' ) { $i_beg-- }
6936 my $pos_beg = $rtoken_map->[$i_beg];
6937 pos($input_line) = $pos_beg;
6944 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
6946 my $pos = pos($input_line);
6947 my $numc = $pos - $pos_beg;
6948 $tok = substr( $input_line, $pos_beg, $numc );
6950 # type 'w' includes anything without leading type info
6951 # ($,%,@,*) including something like abc::def::ghi
6954 my $sub_name = EMPTY_STRING;
6955 if ( defined($2) ) { $sub_name = $2; }
6956 if ( defined($1) ) {
6959 # patch: don't allow isolated package name which just ends
6960 # in the old style package separator (single quote). Example:
6962 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
6966 $package =~ s/\'/::/g;
6967 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
6968 $package =~ s/::$//;
6971 $package = $current_package;
6973 # patched for c043, part 1: keyword does not follow '->'
6974 if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
6979 # if it is a bareword.. patched for c043, part 2: not following '->'
6980 if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
6982 # check for v-string with leading 'v' type character
6983 # (This seems to have precedence over filehandle, type 'Y')
6984 if ( $tok =~ /^v\d[_\d]*$/ ) {
6986 # we only have the first part - something like 'v101' -
6988 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
6989 $pos = pos($input_line);
6990 $numc = $pos - $pos_beg;
6991 $tok = substr( $input_line, $pos_beg, $numc );
6995 # warn if this version can't handle v-strings
6996 report_v_string($tok);
6999 elsif ( $is_constant{$package}{$sub_name} ) {
7003 # bareword after sort has implied empty prototype; for example:
7004 # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
7005 # This has priority over whatever the user has specified.
7006 elsif ($last_nonblank_token eq 'sort'
7007 && $last_nonblank_type eq 'k' )
7012 # Note: strangely, perl does not seem to really let you create
7013 # functions which act like eval and do, in the sense that eval
7014 # and do may have operators following the final }, but any operators
7015 # that you create with prototype (&) apparently do not allow
7016 # trailing operators, only terms. This seems strange.
7017 # If this ever changes, here is the update
7018 # to make perltidy behave accordingly:
7020 # elsif ( $is_block_function{$package}{$tok} ) {
7021 # $tok='eval'; # patch to do braces like eval - doesn't work
7024 # TODO: This could become a separate type to allow for different
7026 elsif ( $is_block_function{$package}{$sub_name} ) {
7029 elsif ( $is_block_list_function{$package}{$sub_name} ) {
7032 elsif ( $is_user_function{$package}{$sub_name} ) {
7034 $prototype = $user_function_prototype{$package}{$sub_name};
7037 # check for indirect object
7040 # added 2001-03-27: must not be followed immediately by '('
7042 ( $input_line !~ m/\G\(/gc )
7047 # preceded by keyword like 'print', 'printf' and friends
7048 $is_indirect_object_taker{$last_nonblank_token}
7050 # or preceded by something like 'print(' or 'printf('
7052 ( $last_nonblank_token eq '(' )
7053 && $is_indirect_object_taker{ $paren_type[$paren_depth]
7061 # may not be indirect object unless followed by a space;
7062 # updated 2021-01-16 to consider newline to be a space.
7063 # updated for case b990 to look for either ';' or space
7064 if ( pos($input_line) == length($input_line)
7065 || $input_line =~ m/\G[;\s]/gc )
7070 # Perl's indirect object notation is a very bad
7071 # thing and can cause subtle bugs, especially for
7072 # beginning programmers. And I haven't even been
7073 # able to figure out a sane warning scheme which
7074 # doesn't get in the way of good scripts.
7076 # Complain if a filehandle has any lower case
7077 # letters. This is suggested good practice.
7078 # Use 'sub_name' because something like
7079 # main::MYHANDLE is ok for filehandle
7080 if ( $sub_name =~ /[a-z]/ ) {
7082 # could be bug caused by older perltidy if
7084 if ( $input_line =~ m/\G\s*\(/gc ) {
7086 "Caution: unknown word '$tok' in indirect object slot\n"
7092 # bareword not followed by a space -- may not be filehandle
7093 # (may be function call defined in a 'use' statement)
7100 # Now we must convert back from character position
7101 # to pre_token index.
7102 # I don't think an error flag can occur here ..but who knows
7105 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7107 warning("scan_bare_identifier: Possibly invalid tokenization\n");
7111 # no match but line not blank - could be syntax error
7112 # perl will take '::' alone without complaint
7116 # change this warning to log message if it becomes annoying
7117 warning("didn't find identifier after leading ::\n");
7119 return ( $i, $tok, $type, $prototype );
7120 } ## end sub scan_bare_identifier_do
7124 # This is the new scanner and will eventually replace scan_identifier.
7125 # Only type 'sub' and 'package' are implemented.
7126 # Token types $ * % @ & -> are not yet implemented.
7128 # Scan identifier following a type token.
7129 # The type of call depends on $id_scan_state: $id_scan_state = ''
7130 # for starting call, in which case $tok must be the token defining
7133 # If the type token is the last nonblank token on the line, a value
7134 # of $id_scan_state = $tok is returned, indicating that further
7135 # calls must be made to get the identifier. If the type token is
7136 # not the last nonblank token on the line, the identifier is
7137 # scanned and handled and a value of '' is returned.
7138 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
7139 # $statement_type, $tokenizer_self
7141 my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
7144 use constant DEBUG_NSCAN => 0;
7145 my $type = EMPTY_STRING;
7146 my ( $i_beg, $pos_beg );
7148 #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7149 #my ($a,$b,$c) = caller;
7150 #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
7152 # on re-entry, start scanning at first token on the line
7153 if ($id_scan_state) {
7155 $type = EMPTY_STRING;
7158 # on initial entry, start scanning just after type token
7161 $id_scan_state = $tok;
7165 # find $i_beg = index of next nonblank token,
7166 # and handle empty lines
7168 my $next_nonblank_token = $rtokens->[$i_beg];
7169 if ( $i_beg > $max_token_index ) {
7174 # only a '#' immediately after a '$' is not a comment
7175 if ( $next_nonblank_token eq '#' ) {
7176 unless ( $tok eq '$' ) {
7181 if ( $next_nonblank_token =~ /^\s/ ) {
7182 ( $next_nonblank_token, $i_beg ) =
7183 find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
7185 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
7191 # handle non-blank line; identifier, if any, must follow
7192 unless ($blank_line) {
7194 if ( $is_sub{$id_scan_state} ) {
7195 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
7197 input_line => $input_line,
7202 rtokens => $rtokens,
7203 rtoken_map => $rtoken_map,
7204 id_scan_state => $id_scan_state,
7205 max_token_index => $max_token_index,
7210 elsif ( $is_package{$id_scan_state} ) {
7211 ( $i, $tok, $type ) =
7212 do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
7213 $rtoken_map, $max_token_index );
7214 $id_scan_state = EMPTY_STRING;
7218 warning("invalid token in scan_id: $tok\n");
7219 $id_scan_state = EMPTY_STRING;
7223 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
7228 Program bug in scan_id: undefined type but scan_state=$id_scan_state
7232 "Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
7234 report_definite_bug();
7239 "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7241 return ( $i, $tok, $type, $id_scan_state );
7242 } ## end sub scan_id_do
7244 sub check_prototype {
7245 my ( $proto, $package, $subname ) = @_;
7246 return unless ( defined($package) && defined($subname) );
7247 if ( defined($proto) ) {
7248 $proto =~ s/^\s*\(\s*//;
7249 $proto =~ s/\s*\)$//;
7251 $is_user_function{$package}{$subname} = 1;
7252 $user_function_prototype{$package}{$subname} = "($proto)";
7254 # prototypes containing '&' must be treated specially..
7255 if ( $proto =~ /\&/ ) {
7257 # right curly braces of prototypes ending in
7258 # '&' may be followed by an operator
7259 if ( $proto =~ /\&$/ ) {
7260 $is_block_function{$package}{$subname} = 1;
7263 # right curly braces of prototypes NOT ending in
7264 # '&' may NOT be followed by an operator
7265 elsif ( $proto !~ /\&$/ ) {
7266 $is_block_list_function{$package}{$subname} = 1;
7271 $is_constant{$package}{$subname} = 1;
7275 $is_user_function{$package}{$subname} = 1;
7278 } ## end sub check_prototype
7280 sub do_scan_package {
7282 # do_scan_package parses a package name
7283 # it is called with $i_beg equal to the index of the first nonblank
7284 # token following a 'package' token.
7285 # USES GLOBAL VARIABLES: $current_package,
7288 # package NAMESPACE VERSION
7289 # package NAMESPACE BLOCK
7290 # package NAMESPACE VERSION BLOCK
7292 # If VERSION is provided, package sets the $VERSION variable in the given
7293 # namespace to a version object with the VERSION provided. VERSION must be
7294 # a "strict" style version number as defined by the version module: a
7295 # positive decimal number (integer or decimal-fraction) without
7296 # exponentiation or else a dotted-decimal v-string with a leading 'v'
7297 # character and at least three components.
7298 # reference http://perldoc.perl.org/functions/package.html
7300 my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
7303 my $package = undef;
7304 my $pos_beg = $rtoken_map->[$i_beg];
7305 pos($input_line) = $pos_beg;
7307 # handle non-blank line; package name, if any, must follow
7308 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
7310 $package = ( defined($1) && $1 ) ? $1 : 'main';
7311 $package =~ s/\'/::/g;
7312 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7313 $package =~ s/::$//;
7314 my $pos = pos($input_line);
7315 my $numc = $pos - $pos_beg;
7316 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
7319 # Now we must convert back from character position
7320 # to pre_token index.
7321 # I don't think an error flag can occur here ..but ?
7324 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7325 if ($error) { warning("Possibly invalid package\n") }
7326 $current_package = $package;
7328 # we should now have package NAMESPACE
7329 # now expecting VERSION, BLOCK, or ; to follow ...
7330 # package NAMESPACE VERSION
7331 # package NAMESPACE BLOCK
7332 # package NAMESPACE VERSION BLOCK
7333 my ( $next_nonblank_token, $i_next ) =
7334 find_next_nonblank_token( $i, $rtokens, $max_token_index );
7336 # check that something recognizable follows, but do not parse.
7337 # A VERSION number will be parsed later as a number or v-string in the
7338 # normal way. What is important is to set the statement type if
7339 # everything looks okay so that the operator_expected() routine
7340 # knows that the number is in a package statement.
7341 # Examples of valid primitive tokens that might follow are:
7343 # FIX: added a '#' since a side comment may also follow
7344 if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#])|v\d|\d+$/ ) {
7345 $statement_type = $tok;
7349 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
7354 # no match but line not blank --
7355 # could be a label with name package, like package: , for example.
7360 return ( $i, $tok, $type );
7361 } ## end sub do_scan_package
7363 my %is_special_variable_char;
7367 # These are the only characters which can (currently) form special
7368 # variables, like $^W: (issue c066).
7370 qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
7371 @{is_special_variable_char}{@q} = (1) x scalar(@q);
7374 { ## begin closure for sub scan_complex_identifier
7376 use constant DEBUG_SCAN_ID => 0;
7378 # These are the possible states for this scanner:
7379 my $scan_state_SIGIL = '$';
7380 my $scan_state_ALPHA = 'A';
7381 my $scan_state_COLON = ':';
7382 my $scan_state_LPAREN = '(';
7383 my $scan_state_RPAREN = ')';
7384 my $scan_state_AMPERSAND = '&';
7385 my $scan_state_SPLIT = '^';
7387 # Only these non-blank states may be returned to caller:
7388 my %is_returnable_scan_state = (
7389 $scan_state_SIGIL => 1,
7390 $scan_state_AMPERSAND => 1,
7393 # USES GLOBAL VARIABLES:
7394 # $context, $last_nonblank_token, $last_nonblank_type
7399 my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
7400 $expecting, $container_type );
7402 #-------------------------------------------
7403 # my variables, re-initialized on each call:
7404 #-------------------------------------------
7405 my $i_begin; # starting index $i
7406 my $type; # returned identifier type
7407 my $tok_begin; # starting token
7408 my $tok; # returned token
7409 my $id_scan_state_begin; # starting scan state
7410 my $identifier_begin; # starting identifier
7411 my $i_save; # a last good index, in case of error
7412 my $message; # hold error message for log file
7414 my $last_tok_is_blank;
7415 my $in_prototype_or_signature;
7420 sub initialize_my_scan_id_vars {
7422 # Initialize all 'my' vars on entry
7424 $type = EMPTY_STRING;
7425 $tok_begin = $rtokens->[$i_begin];
7427 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
7428 $id_scan_state_begin = $id_scan_state;
7429 $identifier_begin = $identifier;
7432 $message = EMPTY_STRING;
7433 $tok_is_blank = undef; # a flag to speed things up
7434 $last_tok_is_blank = undef;
7436 $in_prototype_or_signature =
7437 $container_type && $container_type =~ /^sub\b/;
7439 # these flags will be used to help figure out the type:
7443 # allow old package separator (') except in 'use' statement
7444 $allow_tick = ( $last_nonblank_token ne 'use' );
7446 } ## end sub initialize_my_scan_id_vars
7448 #----------------------------------
7449 # Routines for handling scan states
7450 #----------------------------------
7451 sub do_id_scan_state_dollar {
7453 # We saw a sigil, now looking to start a variable name
7454 if ( $tok eq '$' ) {
7456 $identifier .= $tok;
7458 # we've got a punctuation variable if end of line (punct.t)
7459 if ( $i == $max_token_index ) {
7461 $id_scan_state = EMPTY_STRING;
7464 elsif ( $tok =~ /^\w/ ) { # alphanumeric ..
7466 $id_scan_state = $scan_state_COLON; # now need ::
7467 $identifier .= $tok;
7469 elsif ( $tok eq '::' ) {
7470 $id_scan_state = $scan_state_ALPHA;
7471 $identifier .= $tok;
7474 # POSTDEFREF ->@ ->% ->& ->*
7475 elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
7476 $identifier .= $tok;
7478 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7480 $id_scan_state = $scan_state_COLON; # now need ::
7481 $identifier .= $tok;
7483 # Perl will accept leading digits in identifiers,
7484 # although they may not always produce useful results.
7485 # Something like $main::0 is ok. But this also works:
7487 # sub howdy::123::bubba{ print "bubba $54321!\n" }
7488 # howdy::123::bubba();
7491 elsif ( $tok eq '#' ) {
7493 my $is_punct_var = $identifier eq '$$';
7495 # side comment or identifier?
7498 # A '#' starts a comment if it follows a space. For example,
7499 # the following is equivalent to $ans=40.
7504 # a # inside a prototype or signature can only start a
7506 && !$in_prototype_or_signature
7508 # these are valid punctuation vars: *# %# @# $#
7509 # May also be '$#array' or POSTDEFREF ->$#
7510 && ( $identifier =~ /^[\%\@\$\*]$/
7511 || $identifier =~ /\$$/ )
7513 # but a '#' after '$$' is a side comment; see c147
7518 $identifier .= $tok; # keep same state, a $ could follow
7522 # otherwise it is a side comment
7523 if ( $identifier eq '->' ) { }
7524 elsif ($is_punct_var) { $type = 'i' }
7525 elsif ( $id_scan_state eq $scan_state_SIGIL ) { $type = 't' }
7526 else { $type = 'i' }
7528 $id_scan_state = EMPTY_STRING;
7532 elsif ( $tok eq '{' ) {
7534 # check for something like ${#} or ${?}, where ? is a special char
7538 || $identifier eq '@'
7539 || $identifier eq '$#'
7541 && $i + 2 <= $max_token_index
7542 && $rtokens->[ $i + 2 ] eq '}'
7543 && $rtokens->[ $i + 1 ] !~ /[\s\w]/
7546 my $next2 = $rtokens->[ $i + 2 ];
7547 my $next1 = $rtokens->[ $i + 1 ];
7548 $identifier .= $tok . $next1 . $next2;
7550 $id_scan_state = EMPTY_STRING;
7554 # skip something like ${xxx} or ->{
7555 $id_scan_state = EMPTY_STRING;
7557 # if this is the first token of a line, any tokens for this
7558 # identifier have already been accumulated
7559 if ( $identifier eq '$' || $i == 0 ) {
7560 $identifier = EMPTY_STRING;
7566 # space ok after leading $ % * & @
7567 elsif ( $tok =~ /^\s*$/ ) {
7571 # note: an id with a leading '&' does not actually come this way
7572 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
7574 if ( length($identifier) > 1 ) {
7575 $id_scan_state = EMPTY_STRING;
7577 $type = 'i'; # probably punctuation variable
7581 # fix c139: trim line-ending type 't'
7582 if ( $i == $max_token_index ) {
7587 # spaces after $'s are common, and space after @
7588 # is harmless, so only complain about space
7589 # after other type characters. Space after $ and
7590 # @ will be removed in formatting. Report space
7591 # after % and * because they might indicate a
7592 # parsing error. In other words '% ' might be a
7593 # modulo operator. Delete this warning if it
7595 elsif ( $identifier !~ /^[\@\$]$/ ) {
7597 "Space in identifier, following $identifier\n";
7600 ## ok: silently accept space after '$' and '@' sigils
7605 elsif ( $identifier eq '->' ) {
7607 # space after '->' is ok except at line end ..
7608 # so trim line-ending in type '->' (fixes c139)
7609 if ( $i == $max_token_index ) {
7615 # stop at space after something other than -> or sigil
7616 # Example of what can arrive here:
7617 # eval { $MyClass->$$ };
7619 $id_scan_state = EMPTY_STRING;
7624 elsif ( $tok eq '^' ) {
7626 # check for some special variables like $^ $^W
7627 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
7628 $identifier .= $tok;
7631 # There may be one more character, not a space, after the ^
7632 my $next1 = $rtokens->[ $i + 1 ];
7633 my $chr = substr( $next1, 0, 1 );
7634 if ( $is_special_variable_char{$chr} ) {
7636 # It is something like $^W
7637 # Test case (c066) : $^Oeq'linux'
7639 $identifier .= $next1;
7641 # If pretoken $next1 is more than one character long,
7642 # set a flag indicating that it needs to be split.
7644 ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
7649 # Simple test case (c065): '$aa=$^if($bb)';
7650 $id_scan_state = EMPTY_STRING;
7654 $id_scan_state = EMPTY_STRING;
7658 else { # something else
7660 if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
7662 # We might be in an extrusion of
7663 # sub foo2 ( $first, $, $third ) {
7664 # looking at a line starting with a comma, like
7667 # in this case the comma ends the signature variable
7668 # '$' which will have been previously marked type 't'
7670 if ( $i == $i_begin ) {
7671 $identifier = EMPTY_STRING;
7672 $type = EMPTY_STRING;
7675 # at a # we have to mark as type 't' because more may
7676 # follow, otherwise, in a signature we can let '$' be an
7677 # identifier here for better formatting.
7678 # See 'mangle4.in' for a test case.
7681 if ( $id_scan_state eq $scan_state_SIGIL && $tok eq '#' ) {
7686 $id_scan_state = EMPTY_STRING;
7689 # check for various punctuation variables
7690 elsif ( $identifier =~ /^[\$\*\@\%]$/ ) {
7691 $identifier .= $tok;
7694 # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
7696 && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
7698 $identifier .= $tok;
7701 elsif ( $identifier eq '$#' ) {
7703 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
7705 # perl seems to allow just these: $#: $#- $#+
7706 elsif ( $tok =~ /^[\:\-\+]$/ ) {
7708 $identifier .= $tok;
7712 write_logfile_entry( 'Use of $# is deprecated' . "\n" );
7715 elsif ( $identifier eq '$$' ) {
7717 # perl does not allow references to punctuation
7718 # variables without braces. For example, this
7722 # You would have to use
7725 # '$$' alone is punctuation variable for PID
7727 if ( $tok eq '{' ) { $type = 't' }
7728 else { $type = 'i' }
7730 elsif ( $identifier eq '->' ) {
7735 if ( length($identifier) == 1 ) {
7736 $identifier = EMPTY_STRING;
7739 $id_scan_state = EMPTY_STRING;
7742 } ## end sub do_id_scan_state_dollar
7744 sub do_id_scan_state_alpha {
7746 # looking for alphanumeric after ::
7747 $tok_is_blank = $tok =~ /^\s*$/;
7749 if ( $tok =~ /^\w/ ) { # found it
7750 $identifier .= $tok;
7751 $id_scan_state = $scan_state_COLON; # now need ::
7754 elsif ( $tok eq "'" && $allow_tick ) {
7755 $identifier .= $tok;
7756 $id_scan_state = $scan_state_COLON; # now need ::
7759 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7760 $id_scan_state = $scan_state_LPAREN;
7761 $identifier .= $tok;
7763 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7764 $id_scan_state = $scan_state_RPAREN;
7765 $identifier .= $tok;
7768 $id_scan_state = EMPTY_STRING;
7772 } ## end sub do_id_scan_state_alpha
7774 sub do_id_scan_state_colon {
7776 # looking for possible :: after alphanumeric
7778 $tok_is_blank = $tok =~ /^\s*$/;
7780 if ( $tok eq '::' ) { # got it
7781 $identifier .= $tok;
7782 $id_scan_state = $scan_state_ALPHA; # now require alpha
7784 elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here
7785 $identifier .= $tok;
7786 $id_scan_state = $scan_state_COLON; # now need ::
7789 elsif ( $tok eq "'" && $allow_tick ) { # tick
7791 if ( $is_keyword{$identifier} ) {
7792 $id_scan_state = EMPTY_STRING; # that's all
7796 $identifier .= $tok;
7799 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7800 $id_scan_state = $scan_state_LPAREN;
7801 $identifier .= $tok;
7803 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7804 $id_scan_state = $scan_state_RPAREN;
7805 $identifier .= $tok;
7808 $id_scan_state = EMPTY_STRING; # that's all
7812 } ## end sub do_id_scan_state_colon
7814 sub do_id_scan_state_left_paren {
7816 # looking for possible '(' of a prototype
7818 if ( $tok eq '(' ) { # got it
7819 $identifier .= $tok;
7820 $id_scan_state = $scan_state_RPAREN; # now find the end of it
7822 elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
7823 $identifier .= $tok;
7827 $id_scan_state = EMPTY_STRING; # that's all - no prototype
7831 } ## end sub do_id_scan_state_left_paren
7833 sub do_id_scan_state_right_paren {
7835 # looking for a ')' of prototype to close a '('
7837 $tok_is_blank = $tok =~ /^\s*$/;
7839 if ( $tok eq ')' ) { # got it
7840 $identifier .= $tok;
7841 $id_scan_state = EMPTY_STRING; # all done
7843 elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
7844 $identifier .= $tok;
7846 else { # probable error in script, but keep going
7847 warning("Unexpected '$tok' while seeking end of prototype\n");
7848 $identifier .= $tok;
7851 } ## end sub do_id_scan_state_right_paren
7853 sub do_id_scan_state_ampersand {
7855 # Starting sub call after seeing an '&'
7857 if ( $tok =~ /^[\$\w]/ ) { # alphanumeric ..
7858 $id_scan_state = $scan_state_COLON; # now need ::
7860 $identifier .= $tok;
7862 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7863 $id_scan_state = $scan_state_COLON; # now need ::
7865 $identifier .= $tok;
7867 elsif ( $tok =~ /^\s*$/ ) { # allow space
7870 # fix c139: trim line-ending type 't'
7871 if ( length($identifier) == 1 && $i == $max_token_index ) {
7876 elsif ( $tok eq '::' ) { # leading ::
7877 $id_scan_state = $scan_state_ALPHA; # accept alpha next
7878 $identifier .= $tok;
7880 elsif ( $tok eq '{' ) {
7881 if ( $identifier eq '&' || $i == 0 ) {
7882 $identifier = EMPTY_STRING;
7885 $id_scan_state = EMPTY_STRING;
7887 elsif ( $tok eq '^' ) {
7888 if ( $identifier eq '&' ) {
7890 # Special variable (c066)
7891 $identifier .= $tok;
7894 # There may be one more character, not a space, after the ^
7895 my $next1 = $rtokens->[ $i + 1 ];
7896 my $chr = substr( $next1, 0, 1 );
7897 if ( $is_special_variable_char{$chr} ) {
7899 # It is something like &^O
7901 $identifier .= $next1;
7903 # If pretoken $next1 is more than one character long,
7904 # set a flag indicating that it needs to be split.
7906 ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
7911 $id_scan_state = EMPTY_STRING;
7915 $identifier = EMPTY_STRING;
7921 # punctuation variable?
7922 # testfile: cunningham4.pl
7924 # We have to be careful here. If we are in an unknown state,
7925 # we will reject the punctuation variable. In the following
7926 # example the '&' is a binary operator but we are in an unknown
7927 # state because there is no sigil on 'Prima', so we don't
7928 # know what it is. But it is a bad guess that
7929 # '&~' is a function variable.
7930 # $self->{text}->{colorMap}->[
7931 # Prima::PodView::COLOR_CODE_FOREGROUND
7932 # & ~tb::COLOR_INDEX ] =
7935 # Fix for case c033: a '#' here starts a side comment
7936 if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
7937 $identifier .= $tok;
7940 $identifier = EMPTY_STRING;
7944 $id_scan_state = EMPTY_STRING;
7947 } ## end sub do_id_scan_state_ampersand
7949 #-------------------
7950 # hash of scanner subs
7951 #-------------------
7952 my $scan_identifier_code = {
7953 $scan_state_SIGIL => \&do_id_scan_state_dollar,
7954 $scan_state_ALPHA => \&do_id_scan_state_alpha,
7955 $scan_state_COLON => \&do_id_scan_state_colon,
7956 $scan_state_LPAREN => \&do_id_scan_state_left_paren,
7957 $scan_state_RPAREN => \&do_id_scan_state_right_paren,
7958 $scan_state_AMPERSAND => \&do_id_scan_state_ampersand,
7961 sub scan_complex_identifier {
7963 # This routine assembles tokens into identifiers. It maintains a
7964 # scan state, id_scan_state. It updates id_scan_state based upon
7965 # current id_scan_state and token, and returns an updated
7966 # id_scan_state and the next index after the identifier.
7968 # This routine now serves a a backup for sub scan_simple_identifier
7969 # which handles most identifiers.
7972 $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
7973 $expecting, $container_type
7976 # return flag telling caller to split the pretoken
7977 my $split_pretoken_flag;
7979 #-------------------
7980 # Initialize my vars
7981 #-------------------
7983 initialize_my_scan_id_vars();
7985 #--------------------------------------------------------
7986 # get started by defining a type and a state if necessary
7987 #--------------------------------------------------------
7989 if ( !$id_scan_state ) {
7990 $context = UNKNOWN_CONTEXT;
7993 if ( $tok eq '>' ) {
7999 if ( $last_nonblank_token eq '->' ) {
8000 $identifier = '->' . $identifier;
8001 $id_scan_state = $scan_state_SIGIL;
8003 elsif ( $tok eq '$' || $tok eq '*' ) {
8004 $id_scan_state = $scan_state_SIGIL;
8005 $context = SCALAR_CONTEXT;
8007 elsif ( $tok eq '%' || $tok eq '@' ) {
8008 $id_scan_state = $scan_state_SIGIL;
8009 $context = LIST_CONTEXT;
8011 elsif ( $tok eq '&' ) {
8012 $id_scan_state = $scan_state_AMPERSAND;
8014 elsif ( $tok eq 'sub' or $tok eq 'package' ) {
8015 $saw_alpha = 0; # 'sub' is considered type info here
8016 $id_scan_state = $scan_state_SIGIL;
8018 SPACE; # need a space to separate sub from sub name
8020 elsif ( $tok eq '::' ) {
8021 $id_scan_state = $scan_state_ALPHA;
8023 elsif ( $tok =~ /^\w/ ) {
8024 $id_scan_state = $scan_state_COLON;
8027 elsif ( $tok eq '->' ) {
8028 $id_scan_state = $scan_state_SIGIL;
8032 # shouldn't happen: bad call parameter
8034 "Program bug detected: scan_identifier received bad starting token = '$tok'\n";
8035 if (DEVEL_MODE) { Fault($msg) }
8036 if ( !$tokenizer_self->[_in_error_] ) {
8038 $tokenizer_self->[_in_error_] = 1;
8040 $id_scan_state = EMPTY_STRING;
8045 $saw_type = !$saw_alpha;
8049 $saw_alpha = ( $tok =~ /^\w/ );
8050 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
8052 # check for a valid starting state
8053 if ( DEVEL_MODE && !$is_returnable_scan_state{$id_scan_state} ) {
8055 Unexpected starting scan state in sub scan_complex_identifier: '$id_scan_state'
8060 #------------------------------
8061 # loop to gather the identifier
8062 #------------------------------
8066 while ( $i < $max_token_index && $id_scan_state ) {
8068 # Be sure we have code to handle this state before we proceed
8069 my $code = $scan_identifier_code->{$id_scan_state};
8072 if ( $id_scan_state eq $scan_state_SPLIT ) {
8073 ## OK: this is the signal to exit and split the pretoken
8076 # unknown state - should not happen
8080 Unknown scan state in sub scan_complex_identifier: '$id_scan_state'
8081 Scan state at sub entry was '$id_scan_state_begin'
8084 $id_scan_state = EMPTY_STRING;
8090 # Remember the starting index for progress check below
8091 my $i_start_loop = $i;
8093 $last_tok_is_blank = $tok_is_blank;
8094 if ($tok_is_blank) { $tok_is_blank = undef }
8095 else { $i_save = $i }
8097 $tok = $rtokens->[ ++$i ];
8099 # patch to make digraph :: if necessary
8100 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
8107 # check for forward progress: a decrease in the index $i
8108 # implies that scanning has finished
8109 last if ( $i <= $i_start_loop );
8111 } ## end of main loop
8117 # Be sure a valid state is returned
8118 if ($id_scan_state) {
8120 if ( !$is_returnable_scan_state{$id_scan_state} ) {
8122 if ( $id_scan_state eq $scan_state_SPLIT ) {
8123 $split_pretoken_flag = 1;
8126 if ( $id_scan_state eq $scan_state_RPAREN ) {
8128 "Hit end of line while seeking ) to end prototype\n");
8131 $id_scan_state = EMPTY_STRING;
8134 # Patch: the deprecated variable $# does not combine with anything
8136 if ( $identifier eq '$#' ) { $id_scan_state = EMPTY_STRING }
8139 # Be sure the token index is valid
8140 if ( $i < 0 ) { $i = 0 }
8142 # Be sure a token type is defined
8149 # The type without the -> should be the same as with the -> so
8150 # that if they get separated we get the same bond strengths,
8152 if ( $identifier =~ /^->/
8153 && $last_nonblank_type eq 'w'
8154 && substr( $identifier, 2, 1 ) =~ /^\w/ )
8158 else { $type = 'i' }
8160 elsif ( $identifier eq '->' ) {
8164 ( length($identifier) > 1 )
8166 # In something like '@$=' we have an identifier '@$'
8167 # In something like '$${' we have type '$$' (and only
8168 # part of an identifier)
8169 && !( $identifier =~ /\$$/ && $tok eq '{' )
8171 ## && ( $identifier !~ /^(sub |package )$/ )
8172 && $identifier ne 'sub '
8173 && $identifier ne 'package '
8178 else { $type = 't' }
8180 elsif ($saw_alpha) {
8182 # type 'w' includes anything without leading type info
8183 # ($,%,@,*) including something like abc::def::ghi
8186 # Fix for b1337, if restarting scan after line break between
8187 # '->' or sigil and identifier name, use type 'i'
8188 if ( $id_scan_state_begin
8189 && $identifier =~ /^([\$\%\@\*\&]|->)/ )
8195 $type = EMPTY_STRING;
8196 } # this can happen on a restart
8199 # See if we formed an identifier...
8202 if ($message) { write_logfile_entry($message) }
8205 # did not find an identifier, back up
8213 DEBUG_SCAN_ID && do {
8214 my ( $a, $b, $c ) = caller;
8216 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
8218 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
8220 return ( $i, $tok, $type, $id_scan_state, $identifier,
8221 $split_pretoken_flag );
8222 } ## end sub scan_complex_identifier
8223 } ## end closure for sub scan_complex_identifier
8225 { ## closure for sub do_scan_sub
8227 my %warn_if_lexical;
8231 # lexical subs with these names can cause parsing errors in this version
8232 my @q = qw( m q qq qr qw qx s tr y );
8233 @{warn_if_lexical}{@q} = (1) x scalar(@q);
8236 # saved package and subnames in case prototype is on separate line
8237 my ( $package_saved, $subname_saved );
8239 # initialize subname each time a new 'sub' keyword is encountered
8240 sub initialize_subname {
8241 $package_saved = EMPTY_STRING;
8242 $subname_saved = EMPTY_STRING;
8249 PROTOTYPE_CALL => 3,
8254 # do_scan_sub parses a sub name and prototype.
8256 # At present there are three basic CALL TYPES which are
8257 # distinguished by the starting value of '$tok':
8258 # 1. $tok='sub', id_scan_state='sub'
8259 # it is called with $i_beg equal to the index of the first nonblank
8260 # token following a 'sub' token.
8261 # 2. $tok='(', id_scan_state='sub',
8262 # it is called with $i_beg equal to the index of a '(' which may
8263 # start a prototype.
8264 # 3. $tok='prototype', id_scan_state='prototype'
8265 # it is called with $i_beg equal to the index of a '(' which is
8266 # preceded by ': prototype' and has $id_scan_state eq 'prototype'
8270 # A single type 1 call will get both the sub and prototype
8271 # sub foo1 ( $$ ) { }
8274 # The subname will be obtained with a 'sub' call
8275 # The prototype on line 2 will be obtained with a '(' call
8281 # The subname will be obtained with a 'sub' call
8282 # The prototype will be obtained with a 'prototype' call
8283 # sub foo1 ( $x, $y ) : prototype ( $$ ) { }
8284 # ^ <---type 1 ^ <---type 3
8286 # TODO: add future error checks to be sure we have a valid
8287 # sub name. For example, 'sub &doit' is wrong. Also, be sure
8288 # a name is given if and only if a non-anonymous sub is
8290 # USES GLOBAL VARS: $current_package, $last_nonblank_token,
8291 # $in_attribute_list, %saw_function_definition,
8294 my ($rinput_hash) = @_;
8296 my $input_line = $rinput_hash->{input_line};
8297 my $i = $rinput_hash->{i};
8298 my $i_beg = $rinput_hash->{i_beg};
8299 my $tok = $rinput_hash->{tok};
8300 my $type = $rinput_hash->{type};
8301 my $rtokens = $rinput_hash->{rtokens};
8302 my $rtoken_map = $rinput_hash->{rtoken_map};
8303 my $id_scan_state = $rinput_hash->{id_scan_state};
8304 my $max_token_index = $rinput_hash->{max_token_index};
8308 # Determine the CALL TYPE
8313 $tok eq 'prototype' ? PROTOTYPE_CALL
8314 : $tok eq '(' ? PAREN_CALL
8317 $id_scan_state = EMPTY_STRING; # normally we get everything in one call
8318 my $subname = $subname_saved;
8319 my $package = $package_saved;
8324 my $pos_beg = $rtoken_map->[$i_beg];
8325 pos($input_line) = $pos_beg;
8327 # Look for the sub NAME if this is a SUB call
8329 $call_type == SUB_CALL
8330 && $input_line =~ m/\G\s*
8331 ((?:\w*(?:'|::))*) # package - something that ends in :: or '
8332 (\w+) # NAME - required
8339 my $is_lexical_sub =
8340 $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
8341 if ( $is_lexical_sub && $1 ) {
8342 warning("'my' sub $subname cannot be in package '$1'\n");
8343 $is_lexical_sub = 0;
8346 if ($is_lexical_sub) {
8348 # lexical subs use the block sequence number as a package name
8350 $current_sequence_number[BRACE][ $current_depth[BRACE] ];
8351 $seqno = 1 unless ( defined($seqno) );
8353 if ( $warn_if_lexical{$subname} ) {
8355 "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
8360 $package = ( defined($1) && $1 ) ? $1 : $current_package;
8361 $package =~ s/\'/::/g;
8362 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
8363 $package =~ s/::$//;
8366 my $pos = pos($input_line);
8367 my $numc = $pos - $pos_beg;
8368 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
8371 # remember the sub name in case another call is needed to
8373 $package_saved = $package;
8374 $subname_saved = $subname;
8377 # Now look for PROTO ATTRS for all call types
8378 # Look for prototype/attributes which are usually on the same
8379 # line as the sub name but which might be on a separate line.
8380 # For example, we might have an anonymous sub with attributes,
8381 # or a prototype on a separate line from its sub name
8383 # NOTE: We only want to parse PROTOTYPES here. If we see anything that
8384 # does not look like a prototype, we assume it is a SIGNATURE and we
8385 # will stop and let the the standard tokenizer handle it. In
8386 # particular, we stop if we see any nested parens, braces, or commas.
8387 # Also note, a valid prototype cannot contain any alphabetic character
8388 # -- see https://perldoc.perl.org/perlsub
8389 # But it appears that an underscore is valid in a prototype, so the
8390 # regex below uses [A-Za-z] rather than \w
8391 # This is the old regex which has been replaced:
8392 # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO
8393 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
8395 $input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO
8396 (\s*:)? # ATTRS leading ':'
8404 # Append the prototype to the starting token if it is 'sub' or
8405 # 'prototype'. This is not necessary but for compatibility with
8406 # previous versions when the -csc flag is used:
8407 if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
8411 # If we just entered the sub at an opening paren on this call, not
8412 # a following :prototype, label it with the previous token. This is
8413 # necessary to propagate the sub name to its opening block.
8414 elsif ( $call_type == PAREN_CALL ) {
8415 $tok = $last_nonblank_token;
8420 # Patch part #1 to fixes cases b994 and b1053:
8421 # Mark an anonymous sub keyword without prototype as type 'k', i.e.
8422 # 'sub : lvalue { ...'
8424 if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
8429 # ATTRS: if there are attributes, back up and let the ':' be
8430 # found later by the scanner.
8431 my $pos = pos($input_line);
8433 $pos -= length($attrs);
8436 my $next_nonblank_token = $tok;
8438 # catch case of line with leading ATTR ':' after anonymous sub
8439 if ( $pos == $pos_beg && $tok eq ':' ) {
8441 $in_attribute_list = 1;
8444 # Otherwise, if we found a match we must convert back from
8445 # string position to the pre_token index for continued parsing.
8448 # I don't think an error flag can occur here ..but ?
8450 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
8452 if ($error) { warning("Possibly invalid sub\n") }
8454 # Patch part #2 to fixes cases b994 and b1053:
8455 # Do not let spaces be part of the token of an anonymous sub
8456 # keyword which we marked as type 'k' above...i.e. for
8458 # 'sub : lvalue { ...'
8459 # Back up and let it be parsed as a blank
8463 && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
8468 # check for multiple definitions of a sub
8469 ( $next_nonblank_token, my $i_next ) =
8470 find_next_nonblank_token_on_this_line( $i, $rtokens,
8474 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
8475 { # skip blank or side comment
8476 my ( $rpre_tokens, $rpre_types ) =
8477 peek_ahead_for_n_nonblank_pre_tokens(1);
8478 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
8479 $next_nonblank_token = $rpre_tokens->[0];
8482 $next_nonblank_token = '}';
8486 # See what's next...
8487 if ( $next_nonblank_token eq '{' ) {
8490 # Check for multiple definitions of a sub, but
8491 # it is ok to have multiple sub BEGIN, etc,
8492 # so we do not complain if name is all caps
8493 if ( $saw_function_definition{$subname}{$package}
8494 && $subname !~ /^[A-Z]+$/ )
8496 my $lno = $saw_function_definition{$subname}{$package};
8497 if ( $package =~ /^\d/ ) {
8499 "already saw definition of lexical 'sub $subname' at line $lno\n"
8505 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
8506 ) unless (DEVEL_MODE);
8509 $saw_function_definition{$subname}{$package} =
8510 $tokenizer_self->[_last_line_number_];
8513 elsif ( $next_nonblank_token eq ';' ) {
8515 elsif ( $next_nonblank_token eq '}' ) {
8518 # ATTRS - if an attribute list follows, remember the name
8519 # of the sub so the next opening brace can be labeled.
8520 # Setting 'statement_type' causes any ':'s to introduce
8522 elsif ( $next_nonblank_token eq ':' ) {
8523 if ( $call_type == SUB_CALL ) {
8525 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8529 # if we stopped before an open paren ...
8530 elsif ( $next_nonblank_token eq '(' ) {
8532 # If we DID NOT see this paren above then it must be on the
8533 # next line so we will set a flag to come back here and see if
8536 # Otherwise, we assume it is a SIGNATURE rather than a
8537 # PROTOTYPE and let the normal tokenizer handle it as a list
8538 if ( !$saw_opening_paren ) {
8539 $id_scan_state = 'sub'; # we must come back to get proto
8541 if ( $call_type == SUB_CALL ) {
8543 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8546 elsif ($next_nonblank_token) { # EOF technically ok
8547 $subname = EMPTY_STRING unless defined($subname);
8549 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
8552 check_prototype( $proto, $package, $subname );
8555 # no match to either sub name or prototype, but line not blank
8559 return ( $i, $tok, $type, $id_scan_state );
8560 } ## end sub do_scan_sub
8563 #########################################################################
8564 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
8565 #########################################################################
8567 sub find_next_nonblank_token {
8568 my ( $i, $rtokens, $max_token_index ) = @_;
8570 # Returns the next nonblank token after the token at index $i
8571 # To skip past a side comment, and any subsequent block comments
8572 # and blank lines, call with i=$max_token_index
8574 if ( $i >= $max_token_index ) {
8575 if ( !peeked_ahead() ) {
8577 peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
8581 my $next_nonblank_token = $rtokens->[ ++$i ];
8582 return ( SPACE, $i )
8583 unless ( defined($next_nonblank_token) && length($next_nonblank_token) );
8585 # Quick test for nonblank ascii char. Note that we just have to
8586 # examine the first character here.
8587 my $ord = ord( substr( $next_nonblank_token, 0, 1 ) );
8588 if ( $ord >= ORD_PRINTABLE_MIN
8589 && $ord <= ORD_PRINTABLE_MAX )
8591 return ( $next_nonblank_token, $i );
8594 # Quick test to skip over an ascii space or tab
8595 elsif ( $ord == ORD_SPACE || $ord == ORD_TAB ) {
8596 $next_nonblank_token = $rtokens->[ ++$i ];
8597 return ( SPACE, $i ) unless defined($next_nonblank_token);
8600 # Slow test to skip over something else identified as whitespace
8601 elsif ( $next_nonblank_token =~ /^\s*$/ ) {
8602 $next_nonblank_token = $rtokens->[ ++$i ];
8603 return ( SPACE, $i ) unless defined($next_nonblank_token);
8606 # We should be at a nonblank now
8607 return ( $next_nonblank_token, $i );
8608 } ## end sub find_next_nonblank_token
8610 sub find_next_noncomment_type {
8611 my ( $i, $rtokens, $max_token_index ) = @_;
8613 # Given the current character position, look ahead past any comments
8614 # and blank lines and return the next token, including digraphs and
8617 my ( $next_nonblank_token, $i_next ) =
8618 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8620 # skip past any side comment
8621 if ( $next_nonblank_token eq '#' ) {
8622 ( $next_nonblank_token, $i_next ) =
8623 find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
8626 # check for a digraph
8627 if ( $next_nonblank_token
8628 && $next_nonblank_token ne SPACE
8629 && defined( $rtokens->[ $i_next + 1 ] ) )
8631 my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8632 if ( $is_digraph{$test2} ) {
8633 $next_nonblank_token = $test2;
8634 $i_next = $i_next + 1;
8636 # check for a trigraph
8637 if ( defined( $rtokens->[ $i_next + 1 ] ) ) {
8638 my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8639 if ( $is_trigraph{$test3} ) {
8640 $next_nonblank_token = $test3;
8641 $i_next = $i_next + 1;
8647 return ( $next_nonblank_token, $i_next );
8648 } ## end sub find_next_noncomment_type
8650 sub is_possible_numerator {
8652 # Look at the next non-comment character and decide if it could be a
8658 my ( $i, $rtokens, $max_token_index ) = @_;
8659 my $is_possible_numerator = 0;
8661 my $next_token = $rtokens->[ $i + 1 ];
8662 if ( $next_token eq '=' ) { $i++; } # handle /=
8663 my ( $next_nonblank_token, $i_next ) =
8664 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8666 if ( $next_nonblank_token eq '#' ) {
8667 ( $next_nonblank_token, $i_next ) =
8668 find_next_nonblank_token( $max_token_index, $rtokens,
8672 if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
8673 $is_possible_numerator = 1;
8675 elsif ( $next_nonblank_token =~ /^\s*$/ ) {
8676 $is_possible_numerator = 0;
8679 $is_possible_numerator = -1;
8682 return $is_possible_numerator;
8683 } ## end sub is_possible_numerator
8685 { ## closure for sub pattern_expected
8690 # List of tokens which may follow a pattern. Note that we will not
8691 # have formed digraphs at this point, so we will see '&' instead of
8692 # '&&' and '|' instead of '||'
8694 # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
8695 my @q = qw( & && | || ? : + - * and or while if unless);
8696 push @q, ')', '}', ']', '>', ',', ';';
8697 @{pattern_test}{@q} = (1) x scalar(@q);
8700 sub pattern_expected {
8702 # This a filter for a possible pattern.
8703 # It looks at the token after a possible pattern and tries to
8704 # determine if that token could end a pattern.
8709 my ( $i, $rtokens, $max_token_index ) = @_;
8712 my $next_token = $rtokens->[ $i + 1 ];
8713 if ( $next_token =~ /^[msixpodualgc]/ ) {
8715 } # skip possible modifier
8716 my ( $next_nonblank_token, $i_next ) =
8717 find_next_nonblank_token( $i, $rtokens, $max_token_index );
8719 if ( $pattern_test{$next_nonblank_token} ) {
8724 # Added '#' to fix issue c044
8725 if ( $next_nonblank_token =~ /^\s*$/
8726 || $next_nonblank_token eq '#' )
8735 } ## end sub pattern_expected
8738 sub find_next_nonblank_token_on_this_line {
8739 my ( $i, $rtokens, $max_token_index ) = @_;
8740 my $next_nonblank_token;
8742 if ( $i < $max_token_index ) {
8743 $next_nonblank_token = $rtokens->[ ++$i ];
8745 if ( $next_nonblank_token =~ /^\s*$/ ) {
8747 if ( $i < $max_token_index ) {
8748 $next_nonblank_token = $rtokens->[ ++$i ];
8753 $next_nonblank_token = EMPTY_STRING;
8755 return ( $next_nonblank_token, $i );
8756 } ## end sub find_next_nonblank_token_on_this_line
8758 sub find_angle_operator_termination {
8760 # We are looking at a '<' and want to know if it is an angle operator.
8762 # $i = pretoken index of ending '>' if found, current $i otherwise
8763 # $type = 'Q' if found, '>' otherwise
8764 my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
8767 pos($input_line) = 1 + $rtoken_map->[$i];
8771 # we just have to find the next '>' if a term is expected
8772 if ( $expecting == TERM ) { $filter = '[\>]' }
8774 # we have to guess if we don't know what is expected
8775 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
8777 # shouldn't happen - we shouldn't be here if operator is expected
8781 Bad call to find_angle_operator_termination
8784 return ( $i, $type );
8787 # To illustrate what we might be looking at, in case we are
8788 # guessing, here are some examples of valid angle operators
8795 # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
8796 # <${PREFIX}*img*.$IMAGE_TYPE>
8797 # <img*.$IMAGE_TYPE>
8798 # <Timg*.$IMAGE_TYPE>
8799 # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
8801 # Here are some examples of lines which do not have angle operators:
8802 # return unless $self->[2]++ < $#{$self->[1]};
8805 # the following line from dlister.pl caused trouble:
8806 # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
8808 # If the '<' starts an angle operator, it must end on this line and
8809 # it must not have certain characters like ';' and '=' in it. I use
8810 # this to limit the testing. This filter should be improved if
8813 if ( $input_line =~ /($filter)/g ) {
8817 # We MAY have found an angle operator termination if we get
8818 # here, but we need to do more to be sure we haven't been
8820 my $pos = pos($input_line);
8822 my $pos_beg = $rtoken_map->[$i];
8823 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
8825 # Test for '<' after possible filehandle, issue c103
8826 # print $fh <>; # syntax error
8827 # print $fh <DATA>; # ok
8828 # print $fh < DATA>; # syntax error at '>'
8829 # print STDERR < DATA>; # ok, prints word 'DATA'
8830 # print BLABLA <DATA>; # ok; does nothing unless BLABLA is defined
8831 if ( $last_nonblank_type eq 'Z' ) {
8833 # $str includes brackets; something like '<DATA>'
8834 if ( substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
8835 && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
8837 return ( $i, $type );
8841 # Reject if the closing '>' follows a '-' as in:
8842 # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
8843 if ( $expecting eq UNKNOWN ) {
8844 my $check = substr( $input_line, $pos - 2, 1 );
8845 if ( $check eq '-' ) {
8846 return ( $i, $type );
8850 ######################################debug#####
8851 #write_diagnostics( "ANGLE? :$str\n");
8852 #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
8853 ######################################debug#####
8857 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
8859 # It may be possible that a quote ends midway in a pretoken.
8860 # If this happens, it may be necessary to split the pretoken.
8864 unexpected error condition returned by inverse_pretoken_map
8868 "Possible tokinization error..please check this line\n");
8871 # count blanks on inside of brackets
8872 my $blank_count = 0;
8873 $blank_count++ if ( $str =~ /<\s+/ );
8874 $blank_count++ if ( $str =~ /\s+>/ );
8876 # Now let's see where we stand....
8877 # OK if math op not possible
8878 if ( $expecting == TERM ) {
8881 # OK if there are no more than 2 non-blank pre-tokens inside
8882 # (not possible to write 2 token math between < and >)
8883 # This catches most common cases
8884 elsif ( $i <= $i_beg + 3 + $blank_count ) {
8886 # No longer any need to document this common case
8887 ## write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
8890 # OK if there is some kind of identifier inside
8891 # print $fh <tvg::INPUT>;
8892 elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
8893 write_diagnostics("ANGLE (contains identifier): $str\n");
8899 # Let's try a Brace Test: any braces inside must balance
8901 while ( $str =~ /\{/g ) { $br++ }
8902 while ( $str =~ /\}/g ) { $br-- }
8904 while ( $str =~ /\[/g ) { $sb++ }
8905 while ( $str =~ /\]/g ) { $sb-- }
8907 while ( $str =~ /\(/g ) { $pr++ }
8908 while ( $str =~ /\)/g ) { $pr-- }
8910 # if braces do not balance - not angle operator
8911 if ( $br || $sb || $pr ) {
8915 "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
8918 # we should keep doing more checks here...to be continued
8919 # Tentatively accepting this as a valid angle operator.
8920 # There are lots more things that can be checked.
8923 "ANGLE-Guessing yes: $str expecting=$expecting\n");
8924 write_logfile_entry("Guessing angle operator here: $str\n");
8929 # didn't find ending >
8931 if ( $expecting == TERM ) {
8932 warning("No ending > for angle operator\n");
8936 return ( $i, $type );
8937 } ## end sub find_angle_operator_termination
8939 sub scan_number_do {
8941 # scan a number in any of the formats that Perl accepts
8942 # Underbars (_) are allowed in decimal numbers.
8943 # input parameters -
8944 # $input_line - the string to scan
8945 # $i - pre_token index to start scanning
8946 # $rtoken_map - reference to the pre_token map giving starting
8947 # character position in $input_line of token $i
8948 # output parameters -
8949 # $i - last pre_token index of the number just scanned
8950 # number - the number (characters); or undef if not a number
8952 my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
8953 my $pos_beg = $rtoken_map->[$i];
8957 my $type = $input_type;
8959 my $first_char = substr( $input_line, $pos_beg, 1 );
8961 # Look for bad starting characters; Shouldn't happen..
8962 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
8965 Program bug - scan_number given bad first character = '$first_char'
8968 return ( $i, $type, $number );
8971 # handle v-string without leading 'v' character ('Two Dot' rule)
8973 # Here is the format prior to including underscores:
8974 ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
8975 pos($input_line) = $pos_beg;
8976 if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
8977 $pos = pos($input_line);
8978 my $numc = $pos - $pos_beg;
8979 $number = substr( $input_line, $pos_beg, $numc );
8981 report_v_string($number);
8984 # handle octal, hex, binary
8985 if ( !defined($number) ) {
8986 pos($input_line) = $pos_beg;
8988 # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
8989 # For reference, the format prior to hex floating point is:
8990 # /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
8991 # (hex) (octal) (binary)
8995 /\G[+-]?0( # leading [signed] 0
8997 # a hex float, i.e. '0x0.b17217f7d1cf78p0'
8998 ([xX][0-9a-fA-F_]* # X and optional leading digits
8999 (\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction
9000 [Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
9001 [0-9a-fA-F_]*) # optional Additional exponent digits
9004 |([xX][0-9a-fA-F_]+)
9007 |([oO]?[0-7_]+ # string of octal digits
9008 (\.([0-7][0-7_]*)?)? # optional decimal and fraction
9009 [Pp][+-]?[0-7] # REQUIRED exponent, no underscore
9010 [0-7_]*) # Additional exponent digits with underscores
9013 |([oO]?[0-7_]+) # string of octal digits
9016 |([bB][01_]* # 'b' with string of binary digits
9017 (\.([01][01_]*)?)? # optional decimal and fraction
9018 [Pp][+-]?[01] # Required exponent indicator, no underscore
9019 [01_]*) # additional exponent bits
9022 |([bB][01_]+) # 'b' with string of binary digits
9027 $pos = pos($input_line);
9028 my $numc = $pos - $pos_beg;
9029 $number = substr( $input_line, $pos_beg, $numc );
9035 if ( !defined($number) ) {
9036 pos($input_line) = $pos_beg;
9038 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
9039 $pos = pos($input_line);
9041 # watch out for things like 0..40 which would give 0. by this;
9042 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
9043 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
9047 my $numc = $pos - $pos_beg;
9048 $number = substr( $input_line, $pos_beg, $numc );
9053 # filter out non-numbers like e + - . e2 .e3 +e6
9054 # the rule: at least one digit, and any 'e' must be preceded by a digit
9056 $number !~ /\d/ # no digits
9057 || ( $number =~ /^(.*)[eE]/
9058 && $1 !~ /\d/ ) # or no digits before the 'e'
9062 $type = $input_type;
9063 return ( $i, $type, $number );
9066 # Found a number; now we must convert back from character position
9067 # to pre_token index. An error here implies user syntax error.
9068 # An example would be an invalid octal number like '009'.
9071 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9072 if ($error) { warning("Possibly invalid number\n") }
9074 return ( $i, $type, $number );
9075 } ## end sub scan_number_do
9077 sub inverse_pretoken_map {
9079 # Starting with the current pre_token index $i, scan forward until
9080 # finding the index of the next pre_token whose position is $pos.
9081 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
9084 while ( ++$i <= $max_token_index ) {
9086 if ( $pos <= $rtoken_map->[$i] ) {
9088 # Let the calling routine handle errors in which we do not
9089 # land on a pre-token boundary. It can happen by running
9090 # perltidy on some non-perl scripts, for example.
9091 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
9096 return ( $i, $error );
9097 } ## end sub inverse_pretoken_map
9101 # find the target of a here document, if any
9103 # $i - token index of the second < of <<
9104 # ($i must be less than the last token index if this is called)
9105 # output parameters:
9106 # $found_target = 0 didn't find target; =1 found target
9107 # HERE_TARGET - the target string (may be empty string)
9108 # $i - unchanged if not here doc,
9109 # or index of the last token of the here target
9110 # $saw_error - flag noting unbalanced quote on here target
9111 my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
9113 my $found_target = 0;
9114 my $here_doc_target = EMPTY_STRING;
9115 my $here_quote_character = EMPTY_STRING;
9117 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
9118 $next_token = $rtokens->[ $i + 1 ];
9120 # perl allows a backslash before the target string (heredoc.t)
9122 if ( $next_token eq '\\' ) {
9124 $next_token = $rtokens->[ $i + 2 ];
9127 ( $next_nonblank_token, $i_next_nonblank ) =
9128 find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
9130 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
9133 my $quote_depth = 0;
9138 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
9141 = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
9142 $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
9144 if ($in_quote) { # didn't find end of quote, so no target found
9146 if ( $expecting == TERM ) {
9148 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
9153 else { # found ending quote
9157 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
9158 $tokj = $rtokens->[$j];
9160 # we have to remove any backslash before the quote character
9161 # so that the here-doc-target exactly matches this string
9165 && $rtokens->[ $j + 1 ] eq $here_quote_character );
9166 $here_doc_target .= $tokj;
9171 elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
9173 write_logfile_entry(
9174 "found blank here-target after <<; suggest using \"\"\n");
9177 elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
9179 my $here_doc_expected;
9180 if ( $expecting == UNKNOWN ) {
9181 $here_doc_expected = guess_if_here_doc($next_token);
9184 $here_doc_expected = 1;
9187 if ($here_doc_expected) {
9189 $here_doc_target = $next_token;
9196 if ( $expecting == TERM ) {
9198 write_logfile_entry("Note: bare here-doc operator <<\n");
9205 # patch to neglect any prepended backslash
9206 if ( $found_target && $backslash ) { $i++ }
9208 return ( $found_target, $here_doc_target, $here_quote_character, $i,
9210 } ## end sub find_here_doc
9214 # follow (or continue following) quoted string(s)
9215 # $in_quote return code:
9217 # 1 - still must find end of quote whose target is $quote_character
9218 # 2 - still looking for end of first of two quotes
9220 # Returns updated strings:
9221 # $quoted_string_1 = quoted string seen while in_quote=1
9222 # $quoted_string_2 = quoted string seen while in_quote=2
9238 my $in_quote_starting = $in_quote;
9241 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
9244 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9247 = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
9248 $quote_pos, $quote_depth, $max_token_index );
9249 $quoted_string_2 .= $quoted_string;
9250 if ( $in_quote == 1 ) {
9251 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
9252 $quote_character = EMPTY_STRING;
9255 $quoted_string_2 .= "\n";
9259 if ( $in_quote == 1 ) { # one (more) quote to follow
9262 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9265 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
9266 $quote_pos, $quote_depth, $max_token_index );
9267 $quoted_string_1 .= $quoted_string;
9268 if ( $in_quote == 1 ) {
9269 $quoted_string_1 .= "\n";
9283 } ## end sub do_quote
9285 sub follow_quoted_string {
9287 # scan for a specific token, skipping escaped characters
9288 # if the quote character is blank, use the first non-blank character
9290 # $rtokens = reference to the array of tokens
9291 # $i = the token index of the first character to search
9292 # $in_quote = number of quoted strings being followed
9293 # $beginning_tok = the starting quote character
9294 # $quote_pos = index to check next for alphanumeric delimiter
9295 # output parameters:
9296 # $i = the token index of the ending quote character
9297 # $in_quote = decremented if found end, unchanged if not
9298 # $beginning_tok = the starting quote character
9299 # $quote_pos = index to check next for alphanumeric delimiter
9300 # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
9301 # $quoted_string = the text of the quote (without quotation tokens)
9314 my ( $tok, $end_tok );
9316 my $quoted_string = EMPTY_STRING;
9320 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
9323 # get the corresponding end token
9324 if ( $beginning_tok !~ /^\s*$/ ) {
9325 $end_tok = matching_end_token($beginning_tok);
9328 # a blank token means we must find and use the first non-blank one
9330 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
9332 while ( $i < $max_token_index ) {
9333 $tok = $rtokens->[ ++$i ];
9335 if ( $tok !~ /^\s*$/ ) {
9337 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
9338 $i = $max_token_index;
9342 if ( length($tok) > 1 ) {
9343 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
9344 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
9347 $beginning_tok = $tok;
9350 $end_tok = matching_end_token($beginning_tok);
9356 $allow_quote_comments = 1;
9361 # There are two different loops which search for the ending quote
9362 # character. In the rare case of an alphanumeric quote delimiter, we
9363 # have to look through alphanumeric tokens character-by-character, since
9364 # the pre-tokenization process combines multiple alphanumeric
9365 # characters, whereas for a non-alphanumeric delimiter, only tokens of
9366 # length 1 can match.
9368 #----------------------------------------------------------------
9369 # Case 1 (rare): loop for case of alphanumeric quote delimiter..
9370 # "quote_pos" is the position the current word to begin searching
9371 #----------------------------------------------------------------
9372 if ( $beginning_tok =~ /\w/ ) {
9374 # Note this because it is not recommended practice except
9375 # for obfuscated perl contests
9376 if ( $in_quote == 1 ) {
9377 write_logfile_entry(
9378 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
9381 # Note: changed < to <= here to fix c109. Relying on extra end blanks.
9382 while ( $i <= $max_token_index ) {
9384 if ( $quote_pos == 0 || ( $i < 0 ) ) {
9385 $tok = $rtokens->[ ++$i ];
9387 if ( $tok eq '\\' ) {
9389 # retain backslash unless it hides the end token
9390 $quoted_string .= $tok
9391 unless $rtokens->[ $i + 1 ] eq $end_tok;
9393 last if ( $i >= $max_token_index );
9394 $tok = $rtokens->[ ++$i ];
9397 my $old_pos = $quote_pos;
9399 unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
9403 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
9405 if ( $quote_pos > 0 ) {
9408 substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
9410 # NOTE: any quote modifiers will be at the end of '$tok'. If we
9411 # wanted to check them, this is the place to get them. But
9412 # this quote form is rarely used in practice, so it isn't
9417 if ( $quote_depth == 0 ) {
9423 if ( $old_pos <= length($tok) ) {
9424 $quoted_string .= substr( $tok, $old_pos );
9430 #-----------------------------------------------------------------------
9431 # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
9432 #-----------------------------------------------------------------------
9435 while ( $i < $max_token_index ) {
9436 $tok = $rtokens->[ ++$i ];
9438 if ( $tok eq $end_tok ) {
9441 if ( $quote_depth == 0 ) {
9446 elsif ( $tok eq $beginning_tok ) {
9449 elsif ( $tok eq '\\' ) {
9451 # retain backslash unless it hides the beginning or end token
9452 $tok = $rtokens->[ ++$i ];
9453 $quoted_string .= '\\'
9454 unless ( $tok eq $end_tok || $tok eq $beginning_tok );
9456 $quoted_string .= $tok;
9459 if ( $i > $max_token_index ) { $i = $max_token_index }
9470 } ## end sub follow_quoted_string
9472 sub indicate_error {
9473 my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
9474 interrupt_logfile();
9476 write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
9481 sub write_error_indicator_pair {
9482 my ( $line_number, $input_line, $pos, $carrat ) = @_;
9483 my ( $offset, $numbered_line, $underline ) =
9484 make_numbered_line( $line_number, $input_line, $pos );
9485 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
9486 warning( $numbered_line . "\n" );
9487 $underline =~ s/\s*$//;
9488 warning( $underline . "\n" );
9490 } ## end sub write_error_indicator_pair
9492 sub make_numbered_line {
9494 # Given an input line, its line number, and a character position of
9495 # interest, create a string not longer than 80 characters of the form
9496 # $lineno: sub_string
9497 # such that the sub_string of $str contains the position of interest
9499 # Here is an example of what we want, in this case we add trailing
9500 # '...' because the line is long.
9502 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9504 # Here is another example, this time in which we used leading '...'
9505 # because of excessive length:
9507 # 2: ... er of the World Wide Web Consortium's
9509 # input parameters are:
9510 # $lineno = line number
9511 # $str = the text of the line
9512 # $pos = position of interest (the error) : 0 = first character
9515 # - $offset = an offset which corrects the position in case we only
9516 # display part of a line, such that $pos-$offset is the effective
9517 # position from the start of the displayed line.
9518 # - $numbered_line = the numbered line as above,
9519 # - $underline = a blank 'underline' which is all spaces with the same
9520 # number of characters as the numbered line.
9522 my ( $lineno, $str, $pos ) = @_;
9523 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
9524 my $excess = length($str) - $offset - 68;
9525 my $numc = ( $excess > 0 ) ? 68 : undef;
9527 if ( defined($numc) ) {
9528 if ( $offset == 0 ) {
9529 $str = substr( $str, $offset, $numc - 4 ) . " ...";
9532 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
9537 if ( $offset == 0 ) {
9540 $str = "... " . substr( $str, $offset + 4 );
9544 my $numbered_line = sprintf( "%d: ", $lineno );
9545 $offset -= length($numbered_line);
9546 $numbered_line .= $str;
9547 my $underline = SPACE x length($numbered_line);
9548 return ( $offset, $numbered_line, $underline );
9549 } ## end sub make_numbered_line
9551 sub write_on_underline {
9553 # The "underline" is a string that shows where an error is; it starts
9554 # out as a string of blanks with the same length as the numbered line of
9555 # code above it, and we have to add marking to show where an error is.
9556 # In the example below, we want to write the string '--^' just below
9557 # the line of bad code:
9559 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9561 # We are given the current underline string, plus a position and a
9562 # string to write on it.
9564 # In the above example, there will be 2 calls to do this:
9565 # First call: $pos=19, pos_chr=^
9566 # Second call: $pos=16, pos_chr=---
9568 # This is a trivial thing to do with substr, but there is some
9571 my ( $underline, $pos, $pos_chr ) = @_;
9573 # check for error..shouldn't happen
9574 unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
9577 my $excess = length($pos_chr) + $pos - length($underline);
9578 if ( $excess > 0 ) {
9579 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
9581 substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
9582 return ($underline);
9583 } ## end sub write_on_underline
9587 my ( $str, $max_tokens_wanted ) = @_;
9590 # $max_tokens_wanted > 0 to stop on reaching this many tokens.
9591 # = 0 means get all tokens
9593 # Break a string, $str, into a sequence of preliminary tokens. We
9594 # are interested in these types of tokens:
9595 # words (type='w'), example: 'max_tokens_wanted'
9596 # digits (type = 'd'), example: '0755'
9597 # whitespace (type = 'b'), example: ' '
9598 # any other single character (i.e. punct; type = the character itself).
9599 # We cannot do better than this yet because we might be in a quoted
9600 # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
9603 # An advantage of doing this pre-tokenization step is that it keeps almost
9604 # all of the regex work highly localized. A disadvantage is that in some
9605 # very rare instances we will have to go back and split a pre-token.
9607 # Return parameters:
9608 my @tokens = (); # array of the tokens themselves
9609 my @token_map = (0); # string position of start of each token
9610 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
9615 if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
9618 # note that this must come before words!
9619 elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
9622 elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
9624 # single-character punctuation
9625 elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
9629 return ( \@tokens, \@token_map, \@type );
9633 push @token_map, pos($str);
9635 } while ( --$max_tokens_wanted != 0 );
9637 return ( \@tokens, \@token_map, \@type );
9638 } ## end sub pre_tokenize
9642 # this is an old debug routine
9643 # not called, but saved for reference
9644 my ( $rtokens, $rtoken_map ) = @_;
9645 my $num = scalar( @{$rtokens} );
9647 foreach my $i ( 0 .. $num - 1 ) {
9648 my $len = length( $rtokens->[$i] );
9649 print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
9652 } ## end sub show_tokens
9654 { ## closure for sub matching end token
9655 my %matching_end_token;
9658 %matching_end_token = (
9666 sub matching_end_token {
9668 # return closing character for a pattern
9669 my $beginning_token = shift;
9670 if ( $matching_end_token{$beginning_token} ) {
9671 return $matching_end_token{$beginning_token};
9673 return ($beginning_token);
9677 sub dump_token_types {
9678 my ( $class, $fh ) = @_;
9680 # This should be the latest list of token types in use
9681 # adding NEW_TOKENS: add a comment here
9682 $fh->print(<<'END_OF_LIST');
9684 Here is a list of the token types currently used for lines of type 'CODE'.
9685 For the following tokens, the "type" of a token is just the token itself.
9687 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
9688 ( ) <= >= == =~ !~ != ++ -- /= x=
9689 ... **= <<= >>= &&= ||= //= <=>
9690 , + - / * | % ! x ~ = \ ? : . < > ^ &
9692 The following additional token types are defined:
9695 b blank (white space)
9696 { indent: opening structural curly brace or square bracket or paren
9697 (code block, anonymous hash reference, or anonymous array reference)
9698 } outdent: right structural curly brace or square bracket or paren
9699 [ left non-structural square bracket (enclosing an array index)
9700 ] right non-structural square bracket
9701 ( left non-structural paren (all but a list right of an =)
9702 ) right non-structural paren
9703 L left non-structural curly brace (enclosing a key)
9704 R right non-structural curly brace
9705 ; terminal semicolon
9706 f indicates a semicolon in a "for" statement
9707 h here_doc operator <<
9709 Q indicates a quote or pattern
9710 q indicates a qw quote block
9712 C user-defined constant or constant function (with void prototype = ())
9713 U user-defined function taking parameters
9714 G user-defined function taking block parameter (like grep/map/eval)
9715 M (unused, but reserved for subroutine definition name)
9716 P (unused, but -html uses it to label pod text)
9717 t type indicater such as %,$,@,*,&,sub
9718 w bare word (perhaps a subroutine call)
9719 i identifier of some type (with leading %, $, @, *, &, sub, -> )
9722 F a file test operator (like -e)
9724 Z identifier in indirect object slot: may be file handle, object
9725 J LABEL: code block label
9726 j LABEL after next, last, redo, goto
9729 pp pre-increment operator ++
9730 mm pre-decrement operator --
9731 A : used as attribute separator
9733 Here are the '_line_type' codes used internally:
9734 SYSTEM - system-specific code before hash-bang line
9735 CODE - line of perl code (including comments)
9736 POD_START - line starting pod, such as '=head'
9737 POD - pod documentation text
9738 POD_END - last line of pod section, '=cut'
9739 HERE - text of here-document
9740 HERE_END - last line of here-doc (target word)
9741 FORMAT - format section
9742 FORMAT_END - last line of format section, '.'
9743 SKIP - code skipping section
9744 SKIP_END - last line of code skipping section, '#>>V'
9745 DATA_START - __DATA__ line
9746 DATA - unidentified text following __DATA__
9747 END_START - __END__ line
9748 END - unidentified text following __END__
9749 ERROR - we are in big trouble, probably not a perl script
9753 } ## end sub dump_token_types
9757 # These names are used in error messages
9758 @opening_brace_names = qw# '{' '[' '(' '?' #;
9759 @closing_brace_names = qw# '}' ']' ')' ':' #;
9764 .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
9765 <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
9767 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
9770 . : < > * & | / - = + - % ^ ! x ~
9772 @can_start_digraph{@q} = (1) x scalar(@q);
9774 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
9775 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
9777 my @tetragraphs = qw( <<>> );
9778 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
9780 # make a hash of all valid token types for self-checking the tokenizer
9781 # (adding NEW_TOKENS : select a new character and add to this list)
9782 my @valid_token_types = qw#
9783 A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
9784 { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
9786 push( @valid_token_types, @digraphs );
9787 push( @valid_token_types, @trigraphs );
9788 push( @valid_token_types, @tetragraphs );
9789 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
9790 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
9792 # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
9793 my @file_test_operators =
9794 qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
9795 @is_file_test_operator{@file_test_operators} =
9796 (1) x scalar(@file_test_operators);
9798 # these functions have prototypes of the form (&), so when they are
9799 # followed by a block, that block MAY BE followed by an operator.
9800 # Smartmatch operator ~~ may be followed by anonymous hash or array ref
9802 @is_block_operator{@q} = (1) x scalar(@q);
9804 # these functions allow an identifier in the indirect object slot
9805 @q = qw( print printf sort exec system say);
9806 @is_indirect_object_taker{@q} = (1) x scalar(@q);
9808 # These tokens may precede a code block
9809 # patched for SWITCH/CASE/CATCH. Actually these could be removed
9810 # now and we could let the extended-syntax coding handle them.
9811 # Added 'default' for Switch::Plain.
9813 qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
9814 unless do while until eval for foreach map grep sort
9815 switch case given when default catch try finally);
9816 @is_code_block_token{@q} = (1) x scalar(@q);
9818 # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
9819 # to contrast it with the block types in '%is_zero_continuation_block_type'
9820 @q = qw( sort map grep eval do );
9821 @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
9823 @q = qw( sort map grep );
9824 @is_sort_map_grep{@q} = (1) x scalar(@q);
9826 %is_grep_alias = ();
9828 # I'll build the list of keywords incrementally
9831 # keywords and tokens after which a value or pattern is expected,
9832 # but not an operator. In other words, these should consume terms
9833 # to their right, or at least they are not expected to be followed
9834 # immediately by operators.
9835 my @value_requestor = qw(
10063 # patched above for SWITCH/CASE given/when err say
10064 # 'err' is a fairly safe addition.
10065 # Added 'default' for Switch::Plain. Note that we could also have
10066 # a separate set of keywords to include if we see 'use Switch::Plain'
10067 push( @Keywords, @value_requestor );
10069 # These are treated the same but are not keywords:
10074 push( @value_requestor, @extra_vr );
10076 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
10078 # this list contains keywords which do not look for arguments,
10079 # so that they might be followed by an operator, or at least
10081 my @operator_requestor = qw(
10105 push( @Keywords, @operator_requestor );
10107 # These are treated the same but are not considered keywords:
10114 push( @operator_requestor, @extra_or );
10116 @expecting_operator_token{@operator_requestor} =
10117 (1) x scalar(@operator_requestor);
10119 # these token TYPES expect trailing operator but not a term
10120 # note: ++ and -- are post-increment and decrement, 'C' = constant
10121 my @operator_requestor_types = qw( ++ -- C <> q );
10122 @expecting_operator_types{@operator_requestor_types} =
10123 (1) x scalar(@operator_requestor_types);
10125 # these token TYPES consume values (terms)
10126 # note: pp and mm are pre-increment and decrement
10127 # f=semicolon in for, F=file test operator
10128 my @value_requestor_type = qw#
10129 L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
10130 **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
10131 <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
10132 f F pp mm Y p m U J G j >> << ^ t
10133 ~. ^. |. &. ^.= |.= &.=
10135 push( @value_requestor_type, ',' )
10136 ; # (perl doesn't like a ',' in a qw block)
10137 @expecting_term_types{@value_requestor_type} =
10138 (1) x scalar(@value_requestor_type);
10140 # Note: the following valid token types are not assigned here to
10141 # hashes requesting to be followed by values or terms, but are
10142 # instead currently hard-coded into sub operator_expected:
10143 # ) -> :: Q R Z ] b h i k n v w } #
10145 # For simple syntax checking, it is nice to have a list of operators which
10146 # will really be unhappy if not followed by a term. This includes most
10148 %really_want_term = %expecting_term_types;
10150 # with these exceptions...
10151 delete $really_want_term{'U'}; # user sub, depends on prototype
10152 delete $really_want_term{'F'}; # file test works on $_ if no following term
10153 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
10155 @q = qw(q qq qx qr s y tr m);
10156 @is_q_qq_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10158 # Note added 'qw' here
10159 @q = qw(q qq qw qx qr s y tr m);
10160 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10163 @is_package{@q} = (1) x scalar(@q);
10167 @is_comma_question_colon{@q} = (1) x scalar(@q);
10169 @q = qw( if elsif unless );
10170 @is_if_elsif_unless{@q} = (1) x scalar(@q);
10173 @is_semicolon_or_t{@q} = (1) x scalar(@q);
10175 @q = qw( if elsif unless case when );
10176 @is_if_elsif_unless_case_when{@q} = (1) x scalar(@q);
10178 # Hash of other possible line endings which may occur.
10179 # Keep these coordinated with the regex where this is used.
10180 # Note: chr(13) = chr(015)="\r".
10181 @q = ( chr(13), chr(29), chr(26) );
10182 @other_line_endings{@q} = (1) x scalar(@q);
10184 # These keywords are handled specially in the tokenizer code:
10185 my @special_keywords = qw(
10201 push( @Keywords, @special_keywords );
10203 # Keywords after which list formatting may be used
10204 # WARNING: do not include |map|grep|eval or perl may die on
10205 # syntax errors (map1.t).
10206 my @keyword_taking_list = qw(
10281 @is_keyword_taking_list{@keyword_taking_list} =
10282 (1) x scalar(@keyword_taking_list);
10284 # perl functions which may be unary operators.
10286 # This list is used to decide if a pattern delimited by slashes, /pattern/,
10287 # can follow one of these keywords.
10289 chomp eof eval fc lc pop shift uc undef
10292 @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
10295 # These are keywords for which an arg may optionally be omitted. They are
10296 # currently only used to disambiguate a ? used as a ternary from one used
10297 # as a (deprecated) pattern delimiter. In the future, they might be used
10298 # to give a warning about ambiguous syntax before a /.
10299 # Note: split has been omitted (see not below).
10300 my @keywords_taking_optional_arg = qw(
10369 @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
10370 (1) x scalar(@keywords_taking_optional_arg);
10372 # This list is used to decide if a pattern delimited by question marks,
10373 # ?pattern?, can follow one of these keywords. Note that from perl 5.22
10374 # on, a ?pattern? is not recognized, so we can be much more strict than
10375 # with a /pattern/. Note that 'split' is not in this list. In current
10376 # versions of perl a question following split must be a ternary, but
10377 # in older versions it could be a pattern. The guessing algorithm will
10378 # decide. We are combining two lists here to simplify the test.
10379 @q = ( @keywords_taking_optional_arg, @operator_requestor );
10380 @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
10383 # These are not used in any way yet
10384 # my @unused_keywords = qw(
10390 # The list of keywords was originally extracted from function 'keyword' in
10391 # perl file toke.c version 5.005.03, using this utility, plus a
10392 # little editing: (file getkwd.pl):
10393 # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
10394 # Add 'get' prefix where necessary, then split into the above lists.
10395 # This list should be updated as necessary.
10396 # The list should not contain these special variables:
10397 # ARGV DATA ENV SIG STDERR STDIN STDOUT
10400 @is_keyword{@Keywords} = (1) x scalar(@Keywords);