1 #####################################################################
3 # The Perl::Tidy::Tokenizer package is essentially a filter which
4 # reads lines of perl source code from a source object and provides
5 # corresponding tokenized lines through its get_line() method. Lines
6 # flow from the source_object to the caller like this:
8 # source_object --> LineBuffer_object --> Tokenizer --> calling routine
9 # get_line() get_line() get_line() line_of_tokens
11 # The source object can be any object with a get_line() method which
12 # supplies one line (a character string) perl call.
13 # The LineBuffer object is created by the Tokenizer.
14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
15 # containing one tokenized line for each call to its get_line() method.
17 # WARNING: This is not a real class. Only one tokenizer my be used.
19 ########################################################################
21 package Perl::Tidy::Tokenizer;
24 our $VERSION = '20210717';
26 use Perl::Tidy::LineBuffer;
29 # PACKAGE VARIABLES for processing an entire FILE.
30 # These must be package variables because most may get localized during
31 # processing. Most are initialized in sub prepare_for_a_new_file.
37 $last_nonblank_block_type
45 %user_function_prototype
47 %is_block_list_function
48 %saw_function_definition
58 @nesting_sequence_number
59 @current_sequence_number
61 @paren_semicolon_count
62 @paren_structural_type
64 @brace_structural_type
68 @square_bracket_structural_type
71 @nested_statement_type
72 @starting_line_of_current_depth
75 # GLOBAL CONSTANTS for routines in this package,
76 # Initialized in a BEGIN block.
78 %is_indirect_object_taker
80 %expecting_operator_token
81 %expecting_operator_types
85 %is_file_test_operator
94 %is_keyword_taking_list
95 %is_keyword_taking_optional_arg
96 %is_keyword_rejecting_slash_as_pattern_delimiter
97 %is_keyword_rejecting_question_as_pattern_delimiter
98 %is_q_qq_qw_qx_qr_s_y_tr_m
101 %is_comma_question_colon
103 $code_skipping_pattern_begin
104 $code_skipping_pattern_end
107 # GLOBAL VARIABLES which are constant after being configured by user-supplied
108 # parameters. They remain constant as a file is being processed.
111 $rOpts_code_skipping,
112 $code_skipping_pattern_begin,
113 $code_skipping_pattern_end,
116 # possible values of operator_expected()
117 use constant TERM => -1;
118 use constant UNKNOWN => 0;
119 use constant OPERATOR => 1;
121 # possible values of context
122 use constant SCALAR_CONTEXT => -1;
123 use constant UNKNOWN_CONTEXT => 0;
124 use constant LIST_CONTEXT => 1;
126 # Maximum number of little messages; probably need not be changed.
127 use constant MAX_NAG_MESSAGES => 6;
131 # Array index names for $self
134 _rhere_target_list_ => $i++,
135 _in_here_doc_ => $i++,
136 _here_doc_target_ => $i++,
137 _here_quote_character_ => $i++,
143 _in_skipped_ => $i++,
144 _in_attribute_list_ => $i++,
146 _quote_target_ => $i++,
147 _line_start_quote_ => $i++,
148 _starting_level_ => $i++,
149 _know_starting_level_ => $i++,
151 _indent_columns_ => $i++,
152 _look_for_hash_bang_ => $i++,
154 _continuation_indentation_ => $i++,
155 _outdent_labels_ => $i++,
156 _last_line_number_ => $i++,
157 _saw_perl_dash_P_ => $i++,
158 _saw_perl_dash_w_ => $i++,
159 _saw_use_strict_ => $i++,
160 _saw_v_string_ => $i++,
162 _look_for_autoloader_ => $i++,
163 _look_for_selfloader_ => $i++,
164 _saw_autoloader_ => $i++,
165 _saw_selfloader_ => $i++,
166 _saw_hash_bang_ => $i++,
169 _saw_negative_indentation_ => $i++,
170 _started_tokenizing_ => $i++,
171 _line_buffer_object_ => $i++,
172 _debugger_object_ => $i++,
173 _diagnostics_object_ => $i++,
174 _logger_object_ => $i++,
175 _unexpected_error_count_ => $i++,
176 _started_looking_for_here_target_at_ => $i++,
177 _nearly_matched_here_target_at_ => $i++,
178 _line_of_text_ => $i++,
179 _rlower_case_labels_at_ => $i++,
180 _extended_syntax_ => $i++,
181 _maximum_level_ => $i++,
182 _true_brace_error_count_ => $i++,
183 _rOpts_maximum_level_errors_ => $i++,
184 _rOpts_maximum_unexpected_errors_ => $i++,
185 _rOpts_logfile_ => $i++,
190 { ## closure for subs to count instances
192 # methods to count instances
194 sub get_count { return $_count; }
195 sub _increment_count { return ++$_count }
196 sub _decrement_count { return --$_count }
201 $self->_decrement_count();
207 # Catch any undefined sub calls so that we are sure to get
208 # some diagnostic information. This sub should never be called
209 # except for a programming error.
211 return if ( $AUTOLOAD =~ /\bDESTROY$/ );
212 my ( $pkg, $fname, $lno ) = caller();
213 my $my_package = __PACKAGE__;
215 ======================================================================
216 Error detected in package '$my_package', version $VERSION
217 Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
218 Called from package: '$pkg'
219 Called from File '$fname' at line '$lno'
220 This error is probably due to a recent programming change
221 ======================================================================
228 Perl::Tidy::Die($msg);
229 croak "unexpected return from Perl::Tidy::Die";
234 # See if a pattern will compile. We have to use a string eval here,
235 # but it should be safe because the pattern has been constructed
238 eval "'##'=~/$pattern/";
242 sub make_code_skipping_pattern {
243 my ( $rOpts, $opt_name, $default ) = @_;
244 my $param = $rOpts->{$opt_name};
245 unless ($param) { $param = $default }
246 $param =~ s/^\s*//; # allow leading spaces to be like format-skipping
247 if ( $param !~ /^#/ ) {
248 Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
250 my $pattern = '^\s*' . $param . '\b';
251 if ( bad_pattern($pattern) ) {
253 "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
261 # Check Tokenizer parameters
267 # Install any aliases to 'sub'
268 if ( $rOpts->{'sub-alias-list'} ) {
270 # Note that any 'sub-alias-list' has been preprocessed to
271 # be a trimmed, space-separated list which includes 'sub'
272 # for example, it might be 'sub method fun'
273 my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
274 foreach my $word (@sub_alias_list) {
279 $rOpts_code_skipping = $rOpts->{'code-skipping'};
280 $code_skipping_pattern_begin =
281 make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
282 $code_skipping_pattern_end =
283 make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
289 my ( $class, @args ) = @_;
291 # Note: 'tabs' and 'indent_columns' are temporary and should be
294 source_object => undef,
295 debugger_object => undef,
296 diagnostics_object => undef,
297 logger_object => undef,
298 starting_level => undef,
301 look_for_hash_bang => 0,
303 look_for_autoloader => 1,
304 look_for_selfloader => 1,
305 starting_line_number => 1,
306 extended_syntax => 0,
309 my %args = ( %defaults, @args );
311 # we are given an object with a get_line() method to supply source lines
312 my $source_object = $args{source_object};
313 my $rOpts = $args{rOpts};
315 # we create another object with a get_line() and peek_ahead() method
316 my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
318 # Tokenizer state data is as follows:
319 # _rhere_target_list_ reference to list of here-doc targets
320 # _here_doc_target_ the target string for a here document
321 # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
322 # to determine if interpolation is done
323 # _quote_target_ character we seek if chasing a quote
324 # _line_start_quote_ line where we started looking for a long quote
325 # _in_here_doc_ flag indicating if we are in a here-doc
326 # _in_pod_ flag set if we are in pod documentation
327 # _in_skipped_ flag set if we are in a skipped section
328 # _in_error_ flag set if we saw severe error (binary in script)
329 # _in_data_ flag set if we are in __DATA__ section
330 # _in_end_ flag set if we are in __END__ section
331 # _in_format_ flag set if we are in a format description
332 # _in_attribute_list_ flag telling if we are looking for attributes
333 # _in_quote_ flag telling if we are chasing a quote
334 # _starting_level_ indentation level of first line
335 # _line_buffer_object_ object with get_line() method to supply source code
336 # _diagnostics_object_ place to write debugging information
337 # _unexpected_error_count_ error count used to limit output
338 # _lower_case_labels_at_ line numbers where lower case labels seen
339 # _hit_bug_ program bug detected
342 $self->[_rhere_target_list_] = [];
343 $self->[_in_here_doc_] = 0;
344 $self->[_here_doc_target_] = "";
345 $self->[_here_quote_character_] = "";
346 $self->[_in_data_] = 0;
347 $self->[_in_end_] = 0;
348 $self->[_in_format_] = 0;
349 $self->[_in_error_] = 0;
350 $self->[_in_pod_] = 0;
351 $self->[_in_skipped_] = 0;
352 $self->[_in_attribute_list_] = 0;
353 $self->[_in_quote_] = 0;
354 $self->[_quote_target_] = "";
355 $self->[_line_start_quote_] = -1;
356 $self->[_starting_level_] = $args{starting_level};
357 $self->[_know_starting_level_] = defined( $args{starting_level} );
358 $self->[_tabsize_] = $args{tabsize};
359 $self->[_indent_columns_] = $args{indent_columns};
360 $self->[_look_for_hash_bang_] = $args{look_for_hash_bang};
361 $self->[_trim_qw_] = $args{trim_qw};
362 $self->[_continuation_indentation_] = $args{continuation_indentation};
363 $self->[_outdent_labels_] = $args{outdent_labels};
364 $self->[_last_line_number_] = $args{starting_line_number} - 1;
365 $self->[_saw_perl_dash_P_] = 0;
366 $self->[_saw_perl_dash_w_] = 0;
367 $self->[_saw_use_strict_] = 0;
368 $self->[_saw_v_string_] = 0;
369 $self->[_hit_bug_] = 0;
370 $self->[_look_for_autoloader_] = $args{look_for_autoloader};
371 $self->[_look_for_selfloader_] = $args{look_for_selfloader};
372 $self->[_saw_autoloader_] = 0;
373 $self->[_saw_selfloader_] = 0;
374 $self->[_saw_hash_bang_] = 0;
375 $self->[_saw_end_] = 0;
376 $self->[_saw_data_] = 0;
377 $self->[_saw_negative_indentation_] = 0;
378 $self->[_started_tokenizing_] = 0;
379 $self->[_line_buffer_object_] = $line_buffer_object;
380 $self->[_debugger_object_] = $args{debugger_object};
381 $self->[_diagnostics_object_] = $args{diagnostics_object};
382 $self->[_logger_object_] = $args{logger_object};
383 $self->[_unexpected_error_count_] = 0;
384 $self->[_started_looking_for_here_target_at_] = 0;
385 $self->[_nearly_matched_here_target_at_] = undef;
386 $self->[_line_of_text_] = "";
387 $self->[_rlower_case_labels_at_] = undef;
388 $self->[_extended_syntax_] = $args{extended_syntax};
389 $self->[_maximum_level_] = 0;
390 $self->[_true_brace_error_count_] = 0;
391 $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
392 $self->[_rOpts_maximum_unexpected_errors_] =
393 $rOpts->{'maximum-unexpected-errors'};
394 $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
395 $self->[_rOpts_] = $rOpts;
398 $tokenizer_self = $self;
400 prepare_for_a_new_file();
401 find_starting_indentation_level();
403 # This is not a full class yet, so die if an attempt is made to
404 # create more than one object.
406 if ( _increment_count() > 1 ) {
408 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
415 # interface to Perl::Tidy::Logger routines
418 my $logger_object = $tokenizer_self->[_logger_object_];
419 if ($logger_object) {
420 $logger_object->warning($msg);
427 my $logger_object = $tokenizer_self->[_logger_object_];
428 if ($logger_object) {
429 $logger_object->complain($msg);
434 sub write_logfile_entry {
436 my $logger_object = $tokenizer_self->[_logger_object_];
437 if ($logger_object) {
438 $logger_object->write_logfile_entry($msg);
443 sub interrupt_logfile {
444 my $logger_object = $tokenizer_self->[_logger_object_];
445 if ($logger_object) {
446 $logger_object->interrupt_logfile();
452 my $logger_object = $tokenizer_self->[_logger_object_];
453 if ($logger_object) {
454 $logger_object->resume_logfile();
459 sub increment_brace_error {
460 my $logger_object = $tokenizer_self->[_logger_object_];
461 if ($logger_object) {
462 $logger_object->increment_brace_error();
467 sub report_definite_bug {
468 $tokenizer_self->[_hit_bug_] = 1;
469 my $logger_object = $tokenizer_self->[_logger_object_];
470 if ($logger_object) {
471 $logger_object->report_definite_bug();
478 my $logger_object = $tokenizer_self->[_logger_object_];
479 if ($logger_object) {
480 $logger_object->brace_warning($msg);
485 sub get_saw_brace_error {
486 my $logger_object = $tokenizer_self->[_logger_object_];
487 if ($logger_object) {
488 return $logger_object->get_saw_brace_error();
495 sub get_unexpected_error_count {
497 return $self->[_unexpected_error_count_];
500 # interface to Perl::Tidy::Diagnostics routines
501 sub write_diagnostics {
503 if ( $tokenizer_self->[_diagnostics_object_] ) {
504 $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
509 sub get_maximum_level {
510 return $tokenizer_self->[_maximum_level_];
513 sub report_tokenization_errors {
517 # Report any tokenization errors and return a flag '$severe_error'.
518 # Set $severe_error = 1 if the tokenizations errors are so severe that
519 # the formatter should not attempt to format the file. Instead, it will
520 # just output the file verbatim.
522 # set severe error flag if tokenizer has encountered file reading problems
523 # (i.e. unexpected binary characters)
524 my $severe_error = $self->[_in_error_];
526 my $maxle = $self->[_rOpts_maximum_level_errors_];
527 my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
528 $maxle = 1 unless defined($maxle);
529 $maxue = 0 unless defined($maxue);
531 my $level = get_indentation_level();
532 if ( $level != $tokenizer_self->[_starting_level_] ) {
533 warning("final indentation level: $level\n");
534 my $level_diff = $tokenizer_self->[_starting_level_] - $level;
535 if ( $level_diff < 0 ) { $level_diff = -$level_diff }
537 # Set severe error flag if the level error is greater than 1.
538 # The formatter can function for any level error but it is probably
539 # best not to attempt formatting for a high level error.
540 if ( $maxle >= 0 && $level_diff > $maxle ) {
543 Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
548 check_final_nesting_depths();
550 # Likewise, large numbers of brace errors usually indicate non-perl
551 # scirpts, so set the severe error flag at a low number. This is similar
552 # to the level check, but different because braces may balance but be
553 # incorrectly interlaced.
554 if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
558 if ( $tokenizer_self->[_look_for_hash_bang_]
559 && !$tokenizer_self->[_saw_hash_bang_] )
562 "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
565 if ( $tokenizer_self->[_in_format_] ) {
566 warning("hit EOF while in format description\n");
569 if ( $tokenizer_self->[_in_skipped_] ) {
571 "hit EOF while in lines skipped with --code-skipping\n");
574 if ( $tokenizer_self->[_in_pod_] ) {
576 # Just write log entry if this is after __END__ or __DATA__
577 # because this happens to often, and it is not likely to be
579 if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
581 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
587 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
593 if ( $tokenizer_self->[_in_here_doc_] ) {
595 my $here_doc_target = $tokenizer_self->[_here_doc_target_];
596 my $started_looking_for_here_target_at =
597 $tokenizer_self->[_started_looking_for_here_target_at_];
598 if ($here_doc_target) {
600 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
605 "hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
608 my $nearly_matched_here_target_at =
609 $tokenizer_self->[_nearly_matched_here_target_at_];
610 if ($nearly_matched_here_target_at) {
612 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
617 # Something is seriously wrong if we ended inside a quote
618 if ( $tokenizer_self->[_in_quote_] ) {
620 my $line_start_quote = $tokenizer_self->[_line_start_quote_];
621 my $quote_target = $tokenizer_self->[_quote_target_];
623 ( $tokenizer_self->[_in_attribute_list_] )
627 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
631 if ( $tokenizer_self->[_hit_bug_] ) {
635 # Multiple "unexpected" type tokenization errors usually indicate parsing
636 # non-perl scripts, or that something is seriously wrong, so we should
637 # avoid formatting them. This can happen for example if we run perltidy on
638 # a shell script or an html file. But unfortunately this check can
639 # interfere with some extended syntaxes, such as RPerl, so it has to be off
641 my $ue_count = $tokenizer_self->[_unexpected_error_count_];
642 if ( $maxue > 0 && $ue_count > $maxue ) {
644 Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
649 unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
651 write_logfile_entry("Suggest including '-w parameter'\n");
654 write_logfile_entry("Suggest including 'use warnings;'\n");
658 if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
659 write_logfile_entry("Use of -P parameter for defines is discouraged\n");
662 unless ( $tokenizer_self->[_saw_use_strict_] ) {
663 write_logfile_entry("Suggest including 'use strict;'\n");
666 # it is suggested that labels have at least one upper case character
667 # for legibility and to avoid code breakage as new keywords are introduced
668 if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
669 my @lower_case_labels_at =
670 @{ $tokenizer_self->[_rlower_case_labels_at_] };
672 "Suggest using upper case characters in label(s)\n");
674 write_logfile_entry(" defined at line(s): (@lower_case_labels_at)\n");
676 return $severe_error;
679 sub report_v_string {
681 # warn if this version can't handle v-strings
683 unless ( $tokenizer_self->[_saw_v_string_] ) {
684 $tokenizer_self->[_saw_v_string_] =
685 $tokenizer_self->[_last_line_number_];
689 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
695 sub get_input_line_number {
696 return $tokenizer_self->[_last_line_number_];
699 # returns the next tokenized line
704 # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
705 # $square_bracket_depth, $paren_depth
707 my $input_line = $tokenizer_self->[_line_buffer_object_]->get_line();
708 $tokenizer_self->[_line_of_text_] = $input_line;
710 return unless ($input_line);
712 my $input_line_number = ++$tokenizer_self->[_last_line_number_];
714 my $write_logfile_entry = sub {
716 write_logfile_entry("Line $input_line_number: $msg");
719 # Find and remove what characters terminate this line, including any
721 my $input_line_separator = "";
722 if ( chomp($input_line) ) { $input_line_separator = $/ }
724 # The first test here very significantly speeds things up, but be sure to
725 # keep the regex and hash %other_line_endings the same.
726 if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
727 if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
728 $input_line_separator = $2 . $input_line_separator;
732 # for backwards compatibility we keep the line text terminated with
733 # a newline character
735 $tokenizer_self->[_line_of_text_] = $input_line; # update
737 # create a data structure describing this line which will be
738 # returned to the caller.
740 # _line_type codes are:
741 # SYSTEM - system-specific code before hash-bang line
742 # CODE - line of perl code (including comments)
743 # POD_START - line starting pod, such as '=head'
744 # POD - pod documentation text
745 # POD_END - last line of pod section, '=cut'
746 # HERE - text of here-document
747 # HERE_END - last line of here-doc (target word)
748 # FORMAT - format section
749 # FORMAT_END - last line of format section, '.'
750 # DATA_START - __DATA__ line
751 # DATA - unidentified text following __DATA__
752 # END_START - __END__ line
753 # END - unidentified text following __END__
754 # ERROR - we are in big trouble, probably not a perl script
757 # _curly_brace_depth - depth of curly braces at start of line
758 # _square_bracket_depth - depth of square brackets at start of line
759 # _paren_depth - depth of parens at start of line
760 # _starting_in_quote - this line continues a multi-line quote
761 # (so don't trim leading blanks!)
762 # _ending_in_quote - this line ends in a multi-line quote
763 # (so don't trim trailing blanks!)
764 my $line_of_tokens = {
766 _line_text => $input_line,
767 _line_number => $input_line_number,
768 _guessed_indentation_level => 0,
769 _curly_brace_depth => $brace_depth,
770 _square_bracket_depth => $square_bracket_depth,
771 _paren_depth => $paren_depth,
772 _quote_character => '',
773 ## _rtoken_type => undef,
774 ## _rtokens => undef,
775 ## _rlevels => undef,
776 ## _rslevels => undef,
777 ## _rblock_type => undef,
778 ## _rcontainer_type => undef,
779 ## _rcontainer_environment => undef,
780 ## _rtype_sequence => undef,
781 ## _rnesting_tokens => undef,
782 ## _rci_levels => undef,
783 ## _rnesting_blocks => undef,
784 ## _starting_in_quote => 0,
785 ## _ending_in_quote => 0,
788 # must print line unchanged if we are in a here document
789 if ( $tokenizer_self->[_in_here_doc_] ) {
791 $line_of_tokens->{_line_type} = 'HERE';
792 my $here_doc_target = $tokenizer_self->[_here_doc_target_];
793 my $here_quote_character = $tokenizer_self->[_here_quote_character_];
794 my $candidate_target = $input_line;
795 chomp $candidate_target;
797 # Handle <<~ targets, which are indicated here by a leading space on
798 # the here quote character
799 if ( $here_quote_character =~ /^\s/ ) {
800 $candidate_target =~ s/^\s*//;
802 if ( $candidate_target eq $here_doc_target ) {
803 $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
804 $line_of_tokens->{_line_type} = 'HERE_END';
805 $write_logfile_entry->("Exiting HERE document $here_doc_target\n");
807 my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
808 if ( @{$rhere_target_list} ) { # there can be multiple here targets
809 ( $here_doc_target, $here_quote_character ) =
810 @{ shift @{$rhere_target_list} };
811 $tokenizer_self->[_here_doc_target_] = $here_doc_target;
812 $tokenizer_self->[_here_quote_character_] =
813 $here_quote_character;
814 $write_logfile_entry->(
815 "Entering HERE document $here_doc_target\n");
816 $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
817 $tokenizer_self->[_started_looking_for_here_target_at_] =
821 $tokenizer_self->[_in_here_doc_] = 0;
822 $tokenizer_self->[_here_doc_target_] = "";
823 $tokenizer_self->[_here_quote_character_] = "";
827 # check for error of extra whitespace
828 # note for PERL6: leading whitespace is allowed
830 $candidate_target =~ s/\s*$//;
831 $candidate_target =~ s/^\s*//;
832 if ( $candidate_target eq $here_doc_target ) {
833 $tokenizer_self->[_nearly_matched_here_target_at_] =
837 return $line_of_tokens;
840 # Print line unchanged if we are in a format section
841 elsif ( $tokenizer_self->[_in_format_] ) {
843 if ( $input_line =~ /^\.[\s#]*$/ ) {
845 # Decrement format depth count at a '.' after a 'format'
846 $tokenizer_self->[_in_format_]--;
848 # This is the end when count reaches 0
849 if ( !$tokenizer_self->[_in_format_] ) {
850 $write_logfile_entry->("Exiting format section\n");
851 $line_of_tokens->{_line_type} = 'FORMAT_END';
855 $line_of_tokens->{_line_type} = 'FORMAT';
856 if ( $input_line =~ /^\s*format\s+\w+/ ) {
858 # Increment format depth count at a 'format' within a 'format'
859 # This is a simple way to handle nested formats (issue c019).
860 $tokenizer_self->[_in_format_]++;
863 return $line_of_tokens;
866 # must print line unchanged if we are in pod documentation
867 elsif ( $tokenizer_self->[_in_pod_] ) {
869 $line_of_tokens->{_line_type} = 'POD';
870 if ( $input_line =~ /^=cut/ ) {
871 $line_of_tokens->{_line_type} = 'POD_END';
872 $write_logfile_entry->("Exiting POD section\n");
873 $tokenizer_self->[_in_pod_] = 0;
875 if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) {
877 "Hash-bang in pod can cause older versions of perl to fail! \n"
881 return $line_of_tokens;
884 # print line unchanged if in skipped section
885 elsif ( $tokenizer_self->[_in_skipped_] ) {
887 # NOTE: marked as the existing type 'FORMAT' to keep html working
888 $line_of_tokens->{_line_type} = 'FORMAT';
889 if ( $input_line =~ /$code_skipping_pattern_end/ ) {
890 $write_logfile_entry->("Exiting code-skipping section\n");
891 $tokenizer_self->[_in_skipped_] = 0;
893 return $line_of_tokens;
896 # must print line unchanged if we have seen a severe error (i.e., we
897 # are seeing illegal tokens and cannot continue. Syntax errors do
898 # not pass this route). Calling routine can decide what to do, but
899 # the default can be to just pass all lines as if they were after __END__
900 elsif ( $tokenizer_self->[_in_error_] ) {
901 $line_of_tokens->{_line_type} = 'ERROR';
902 return $line_of_tokens;
905 # print line unchanged if we are __DATA__ section
906 elsif ( $tokenizer_self->[_in_data_] ) {
908 # ...but look for POD
909 # Note that the _in_data and _in_end flags remain set
910 # so that we return to that state after seeing the
911 # end of a pod section
912 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
913 $line_of_tokens->{_line_type} = 'POD_START';
914 $write_logfile_entry->("Entering POD section\n");
915 $tokenizer_self->[_in_pod_] = 1;
916 return $line_of_tokens;
919 $line_of_tokens->{_line_type} = 'DATA';
920 return $line_of_tokens;
924 # print line unchanged if we are in __END__ section
925 elsif ( $tokenizer_self->[_in_end_] ) {
927 # ...but look for POD
928 # Note that the _in_data and _in_end flags remain set
929 # so that we return to that state after seeing the
930 # end of a pod section
931 if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
932 $line_of_tokens->{_line_type} = 'POD_START';
933 $write_logfile_entry->("Entering POD section\n");
934 $tokenizer_self->[_in_pod_] = 1;
935 return $line_of_tokens;
938 $line_of_tokens->{_line_type} = 'END';
939 return $line_of_tokens;
943 # check for a hash-bang line if we haven't seen one
944 if ( !$tokenizer_self->[_saw_hash_bang_] ) {
945 if ( $input_line =~ /^\#\!.*perl\b/ ) {
946 $tokenizer_self->[_saw_hash_bang_] = $input_line_number;
948 # check for -w and -P flags
949 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
950 $tokenizer_self->[_saw_perl_dash_P_] = 1;
953 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
954 $tokenizer_self->[_saw_perl_dash_w_] = 1;
958 $input_line_number > 1
960 # leave any hash bang in a BEGIN block alone
961 # i.e. see 'debugger-duck_type.t'
963 $last_nonblank_block_type
964 && $last_nonblank_block_type eq 'BEGIN'
966 && !$tokenizer_self->[_look_for_hash_bang_]
968 # Try to avoid giving a false alarm at a simple comment.
969 # These look like valid hash-bang lines:
973 #!c:\perl\bin\perl.exe
975 # These are comments:
977 #! sunos does not yet provide a /usr/bin/perl
979 # Comments typically have multiple spaces, which suggests
981 && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
985 # this is helpful for VMS systems; we may have accidentally
986 # tokenized some DCL commands
987 if ( $tokenizer_self->[_started_tokenizing_] ) {
989 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
993 complain("Useless hash-bang after line 1\n");
997 # Report the leading hash-bang as a system line
998 # This will prevent -dac from deleting it
1000 $line_of_tokens->{_line_type} = 'SYSTEM';
1001 return $line_of_tokens;
1006 # wait for a hash-bang before parsing if the user invoked us with -x
1007 if ( $tokenizer_self->[_look_for_hash_bang_]
1008 && !$tokenizer_self->[_saw_hash_bang_] )
1010 $line_of_tokens->{_line_type} = 'SYSTEM';
1011 return $line_of_tokens;
1014 # a first line of the form ': #' will be marked as SYSTEM
1015 # since lines of this form may be used by tcsh
1016 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1017 $line_of_tokens->{_line_type} = 'SYSTEM';
1018 return $line_of_tokens;
1021 # now we know that it is ok to tokenize the line...
1022 # the line tokenizer will modify any of these private variables:
1023 # _rhere_target_list_
1031 my $ending_in_quote_last = $tokenizer_self->[_in_quote_];
1032 tokenize_this_line($line_of_tokens);
1034 # Now finish defining the return structure and return it
1035 $line_of_tokens->{_ending_in_quote} = $tokenizer_self->[_in_quote_];
1037 # handle severe error (binary data in script)
1038 if ( $tokenizer_self->[_in_error_] ) {
1039 $tokenizer_self->[_in_quote_] = 0; # to avoid any more messages
1040 warning("Giving up after error\n");
1041 $line_of_tokens->{_line_type} = 'ERROR';
1042 reset_indentation_level(0); # avoid error messages
1043 return $line_of_tokens;
1046 # handle start of pod documentation
1047 if ( $tokenizer_self->[_in_pod_] ) {
1049 # This gets tricky..above a __DATA__ or __END__ section, perl
1050 # accepts '=cut' as the start of pod section. But afterwards,
1051 # only pod utilities see it and they may ignore an =cut without
1052 # leading =head. In any case, this isn't good.
1053 if ( $input_line =~ /^=cut\b/ ) {
1054 if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] )
1056 complain("=cut while not in pod ignored\n");
1057 $tokenizer_self->[_in_pod_] = 0;
1058 $line_of_tokens->{_line_type} = 'POD_END';
1061 $line_of_tokens->{_line_type} = 'POD_START';
1063 "=cut starts a pod section .. this can fool pod utilities.\n"
1065 $write_logfile_entry->("Entering POD section\n");
1070 $line_of_tokens->{_line_type} = 'POD_START';
1071 $write_logfile_entry->("Entering POD section\n");
1074 return $line_of_tokens;
1077 # handle start of skipped section
1078 if ( $tokenizer_self->[_in_skipped_] ) {
1080 # NOTE: marked as the existing type 'FORMAT' to keep html working
1081 $line_of_tokens->{_line_type} = 'FORMAT';
1082 $write_logfile_entry->("Entering code-skipping section\n");
1083 return $line_of_tokens;
1086 # Update indentation levels for log messages.
1087 # Skip blank lines and also block comments, unless a logfile is requested.
1088 # Note that _line_of_text_ is the input line but trimmed from left to right.
1089 my $lot = $tokenizer_self->[_line_of_text_];
1090 if ( $lot && ( $self->[_rOpts_logfile_] || substr( $lot, 0, 1 ) ne '#' ) ) {
1091 my $rlevels = $line_of_tokens->{_rlevels};
1092 $line_of_tokens->{_guessed_indentation_level} =
1093 guess_old_indentation_level($input_line);
1096 # see if this line contains here doc targets
1097 my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
1098 if ( @{$rhere_target_list} ) {
1100 my ( $here_doc_target, $here_quote_character ) =
1101 @{ shift @{$rhere_target_list} };
1102 $tokenizer_self->[_in_here_doc_] = 1;
1103 $tokenizer_self->[_here_doc_target_] = $here_doc_target;
1104 $tokenizer_self->[_here_quote_character_] = $here_quote_character;
1105 $write_logfile_entry->("Entering HERE document $here_doc_target\n");
1106 $tokenizer_self->[_started_looking_for_here_target_at_] =
1110 # NOTE: __END__ and __DATA__ statements are written unformatted
1111 # because they can theoretically contain additional characters
1112 # which are not tokenized (and cannot be read with <DATA> either!).
1113 if ( $tokenizer_self->[_in_data_] ) {
1114 $line_of_tokens->{_line_type} = 'DATA_START';
1115 $write_logfile_entry->("Starting __DATA__ section\n");
1116 $tokenizer_self->[_saw_data_] = 1;
1118 # keep parsing after __DATA__ if use SelfLoader was seen
1119 if ( $tokenizer_self->[_saw_selfloader_] ) {
1120 $tokenizer_self->[_in_data_] = 0;
1121 $write_logfile_entry->(
1122 "SelfLoader seen, continuing; -nlsl deactivates\n");
1125 return $line_of_tokens;
1128 elsif ( $tokenizer_self->[_in_end_] ) {
1129 $line_of_tokens->{_line_type} = 'END_START';
1130 $write_logfile_entry->("Starting __END__ section\n");
1131 $tokenizer_self->[_saw_end_] = 1;
1133 # keep parsing after __END__ if use AutoLoader was seen
1134 if ( $tokenizer_self->[_saw_autoloader_] ) {
1135 $tokenizer_self->[_in_end_] = 0;
1136 $write_logfile_entry->(
1137 "AutoLoader seen, continuing; -nlal deactivates\n");
1139 return $line_of_tokens;
1142 # now, finally, we know that this line is type 'CODE'
1143 $line_of_tokens->{_line_type} = 'CODE';
1145 # remember if we have seen any real code
1146 if ( !$tokenizer_self->[_started_tokenizing_]
1147 && $input_line !~ /^\s*$/
1148 && $input_line !~ /^\s*#/ )
1150 $tokenizer_self->[_started_tokenizing_] = 1;
1153 if ( $tokenizer_self->[_debugger_object_] ) {
1154 $tokenizer_self->[_debugger_object_]
1155 ->write_debug_entry($line_of_tokens);
1158 # Note: if keyword 'format' occurs in this line code, it is still CODE
1159 # (keyword 'format' need not start a line)
1160 if ( $tokenizer_self->[_in_format_] ) {
1161 $write_logfile_entry->("Entering format section\n");
1164 if ( $tokenizer_self->[_in_quote_]
1165 and ( $tokenizer_self->[_line_start_quote_] < 0 ) )
1168 #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1169 if ( ( my $quote_target = $tokenizer_self->[_quote_target_] ) !~
1172 $tokenizer_self->[_line_start_quote_] = $input_line_number;
1173 $write_logfile_entry->(
1174 "Start multi-line quote or pattern ending in $quote_target\n");
1177 elsif ( ( $tokenizer_self->[_line_start_quote_] >= 0 )
1178 && !$tokenizer_self->[_in_quote_] )
1180 $tokenizer_self->[_line_start_quote_] = -1;
1181 $write_logfile_entry->("End of multi-line quote or pattern\n");
1184 # we are returning a line of CODE
1185 return $line_of_tokens;
1188 sub find_starting_indentation_level {
1190 # We need to find the indentation level of the first line of the
1191 # script being formatted. Often it will be zero for an entire file,
1192 # but if we are formatting a local block of code (within an editor for
1193 # example) it may not be zero. The user may specify this with the
1194 # -sil=n parameter but normally doesn't so we have to guess.
1196 # USES GLOBAL VARIABLES: $tokenizer_self
1197 my $starting_level = 0;
1199 # use value if given as parameter
1200 if ( $tokenizer_self->[_know_starting_level_] ) {
1201 $starting_level = $tokenizer_self->[_starting_level_];
1204 # if we know there is a hash_bang line, the level must be zero
1205 elsif ( $tokenizer_self->[_look_for_hash_bang_] ) {
1206 $tokenizer_self->[_know_starting_level_] = 1;
1209 # otherwise figure it out from the input file
1214 # keep looking at lines until we find a hash bang or piece of code
1217 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
1220 # if first line is #! then assume starting level is zero
1221 if ( $i == 1 && $line =~ /^\#\!/ ) {
1222 $starting_level = 0;
1225 next if ( $line =~ /^\s*#/ ); # skip past comments
1226 next if ( $line =~ /^\s*$/ ); # skip past blank lines
1227 $starting_level = guess_old_indentation_level($line);
1230 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1231 write_logfile_entry("$msg");
1233 $tokenizer_self->[_starting_level_] = $starting_level;
1234 reset_indentation_level($starting_level);
1238 sub guess_old_indentation_level {
1241 # Guess the indentation level of an input line.
1243 # For the first line of code this result will define the starting
1244 # indentation level. It will mainly be non-zero when perltidy is applied
1245 # within an editor to a local block of code.
1247 # This is an impossible task in general because we can't know what tabs
1248 # meant for the old script and how many spaces were used for one
1249 # indentation level in the given input script. For example it may have
1250 # been previously formatted with -i=7 -et=3. But we can at least try to
1251 # make sure that perltidy guesses correctly if it is applied repeatedly to
1252 # a block of code within an editor, so that the block stays at the same
1253 # level when perltidy is applied repeatedly.
1255 # USES GLOBAL VARIABLES: $tokenizer_self
1258 # find leading tabs, spaces, and any statement label
1260 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1262 # If there are leading tabs, we use the tab scheme for this run, if
1263 # any, so that the code will remain stable when editing.
1264 if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }
1266 if ($2) { $spaces += length($2) }
1268 # correct for outdented labels
1269 if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
1270 $spaces += $tokenizer_self->[_continuation_indentation_];
1274 # compute indentation using the value of -i for this run.
1275 # If -i=0 is used for this run (which is possible) it doesn't matter
1276 # what we do here but we'll guess that the old run used 4 spaces per level.
1277 my $indent_columns = $tokenizer_self->[_indent_columns_];
1278 $indent_columns = 4 if ( !$indent_columns );
1279 $level = int( $spaces / $indent_columns );
1283 # This is a currently unused debug routine
1284 sub dump_functions {
1287 foreach my $pkg ( keys %is_user_function ) {
1288 $fh->print("\nnon-constant subs in package $pkg\n");
1290 foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1292 if ( $is_block_list_function{$pkg}{$sub} ) {
1293 $msg = 'block_list';
1296 if ( $is_block_function{$pkg}{$sub} ) {
1299 $fh->print("$sub $msg\n");
1303 foreach my $pkg ( keys %is_constant ) {
1304 $fh->print("\nconstants and constant subs in package $pkg\n");
1306 foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1307 $fh->print("$sub\n");
1313 sub prepare_for_a_new_file {
1315 # previous tokens needed to determine what to expect next
1316 $last_nonblank_token = ';'; # the only possible starting state which
1317 $last_nonblank_type = ';'; # will make a leading brace a code block
1318 $last_nonblank_block_type = '';
1320 # scalars for remembering statement types across multiple lines
1321 $statement_type = ''; # '' or 'use' or 'sub..' or 'case..'
1322 $in_attribute_list = 0;
1324 # scalars for remembering where we are in the file
1325 $current_package = "main";
1326 $context = UNKNOWN_CONTEXT;
1328 # hashes used to remember function information
1329 %is_constant = (); # user-defined constants
1330 %is_user_function = (); # user-defined functions
1331 %user_function_prototype = (); # their prototypes
1332 %is_block_function = ();
1333 %is_block_list_function = ();
1334 %saw_function_definition = ();
1335 %saw_use_module = ();
1337 # variables used to track depths of various containers
1338 # and report nesting errors
1341 $square_bracket_depth = 0;
1342 @current_depth = (0) x scalar @closing_brace_names;
1345 @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
1346 @current_sequence_number = ();
1349 @paren_semicolon_count = ();
1350 @paren_structural_type = ();
1352 @brace_structural_type = ();
1353 @brace_context = ();
1354 @brace_package = ();
1355 @square_bracket_type = ();
1356 @square_bracket_structural_type = ();
1358 @nested_ternary_flag = ();
1359 @nested_statement_type = ();
1360 @starting_line_of_current_depth = ();
1362 $paren_type[$paren_depth] = '';
1363 $paren_semicolon_count[$paren_depth] = 0;
1364 $paren_structural_type[$brace_depth] = '';
1365 $brace_type[$brace_depth] = ';'; # identify opening brace as code block
1366 $brace_structural_type[$brace_depth] = '';
1367 $brace_context[$brace_depth] = UNKNOWN_CONTEXT;
1368 $brace_package[$paren_depth] = $current_package;
1369 $square_bracket_type[$square_bracket_depth] = '';
1370 $square_bracket_structural_type[$square_bracket_depth] = '';
1372 initialize_tokenizer_state();
1376 { ## closure for sub tokenize_this_line
1378 use constant BRACE => 0;
1379 use constant SQUARE_BRACKET => 1;
1380 use constant PAREN => 2;
1381 use constant QUESTION_COLON => 3;
1383 # TV1: scalars for processing one LINE.
1384 # Re-initialized on each entry to sub tokenize_this_line.
1386 $block_type, $container_type, $expecting,
1387 $i, $i_tok, $input_line,
1388 $input_line_number, $last_nonblank_i, $max_token_index,
1389 $next_tok, $next_type, $peeked_ahead,
1390 $prototype, $rhere_target_list, $rtoken_map,
1391 $rtoken_type, $rtokens, $tok,
1392 $type, $type_sequence, $indent_flag,
1395 # TV2: refs to ARRAYS for processing one LINE
1396 # Re-initialized on each call.
1397 my $routput_token_list = []; # stack of output token indexes
1398 my $routput_token_type = []; # token types
1399 my $routput_block_type = []; # types of code block
1400 my $routput_container_type = []; # paren types, such as if, elsif, ..
1401 my $routput_type_sequence = []; # nesting sequential number
1402 my $routput_indent_flag = []; #
1404 # TV3: SCALARS for quote variables. These are initialized with a
1405 # subroutine call and continually updated as lines are processed.
1406 my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1407 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1409 # TV4: SCALARS for multi-line identifiers and
1410 # statements. These are initialized with a subroutine call
1411 # and continually updated as lines are processed.
1412 my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
1414 # TV5: SCALARS for tracking indentation level.
1415 # Initialized once and continually updated as lines are
1418 $nesting_token_string, $nesting_type_string,
1419 $nesting_block_string, $nesting_block_flag,
1420 $nesting_list_string, $nesting_list_flag,
1421 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1422 $in_statement_continuation, $level_in_tokenizer,
1423 $slevel_in_tokenizer, $rslevel_stack,
1426 # TV6: SCALARS for remembering several previous
1427 # tokens. Initialized once and continually updated as
1428 # lines are processed.
1430 $last_nonblank_container_type, $last_nonblank_type_sequence,
1431 $last_last_nonblank_token, $last_last_nonblank_type,
1432 $last_last_nonblank_block_type, $last_last_nonblank_container_type,
1433 $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1436 # ----------------------------------------------------------------
1437 # beginning of tokenizer variable access and manipulation routines
1438 # ----------------------------------------------------------------
1440 sub initialize_tokenizer_state {
1442 # TV1: initialized on each call
1443 # TV2: initialized on each call
1447 $quote_character = "";
1450 $quoted_string_1 = "";
1451 $quoted_string_2 = "";
1452 $allowed_quote_modifiers = "";
1455 $id_scan_state = '';
1458 $indented_if_level = 0;
1461 $nesting_token_string = "";
1462 $nesting_type_string = "";
1463 $nesting_block_string = '1'; # initially in a block
1464 $nesting_block_flag = 1;
1465 $nesting_list_string = '0'; # initially not in a list
1466 $nesting_list_flag = 0; # initially not in a list
1467 $ci_string_in_tokenizer = "";
1468 $continuation_string_in_tokenizer = "0";
1469 $in_statement_continuation = 0;
1470 $level_in_tokenizer = 0;
1471 $slevel_in_tokenizer = 0;
1472 $rslevel_stack = [];
1475 $last_nonblank_container_type = '';
1476 $last_nonblank_type_sequence = '';
1477 $last_last_nonblank_token = ';';
1478 $last_last_nonblank_type = ';';
1479 $last_last_nonblank_block_type = '';
1480 $last_last_nonblank_container_type = '';
1481 $last_last_nonblank_type_sequence = '';
1482 $last_nonblank_prototype = "";
1486 sub save_tokenizer_state {
1489 $block_type, $container_type, $expecting,
1490 $i, $i_tok, $input_line,
1491 $input_line_number, $last_nonblank_i, $max_token_index,
1492 $next_tok, $next_type, $peeked_ahead,
1493 $prototype, $rhere_target_list, $rtoken_map,
1494 $rtoken_type, $rtokens, $tok,
1495 $type, $type_sequence, $indent_flag,
1499 $routput_token_list, $routput_token_type,
1500 $routput_block_type, $routput_container_type,
1501 $routput_type_sequence, $routput_indent_flag,
1505 $in_quote, $quote_type,
1506 $quote_character, $quote_pos,
1507 $quote_depth, $quoted_string_1,
1508 $quoted_string_2, $allowed_quote_modifiers,
1512 [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
1515 $nesting_token_string, $nesting_type_string,
1516 $nesting_block_string, $nesting_block_flag,
1517 $nesting_list_string, $nesting_list_flag,
1518 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1519 $in_statement_continuation, $level_in_tokenizer,
1520 $slevel_in_tokenizer, $rslevel_stack,
1524 $last_nonblank_container_type,
1525 $last_nonblank_type_sequence,
1526 $last_last_nonblank_token,
1527 $last_last_nonblank_type,
1528 $last_last_nonblank_block_type,
1529 $last_last_nonblank_container_type,
1530 $last_last_nonblank_type_sequence,
1531 $last_nonblank_prototype,
1533 return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1536 sub restore_tokenizer_state {
1538 my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1540 $block_type, $container_type, $expecting,
1541 $i, $i_tok, $input_line,
1542 $input_line_number, $last_nonblank_i, $max_token_index,
1543 $next_tok, $next_type, $peeked_ahead,
1544 $prototype, $rhere_target_list, $rtoken_map,
1545 $rtoken_type, $rtokens, $tok,
1546 $type, $type_sequence, $indent_flag,
1550 $routput_token_list, $routput_token_type,
1551 $routput_block_type, $routput_container_type,
1552 $routput_type_sequence, $routput_type_sequence,
1556 $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1557 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1560 ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
1564 $nesting_token_string, $nesting_type_string,
1565 $nesting_block_string, $nesting_block_flag,
1566 $nesting_list_string, $nesting_list_flag,
1567 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1568 $in_statement_continuation, $level_in_tokenizer,
1569 $slevel_in_tokenizer, $rslevel_stack,
1573 $last_nonblank_container_type,
1574 $last_nonblank_type_sequence,
1575 $last_last_nonblank_token,
1576 $last_last_nonblank_type,
1577 $last_last_nonblank_block_type,
1578 $last_last_nonblank_container_type,
1579 $last_last_nonblank_type_sequence,
1580 $last_nonblank_prototype,
1585 sub get_indentation_level {
1587 # patch to avoid reporting error if indented if is not terminated
1588 if ($indented_if_level) { return $level_in_tokenizer - 1 }
1589 return $level_in_tokenizer;
1592 sub reset_indentation_level {
1593 $level_in_tokenizer = $slevel_in_tokenizer = shift;
1594 push @{$rslevel_stack}, $slevel_in_tokenizer;
1600 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1601 return $peeked_ahead;
1604 # ------------------------------------------------------------
1605 # end of tokenizer variable access and manipulation routines
1606 # ------------------------------------------------------------
1608 # ------------------------------------------------------------
1609 # beginning of various scanner interface routines
1610 # ------------------------------------------------------------
1611 sub scan_replacement_text {
1613 # check for here-docs in replacement text invoked by
1614 # a substitution operator with executable modifier 'e'.
1619 # $rht = reference to any here-doc targets
1620 my ($replacement_text) = @_;
1623 return unless ( $replacement_text =~ /<</ );
1625 write_logfile_entry("scanning replacement text for here-doc targets\n");
1627 # save the logger object for error messages
1628 my $logger_object = $tokenizer_self->[_logger_object_];
1630 # localize all package variables
1632 $tokenizer_self, $last_nonblank_token,
1633 $last_nonblank_type, $last_nonblank_block_type,
1634 $statement_type, $in_attribute_list,
1635 $current_package, $context,
1636 %is_constant, %is_user_function,
1637 %user_function_prototype, %is_block_function,
1638 %is_block_list_function, %saw_function_definition,
1639 $brace_depth, $paren_depth,
1640 $square_bracket_depth, @current_depth,
1641 @total_depth, $total_depth,
1642 @nesting_sequence_number, @current_sequence_number,
1643 @paren_type, @paren_semicolon_count,
1644 @paren_structural_type, @brace_type,
1645 @brace_structural_type, @brace_context,
1646 @brace_package, @square_bracket_type,
1647 @square_bracket_structural_type, @depth_array,
1648 @starting_line_of_current_depth, @nested_ternary_flag,
1649 @nested_statement_type,
1652 # save all lexical variables
1653 my $rstate = save_tokenizer_state();
1654 _decrement_count(); # avoid error check for multiple tokenizers
1656 # make a new tokenizer
1658 my $rpending_logfile_message;
1659 my $source_object = Perl::Tidy::LineSource->new(
1660 input_file => \$replacement_text,
1662 rpending_logfile_message => $rpending_logfile_message,
1664 my $tokenizer = Perl::Tidy::Tokenizer->new(
1665 source_object => $source_object,
1666 logger_object => $logger_object,
1667 starting_line_number => $input_line_number,
1670 # scan the replacement text
1671 1 while ( $tokenizer->get_line() );
1673 # remove any here doc targets
1675 if ( $tokenizer_self->[_in_here_doc_] ) {
1679 $tokenizer_self->[_here_doc_target_],
1680 $tokenizer_self->[_here_quote_character_]
1682 if ( $tokenizer_self->[_rhere_target_list_] ) {
1683 push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
1684 $tokenizer_self->[_rhere_target_list_] = undef;
1686 $tokenizer_self->[_in_here_doc_] = undef;
1689 # now its safe to report errors
1690 my $severe_error = $tokenizer->report_tokenization_errors();
1692 # TODO: Could propagate a severe error up
1694 # restore all tokenizer lexical variables
1695 restore_tokenizer_state($rstate);
1697 # return the here doc targets
1701 sub scan_bare_identifier {
1702 ( $i, $tok, $type, $prototype ) =
1703 scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
1704 $rtoken_map, $max_token_index );
1708 sub scan_identifier {
1709 ( $i, $tok, $type, $id_scan_state, $identifier ) =
1710 scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
1711 $max_token_index, $expecting, $paren_type[$paren_depth] );
1715 use constant VERIFY_FASTSCAN => 0;
1716 my %fast_scan_context;
1719 %fast_scan_context = (
1720 '$' => SCALAR_CONTEXT,
1721 '*' => SCALAR_CONTEXT,
1722 '@' => LIST_CONTEXT,
1723 '%' => LIST_CONTEXT,
1724 '&' => UNKNOWN_CONTEXT,
1728 sub scan_identifier_fast {
1730 # This is a wrapper for sub scan_identifier. It does a fast preliminary
1731 # scan for certain common identifiers:
1732 # '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
1733 # If it does not find one of these, or this is a restart, it calls the
1734 # original scanner directly.
1736 # This gives the same results as the full scanner in about 1/4 the
1737 # total runtime for a typical input stream.
1740 my $tok_begin = $tok;
1743 ###############################
1744 # quick scan with leading sigil
1745 ###############################
1746 if ( !$id_scan_state
1747 && $i + 1 <= $max_token_index
1748 && $fast_scan_context{$tok} )
1750 $context = $fast_scan_context{$tok};
1752 # look for $var, @var, ...
1753 if ( $rtoken_type->[ $i + 1 ] eq 'w' ) {
1754 my $pretype_next = "";
1755 my $i_next = $i + 2;
1756 if ( $i_next <= $max_token_index ) {
1757 if ( $rtoken_type->[$i_next] eq 'b'
1758 && $i_next < $max_token_index )
1762 $pretype_next = $rtoken_type->[$i_next];
1764 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
1766 # Found type 'i' like '$var', '@var', or '%var'
1767 $identifier = $tok . $rtokens->[ $i + 1 ];
1771 $fast_scan_type = $type;
1775 # Look for @{ or %{ .
1776 # But we must let the full scanner handle things ${ because it may
1777 # keep going to get a complete identifier like '${#}' .
1779 $rtoken_type->[ $i + 1 ] eq '{'
1780 && ( $tok_begin eq '@'
1781 || $tok_begin eq '%' )
1787 $fast_scan_type = $type;
1791 ############################
1792 # Quick scan with leading ->
1793 # Look for ->[ and ->{
1794 ############################
1797 && $i < $max_token_index
1798 && ( $rtokens->[ $i + 1 ] eq '{'
1799 || $rtokens->[ $i + 1 ] eq '[' )
1803 $fast_scan_type = $type;
1805 $context = UNKNOWN_CONTEXT;
1808 #######################################
1809 # Verify correctness during development
1810 #######################################
1811 if ( VERIFY_FASTSCAN && $fast_scan_type ) {
1813 # We will call the full method
1814 my $identifier_simple = $identifier;
1815 my $tok_simple = $tok;
1816 my $fast_scan_type = $type;
1818 my $context_simple = $context;
1824 if ( $tok ne $tok_simple
1825 || $type ne $fast_scan_type
1827 || $identifier ne $identifier_simple
1829 || $context ne $context_simple )
1832 scan_identifier_fast differs from scan_identifier:
1833 simple: i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
1834 full: i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
1839 ###################################################
1840 # call full scanner if fast method did not succeed
1841 ###################################################
1842 if ( !$fast_scan_type ) {
1849 ( $i, $tok, $type, $id_scan_state ) =
1850 scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
1851 $id_scan_state, $max_token_index );
1857 ( $i, $type, $number ) =
1858 scan_number_do( $input_line, $i, $rtoken_map, $type,
1863 use constant VERIFY_FASTNUM => 0;
1865 sub scan_number_fast {
1867 # This is a wrapper for sub scan_number. It does a fast preliminary
1868 # scan for a simple integer. It calls the original scan_number if it
1869 # does not find one.
1872 my $tok_begin = $tok;
1875 ##################################
1876 # Quick check for (signed) integer
1877 ##################################
1879 # This will be the string of digits:
1882 my $typ_d = $rtoken_type->[$i_d];
1884 # check for signed integer
1887 && ( $typ_d eq '+' || $typ_d eq '-' )
1888 && $i_d < $max_token_index )
1892 $tok_d = $rtokens->[$i_d];
1893 $typ_d = $rtoken_type->[$i_d];
1900 $i_d == $max_token_index
1901 || ( $i_d < $max_token_index
1902 && $rtoken_type->[ $i_d + 1 ] ne '.'
1903 && $rtoken_type->[ $i_d + 1 ] ne 'w' )
1907 # Let let full scanner handle multi-digit integers beginning with
1908 # '0' because there could be error messages. For example, '009' is
1909 # not a valid number.
1911 if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
1912 $number = $sign . $tok_d;
1918 #######################################
1919 # Verify correctness during development
1920 #######################################
1921 if ( VERIFY_FASTNUM && defined($number) ) {
1923 # We will call the full method
1924 my $type_simple = $type;
1926 my $number_simple = $number;
1930 $number = scan_number();
1932 if ( $type ne $type_simple
1933 || ( $i != $i_simple && $i <= $max_token_index )
1934 || $number ne $number_simple )
1937 scan_number_fast differs from scan_number:
1938 simple: i=$i_simple, type=$type_simple, number=$number_simple
1939 full: i=$i, type=$type, number=$number
1944 #########################################
1945 # call full scanner if may not be integer
1946 #########################################
1947 if ( !defined($number) ) {
1948 $number = scan_number();
1953 # a sub to warn if token found where term expected
1954 sub error_if_expecting_TERM {
1955 if ( $expecting == TERM ) {
1956 if ( $really_want_term{$last_nonblank_type} ) {
1957 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
1958 $rtoken_map, $rtoken_type, $input_line );
1965 # a sub to warn if token found where operator expected
1966 sub error_if_expecting_OPERATOR {
1968 if ( $expecting == OPERATOR ) {
1969 if ( !defined($thing) ) { $thing = $tok }
1970 report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
1971 $rtoken_map, $rtoken_type, $input_line );
1972 if ( $i_tok == 0 ) {
1973 interrupt_logfile();
1974 warning("Missing ';' or ',' above?\n");
1982 # ------------------------------------------------------------
1983 # end scanner interfaces
1984 # ------------------------------------------------------------
1987 @_ = qw(for foreach);
1988 @is_for_foreach{@_} = (1) x scalar(@_);
1990 my %is_my_our_state;
1991 @_ = qw(my our state);
1992 @is_my_our_state{@_} = (1) x scalar(@_);
1994 # These keywords may introduce blocks after parenthesized expressions,
1996 # keyword ( .... ) { BLOCK }
1997 # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
1998 my %is_blocktype_with_paren;
2000 qw(if elsif unless while until for foreach switch case given when catch);
2001 @is_blocktype_with_paren{@_} = (1) x scalar(@_);
2003 my %is_case_default;
2004 @_ = qw(case default);
2005 @is_case_default{@_} = (1) x scalar(@_);
2007 # ------------------------------------------------------------
2008 # begin hash of code for handling most token types
2009 # ------------------------------------------------------------
2010 my $tokenization_code = {
2012 # no special code for these types yet, but syntax checks
2047 error_if_expecting_TERM()
2048 if ( $expecting == TERM );
2051 error_if_expecting_TERM()
2052 if ( $expecting == TERM );
2056 # start looking for a scalar
2057 error_if_expecting_OPERATOR("Scalar")
2058 if ( $expecting == OPERATOR );
2059 scan_identifier_fast();
2061 if ( $identifier eq '$^W' ) {
2062 $tokenizer_self->[_saw_perl_dash_w_] = 1;
2065 # Check for identifier in indirect object slot
2066 # (vorboard.pl, sort.t). Something like:
2067 # /^(print|printf|sort|exec|system)$/
2069 $is_indirect_object_taker{$last_nonblank_token}
2070 || ( ( $last_nonblank_token eq '(' )
2071 && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
2072 || ( $last_nonblank_type eq 'w'
2073 || $last_nonblank_type eq 'U' ) # possible object
2082 $paren_semicolon_count[$paren_depth] = 0;
2084 $container_type = $want_paren;
2087 elsif ( $statement_type =~ /^sub\b/ ) {
2088 $container_type = $statement_type;
2091 $container_type = $last_nonblank_token;
2093 # We can check for a syntax error here of unexpected '(',
2094 # but this is going to get messy...
2096 $expecting == OPERATOR
2098 # Be sure this is not a method call of the form
2099 # &method(...), $method->(..), &{method}(...),
2100 # $ref[2](list) is ok & short for $ref[2]->(list)
2101 # NOTE: at present, braces in something like &{ xxx }
2102 # are not marked as a block, we might have a method call.
2103 # Added ')' to fix case c017, something like ()()()
2104 && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/
2109 # ref: camel 3 p 703.
2110 if ( $last_last_nonblank_token eq 'do' ) {
2112 "do SUBROUTINE is deprecated; consider & or -> notation\n"
2117 # if this is an empty list, (), then it is not an
2118 # error; for example, we might have a constant pi and
2119 # invoke it with pi() or just pi;
2120 my ( $next_nonblank_token, $i_next ) =
2121 find_next_nonblank_token( $i, $rtokens,
2124 # Patch for c029: give up error check if
2125 # a side comment follows
2126 if ( $next_nonblank_token ne ')'
2127 && $next_nonblank_token ne '#' )
2131 error_if_expecting_OPERATOR('(');
2133 if ( $last_nonblank_type eq 'C' ) {
2135 "$last_nonblank_token has a void prototype\n";
2137 elsif ( $last_nonblank_type eq 'i' ) {
2139 && $last_nonblank_token =~ /^\$/ )
2142 "Do you mean '$last_nonblank_token->(' ?\n";
2146 interrupt_logfile();
2150 } ## end if ( $next_nonblank_token...
2151 } ## end else [ if ( $last_last_nonblank_token...
2152 } ## end if ( $expecting == OPERATOR...
2154 $paren_type[$paren_depth] = $container_type;
2155 ( $type_sequence, $indent_flag ) =
2156 increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2158 # propagate types down through nested parens
2159 # for example: the second paren in 'if ((' would be structural
2160 # since the first is.
2162 if ( $last_nonblank_token eq '(' ) {
2163 $type = $last_nonblank_type;
2166 # We exclude parens as structural after a ',' because it
2167 # causes subtle problems with continuation indentation for
2168 # something like this, where the first 'or' will not get
2173 # ( not defined $check )
2175 # or $check eq "new"
2176 # or $check eq "old",
2179 # Likewise, we exclude parens where a statement can start
2180 # because of problems with continuation indentation, like
2183 # ($firstline =~ /^#\!.*perl/)
2184 # and (print $File::Find::name, "\n")
2187 # (ref($usage_fref) =~ /CODE/)
2189 # : (&blast_usage, &blast_params, &blast_general_params);
2195 if ( $last_nonblank_type eq ')' ) {
2197 "Syntax error? found token '$last_nonblank_type' then '('\n"
2200 $paren_structural_type[$paren_depth] = $type;
2204 ( $type_sequence, $indent_flag ) =
2205 decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2207 if ( $paren_structural_type[$paren_depth] eq '{' ) {
2211 $container_type = $paren_type[$paren_depth];
2213 # restore statement type as 'sub' at closing paren of a signature
2214 # so that a subsequent ':' is identified as an attribute
2215 if ( $container_type =~ /^sub\b/ ) {
2216 $statement_type = $container_type;
2220 if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
2221 my $num_sc = $paren_semicolon_count[$paren_depth];
2222 if ( $num_sc > 0 && $num_sc != 2 ) {
2223 warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2227 if ( $paren_depth > 0 ) { $paren_depth-- }
2230 if ( $last_nonblank_type eq ',' ) {
2231 complain("Repeated ','s \n");
2234 # Note that we have to check both token and type here because a
2235 # comma following a qw list can have last token='(' but type = 'q'
2236 elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' )
2238 warning("Unexpected leading ',' after a '('\n");
2241 # patch for operator_expected: note if we are in the list (use.t)
2242 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2246 $context = UNKNOWN_CONTEXT;
2247 $statement_type = '';
2251 if ( $is_for_foreach{ $paren_type[$paren_depth] } )
2252 { # mark ; in for loop
2254 # Be careful: we do not want a semicolon such as the
2255 # following to be included:
2257 # for (sort {strcoll($a,$b);} keys %investments) {
2259 if ( $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
2260 && $square_bracket_depth ==
2261 $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
2265 $paren_semicolon_count[$paren_depth]++;
2271 error_if_expecting_OPERATOR("String")
2272 if ( $expecting == OPERATOR );
2275 $allowed_quote_modifiers = "";
2278 error_if_expecting_OPERATOR("String")
2279 if ( $expecting == OPERATOR );
2282 $allowed_quote_modifiers = "";
2285 error_if_expecting_OPERATOR("String")
2286 if ( $expecting == OPERATOR );
2289 $allowed_quote_modifiers = "";
2294 # a pattern cannot follow certain keywords which take optional
2295 # arguments, like 'shift' and 'pop'. See also '?'.
2297 $last_nonblank_type eq 'k'
2298 && $is_keyword_rejecting_slash_as_pattern_delimiter{
2299 $last_nonblank_token}
2304 elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
2306 ( $is_pattern, $msg ) =
2307 guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
2311 write_diagnostics("DIVIDE:$msg\n");
2312 write_logfile_entry($msg);
2315 else { $is_pattern = ( $expecting == TERM ) }
2320 $allowed_quote_modifiers = '[msixpodualngc]';
2322 else { # not a pattern; check for a /= token
2324 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
2330 #DEBUG - collecting info on what tokens follow a divide
2331 # for development of guessing algorithm
2332 #if ( is_possible_numerator( $i, $rtokens, $max_token_index ) < 0 ) {
2333 # #write_diagnostics( "DIVIDE? $input_line\n" );
2339 # if we just saw a ')', we will label this block with
2340 # its type. We need to do this to allow sub
2341 # code_block_type to determine if this brace starts a
2342 # code block or anonymous hash. (The type of a paren
2343 # pair is the preceding token, such as 'if', 'else',
2345 $container_type = "";
2347 # ATTRS: for a '{' following an attribute list, reset
2348 # things to look like we just saw the sub name
2349 if ( $statement_type =~ /^sub\b/ ) {
2350 $last_nonblank_token = $statement_type;
2351 $last_nonblank_type = 'i';
2352 $statement_type = "";
2355 # patch for SWITCH/CASE: hide these keywords from an immediately
2356 # following opening brace
2357 elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
2358 && $statement_type eq $last_nonblank_token )
2360 $last_nonblank_token = ";";
2363 elsif ( $last_nonblank_token eq ')' ) {
2364 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
2366 # defensive move in case of a nesting error (pbug.t)
2367 # in which this ')' had no previous '('
2368 # this nesting error will have been caught
2369 if ( !defined($last_nonblank_token) ) {
2370 $last_nonblank_token = 'if';
2373 # check for syntax error here;
2374 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
2375 if ( $tokenizer_self->[_extended_syntax_] ) {
2377 # we append a trailing () to mark this as an unknown
2378 # block type. This allows perltidy to format some
2379 # common extensions of perl syntax.
2380 # This is used by sub code_block_type
2381 $last_nonblank_token .= '()';
2385 join( ' ', sort keys %is_blocktype_with_paren );
2387 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
2393 # patch for paren-less for/foreach glitch, part 2.
2394 # see note below under 'qw'
2395 elsif ($last_nonblank_token eq 'qw'
2396 && $is_for_foreach{$want_paren} )
2398 $last_nonblank_token = $want_paren;
2399 if ( $last_last_nonblank_token eq $want_paren ) {
2401 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
2408 # now identify which of the three possible types of
2409 # curly braces we have: hash index container, anonymous
2410 # hash reference, or code block.
2412 # non-structural (hash index) curly brace pair
2413 # get marked 'L' and 'R'
2414 if ( is_non_structural_brace() ) {
2417 # patch for SWITCH/CASE:
2418 # allow paren-less identifier after 'when'
2419 # if the brace is preceded by a space
2420 if ( $statement_type eq 'when'
2421 && $last_nonblank_type eq 'i'
2422 && $last_last_nonblank_type eq 'k'
2423 && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
2426 $block_type = $statement_type;
2430 # code and anonymous hash have the same type, '{', but are
2431 # distinguished by 'block_type',
2432 # which will be blank for an anonymous hash
2435 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
2438 # patch to promote bareword type to function taking block
2440 && $last_nonblank_type eq 'w'
2441 && $last_nonblank_i >= 0 )
2443 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
2444 $routput_token_type->[$last_nonblank_i] = 'G';
2448 # patch for SWITCH/CASE: if we find a stray opening block brace
2449 # where we might accept a 'case' or 'when' block, then take it
2450 if ( $statement_type eq 'case'
2451 || $statement_type eq 'when' )
2453 if ( !$block_type || $block_type eq '}' ) {
2454 $block_type = $statement_type;
2459 $brace_type[ ++$brace_depth ] = $block_type;
2460 $brace_package[$brace_depth] = $current_package;
2461 $brace_structural_type[$brace_depth] = $type;
2462 $brace_context[$brace_depth] = $context;
2463 ( $type_sequence, $indent_flag ) =
2464 increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2467 $block_type = $brace_type[$brace_depth];
2468 if ($block_type) { $statement_type = '' }
2469 if ( defined( $brace_package[$brace_depth] ) ) {
2470 $current_package = $brace_package[$brace_depth];
2473 # can happen on brace error (caught elsewhere)
2476 ( $type_sequence, $indent_flag ) =
2477 decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2479 if ( $brace_structural_type[$brace_depth] eq 'L' ) {
2483 # propagate type information for 'do' and 'eval' blocks, and also
2484 # for smartmatch operator. This is necessary to enable us to know
2485 # if an operator or term is expected next.
2486 if ( $is_block_operator{$block_type} ) {
2490 $context = $brace_context[$brace_depth];
2491 if ( $brace_depth > 0 ) { $brace_depth--; }
2493 '&' => sub { # maybe sub call? start looking
2495 # We have to check for sub call unless we are sure we
2496 # are expecting an operator. This example from s2p
2497 # got mistaken as a q operator in an early version:
2498 # print BODY &q(<<'EOT');
2499 if ( $expecting != OPERATOR ) {
2501 # But only look for a sub call if we are expecting a term or
2502 # if there is no existing space after the &.
2503 # For example we probably don't want & as sub call here:
2504 # Fcntl::S_IRUSR & $mode;
2505 if ( $expecting == TERM || $next_type ne 'b' ) {
2506 scan_identifier_fast();
2512 '<' => sub { # angle operator or less than?
2514 if ( $expecting != OPERATOR ) {
2516 find_angle_operator_termination( $input_line, $i, $rtoken_map,
2517 $expecting, $max_token_index );
2519 ## This message is not very helpful and quite confusing if the above
2520 ## routine decided not to write a message with the line number.
2521 ## if ( $type eq '<' && $expecting == TERM ) {
2522 ## error_if_expecting_TERM();
2523 ## interrupt_logfile();
2524 ## warning("Unterminated <> operator?\n");
2525 ## resume_logfile();
2532 '?' => sub { # ?: conditional or starting pattern?
2536 # Patch for rt #126965
2537 # a pattern cannot follow certain keywords which take optional
2538 # arguments, like 'shift' and 'pop'. See also '/'.
2540 $last_nonblank_type eq 'k'
2541 && $is_keyword_rejecting_question_as_pattern_delimiter{
2542 $last_nonblank_token}
2548 # patch for RT#131288, user constant function without prototype
2549 # last type is 'U' followed by ?.
2550 elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
2553 elsif ( $expecting == UNKNOWN ) {
2555 # In older versions of Perl, a bare ? can be a pattern
2556 # delimiter. In perl version 5.22 this was
2557 # dropped, but we have to support it in order to format
2558 # older programs. See:
2559 ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
2560 # For example, the following line worked
2562 # ?(.*)? && (print $1,"\n");
2563 # In current versions it would have to be written with slashes:
2564 # /(.*)/ && (print $1,"\n");
2566 ( $is_pattern, $msg ) =
2567 guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
2570 if ($msg) { write_logfile_entry($msg) }
2572 else { $is_pattern = ( $expecting == TERM ) }
2577 $allowed_quote_modifiers = '[msixpodualngc]';
2580 ( $type_sequence, $indent_flag ) =
2581 increase_nesting_depth( QUESTION_COLON,
2582 $rtoken_map->[$i_tok] );
2585 '*' => sub { # typeglob, or multiply?
2587 if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
2588 if ( $next_type ne 'b'
2589 && $next_type ne '('
2590 && $next_type ne '#' ) # Fix c036
2595 if ( $expecting == TERM ) {
2596 scan_identifier_fast();
2600 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2605 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
2609 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2617 '.' => sub { # what kind of . ?
2619 if ( $expecting != OPERATOR ) {
2621 if ( $type eq '.' ) {
2622 error_if_expecting_TERM()
2623 if ( $expecting == TERM );
2631 # if this is the first nonblank character, call it a label
2632 # since perl seems to just swallow it
2633 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
2637 # ATTRS: check for a ':' which introduces an attribute list
2638 # either after a 'sub' keyword or within a paren list
2639 elsif ( $statement_type =~ /^sub\b/ ) {
2641 $in_attribute_list = 1;
2644 # Within a signature, unless we are in a ternary. For example,
2645 # from 't/filter_example.t':
2646 # method foo4 ( $class: $bar ) { $class->bar($bar) }
2647 elsif ( $paren_type[$paren_depth] =~ /^sub\b/
2648 && !is_balanced_closing_container(QUESTION_COLON) )
2651 $in_attribute_list = 1;
2654 # check for scalar attribute, such as
2655 # my $foo : shared = 1;
2656 elsif ($is_my_our_state{$statement_type}
2657 && $current_depth[QUESTION_COLON] == 0 )
2660 $in_attribute_list = 1;
2663 # Look for Switch::Plain syntax if an error would otherwise occur
2664 # here. Note that we do not need to check if the extended syntax
2665 # flag is set because otherwise an error would occur, and we would
2666 # then have to output a message telling the user to set the
2667 # extended syntax flag to avoid the error.
2671 # Note that the line 'default:' will be parsed as a label elsewhere.
2672 elsif ( $is_case_default{$statement_type}
2673 && !is_balanced_closing_container(QUESTION_COLON) )
2675 # mark it as a perltidy label type
2679 # otherwise, it should be part of a ?/: operator
2681 ( $type_sequence, $indent_flag ) =
2682 decrease_nesting_depth( QUESTION_COLON,
2683 $rtoken_map->[$i_tok] );
2684 if ( $last_nonblank_token eq '?' ) {
2685 warning("Syntax error near ? :\n");
2689 '+' => sub { # what kind of plus?
2691 if ( $expecting == TERM ) {
2692 my $number = scan_number_fast();
2694 # unary plus is safest assumption if not a number
2695 if ( !defined($number) ) { $type = 'p'; }
2697 elsif ( $expecting == OPERATOR ) {
2700 if ( $next_type eq 'w' ) { $type = 'p' }
2705 error_if_expecting_OPERATOR("Array")
2706 if ( $expecting == OPERATOR );
2707 scan_identifier_fast();
2709 '%' => sub { # hash or modulo?
2711 # first guess is hash if no following blank or paren
2712 if ( $expecting == UNKNOWN ) {
2713 if ( $next_type ne 'b' && $next_type ne '(' ) {
2717 if ( $expecting == TERM ) {
2718 scan_identifier_fast();
2722 $square_bracket_type[ ++$square_bracket_depth ] =
2723 $last_nonblank_token;
2724 ( $type_sequence, $indent_flag ) =
2725 increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2727 # It may seem odd, but structural square brackets have
2728 # type '{' and '}'. This simplifies the indentation logic.
2729 if ( !is_non_structural_brace() ) {
2732 $square_bracket_structural_type[$square_bracket_depth] = $type;
2735 ( $type_sequence, $indent_flag ) =
2736 decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2738 if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
2743 # propagate type information for smartmatch operator. This is
2744 # necessary to enable us to know if an operator or term is expected
2746 if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
2747 $tok = $square_bracket_type[$square_bracket_depth];
2750 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
2752 '-' => sub { # what kind of minus?
2754 if ( ( $expecting != OPERATOR )
2755 && $is_file_test_operator{$next_tok} )
2757 my ( $next_nonblank_token, $i_next ) =
2758 find_next_nonblank_token( $i + 1, $rtokens,
2761 # check for a quoted word like "-w=>xx";
2762 # it is sufficient to just check for a following '='
2763 if ( $next_nonblank_token eq '=' ) {
2772 elsif ( $expecting == TERM ) {
2773 my $number = scan_number_fast();
2775 # maybe part of bareword token? unary is safest
2776 if ( !defined($number) ) { $type = 'm'; }
2779 elsif ( $expecting == OPERATOR ) {
2783 if ( $next_type eq 'w' ) {
2791 # check for special variables like ${^WARNING_BITS}
2792 if ( $expecting == TERM ) {
2794 # FIXME: this should work but will not catch errors
2795 # because we also have to be sure that previous token is
2796 # a type character ($,@,%).
2797 if ( $last_nonblank_token eq '{'
2798 && ( $next_tok !~ /^\d/ )
2799 && ( $next_tok =~ /^\w/ ) )
2802 if ( $next_tok eq 'W' ) {
2803 $tokenizer_self->[_saw_perl_dash_w_] = 1;
2805 $tok = $tok . $next_tok;
2811 unless ( error_if_expecting_TERM() ) {
2813 # Something like this is valid but strange:
2815 complain("The '^' seems unusual here\n");
2821 '::' => sub { # probably a sub call
2822 scan_bare_identifier();
2824 '<<' => sub { # maybe a here-doc?
2826 ## This check removed because it could be a deprecated here-doc with
2827 ## no specified target. See example in log 16 Sep 2020.
2829 ## unless ( $i < $max_token_index )
2830 ## ; # here-doc not possible if end of line
2832 if ( $expecting != OPERATOR ) {
2833 my ( $found_target, $here_doc_target, $here_quote_character,
2836 $found_target, $here_doc_target, $here_quote_character, $i,
2839 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
2842 if ($found_target) {
2843 push @{$rhere_target_list},
2844 [ $here_doc_target, $here_quote_character ];
2846 if ( length($here_doc_target) > 80 ) {
2847 my $truncated = substr( $here_doc_target, 0, 80 );
2848 complain("Long here-target: '$truncated' ...\n");
2850 elsif ( !$here_doc_target ) {
2852 'Use of bare << to mean <<"" is deprecated' . "\n" )
2853 unless ($here_quote_character);
2855 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
2857 "Unconventional here-target: '$here_doc_target'\n");
2860 elsif ( $expecting == TERM ) {
2861 unless ($saw_error) {
2863 # shouldn't happen..
2864 warning("Program bug; didn't find here doc target\n");
2865 report_definite_bug();
2872 '<<~' => sub { # a here-doc, new type added in v26
2874 unless ( $i < $max_token_index )
2875 ; # here-doc not possible if end of line
2876 if ( $expecting != OPERATOR ) {
2877 my ( $found_target, $here_doc_target, $here_quote_character,
2880 $found_target, $here_doc_target, $here_quote_character, $i,
2883 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
2886 if ($found_target) {
2888 if ( length($here_doc_target) > 80 ) {
2889 my $truncated = substr( $here_doc_target, 0, 80 );
2890 complain("Long here-target: '$truncated' ...\n");
2892 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
2894 "Unconventional here-target: '$here_doc_target'\n");
2897 # Note that we put a leading space on the here quote
2898 # character indicate that it may be preceded by spaces
2899 $here_quote_character = " " . $here_quote_character;
2900 push @{$rhere_target_list},
2901 [ $here_doc_target, $here_quote_character ];
2904 elsif ( $expecting == TERM ) {
2905 unless ($saw_error) {
2907 # shouldn't happen..
2908 warning("Program bug; didn't find here doc target\n");
2909 report_definite_bug();
2918 # if -> points to a bare word, we must scan for an identifier,
2919 # otherwise something like ->y would look like the y operator
2921 # NOTE: this will currently allow things like
2922 # '->@array' '->*VAR' '->%hash'
2923 # to get parsed as identifiers, even though these are not currently
2924 # allowed syntax. To catch syntax errors like this we could first
2925 # check that the next character and skip this call if it is one of
2926 # ' @ % * '. A disadvantage with doing this is that this would
2927 # have to be fixed if the perltidy syntax is ever extended to make
2928 # any of these valid. So for now this check is not done.
2929 scan_identifier_fast();
2932 # type = 'pp' for pre-increment, '++' for post-increment
2934 if ( $expecting == TERM ) { $type = 'pp' }
2935 elsif ( $expecting == UNKNOWN ) {
2937 my ( $next_nonblank_token, $i_next ) =
2938 find_next_nonblank_token( $i, $rtokens, $max_token_index );
2940 # Fix for c042: look past a side comment
2941 if ( $next_nonblank_token eq '#' ) {
2942 ( $next_nonblank_token, $i_next ) =
2943 find_next_nonblank_token( $max_token_index,
2944 $rtokens, $max_token_index );
2947 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
2952 if ( $last_nonblank_type eq $tok ) {
2953 complain("Repeated '=>'s \n");
2956 # patch for operator_expected: note if we are in the list (use.t)
2957 # TODO: make version numbers a new token type
2958 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2961 # type = 'mm' for pre-decrement, '--' for post-decrement
2964 if ( $expecting == TERM ) { $type = 'mm' }
2965 elsif ( $expecting == UNKNOWN ) {
2966 my ( $next_nonblank_token, $i_next ) =
2967 find_next_nonblank_token( $i, $rtokens, $max_token_index );
2969 # Fix for c042: look past a side comment
2970 if ( $next_nonblank_token eq '#' ) {
2971 ( $next_nonblank_token, $i_next ) =
2972 find_next_nonblank_token( $max_token_index,
2973 $rtokens, $max_token_index );
2976 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
2981 error_if_expecting_TERM()
2982 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
2986 error_if_expecting_TERM()
2987 if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015
2991 error_if_expecting_TERM()
2992 if ( $expecting == TERM );
2996 # ------------------------------------------------------------
2997 # end hash of code for handling individual token types
2998 # ------------------------------------------------------------
3000 my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
3002 # These block types terminate statements and do not need a trailing
3004 # patched for SWITCH/CASE/
3005 my %is_zero_continuation_block_type;
3006 @_ = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
3007 if elsif else unless while until for foreach switch case given when);
3008 @is_zero_continuation_block_type{@_} = (1) x scalar(@_);
3010 my %is_not_zero_continuation_block_type;
3011 @_ = qw(sort grep map do eval);
3012 @is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);
3014 my %is_logical_container;
3015 @_ = qw(if elsif unless while and or err not && ! || for foreach);
3016 @is_logical_container{@_} = (1) x scalar(@_);
3020 @is_binary_type{@_} = (1) x scalar(@_);
3022 my %is_binary_keyword;
3023 @_ = qw(and or err eq ne cmp);
3024 @is_binary_keyword{@_} = (1) x scalar(@_);
3026 # 'L' is token for opening { at hash key
3027 my %is_opening_type;
3029 @is_opening_type{@_} = (1) x scalar(@_);
3031 # 'R' is token for closing } at hash key
3032 my %is_closing_type;
3034 @is_closing_type{@_} = (1) x scalar(@_);
3036 my %is_redo_last_next_goto;
3037 @_ = qw(redo last next goto);
3038 @is_redo_last_next_goto{@_} = (1) x scalar(@_);
3041 @_ = qw(use require);
3042 @is_use_require{@_} = (1) x scalar(@_);
3044 # This hash holds the array index in $tokenizer_self for these keywords:
3045 # Fix for issue c035: removed 'format' from this hash
3047 '__END__' => _in_end_,
3048 '__DATA__' => _in_data_,
3051 # original ref: camel 3 p 147,
3052 # but perl may accept undocumented flags
3053 # perl 5.10 adds 'p' (preserve)
3054 # Perl version 5.22 added 'n'
3055 # From http://perldoc.perl.org/perlop.html we have
3056 # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
3057 # s/PATTERN/REPLACEMENT/msixpodualngcer
3058 # y/SEARCHLIST/REPLACEMENTLIST/cdsr
3059 # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
3060 # qr/STRING/msixpodualn
3061 my %quote_modifiers = (
3062 's' => '[msixpodualngcer]',
3065 'm' => '[msixpodualngc]',
3066 'qr' => '[msixpodualn]',
3073 # table showing how many quoted things to look for after quote operator..
3074 # s, y, tr have 2 (pattern and replacement)
3075 # others have 1 (pattern only)
3088 use constant DEBUG_TOKENIZE => 0;
3090 sub tokenize_this_line {
3092 # This routine breaks a line of perl code into tokens which are of use in
3093 # indentation and reformatting. One of my goals has been to define tokens
3094 # such that a newline may be inserted between any pair of tokens without
3095 # changing or invalidating the program. This version comes close to this,
3096 # although there are necessarily a few exceptions which must be caught by
3097 # the formatter. Many of these involve the treatment of bare words.
3099 # The tokens and their types are returned in arrays. See previous
3100 # routine for their names.
3102 # See also the array "valid_token_types" in the BEGIN section for an
3105 # To simplify things, token types are either a single character, or they
3106 # are identical to the tokens themselves.
3108 # As a debugging aid, the -D flag creates a file containing a side-by-side
3109 # comparison of the input string and its tokenization for each line of a file.
3110 # This is an invaluable debugging aid.
3112 # In addition to tokens, and some associated quantities, the tokenizer
3113 # also returns flags indication any special line types. These include
3114 # quotes, here_docs, formats.
3116 # -----------------------------------------------------------------------
3118 # How to add NEW_TOKENS:
3120 # New token types will undoubtedly be needed in the future both to keep up
3121 # with changes in perl and to help adapt the tokenizer to other applications.
3123 # Here are some notes on the minimal steps. I wrote these notes while
3124 # adding the 'v' token type for v-strings, which are things like version
3125 # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
3126 # can use your editor to search for the string "NEW_TOKENS" to find the
3127 # appropriate sections to change):
3129 # *. Try to talk somebody else into doing it! If not, ..
3131 # *. Make a backup of your current version in case things don't work out!
3133 # *. Think of a new, unused character for the token type, and add to
3134 # the array @valid_token_types in the BEGIN section of this package.
3135 # For example, I used 'v' for v-strings.
3137 # *. Implement coding to recognize the $type of the token in this routine.
3138 # This is the hardest part, and is best done by imitating or modifying
3139 # some of the existing coding. For example, to recognize v-strings, I
3140 # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
3141 # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
3143 # *. Update sub operator_expected. This update is critically important but
3144 # the coding is trivial. Look at the comments in that routine for help.
3145 # For v-strings, which should behave like numbers, I just added 'v' to the
3146 # regex used to handle numbers and strings (types 'n' and 'Q').
3148 # *. Implement a 'bond strength' rule in sub set_bond_strengths in
3149 # Perl::Tidy::Formatter for breaking lines around this token type. You can
3150 # skip this step and take the default at first, then adjust later to get
3151 # desired results. For adding type 'v', I looked at sub bond_strength and
3152 # saw that number type 'n' was using default strengths, so I didn't do
3153 # anything. I may tune it up someday if I don't like the way line
3154 # breaks with v-strings look.
3156 # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
3157 # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
3158 # and saw that type 'n' used spaces on both sides, so I just added 'v'
3159 # to the array @spaces_both_sides.
3161 # *. Update HtmlWriter package so that users can colorize the token as
3162 # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
3163 # that package. For v-strings, I initially chose to use a default color
3164 # equal to the default for numbers, but it might be nice to change that
3167 # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
3169 # *. Run lots and lots of debug tests. Start with special files designed
3170 # to test the new token type. Run with the -D flag to create a .DEBUG
3171 # file which shows the tokenization. When these work ok, test as many old
3172 # scripts as possible. Start with all of the '.t' files in the 'test'
3173 # directory of the distribution file. Compare .tdy output with previous
3174 # version and updated version to see the differences. Then include as
3175 # many more files as possible. My own technique has been to collect a huge
3176 # number of perl scripts (thousands!) into one directory and run perltidy
3177 # *, then run diff between the output of the previous version and the
3180 # *. For another example, search for the smartmatch operator '~~'
3181 # with your editor to see where updates were made for it.
3183 # -----------------------------------------------------------------------
3185 my $line_of_tokens = shift;
3186 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
3188 # patch while coding change is underway
3189 # make callers private data to allow access
3190 # $tokenizer_self = $caller_tokenizer_self;
3192 # extract line number for use in error messages
3193 $input_line_number = $line_of_tokens->{_line_number};
3195 # reinitialize for multi-line quote
3196 $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
3198 # check for pod documentation
3199 if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
3200 && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
3203 # must not be in multi-line quote
3204 # and must not be in an equation
3206 && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
3208 $tokenizer_self->[_in_pod_] = 1;
3213 $input_line = $untrimmed_input_line;
3217 # Set a flag to indicate if we might be at an __END__ or __DATA__ line
3218 # This will be used below to avoid quoting a bare word followed by
3222 # trim start of this line unless we are continuing a quoted line
3223 # do not trim end because we might end in a quote (test: deken4.pl)
3224 # Perl::Tidy::Formatter will delete needless trailing blanks
3225 unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
3226 $input_line =~ s/^\s+//; # trim left end
3228 $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
3229 && $input_line =~ /^\s*__(END|DATA)__\s*$/;
3232 # update the copy of the line for use in error messages
3233 # This must be exactly what we give the pre_tokenizer
3234 $tokenizer_self->[_line_of_text_] = $input_line;
3236 # re-initialize for the main loop
3237 $routput_token_list = []; # stack of output token indexes
3238 $routput_token_type = []; # token types
3239 $routput_block_type = []; # types of code block
3240 $routput_container_type = []; # paren types, such as if, elsif, ..
3241 $routput_type_sequence = []; # nesting sequential number
3243 $rhere_target_list = [];
3245 $tok = $last_nonblank_token;
3246 $type = $last_nonblank_type;
3247 $prototype = $last_nonblank_prototype;
3248 $last_nonblank_i = -1;
3249 $block_type = $last_nonblank_block_type;
3250 $container_type = $last_nonblank_container_type;
3251 $type_sequence = $last_nonblank_type_sequence;
3255 # tokenization is done in two stages..
3256 # stage 1 is a very simple pre-tokenization
3257 my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
3259 # optimize for a full-line comment
3260 if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) {
3261 $max_tokens_wanted = 1; # no use tokenizing a comment
3263 # and check for skipped section
3264 if ( $rOpts_code_skipping
3265 && $input_line =~ /$code_skipping_pattern_begin/ )
3267 $tokenizer_self->[_in_skipped_] = 1;
3272 # start by breaking the line into pre-tokens
3273 ( $rtokens, $rtoken_map, $rtoken_type ) =
3274 pre_tokenize( $input_line, $max_tokens_wanted );
3276 $max_token_index = scalar( @{$rtokens} ) - 1;
3277 push( @{$rtokens}, ' ', ' ', ' ' ); # extra whitespace simplifies logic
3278 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
3279 push( @{$rtoken_type}, 'b', 'b', 'b' );
3281 # initialize for main loop
3282 foreach my $ii ( 0 .. $max_token_index + 3 ) {
3283 $routput_token_type->[$ii] = "";
3284 $routput_block_type->[$ii] = "";
3285 $routput_container_type->[$ii] = "";
3286 $routput_type_sequence->[$ii] = "";
3287 $routput_indent_flag->[$ii] = 0;
3292 # ------------------------------------------------------------
3293 # begin main tokenization loop
3294 # ------------------------------------------------------------
3296 # we are looking at each pre-token of one line and combining them
3298 while ( ++$i <= $max_token_index ) {
3300 if ($in_quote) { # continue looking for end of a quote
3301 $type = $quote_type;
3303 unless ( @{$routput_token_list} )
3304 { # initialize if continuation line
3305 push( @{$routput_token_list}, $i );
3306 $routput_token_type->[$i] = $type;
3309 $tok = $quote_character if ($quote_character);
3311 # scan for the end of the quote or pattern
3313 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
3314 $quoted_string_1, $quoted_string_2
3317 $i, $in_quote, $quote_character,
3318 $quote_pos, $quote_depth, $quoted_string_1,
3319 $quoted_string_2, $rtokens, $rtoken_map,
3323 # all done if we didn't find it
3324 last if ($in_quote);
3326 # save pattern and replacement text for rescanning
3327 my $qs1 = $quoted_string_1;
3328 my $qs2 = $quoted_string_2;
3330 # re-initialize for next search
3331 $quote_character = '';
3334 $quoted_string_1 = "";
3335 $quoted_string_2 = "";
3336 last if ( ++$i > $max_token_index );
3338 # look for any modifiers
3339 if ($allowed_quote_modifiers) {
3341 # check for exact quote modifiers
3342 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
3343 my $str = $rtokens->[$i];
3345 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
3346 my $pos = pos($str);
3347 my $char = substr( $str, $pos - 1, 1 );
3348 $saw_modifier_e ||= ( $char eq 'e' );
3351 # For an 'e' quote modifier we must scan the replacement
3352 # text for here-doc targets...
3353 # but if the modifier starts a new line we can skip
3354 # this because either the here doc will be fully
3355 # contained in the replacement text (so we can
3356 # ignore it) or Perl will not find it.
3357 # See test 'here2.in'.
3358 if ( $saw_modifier_e && $i_tok >= 0 ) {
3360 my $rht = scan_replacement_text($qs1);
3362 # Change type from 'Q' to 'h' for quotes with
3363 # here-doc targets so that the formatter (see sub
3364 # process_line_of_CODE) will not make any line
3365 # breaks after this point.
3367 push @{$rhere_target_list}, @{$rht};
3370 my $ilast = $routput_token_list->[-1];
3371 $routput_token_type->[$ilast] = $type;
3376 if ( defined( pos($str) ) ) {
3379 if ( pos($str) == length($str) ) {
3380 last if ( ++$i > $max_token_index );
3383 # Looks like a joined quote modifier
3384 # and keyword, maybe something like
3385 # s/xxx/yyy/gefor @k=...
3386 # Example is "galgen.pl". Would have to split
3387 # the word and insert a new token in the
3388 # pre-token list. This is so rare that I haven't
3389 # done it. Will just issue a warning citation.
3391 # This error might also be triggered if my quote
3392 # modifier characters are incomplete
3396 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
3397 Please put a space between quote modifiers and trailing keywords.
3400 # print "token $rtokens->[$i]\n";
3401 # my $num = length($str) - pos($str);
3402 # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
3403 # print "continuing with new token $rtokens->[$i]\n";
3405 # skipping past this token does least damage
3406 last if ( ++$i > $max_token_index );
3411 # example file: rokicki4.pl
3412 # This error might also be triggered if my quote
3413 # modifier characters are incomplete
3414 write_logfile_entry(
3415 "Note: found word $str at quote modifier location\n"
3421 $allowed_quote_modifiers = "";
3425 unless ( $type eq 'b' || $tok eq 'CORE::' ) {
3427 # try to catch some common errors
3428 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
3430 if ( $last_nonblank_token eq 'eq' ) {
3431 complain("Should 'eq' be '==' here ?\n");
3433 elsif ( $last_nonblank_token eq 'ne' ) {
3434 complain("Should 'ne' be '!=' here ?\n");
3438 $last_last_nonblank_token = $last_nonblank_token;
3439 $last_last_nonblank_type = $last_nonblank_type;
3440 $last_last_nonblank_block_type = $last_nonblank_block_type;
3441 $last_last_nonblank_container_type =
3442 $last_nonblank_container_type;
3443 $last_last_nonblank_type_sequence =
3444 $last_nonblank_type_sequence;
3445 $last_nonblank_token = $tok;
3446 $last_nonblank_type = $type;
3447 $last_nonblank_prototype = $prototype;
3448 $last_nonblank_block_type = $block_type;
3449 $last_nonblank_container_type = $container_type;
3450 $last_nonblank_type_sequence = $type_sequence;
3451 $last_nonblank_i = $i_tok;
3453 # Patch for c030: Fix things in case a '->' got separated from
3454 # the subsequent identifier by a side comment. We need the
3455 # last_nonblank_token to have a leading -> to avoid triggering
3456 # an operator expected error message at the next '('. See also
3458 if ( $last_last_nonblank_token eq '->' ) {
3459 if ( $last_nonblank_type eq 'w'
3460 || $last_nonblank_type eq 'i'
3461 && substr( $last_nonblank_token, 0, 1 ) eq '$' )
3463 $last_nonblank_token = '->' . $last_nonblank_token;
3464 $last_nonblank_type = 'i';
3469 # store previous token type
3470 if ( $i_tok >= 0 ) {
3471 $routput_token_type->[$i_tok] = $type;
3472 $routput_block_type->[$i_tok] = $block_type;
3473 $routput_container_type->[$i_tok] = $container_type;
3474 $routput_type_sequence->[$i_tok] = $type_sequence;
3475 $routput_indent_flag->[$i_tok] = $indent_flag;
3477 my $pre_tok = $rtokens->[$i]; # get the next pre-token
3478 my $pre_type = $rtoken_type->[$i]; # and type
3480 $type = $pre_type; # to be modified as necessary
3481 $block_type = ""; # blank for all tokens except code block braces
3482 $container_type = ""; # blank for all tokens except some parens
3483 $type_sequence = ""; # blank for all tokens except ?/:
3485 $prototype = ""; # blank for all tokens except user defined subs
3488 # this pre-token will start an output token
3489 push( @{$routput_token_list}, $i_tok );
3491 # continue gathering identifier if necessary
3492 # but do not start on blanks and comments
3493 if ( $id_scan_state && $pre_type ne 'b' && $pre_type ne '#' ) {
3495 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
3502 last if ($id_scan_state);
3503 next if ( ( $i > 0 ) || $type );
3505 # didn't find any token; start over
3510 # handle whitespace tokens..
3511 next if ( $type eq 'b' );
3512 my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : ' ';
3513 my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
3515 # Build larger tokens where possible, since we are not in a quote.
3517 # First try to assemble digraphs. The following tokens are
3518 # excluded and handled specially:
3519 # '/=' is excluded because the / might start a pattern.
3520 # 'x=' is excluded since it might be $x=, with $ on previous line
3521 # '**' and *= might be typeglobs of punctuation variables
3522 # I have allowed tokens starting with <, such as <=,
3523 # because I don't think these could be valid angle operators.
3524 # test file: storrs4.pl
3525 my $test_tok = $tok . $rtokens->[ $i + 1 ];
3526 my $combine_ok = $is_digraph{$test_tok};
3528 # check for special cases which cannot be combined
3531 # '//' must be defined_or operator if an operator is expected.
3532 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
3533 # could be migrated here for clarity
3535 # Patch for RT#102371, misparsing a // in the following snippet:
3536 # state $b //= ccc();
3537 # The solution is to always accept the digraph (or trigraph) after
3538 # token type 'Z' (possible file handle). The reason is that
3539 # sub operator_expected gives TERM expected here, which is
3540 # wrong in this case.
3541 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
3542 my $next_type = $rtokens->[ $i + 1 ];
3544 operator_expected( [ $prev_type, $tok, $next_type ] );
3546 # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
3547 $combine_ok = 0 if ( $expecting == TERM );
3550 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
3551 # Accept the digraphs '**' only after type 'Z'
3552 # Otherwise postpone the decision.
3553 if ( $test_tok eq '**' ) {
3554 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
3561 && ( $test_tok ne '/=' ) # might be pattern
3562 && ( $test_tok ne 'x=' ) # might be $x
3563 && ( $test_tok ne '*=' ) # typeglob?
3565 # Moved above as part of fix for
3566 # RT #114359: Missparsing of "print $x ** 0.5;
3567 # && ( $test_tok ne '**' ) # typeglob?
3573 # Now try to assemble trigraphs. Note that all possible
3574 # perl trigraphs can be constructed by appending a character
3576 $test_tok = $tok . $rtokens->[ $i + 1 ];
3578 if ( $is_trigraph{$test_tok} ) {
3583 # The only current tetragraph is the double diamond operator
3584 # and its first three characters are not a trigraph, so
3585 # we do can do a special test for it
3586 elsif ( $test_tok eq '<<>' ) {
3587 $test_tok .= $rtokens->[ $i + 2 ];
3588 if ( $is_tetragraph{$test_tok} ) {
3596 $next_tok = $rtokens->[ $i + 1 ];
3597 $next_type = $rtoken_type->[ $i + 1 ];
3599 DEBUG_TOKENIZE && do {
3602 $last_nonblank_token, $tok,
3603 $next_tok, $brace_depth,
3604 $brace_type[$brace_depth], $paren_depth,
3605 $paren_type[$paren_depth]
3607 print STDOUT "TOKENIZE:(@debug_list)\n";
3610 # Turn off attribute list on first non-blank, non-bareword.
3611 # Added '#' to fix c038.
3612 if ( $pre_type ne 'w' && $pre_type ne '#' ) {
3613 $in_attribute_list = 0;
3616 ###############################################################
3617 # We have the next token, $tok.
3618 # Now we have to examine this token and decide what it is
3619 # and define its $type
3621 # section 1: bare words
3622 ###############################################################
3624 if ( $pre_type eq 'w' ) {
3626 operator_expected( [ $prev_type, $tok, $next_type ] );
3628 # Patch for c043, part 3: A bareword after '->' expects a TERM
3629 # FIXME: It would be cleaner to give method calls a new type 'M'
3630 # and update sub operator_expected to handle this.
3631 if ( $last_nonblank_type eq '->' ) {
3635 my ( $next_nonblank_token, $i_next ) =
3636 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3638 # ATTRS: handle sub and variable attributes
3639 if ($in_attribute_list) {
3641 # treat bare word followed by open paren like qw(
3642 if ( $next_nonblank_token eq '(' ) {
3644 # For something like:
3646 # we should let do_scan_sub see it so that it can see
3647 # the prototype. All other attributes get parsed as a
3649 if ( $tok eq 'prototype' ) {
3650 $id_scan_state = 'prototype';
3652 # start just after the word 'prototype'
3654 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
3656 input_line => $input_line,
3661 rtokens => $rtokens,
3662 rtoken_map => $rtoken_map,
3663 id_scan_state => $id_scan_state,
3664 max_token_index => $max_token_index
3668 # If successful, mark as type 'q' to be consistent with other
3669 # attributes. Note that type 'w' would also work.
3670 if ( $i > $i_beg ) {
3675 # If not successful, continue and parse as a quote.
3678 # All other attribute lists must be parsed as quotes
3679 # (see 'signatures.t' for good examples)
3680 $in_quote = $quote_items{'q'};
3681 $allowed_quote_modifiers = $quote_modifiers{'q'};
3687 # handle bareword not followed by open paren
3694 # quote a word followed by => operator
3695 # unless the word __END__ or __DATA__ and the only word on
3697 if ( !$is_END_or_DATA && $next_nonblank_token eq '=' ) {
3699 if ( $rtokens->[ $i_next + 1 ] eq '>' ) {
3700 if ( $is_constant{$current_package}{$tok} ) {
3703 elsif ( $is_user_function{$current_package}{$tok} ) {
3706 $user_function_prototype{$current_package}{$tok};
3708 elsif ( $tok =~ /^v\d+$/ ) {
3710 report_v_string($tok);
3714 # Bareword followed by a fat comma ... see 'git18.in'
3715 # If tok is something like 'x17' then it could
3716 # actually be operator x followed by number 17.
3717 # For example, here:
3718 # 123x17 => [ 792, 1224 ],
3719 # (a key of 123 repeated 17 times, perhaps not
3720 # what was intended). We will mark x17 as type
3721 # 'n' and it will be split. If the previous token
3722 # was also a bareword then it is not very clear is
3723 # going on. In this case we will not be sure that
3724 # an operator is expected, so we just mark it as a
3725 # bareword. Perl is a little murky in what it does
3726 # with stuff like this, and its behavior can change
3727 # over time. Something like
3728 # a x18 => [792, 1224], will compile as
3729 # a key with 18 a's. But something like
3730 # push @array, a x18;
3731 # is a syntax error.
3732 if ( $expecting == OPERATOR && $tok =~ /^x\d+$/ ) {
3739 error_if_expecting_OPERATOR();
3747 # quote a bare word within braces..like xxx->{s}; note that we
3748 # must be sure this is not a structural brace, to avoid
3749 # mistaking {s} in the following for a quoted bare word:
3750 # for(@[){s}bla}BLA}
3751 # Also treat q in something like var{-q} as a bare word, not qoute operator
3753 $next_nonblank_token eq '}'
3755 $last_nonblank_type eq 'L'
3756 || ( $last_nonblank_type eq 'm'
3757 && $last_last_nonblank_type eq 'L' )
3765 # Scan a bare word following a -> as an identifir; it could
3766 # have a long package name. Fixes c037, c041.
3767 if ( $last_nonblank_token eq '->' ) {
3768 scan_bare_identifier();
3770 # Patch for c043, part 4; use type 'w' after a '->'.
3771 # This is just a safety check on sub scan_bare_identifier,
3772 # which should get this case correct.
3777 # a bare word immediately followed by :: is not a keyword;
3778 # use $tok_kw when testing for keywords to avoid a mistake
3780 if ( $rtokens->[ $i + 1 ] eq ':'
3781 && $rtokens->[ $i + 2 ] eq ':' )
3786 # Decide if 'sub :' can be the start of a sub attribute list.
3787 # We will decide based on if the colon is followed by a
3788 # bareword which is not a keyword.
3789 my $sub_attribute_ok_here;
3790 if ( $is_sub{$tok_kw}
3791 && $expecting != OPERATOR
3792 && $next_nonblank_token eq ':' )
3794 my ( $nn_nonblank_token, $i_nn ) =
3795 find_next_nonblank_token( $i_next + 1,
3796 $rtokens, $max_token_index );
3797 $sub_attribute_ok_here =
3798 $nn_nonblank_token =~ /^\w/
3799 && $nn_nonblank_token !~ /^\d/
3800 && !$is_keyword{$nn_nonblank_token};
3803 # handle operator x (now we know it isn't $x=)
3804 if ( $expecting == OPERATOR
3805 && substr( $tok, 0, 1 ) eq 'x'
3806 && $tok =~ /^x\d*$/ )
3808 if ( $tok eq 'x' ) {
3810 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
3820 # NOTE: mark something like x4 as an integer for now
3821 # It gets fixed downstream. This is easier than
3822 # splitting the pretoken.
3827 elsif ( $tok_kw eq 'CORE::' ) {
3828 $type = $tok = $tok_kw;
3831 elsif ( ( $tok eq 'strict' )
3832 and ( $last_nonblank_token eq 'use' ) )
3834 $tokenizer_self->[_saw_use_strict_] = 1;
3835 scan_bare_identifier();
3838 elsif ( ( $tok eq 'warnings' )
3839 and ( $last_nonblank_token eq 'use' ) )
3841 $tokenizer_self->[_saw_perl_dash_w_] = 1;
3843 # scan as identifier, so that we pick up something like:
3844 # use warnings::register
3845 scan_bare_identifier();
3849 $tok eq 'AutoLoader'
3850 && $tokenizer_self->[_look_for_autoloader_]
3852 $last_nonblank_token eq 'use'
3854 # these regexes are from AutoSplit.pm, which we want
3856 || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
3857 || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
3861 write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
3862 $tokenizer_self->[_saw_autoloader_] = 1;
3863 $tokenizer_self->[_look_for_autoloader_] = 0;
3864 scan_bare_identifier();
3868 $tok eq 'SelfLoader'
3869 && $tokenizer_self->[_look_for_selfloader_]
3870 && ( $last_nonblank_token eq 'use'
3871 || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
3872 || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
3875 write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
3876 $tokenizer_self->[_saw_selfloader_] = 1;
3877 $tokenizer_self->[_look_for_selfloader_] = 0;
3878 scan_bare_identifier();
3881 elsif ( ( $tok eq 'constant' )
3882 and ( $last_nonblank_token eq 'use' ) )
3884 scan_bare_identifier();
3885 my ( $next_nonblank_token, $i_next ) =
3886 find_next_nonblank_token( $i, $rtokens,
3889 if ($next_nonblank_token) {
3891 if ( $is_keyword{$next_nonblank_token} ) {
3893 # Assume qw is used as a quote and okay, as in:
3894 # use constant qw{ DEBUG 0 };
3895 # Not worth trying to parse for just a warning
3897 # NOTE: This warning is deactivated because recent
3898 # versions of perl do not complain here, but
3899 # the coding is retained for reference.
3900 if ( 0 && $next_nonblank_token ne 'qw' ) {
3902 "Attempting to define constant '$next_nonblank_token' which is a perl keyword\n"
3908 $is_constant{$current_package}{$next_nonblank_token}
3914 # various quote operators
3915 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
3917 if ( $expecting == OPERATOR ) {
3919 # Be careful not to call an error for a qw quote
3920 # where a parenthesized list is allowed. For example,
3921 # it could also be a for/foreach construct such as
3923 # foreach my $key qw\Uno Due Tres Quadro\ {
3924 # print "Set $key\n";
3928 # Or it could be a function call.
3929 # NOTE: Braces in something like &{ xxx } are not
3930 # marked as a block, we might have a method call.
3931 # &method(...), $method->(..), &{method}(...),
3932 # $ref[2](list) is ok & short for $ref[2]->(list)
3934 # See notes in 'sub code_block_type' and
3935 # 'sub is_non_structural_brace'
3939 && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
3940 || $is_for_foreach{$want_paren} )
3943 error_if_expecting_OPERATOR();
3946 $in_quote = $quote_items{$tok};
3947 $allowed_quote_modifiers = $quote_modifiers{$tok};
3949 # All quote types are 'Q' except possibly qw quotes.
3950 # qw quotes are special in that they may generally be trimmed
3951 # of leading and trailing whitespace. So they are given a
3952 # separate type, 'q', unless requested otherwise.
3954 ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
3957 $quote_type = $type;
3960 # check for a statement label
3962 ( $next_nonblank_token eq ':' )
3963 && ( $rtokens->[ $i_next + 1 ] ne ':' )
3964 && ( $i_next <= $max_token_index ) # colon on same line
3965 && !$sub_attribute_ok_here # like 'sub : lvalue' ?
3969 if ( $tok !~ /[A-Z]/ ) {
3970 push @{ $tokenizer_self->[_rlower_case_labels_at_] },
3980 elsif ( $is_sub{$tok_kw} ) {
3981 error_if_expecting_OPERATOR()
3982 if ( $expecting == OPERATOR );
3983 initialize_subname();
3988 elsif ( $is_package{$tok_kw} ) {
3989 error_if_expecting_OPERATOR()
3990 if ( $expecting == OPERATOR );
3994 # Fix for c035: split 'format' from 'is_format_END_DATA' to be
3995 # more restrictive. Require a new statement to be ok here.
3996 elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
3997 $type = ';'; # make tokenizer look for TERM next
3998 $tokenizer_self->[_in_format_] = 1;
4002 # Note on token types for format, __DATA__, __END__:
4003 # It simplifies things to give these type ';', so that when we
4004 # start rescanning we will be expecting a token of type TERM.
4005 # We will switch to type 'k' before outputting the tokens.
4006 elsif ( $is_END_DATA{$tok_kw} ) {
4007 $type = ';'; # make tokenizer look for TERM next
4009 # Remember that we are in one of these three sections
4010 $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1;
4014 elsif ( $is_keyword{$tok_kw} ) {
4017 # Since for and foreach may not be followed immediately
4018 # by an opening paren, we have to remember which keyword
4019 # is associated with the next '('
4020 if ( $is_for_foreach{$tok} ) {
4021 if ( new_statement_ok() ) {
4026 # recognize 'use' statements, which are special
4027 elsif ( $is_use_require{$tok} ) {
4028 $statement_type = $tok;
4029 error_if_expecting_OPERATOR()
4030 if ( $expecting == OPERATOR );
4033 # remember my and our to check for trailing ": shared"
4034 elsif ( $is_my_our_state{$tok} ) {
4035 $statement_type = $tok;
4038 # Check for misplaced 'elsif' and 'else', but allow isolated
4039 # else or elsif blocks to be formatted. This is indicated
4040 # by a last noblank token of ';'
4041 elsif ( $tok eq 'elsif' ) {
4042 if ( $last_nonblank_token ne ';'
4043 && $last_nonblank_block_type !~
4044 /^(if|elsif|unless)$/ )
4047 "expecting '$tok' to follow one of 'if|elsif|unless'\n"
4051 elsif ( $tok eq 'else' ) {
4053 # patched for SWITCH/CASE
4055 $last_nonblank_token ne ';'
4056 && $last_nonblank_block_type !~
4057 /^(if|elsif|unless|case|when)$/
4059 # patch to avoid an unwanted error message for
4060 # the case of a parenless 'case' (RT 105484):
4061 # switch ( 1 ) { case x { 2 } else { } }
4062 && $statement_type !~
4063 /^(if|elsif|unless|case|when)$/
4067 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
4071 elsif ( $tok eq 'continue' ) {
4072 if ( $last_nonblank_token ne ';'
4073 && $last_nonblank_block_type !~
4074 /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
4077 # note: ';' '{' and '}' in list above
4078 # because continues can follow bare blocks;
4079 # ':' is labeled block
4081 ############################################
4082 # NOTE: This check has been deactivated because
4083 # continue has an alternative usage for given/when
4084 # blocks in perl 5.10
4085 ## warning("'$tok' should follow a block\n");
4086 ############################################
4090 # patch for SWITCH/CASE if 'case' and 'when are
4091 # treated as keywords. Also 'default' for Switch::Plain
4092 elsif ($tok eq 'when'
4094 || $tok eq 'default' )
4096 $statement_type = $tok; # next '{' is block
4100 # indent trailing if/unless/while/until
4101 # outdenting will be handled by later indentation loop
4102 ## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
4114 ## if ( $tok =~ /^(if|unless|while|until)$/
4115 ## && $next_nonblank_token ne '(' )
4117 ## $indent_flag = 1;
4121 # check for inline label following
4122 # /^(redo|last|next|goto)$/
4123 elsif (( $last_nonblank_type eq 'k' )
4124 && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4133 scan_bare_identifier();
4135 if ( $statement_type eq 'use'
4136 && $last_nonblank_token eq 'use' )
4138 $saw_use_module{$current_package}->{$tok} = 1;
4141 if ( $type eq 'w' ) {
4143 if ( $expecting == OPERATOR ) {
4145 # Patch to avoid error message for RPerl overloaded
4146 # operator functions: use overload
4151 # FIXME: this should eventually be generalized
4152 if ( $saw_use_module{$current_package}->{'RPerl'}
4153 && $tok =~ /^sse_(mul|div|add|sub)$/ )
4158 # Fix part 1 for git #63 in which a comment falls
4159 # between an -> and the following word. An
4160 # alternate fix would be to change operator_expected
4161 # to return an UNKNOWN for this type.
4162 elsif ( $last_nonblank_type eq '->' ) {
4166 # don't complain about possible indirect object
4170 # sub new($) { ... }
4171 # $b = new A::; # calls A::new
4172 # $c = new A; # same thing but suspicious
4173 # This will call A::new but we have a 'new' in
4174 # main:: which looks like a constant.
4176 elsif ( $last_nonblank_type eq 'C' ) {
4177 if ( $tok !~ /::$/ ) {
4179 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
4180 Maybe indirectet object notation?
4185 error_if_expecting_OPERATOR("bareword");
4189 # mark bare words immediately followed by a paren as
4191 $next_tok = $rtokens->[ $i + 1 ];
4192 if ( $next_tok eq '(' ) {
4194 # Fix part 2 for git #63. Leave type as 'w' to keep
4195 # the type the same as if the -> were not separated
4196 $type = 'U' unless ( $last_nonblank_type eq '->' );
4199 # underscore after file test operator is file handle
4200 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
4204 # patch for SWITCH/CASE if 'case' and 'when are
4205 # not treated as keywords:
4209 && $brace_type[$brace_depth] eq 'switch'
4212 && $brace_type[$brace_depth] eq 'given' )
4215 $statement_type = $tok; # next '{' is block
4216 $type = 'k'; # for keyword syntax coloring
4219 # patch for SWITCH/CASE if switch and given not keywords
4220 # Switch is not a perl 5 keyword, but we will gamble
4221 # and mark switch followed by paren as a keyword. This
4222 # is only necessary to get html syntax coloring nice,
4223 # and does not commit this as being a switch/case.
4224 if ( $next_nonblank_token eq '('
4225 && ( $tok eq 'switch' || $tok eq 'given' ) )
4227 $type = 'k'; # for keyword syntax coloring
4233 ###############################################################
4234 # section 2: strings of digits
4235 ###############################################################
4236 elsif ( $pre_type eq 'd' ) {
4238 operator_expected( [ $prev_type, $tok, $next_type ] );
4239 error_if_expecting_OPERATOR("Number")
4240 if ( $expecting == OPERATOR );
4242 my $number = scan_number_fast();
4243 if ( !defined($number) ) {
4245 # shouldn't happen - we should always get a number
4246 warning("non-number beginning with digit--program bug\n");
4247 report_definite_bug();
4251 ###############################################################
4252 # section 3: all other tokens
4253 ###############################################################
4256 last if ( $tok eq '#' );
4257 my $code = $tokenization_code->{$tok};
4260 operator_expected( [ $prev_type, $tok, $next_type ] );
4267 # -----------------------------
4268 # end of main tokenization loop
4269 # -----------------------------
4271 if ( $i_tok >= 0 ) {
4272 $routput_token_type->[$i_tok] = $type;
4273 $routput_block_type->[$i_tok] = $block_type;
4274 $routput_container_type->[$i_tok] = $container_type;
4275 $routput_type_sequence->[$i_tok] = $type_sequence;
4276 $routput_indent_flag->[$i_tok] = $indent_flag;
4279 unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
4280 $last_last_nonblank_token = $last_nonblank_token;
4281 $last_last_nonblank_type = $last_nonblank_type;
4282 $last_last_nonblank_block_type = $last_nonblank_block_type;
4283 $last_last_nonblank_container_type = $last_nonblank_container_type;
4284 $last_last_nonblank_type_sequence = $last_nonblank_type_sequence;
4285 $last_nonblank_token = $tok;
4286 $last_nonblank_type = $type;
4287 $last_nonblank_block_type = $block_type;
4288 $last_nonblank_container_type = $container_type;
4289 $last_nonblank_type_sequence = $type_sequence;
4290 $last_nonblank_prototype = $prototype;
4293 # reset indentation level if necessary at a sub or package
4294 # in an attempt to recover from a nesting error
4295 if ( $level_in_tokenizer < 0 ) {
4296 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
4297 reset_indentation_level(0);
4298 brace_warning("resetting level to 0 at $1 $2\n");
4302 # all done tokenizing this line ...
4303 # now prepare the final list of tokens and types
4305 my @token_type = (); # stack of output token types
4306 my @block_type = (); # stack of output code block types
4307 my @container_type = (); # stack of output code container types
4308 my @type_sequence = (); # stack of output type sequence numbers
4309 my @tokens = (); # output tokens
4310 my @levels = (); # structural brace levels of output tokens
4311 my @slevels = (); # secondary nesting levels of output tokens
4312 my @nesting_tokens = (); # string of tokens leading to this depth
4313 my @nesting_types = (); # string of token types leading to this depth
4314 my @nesting_blocks = (); # string of block types leading to this depth
4315 my @nesting_lists = (); # string of list types leading to this depth
4316 my @ci_string = (); # string needed to compute continuation indentation
4317 my @container_environment = (); # BLOCK or LIST
4318 my $container_environment = '';
4319 my $im = -1; # previous $i value
4322 # Count the number of '1's in the string (previously sub ones_count)
4323 my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4325 # Computing Token Indentation
4327 # The final section of the tokenizer forms tokens and also computes
4328 # parameters needed to find indentation. It is much easier to do it
4329 # in the tokenizer than elsewhere. Here is a brief description of how
4330 # indentation is computed. Perl::Tidy computes indentation as the sum
4333 # (1) structural indentation, such as if/else/elsif blocks
4334 # (2) continuation indentation, such as long parameter call lists.
4336 # These are occasionally called primary and secondary indentation.
4338 # Structural indentation is introduced by tokens of type '{', although
4339 # the actual tokens might be '{', '(', or '['. Structural indentation
4340 # is of two types: BLOCK and non-BLOCK. Default structural indentation
4341 # is 4 characters if the standard indentation scheme is used.
4343 # Continuation indentation is introduced whenever a line at BLOCK level
4344 # is broken before its termination. Default continuation indentation
4345 # is 2 characters in the standard indentation scheme.
4347 # Both types of indentation may be nested arbitrarily deep and
4348 # interlaced. The distinction between the two is somewhat arbitrary.
4350 # For each token, we will define two variables which would apply if
4351 # the current statement were broken just before that token, so that
4352 # that token started a new line:
4354 # $level = the structural indentation level,
4355 # $ci_level = the continuation indentation level
4357 # The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
4358 # assuming defaults. However, in some special cases it is customary
4359 # to modify $ci_level from this strict value.
4361 # The total structural indentation is easy to compute by adding and
4362 # subtracting 1 from a saved value as types '{' and '}' are seen. The
4363 # running value of this variable is $level_in_tokenizer.
4365 # The total continuation is much more difficult to compute, and requires
4366 # several variables. These variables are:
4368 # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
4369 # each indentation level, if there are intervening open secondary
4370 # structures just prior to that level.
4371 # $continuation_string_in_tokenizer = a string of 1's and 0's indicating
4372 # if the last token at that level is "continued", meaning that it
4373 # is not the first token of an expression.
4374 # $nesting_block_string = a string of 1's and 0's indicating, for each
4375 # indentation level, if the level is of type BLOCK or not.
4376 # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
4377 # $nesting_list_string = a string of 1's and 0's indicating, for each
4378 # indentation level, if it is appropriate for list formatting.
4379 # If so, continuation indentation is used to indent long list items.
4380 # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
4381 # @{$rslevel_stack} = a stack of total nesting depths at each
4382 # structural indentation level, where "total nesting depth" means
4383 # the nesting depth that would occur if every nesting token -- '{', '[',
4384 # and '(' -- , regardless of context, is used to compute a nesting
4387 #my $nesting_block_flag = ($nesting_block_string =~ /1$/);
4388 #my $nesting_list_flag = ($nesting_list_string =~ /1$/);
4390 my ( $ci_string_i, $level_i, $nesting_block_string_i,
4391 $nesting_list_string_i, $nesting_token_string_i,
4392 $nesting_type_string_i, );
4394 foreach my $i ( @{$routput_token_list} )
4395 { # scan the list of pre-tokens indexes
4397 # self-checking for valid token types
4398 my $type = $routput_token_type->[$i];
4399 my $forced_indentation_flag = $routput_indent_flag->[$i];
4401 # See if we should undo the $forced_indentation_flag.
4402 # Forced indentation after 'if', 'unless', 'while' and 'until'
4403 # expressions without trailing parens is optional and doesn't
4404 # always look good. It is usually okay for a trailing logical
4405 # expression, but if the expression is a function call, code block,
4406 # or some kind of list it puts in an unwanted extra indentation
4407 # level which is hard to remove.
4409 # Example where extra indentation looks ok:
4411 # if $det_a < 0 and $det_b > 0
4412 # or $det_a > 0 and $det_b < 0;
4414 # Example where extra indentation is not needed because
4415 # the eval brace also provides indentation:
4416 # print "not " if defined eval {
4417 # reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
4420 # The following rule works fairly well:
4421 # Undo the flag if the end of this line, or start of the next
4422 # line, is an opening container token or a comma.
4423 # This almost always works, but if not after another pass it will
4425 if ( $forced_indentation_flag && $type eq 'k' ) {
4427 my $ilast = $routput_token_list->[$ixlast];
4428 my $toklast = $routput_token_type->[$ilast];
4429 if ( $toklast eq '#' ) {
4431 $ilast = $routput_token_list->[$ixlast];
4432 $toklast = $routput_token_type->[$ilast];
4434 if ( $toklast eq 'b' ) {
4436 $ilast = $routput_token_list->[$ixlast];
4437 $toklast = $routput_token_type->[$ilast];
4439 if ( $toklast =~ /^[\{,]$/ ) {
4440 $forced_indentation_flag = 0;
4443 ( $toklast, my $i_next ) =
4444 find_next_nonblank_token( $max_token_index, $rtokens,
4446 if ( $toklast =~ /^[\{,]$/ ) {
4447 $forced_indentation_flag = 0;
4452 # if we are already in an indented if, see if we should outdent
4453 if ($indented_if_level) {
4455 # don't try to nest trailing if's - shouldn't happen
4456 if ( $type eq 'k' ) {
4457 $forced_indentation_flag = 0;
4460 # check for the normal case - outdenting at next ';'
4461 elsif ( $type eq ';' ) {
4462 if ( $level_in_tokenizer == $indented_if_level ) {
4463 $forced_indentation_flag = -1;
4464 $indented_if_level = 0;
4468 # handle case of missing semicolon
4469 elsif ( $type eq '}' ) {
4470 if ( $level_in_tokenizer == $indented_if_level ) {
4471 $indented_if_level = 0;
4473 # TBD: This could be a subroutine call
4474 $level_in_tokenizer--;
4475 if ( @{$rslevel_stack} > 1 ) {
4476 pop( @{$rslevel_stack} );
4478 if ( length($nesting_block_string) > 1 )
4479 { # true for valid script
4480 chop $nesting_block_string;
4481 chop $nesting_list_string;
4488 my $tok = $rtokens->[$i]; # the token, but ONLY if same as pretoken
4489 $level_i = $level_in_tokenizer;
4491 # This can happen by running perltidy on non-scripts
4492 # although it could also be bug introduced by programming change.
4493 # Perl silently accepts a 032 (^Z) and takes it as the end
4494 if ( !$is_valid_token_type{$type} ) {
4495 my $val = ord($type);
4497 "unexpected character decimal $val ($type) in script\n");
4498 $tokenizer_self->[_in_error_] = 1;
4501 # ----------------------------------------------------------------
4502 # TOKEN TYPE PATCHES
4503 # output __END__, __DATA__, and format as type 'k' instead of ';'
4504 # to make html colors correct, etc.
4505 my $fix_type = $type;
4506 if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }
4508 # output anonymous 'sub' as keyword
4509 if ( $type eq 't' && $is_sub{$tok} ) { $fix_type = 'k' }
4511 # -----------------------------------------------------------------
4513 $nesting_token_string_i = $nesting_token_string;
4514 $nesting_type_string_i = $nesting_type_string;
4515 $nesting_block_string_i = $nesting_block_string;
4516 $nesting_list_string_i = $nesting_list_string;
4518 # set primary indentation levels based on structural braces
4519 # Note: these are set so that the leading braces have a HIGHER
4520 # level than their CONTENTS, which is convenient for indentation
4521 # Also, define continuation indentation for each token.
4522 if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
4525 # use environment before updating
4526 $container_environment =
4527 $nesting_block_flag ? 'BLOCK'
4528 : $nesting_list_flag ? 'LIST'
4531 # if the difference between total nesting levels is not 1,
4532 # there are intervening non-structural nesting types between
4533 # this '{' and the previous unclosed '{'
4534 my $intervening_secondary_structure = 0;
4535 if ( @{$rslevel_stack} ) {
4536 $intervening_secondary_structure =
4537 $slevel_in_tokenizer - $rslevel_stack->[-1];
4540 # Continuation Indentation
4542 # Having tried setting continuation indentation both in the formatter and
4543 # in the tokenizer, I can say that setting it in the tokenizer is much,
4544 # much easier. The formatter already has too much to do, and can't
4545 # make decisions on line breaks without knowing what 'ci' will be at
4546 # arbitrary locations.
4548 # But a problem with setting the continuation indentation (ci) here
4549 # in the tokenizer is that we do not know where line breaks will actually
4550 # be. As a result, we don't know if we should propagate continuation
4551 # indentation to higher levels of structure.
4553 # For nesting of only structural indentation, we never need to do this.
4554 # For example, in a long if statement, like this
4556 # if ( !$output_block_type[$i]
4557 # && ($in_statement_continuation) )
4562 # the second line has ci but we do normally give the lines within the BLOCK
4563 # any ci. This would be true if we had blocks nested arbitrarily deeply.
4565 # But consider something like this, where we have created a break after
4566 # an opening paren on line 1, and the paren is not (currently) a
4567 # structural indentation token:
4569 # my $file = $menubar->Menubutton(
4570 # qw/-text File -underline 0 -menuitems/ => [
4572 # Cascade => '~View',
4576 # The second line has ci, so it would seem reasonable to propagate it
4577 # down, giving the third line 1 ci + 1 indentation. This suggests the
4578 # following rule, which is currently used to propagating ci down: if there
4579 # are any non-structural opening parens (or brackets, or braces), before
4580 # an opening structural brace, then ci is propagated down, and otherwise
4581 # not. The variable $intervening_secondary_structure contains this
4582 # information for the current token, and the string
4583 # "$ci_string_in_tokenizer" is a stack of previous values of this
4586 # save the current states
4587 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
4588 $level_in_tokenizer++;
4590 if ( $level_in_tokenizer > $tokenizer_self->[_maximum_level_] )
4592 $tokenizer_self->[_maximum_level_] = $level_in_tokenizer;
4595 if ($forced_indentation_flag) {
4597 # break BEFORE '?' when there is forced indentation
4598 if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
4599 if ( $type eq 'k' ) {
4600 $indented_if_level = $level_in_tokenizer;
4603 # do not change container environment here if we are not
4604 # at a real list. Adding this check prevents "blinkers"
4605 # often near 'unless" clauses, such as in the following
4610 ## File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
4613 $nesting_block_string .= "$nesting_block_flag";
4617 if ( $routput_block_type->[$i] ) {
4618 $nesting_block_flag = 1;
4619 $nesting_block_string .= '1';
4622 $nesting_block_flag = 0;
4623 $nesting_block_string .= '0';
4627 # we will use continuation indentation within containers
4628 # which are not blocks and not logical expressions
4630 if ( !$routput_block_type->[$i] ) {
4632 # propagate flag down at nested open parens
4633 if ( $routput_container_type->[$i] eq '(' ) {
4634 $bit = 1 if $nesting_list_flag;
4637 # use list continuation if not a logical grouping
4638 # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
4642 $is_logical_container{ $routput_container_type->[$i]
4646 $nesting_list_string .= $bit;
4647 $nesting_list_flag = $bit;
4649 $ci_string_in_tokenizer .=
4650 ( $intervening_secondary_structure != 0 ) ? '1' : '0';
4652 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4653 $continuation_string_in_tokenizer .=
4654 ( $in_statement_continuation > 0 ) ? '1' : '0';
4656 # Sometimes we want to give an opening brace continuation indentation,
4657 # and sometimes not. For code blocks, we don't do it, so that the leading
4658 # '{' gets outdented, like this:
4660 # if ( !$output_block_type[$i]
4661 # && ($in_statement_continuation) )
4664 # For other types, we will give them continuation indentation. For example,
4665 # here is how a list looks with the opening paren indented:
4668 # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
4669 # [ "homer", "marge", "bart" ], );
4671 # This looks best when 'ci' is one-half of the indentation (i.e., 2 and 4)
4673 my $total_ci = $ci_string_sum;
4675 !$routput_block_type->[$i] # patch: skip for BLOCK
4676 && ($in_statement_continuation)
4677 && !( $forced_indentation_flag && $type eq ':' )
4680 $total_ci += $in_statement_continuation
4681 unless ( substr( $ci_string_in_tokenizer, -1 ) eq '1' );
4684 $ci_string_i = $total_ci;
4685 $in_statement_continuation = 0;
4690 || $forced_indentation_flag < 0 )
4693 # only a nesting error in the script would prevent popping here
4694 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
4696 $level_i = --$level_in_tokenizer;
4698 # restore previous level values
4699 if ( length($nesting_block_string) > 1 )
4700 { # true for valid script
4701 chop $nesting_block_string;
4702 $nesting_block_flag =
4703 substr( $nesting_block_string, -1 ) eq '1';
4704 chop $nesting_list_string;
4705 $nesting_list_flag =
4706 substr( $nesting_list_string, -1 ) eq '1';
4708 chop $ci_string_in_tokenizer;
4710 ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4712 $in_statement_continuation =
4713 chop $continuation_string_in_tokenizer;
4715 # zero continuation flag at terminal BLOCK '}' which
4717 if ( $routput_block_type->[$i] ) {
4719 # ...These include non-anonymous subs
4720 # note: could be sub ::abc { or sub 'abc
4721 if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
4723 # note: older versions of perl require the /gc modifier
4724 # here or else the \G does not work.
4725 if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
4727 $in_statement_continuation = 0;
4731 # ...and include all block types except user subs with
4732 # block prototypes and these: (sort|grep|map|do|eval)
4733 # /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
4735 $is_zero_continuation_block_type{
4736 $routput_block_type->[$i]
4740 $in_statement_continuation = 0;
4743 # ..but these are not terminal types:
4744 # /^(sort|grep|map|do|eval)$/ )
4746 $is_not_zero_continuation_block_type{
4747 $routput_block_type->[$i]
4753 # ..and a block introduced by a label
4755 elsif ( $routput_block_type->[$i] =~ /:$/ ) {
4756 $in_statement_continuation = 0;
4759 # user function with block prototype
4761 $in_statement_continuation = 0;
4765 # If we are in a list, then
4766 # we must set continuation indentation at the closing
4767 # paren of something like this (paren after $check):
4770 # ( not defined $check )
4772 # or $check eq "new"
4773 # or $check eq "old",
4775 elsif ( $tok eq ')' ) {
4776 $in_statement_continuation = 1
4777 if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
4780 elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
4783 # use environment after updating
4784 $container_environment =
4785 $nesting_block_flag ? 'BLOCK'
4786 : $nesting_list_flag ? 'LIST'
4788 $ci_string_i = $ci_string_sum + $in_statement_continuation;
4789 $nesting_block_string_i = $nesting_block_string;
4790 $nesting_list_string_i = $nesting_list_string;
4793 # not a structural indentation type..
4796 $container_environment =
4797 $nesting_block_flag ? 'BLOCK'
4798 : $nesting_list_flag ? 'LIST'
4801 # zero the continuation indentation at certain tokens so
4802 # that they will be at the same level as its container. For
4803 # commas, this simplifies the -lp indentation logic, which
4804 # counts commas. For ?: it makes them stand out.
4805 if ($nesting_list_flag) {
4806 ## $type =~ /^[,\?\:]$/
4807 if ( $is_comma_question_colon{$type} ) {
4808 $in_statement_continuation = 0;
4812 # be sure binary operators get continuation indentation
4814 $container_environment
4815 && ( $type eq 'k' && $is_binary_keyword{$tok}
4816 || $is_binary_type{$type} )
4819 $in_statement_continuation = 1;
4822 # continuation indentation is sum of any open ci from previous
4823 # levels plus the current level
4824 $ci_string_i = $ci_string_sum + $in_statement_continuation;
4826 # update continuation flag ...
4827 # if this isn't a blank or comment..
4828 if ( $type ne 'b' && $type ne '#' ) {
4830 # and we are in a BLOCK
4831 if ($nesting_block_flag) {
4833 # the next token after a ';' and label starts a new stmt
4834 if ( $type eq ';' || $type eq 'J' ) {
4835 $in_statement_continuation = 0;
4838 # otherwise, we are continuing the current statement
4840 $in_statement_continuation = 1;
4844 # if we are not in a BLOCK..
4847 # do not use continuation indentation if not list
4848 # environment (could be within if/elsif clause)
4849 if ( !$nesting_list_flag ) {
4850 $in_statement_continuation = 0;
4853 # otherwise, the token after a ',' starts a new term
4855 # Patch FOR RT#99961; no continuation after a ';'
4856 # This is needed because perltidy currently marks
4857 # a block preceded by a type character like % or @
4858 # as a non block, to simplify formatting. But these
4859 # are actually blocks and can have semicolons.
4860 # See code_block_type() and is_non_structural_brace().
4861 elsif ( $type eq ',' || $type eq ';' ) {
4862 $in_statement_continuation = 0;
4865 # otherwise, we are continuing the current term
4867 $in_statement_continuation = 1;
4873 if ( $level_in_tokenizer < 0 ) {
4874 unless ( $tokenizer_self->[_saw_negative_indentation_] ) {
4875 $tokenizer_self->[_saw_negative_indentation_] = 1;
4876 warning("Starting negative indentation\n");
4880 # set secondary nesting levels based on all containment token types
4881 # Note: these are set so that the nesting depth is the depth
4882 # of the PREVIOUS TOKEN, which is convenient for setting
4883 # the strength of token bonds
4884 my $slevel_i = $slevel_in_tokenizer;
4887 if ( $is_opening_type{$type} ) {
4888 $slevel_in_tokenizer++;
4889 $nesting_token_string .= $tok;
4890 $nesting_type_string .= $type;
4894 elsif ( $is_closing_type{$type} ) {
4895 $slevel_in_tokenizer--;
4896 my $char = chop $nesting_token_string;
4898 if ( $char ne $matching_start_token{$tok} ) {
4899 $nesting_token_string .= $char . $tok;
4900 $nesting_type_string .= $type;
4903 chop $nesting_type_string;
4907 push( @block_type, $routput_block_type->[$i] );
4908 push( @ci_string, $ci_string_i );
4909 push( @container_environment, $container_environment );
4910 push( @container_type, $routput_container_type->[$i] );
4911 push( @levels, $level_i );
4912 push( @nesting_tokens, $nesting_token_string_i );
4913 push( @nesting_types, $nesting_type_string_i );
4914 push( @slevels, $slevel_i );
4915 push( @token_type, $fix_type );
4916 push( @type_sequence, $routput_type_sequence->[$i] );
4917 push( @nesting_blocks, $nesting_block_string );
4918 push( @nesting_lists, $nesting_list_string );
4920 # now form the previous token
4923 $rtoken_map->[$i] - $rtoken_map->[$im]; # how many characters
4927 substr( $input_line, $rtoken_map->[$im], $num ) );
4933 $num = length($input_line) - $rtoken_map->[$im]; # make the last token
4935 push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
4938 $tokenizer_self->[_in_attribute_list_] = $in_attribute_list;
4939 $tokenizer_self->[_in_quote_] = $in_quote;
4940 $tokenizer_self->[_quote_target_] =
4941 $in_quote ? matching_end_token($quote_character) : "";
4942 $tokenizer_self->[_rhere_target_list_] = $rhere_target_list;
4944 $line_of_tokens->{_rtoken_type} = \@token_type;
4945 $line_of_tokens->{_rtokens} = \@tokens;
4946 $line_of_tokens->{_rblock_type} = \@block_type;
4947 $line_of_tokens->{_rcontainer_type} = \@container_type;
4948 $line_of_tokens->{_rcontainer_environment} = \@container_environment;
4949 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
4950 $line_of_tokens->{_rlevels} = \@levels;
4951 $line_of_tokens->{_rslevels} = \@slevels;
4952 $line_of_tokens->{_rnesting_tokens} = \@nesting_tokens;
4953 $line_of_tokens->{_rci_levels} = \@ci_string;
4954 $line_of_tokens->{_rnesting_blocks} = \@nesting_blocks;
4958 } # end tokenize_this_line
4960 #########i#############################################################
4961 # Tokenizer routines which assist in identifying token types
4962 #######################################################################
4964 # hash lookup table of operator expected values
4965 my %op_expected_table;
4967 # exceptions to perl's weird parsing rules after type 'Z'
4968 my %is_weird_parsing_rule_exception;
4972 # Always expecting TERM following these types:
4973 # note: this is identical to '@value_requestor_type' defined later.
4975 ; ! + x & ? F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
4976 || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
4977 &= // >> ~. &. |. ^.
4978 ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
4981 push @q, '('; # for completeness, not currently a token type
4982 @{op_expected_table}{@q} = (TERM) x scalar(@q);
4984 # Always UNKNOWN following these types:
4985 # Fix for c030: added '->' to this list
4987 @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
4989 # Always expecting OPERATOR ...
4990 # 'n' and 'v' are currently excluded because they might be VERSION numbers
4991 # 'i' is currently excluded because it might be a package
4992 # 'q' is currently excluded because it might be a prototype
4993 # Fix for c030: removed '->' from this list:
4994 @q = qw( -- C h R ++ ] Q <> ); ## n v q i );
4996 @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
4998 # Fix for git #62: added '*' and '%'
5000 @{is_weird_parsing_rule_exception}{@q} = (OPERATOR) x scalar(@q);
5004 use constant DEBUG_OPERATOR_EXPECTED => 0;
5006 sub operator_expected {
5008 # Returns a parameter indicating what types of tokens can occur next
5011 # $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
5013 # $prev_type is the type of the previous token (blank or not)
5014 # $tok is the current token
5015 # $next_type is the type of the next token (blank or not)
5017 # Many perl symbols have two or more meanings. For example, '<<'
5018 # can be a shift operator or a here-doc operator. The
5019 # interpretation of these symbols depends on the current state of
5020 # the tokenizer, which may either be expecting a term or an
5021 # operator. For this example, a << would be a shift if an OPERATOR
5022 # is expected, and a here-doc if a TERM is expected. This routine
5023 # is called to make this decision for any current token. It returns
5024 # one of three possible values:
5026 # OPERATOR - operator expected (or at least, not a term)
5027 # UNKNOWN - can't tell
5028 # TERM - a term is expected (or at least, not an operator)
5030 # The decision is based on what has been seen so far. This
5031 # information is stored in the "$last_nonblank_type" and
5032 # "$last_nonblank_token" variables. For example, if the
5033 # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
5034 # if $last_nonblank_type is 'n' (numeric), we are expecting an
5037 # If a UNKNOWN is returned, the calling routine must guess. A major
5038 # goal of this tokenizer is to minimize the possibility of returning
5039 # UNKNOWN, because a wrong guess can spoil the formatting of a
5042 # Adding NEW_TOKENS: it is critically important that this routine be
5043 # updated to allow it to determine if an operator or term is to be
5044 # expected after the new token. Doing this simply involves adding
5045 # the new token character to one of the regexes in this routine or
5046 # to one of the hash lists
5047 # that it uses, which are initialized in the BEGIN section.
5048 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
5051 # When possible, token types should be selected such that we can determine
5052 # the 'operator_expected' value by a simple hash lookup. If there are
5053 # exceptions, that is an indication that a new type is needed.
5063 # Many types are can be obtained by a table lookup given the previous type.
5064 # This typically handles half or more of the calls.
5065 my $op_expected = $op_expected_table{$last_nonblank_type};
5066 if ( defined($op_expected) ) {
5067 $msg = "Table lookup";
5071 ######################
5072 # Handle special cases
5073 ######################
5075 $op_expected = UNKNOWN;
5076 my ( $prev_type, $tok, $next_type ) = @{$rarg};
5078 # Types 'k', '}' and 'Z' depend on context
5079 # FIXME: Types 'i', 'n', 'v', 'q' currently also temporarily depend on
5080 # context but that dependence could eventually be eliminated with better
5081 # token type definition
5084 if ( $last_nonblank_type eq 'i' ) {
5085 $op_expected = OPERATOR;
5087 # FIXME: it would be cleaner to make this a special type
5088 # expecting VERSION or {} after package NAMESPACE
5089 # TODO: maybe mark these words as type 'Y'?
5090 if ( $statement_type =~ /^package\b/
5091 && $last_nonblank_token =~ /^package\b/ )
5093 $op_expected = TERM;
5098 elsif ( $last_nonblank_type eq 'k' ) {
5099 $op_expected = TERM;
5100 if ( $expecting_operator_token{$last_nonblank_token} ) {
5101 $op_expected = OPERATOR;
5103 elsif ( $expecting_term_token{$last_nonblank_token} ) {
5105 # Exceptions from TERM:
5107 # // may follow perl functions which may be unary operators
5108 # see test file dor.t (defined or);
5111 && $next_type eq '/'
5112 && $is_keyword_rejecting_slash_as_pattern_delimiter{
5113 $last_nonblank_token}
5116 $op_expected = OPERATOR;
5119 # Patch to allow a ? following 'split' to be a depricated pattern
5120 # delimiter. This patch is coordinated with the omission of split
5122 # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
5123 # will force perltidy to guess.
5125 && $last_nonblank_token eq 'split' )
5127 $op_expected = UNKNOWN;
5132 # closing container token...
5134 # Note that the actual token for type '}' may also be a ')'.
5136 # Also note that $last_nonblank_token is not the token corresponding to
5137 # $last_nonblank_type when the type is a closing container. In that
5138 # case it is the token before the corresponding opening container token.
5139 # So for example, for this snippet
5140 # $a = do { BLOCK } / 2;
5141 # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
5143 elsif ( $last_nonblank_type eq '}' ) {
5144 $op_expected = UNKNOWN;
5146 # handle something after 'do' and 'eval'
5147 if ( $is_block_operator{$last_nonblank_token} ) {
5149 # something like $a = do { BLOCK } / 2;
5150 $op_expected = OPERATOR; # block mode following }
5153 elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
5154 $op_expected = OPERATOR;
5155 if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
5159 # Check for smartmatch operator before preceding brace or square
5160 # bracket. For example, at the ? after the ] in the following
5161 # expressions we are expecting an operator:
5163 # qr/3/ ~~ ['1234'] ? 1 : 0;
5164 # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
5165 elsif ( $last_nonblank_token eq '~~' ) {
5166 $op_expected = OPERATOR;
5169 # A right brace here indicates the end of a simple block. All
5170 # non-structural right braces have type 'R' all braces associated with
5171 # block operator keywords have been given those keywords as
5172 # "last_nonblank_token" and caught above. (This statement is order
5173 # dependent, and must come after checking $last_nonblank_token).
5176 # patch for dor.t (defined or).
5178 && $next_type eq '/'
5179 && $last_nonblank_token eq ']' )
5181 $op_expected = OPERATOR;
5184 # Patch for RT #116344: misparse a ternary operator after an
5185 # anonymous hash, like this:
5186 # return ref {} ? 1 : 0;
5187 # The right brace should really be marked type 'R' in this case,
5188 # and it is safest to return an UNKNOWN here. Expecting a TERM will
5189 # cause the '?' to always be interpreted as a pattern delimiter
5190 # rather than introducing a ternary operator.
5191 elsif ( $tok eq '?' ) {
5192 $op_expected = UNKNOWN;
5195 $op_expected = TERM;
5200 # number or v-string...
5201 # An exception is for VERSION numbers a 'use' statement. It has the format
5202 # use Module VERSION LIST
5203 # We could avoid this exception by writing a special sub to parse 'use'
5204 # statements and perhaps mark these numbers with a new type V (for VERSION)
5205 elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
5206 $op_expected = OPERATOR;
5207 if ( $statement_type eq 'use' ) {
5208 $op_expected = UNKNOWN;
5213 # FIXME: labeled prototype words should probably be given type 'A' or maybe
5214 # 'J'; not 'q'; or maybe mark as type 'Y'
5215 elsif ( $last_nonblank_type eq 'q' ) {
5216 $op_expected = OPERATOR;
5217 if ( $last_nonblank_token eq 'prototype' )
5218 ##|| $last_nonblank_token eq 'switch' )
5220 $op_expected = TERM;
5224 # file handle or similar
5225 elsif ( $last_nonblank_type eq 'Z' ) {
5227 $op_expected = UNKNOWN;
5230 if ( $last_nonblank_token =~ /^\w/ ) {
5231 $op_expected = UNKNOWN;
5234 # The 'weird parsing rules' of next section do not work for '<' and '?'
5235 # It is best to mark them as unknown. Test case:
5237 elsif ( $is_weird_parsing_rule_exception{$tok} ) {
5238 $op_expected = UNKNOWN;
5241 # For possible file handle like "$a", Perl uses weird parsing rules.
5243 # print $a/2,"/hi"; - division
5244 # print $a / 2,"/hi"; - division
5245 # print $a/ 2,"/hi"; - division
5246 # print $a /2,"/hi"; - pattern (and error)!
5247 # Some examples where this logic works okay, for '&','*','+':
5248 # print $fh &xsi_protos(@mods);
5249 # my $x = new $CompressClass *FH;
5250 # print $OUT +( $count % 15 ? ", " : "\n\t" );
5251 elsif ($prev_type eq 'b'
5252 && $next_type ne 'b' )
5254 $op_expected = TERM;
5257 # Note that '?' and '<' have been moved above
5258 # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
5259 elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
5261 # Do not complain in 'use' statements, which have special syntax.
5262 # For example, from RT#130344:
5263 # use lib $FindBin::Bin . '/lib';
5264 if ( $statement_type ne 'use' ) {
5266 "operator in possible indirect object location not recommended\n"
5269 $op_expected = OPERATOR;
5275 $op_expected = UNKNOWN;
5280 DEBUG_OPERATOR_EXPECTED && do {
5282 "OPERATOR_EXPECTED: $msg: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5285 return $op_expected;
5287 } ## end of sub operator_expected
5289 sub new_statement_ok {
5291 # return true if the current token can start a new statement
5292 # USES GLOBAL VARIABLES: $last_nonblank_type
5294 return label_ok() # a label would be ok here
5296 || $last_nonblank_type eq 'J'; # or we follow a label
5302 # Decide if a bare word followed by a colon here is a label
5303 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5304 # $brace_depth, @brace_type
5306 # if it follows an opening or closing code block curly brace..
5307 if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
5308 && $last_nonblank_type eq $last_nonblank_token )
5311 # it is a label if and only if the curly encloses a code block
5312 return $brace_type[$brace_depth];
5315 # otherwise, it is a label if and only if it follows a ';' (real or fake)
5318 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
5322 sub code_block_type {
5324 # Decide if this is a block of code, and its type.
5325 # Must be called only when $type = $token = '{'
5326 # The problem is to distinguish between the start of a block of code
5327 # and the start of an anonymous hash reference
5328 # Returns "" if not code block, otherwise returns 'last_nonblank_token'
5329 # to indicate the type of code block. (For example, 'last_nonblank_token'
5330 # might be 'if' for an if block, 'else' for an else block, etc).
5331 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5332 # $last_nonblank_block_type, $brace_depth, @brace_type
5334 # handle case of multiple '{'s
5336 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
5338 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
5339 if ( $last_nonblank_token eq '{'
5340 && $last_nonblank_type eq $last_nonblank_token )
5343 # opening brace where a statement may appear is probably
5344 # a code block but might be and anonymous hash reference
5345 if ( $brace_type[$brace_depth] ) {
5346 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5350 # cannot start a code block within an anonymous hash
5356 elsif ( $last_nonblank_token eq ';' ) {
5358 # an opening brace where a statement may appear is probably
5359 # a code block but might be and anonymous hash reference
5360 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5364 # handle case of '}{'
5365 elsif ($last_nonblank_token eq '}'
5366 && $last_nonblank_type eq $last_nonblank_token )
5369 # a } { situation ...
5370 # could be hash reference after code block..(blktype1.t)
5371 if ($last_nonblank_block_type) {
5372 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5376 # must be a block if it follows a closing hash reference
5378 return $last_nonblank_token;
5382 ################################################################
5383 # NOTE: braces after type characters start code blocks, but for
5384 # simplicity these are not identified as such. See also
5385 # sub is_non_structural_brace.
5386 ################################################################
5388 ## elsif ( $last_nonblank_type eq 't' ) {
5389 ## return $last_nonblank_token;
5392 # brace after label:
5393 elsif ( $last_nonblank_type eq 'J' ) {
5394 return $last_nonblank_token;
5397 # otherwise, look at previous token. This must be a code block if
5398 # it follows any of these:
5399 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
5400 elsif ( $is_code_block_token{$last_nonblank_token} ) {
5402 # Bug Patch: Note that the opening brace after the 'if' in the following
5403 # snippet is an anonymous hash ref and not a code block!
5404 # print 'hi' if { x => 1, }->{x};
5405 # We can identify this situation because the last nonblank type
5406 # will be a keyword (instead of a closing peren)
5407 if ( $last_nonblank_token =~ /^(if|unless)$/
5408 && $last_nonblank_type eq 'k' )
5413 return $last_nonblank_token;
5417 # or a sub or package BLOCK
5418 elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
5419 && $last_nonblank_token =~ /^(sub|package)\b/ )
5421 return $last_nonblank_token;
5425 elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
5426 && ( $is_sub{$last_nonblank_token} ) )
5431 elsif ( $statement_type =~ /^(sub|package)\b/ ) {
5432 return $statement_type;
5435 # user-defined subs with block parameters (like grep/map/eval)
5436 elsif ( $last_nonblank_type eq 'G' ) {
5437 return $last_nonblank_token;
5441 elsif ( $last_nonblank_type eq 'w' ) {
5443 # check for syntax 'use MODULE LIST'
5444 # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
5445 return "" if ( $statement_type eq 'use' );
5447 return decide_if_code_block( $i, $rtokens, $rtoken_type,
5451 # Patch for bug # RT #94338 reported by Daniel Trizen
5452 # for-loop in a parenthesized block-map triggering an error message:
5453 # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
5454 # Check for a code block within a parenthesized function call
5455 elsif ( $last_nonblank_token eq '(' ) {
5456 my $paren_type = $paren_type[$paren_depth];
5457 if ( $paren_type && $paren_type =~ /^(map|grep|sort)$/ ) {
5459 # We will mark this as a code block but use type 't' instead
5460 # of the name of the contining function. This will allow for
5461 # correct parsing but will usually produce better formatting.
5462 # Braces with block type 't' are not broken open automatically
5463 # in the formatter as are other code block types, and this usually
5465 return 't'; # (Not $paren_type)
5472 # handle unknown syntax ') {'
5473 # we previously appended a '()' to mark this case
5474 elsif ( $last_nonblank_token =~ /\(\)$/ ) {
5475 return $last_nonblank_token;
5478 # anything else must be anonymous hash reference
5484 sub decide_if_code_block {
5486 # USES GLOBAL VARIABLES: $last_nonblank_token
5487 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
5489 my ( $next_nonblank_token, $i_next ) =
5490 find_next_nonblank_token( $i, $rtokens, $max_token_index );
5492 # we are at a '{' where a statement may appear.
5493 # We must decide if this brace starts an anonymous hash or a code
5495 # return "" if anonymous hash, and $last_nonblank_token otherwise
5497 # initialize to be code BLOCK
5498 my $code_block_type = $last_nonblank_token;
5500 # Check for the common case of an empty anonymous hash reference:
5501 # Maybe something like sub { { } }
5502 if ( $next_nonblank_token eq '}' ) {
5503 $code_block_type = "";
5508 # To guess if this '{' is an anonymous hash reference, look ahead
5509 # and test as follows:
5511 # it is a hash reference if next come:
5512 # - a string or digit followed by a comma or =>
5513 # - bareword followed by =>
5514 # otherwise it is a code block
5516 # Examples of anonymous hash ref:
5520 # Examples of code blocks:
5521 # {1; print "hello\n", 1;}
5524 # We are only going to look ahead one more (nonblank/comment) line.
5525 # Strange formatting could cause a bad guess, but that's unlikely.
5529 # Ignore the rest of this line if it is a side comment
5530 if ( $next_nonblank_token ne '#' ) {
5531 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
5532 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
5534 my ( $rpre_tokens, $rpre_types ) =
5535 peek_ahead_for_n_nonblank_pre_tokens(20); # 20 is arbitrary but
5536 # generous, and prevents
5538 # time in mangled files
5539 if ( defined($rpre_types) && @{$rpre_types} ) {
5540 push @pre_types, @{$rpre_types};
5541 push @pre_tokens, @{$rpre_tokens};
5544 # put a sentinel token to simplify stopping the search
5545 push @pre_types, '}';
5546 push @pre_types, '}';
5549 $jbeg = 1 if $pre_types[0] eq 'b';
5551 # first look for one of these
5553 # - bareword with leading -
5557 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
5559 # find the closing quote; don't worry about escapes
5560 my $quote_mark = $pre_types[$j];
5561 foreach my $k ( $j + 1 .. @pre_types - 2 ) {
5562 if ( $pre_types[$k] eq $quote_mark ) {
5564 my $next = $pre_types[$j];
5569 elsif ( $pre_types[$j] eq 'd' ) {
5572 elsif ( $pre_types[$j] eq 'w' ) {
5575 elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
5580 $j++ if $pre_types[$j] eq 'b';
5582 # Patched for RT #95708
5585 # it is a comma which is not a pattern delimeter except for qw
5587 $pre_types[$j] eq ','
5588 && $pre_tokens[$jbeg] !~ /^(s|m|y|tr|qr|q|qq|qx)$/
5592 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
5595 $code_block_type = "";
5599 if ($code_block_type) {
5601 # Patch for cases b1085 b1128: It is uncertain if this is a block.
5602 # If this brace follows a bareword, then append a space as a signal
5603 # to the formatter that this may not be a block brace. To find the
5604 # corresponding code in Formatter.pm search for 'b1085'.
5605 $code_block_type .= " " if ( $code_block_type =~ /^\w/ );
5609 return $code_block_type;
5612 sub report_unexpected {
5614 # report unexpected token type and show where it is
5615 # USES GLOBAL VARIABLES: $tokenizer_self
5616 my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
5617 $rpretoken_type, $input_line )
5620 if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
5621 my $msg = "found $found where $expecting expected";
5622 my $pos = $rpretoken_map->[$i_tok];
5623 interrupt_logfile();
5624 my $input_line_number = $tokenizer_self->[_last_line_number_];
5625 my ( $offset, $numbered_line, $underline ) =
5626 make_numbered_line( $input_line_number, $input_line, $pos );
5627 $underline = write_on_underline( $underline, $pos - $offset, '^' );
5630 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
5631 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
5633 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
5634 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
5637 $num = $pos - $pos_prev;
5639 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
5642 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
5643 $trailer = " (previous token underlined)";
5645 $underline =~ s/\s+$//;
5646 warning( $numbered_line . "\n" );
5647 warning( $underline . "\n" );
5648 warning( $msg . $trailer . "\n" );
5654 sub is_non_structural_brace {
5656 # Decide if a brace or bracket is structural or non-structural
5657 # by looking at the previous token and type
5658 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
5660 # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
5661 # Tentatively deactivated because it caused the wrong operator expectation
5663 # $user = @vars[1] / 100;
5664 # Must update sub operator_expected before re-implementing.
5665 # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
5669 ################################################################
5670 # NOTE: braces after type characters start code blocks, but for
5671 # simplicity these are not identified as such. See also
5672 # sub code_block_type
5673 ################################################################
5675 ##if ($last_nonblank_type eq 't') {return 0}
5677 # otherwise, it is non-structural if it is decorated
5678 # by type information.
5679 # For example, the '{' here is non-structural: ${xxx}
5681 $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
5683 # or if we follow a hash or array closing curly brace or bracket
5684 # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
5685 # because the first '}' would have been given type 'R'
5686 || $last_nonblank_type =~ /^([R\]])$/
5690 #########i#############################################################
5691 # Tokenizer routines for tracking container nesting depths
5692 #######################################################################
5694 # The following routines keep track of nesting depths of the nesting
5695 # types, ( [ { and ?. This is necessary for determining the indentation
5696 # level, and also for debugging programs. Not only do they keep track of
5697 # nesting depths of the individual brace types, but they check that each
5698 # of the other brace types is balanced within matching pairs. For
5699 # example, if the program sees this sequence:
5703 # then it can determine that there is an extra left paren somewhere
5704 # between the { and the }. And so on with every other possible
5705 # combination of outer and inner brace types. For another
5710 # which has an extra ] within the parens.
5712 # The brace types have indexes 0 .. 3 which are indexes into
5715 # The pair ? : are treated as just another nesting type, with ? acting
5716 # as the opening brace and : acting as the closing brace.
5720 # $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
5722 # saves the nesting depth of brace type $b (where $b is either of the other
5723 # nesting types) when brace type $a enters a new depth. When this depth
5724 # decreases, a check is made that the current depth of brace types $b is
5725 # unchanged, or otherwise there must have been an error. This can
5726 # be very useful for localizing errors, particularly when perl runs to
5727 # the end of a large file (such as this one) and announces that there
5728 # is a problem somewhere.
5730 # A numerical sequence number is maintained for every nesting type,
5731 # so that each matching pair can be uniquely identified in a simple
5734 sub increase_nesting_depth {
5735 my ( $aa, $pos ) = @_;
5737 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
5738 # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
5740 $current_depth[$aa]++;
5742 $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
5743 my $input_line_number = $tokenizer_self->[_last_line_number_];
5744 my $input_line = $tokenizer_self->[_line_of_text_];
5746 # Sequence numbers increment by number of items. This keeps
5747 # a unique set of numbers but still allows the relative location
5748 # of any type to be determined.
5749 $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
5750 my $seqno = $nesting_sequence_number[$aa];
5751 $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
5753 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
5754 [ $input_line_number, $input_line, $pos ];
5756 for my $bb ( 0 .. @closing_brace_names - 1 ) {
5757 next if ( $bb == $aa );
5758 $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
5761 # set a flag for indenting a nested ternary statement
5763 if ( $aa == QUESTION_COLON ) {
5764 $nested_ternary_flag[ $current_depth[$aa] ] = 0;
5765 if ( $current_depth[$aa] > 1 ) {
5766 if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
5767 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
5768 if ( $pdepth == $total_depth - 1 ) {
5770 $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
5775 $nested_statement_type[$aa][ $current_depth[$aa] ] = $statement_type;
5776 $statement_type = "";
5777 return ( $seqno, $indent );
5780 sub is_balanced_closing_container {
5782 # Return true if a closing container can go here without error
5783 # Return false if not
5786 # cannot close if there was no opening
5787 return unless ( $current_depth[$aa] > 0 );
5789 # check that any other brace types $bb contained within would be balanced
5790 for my $bb ( 0 .. @closing_brace_names - 1 ) {
5791 next if ( $bb == $aa );
5793 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
5794 $current_depth[$bb] );
5797 # OK, everything will be balanced
5801 sub decrease_nesting_depth {
5803 my ( $aa, $pos ) = @_;
5805 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
5806 # @current_sequence_number, @depth_array, @starting_line_of_current_depth
5809 my $input_line_number = $tokenizer_self->[_last_line_number_];
5810 my $input_line = $tokenizer_self->[_line_of_text_];
5814 if ( $current_depth[$aa] > 0 ) {
5816 # set a flag for un-indenting after seeing a nested ternary statement
5817 $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
5818 if ( $aa == QUESTION_COLON ) {
5819 $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
5821 $statement_type = $nested_statement_type[$aa][ $current_depth[$aa] ];
5823 # check that any brace types $bb contained within are balanced
5824 for my $bb ( 0 .. @closing_brace_names - 1 ) {
5825 next if ( $bb == $aa );
5827 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
5828 $current_depth[$bb] )
5831 $current_depth[$bb] -
5832 $depth_array[$aa][$bb][ $current_depth[$aa] ];
5834 # don't whine too many times
5835 my $saw_brace_error = get_saw_brace_error();
5837 $saw_brace_error <= MAX_NAG_MESSAGES
5839 # if too many closing types have occurred, we probably
5840 # already caught this error
5841 && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
5844 interrupt_logfile();
5846 $starting_line_of_current_depth[$aa]
5847 [ $current_depth[$aa] ];
5849 my $rel = [ $input_line_number, $input_line, $pos ];
5853 if ( $diff == 1 || $diff == -1 ) {
5861 ? $opening_brace_names[$bb]
5862 : $closing_brace_names[$bb];
5863 write_error_indicator_pair( @{$rsl}, '^' );
5865 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
5870 $starting_line_of_current_depth[$bb]
5871 [ $current_depth[$bb] ];
5874 " The most recent un-matched $bname is on line $ml\n";
5875 write_error_indicator_pair( @{$rml}, '^' );
5877 write_error_indicator_pair( @{$rel}, '^' );
5881 increment_brace_error();
5884 $current_depth[$aa]--;
5888 my $saw_brace_error = get_saw_brace_error();
5889 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
5891 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
5893 indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
5895 increment_brace_error();
5897 # keep track of errors in braces alone (ignoring ternary nesting errors)
5898 $tokenizer_self->[_true_brace_error_count_]++
5899 if ( $closing_brace_names[$aa] ne "':'" );
5901 return ( $seqno, $outdent );
5904 sub check_final_nesting_depths {
5906 # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
5908 for my $aa ( 0 .. @closing_brace_names - 1 ) {
5910 if ( $current_depth[$aa] ) {
5912 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
5915 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
5916 The most recent un-matched $opening_brace_names[$aa] is on line $sl
5918 indicate_error( $msg, @{$rsl}, '^' );
5919 increment_brace_error();
5925 #########i#############################################################
5926 # Tokenizer routines for looking ahead in input stream
5927 #######################################################################
5929 sub peek_ahead_for_n_nonblank_pre_tokens {
5931 # returns next n pretokens if they exist
5932 # returns undef's if hits eof without seeing any pretokens
5933 # USES GLOBAL VARIABLES: $tokenizer_self
5934 my $max_pretokens = shift;
5937 my ( $rpre_tokens, $rmap, $rpre_types );
5940 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
5942 $line =~ s/^\s*//; # trim leading blanks
5943 next if ( length($line) <= 0 ); # skip blank
5944 next if ( $line =~ /^#/ ); # skip comment
5945 ( $rpre_tokens, $rmap, $rpre_types ) =
5946 pre_tokenize( $line, $max_pretokens );
5949 return ( $rpre_tokens, $rpre_types );
5952 # look ahead for next non-blank, non-comment line of code
5953 sub peek_ahead_for_nonblank_token {
5955 # USES GLOBAL VARIABLES: $tokenizer_self
5956 my ( $rtokens, $max_token_index ) = @_;
5961 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
5963 $line =~ s/^\s*//; # trim leading blanks
5964 next if ( length($line) <= 0 ); # skip blank
5965 next if ( $line =~ /^#/ ); # skip comment
5966 my ( $rtok, $rmap, $rtype ) =
5967 pre_tokenize( $line, 2 ); # only need 2 pre-tokens
5968 my $j = $max_token_index + 1;
5970 foreach my $tok ( @{$rtok} ) {
5971 last if ( $tok =~ "\n" );
5972 $rtokens->[ ++$j ] = $tok;
5979 #########i#############################################################
5980 # Tokenizer guessing routines for ambiguous situations
5981 #######################################################################
5983 sub guess_if_pattern_or_conditional {
5985 # this routine is called when we have encountered a ? following an
5986 # unknown bareword, and we must decide if it starts a pattern or not
5988 # $i - token index of the ? starting possible pattern
5989 # output parameters:
5990 # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
5991 # msg = a warning or diagnostic message
5992 # USES GLOBAL VARIABLES: $last_nonblank_token
5994 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
5996 my $msg = "guessing that ? after $last_nonblank_token starts a ";
5998 if ( $i >= $max_token_index ) {
5999 $msg .= "conditional (no end to pattern found on the line)\n";
6004 my $next_token = $rtokens->[$i]; # first token after ?
6006 # look for a possible ending ? on this line..
6008 my $quote_depth = 0;
6009 my $quote_character = '';
6013 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6016 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6017 $quote_pos, $quote_depth, $max_token_index );
6021 # we didn't find an ending ? on this line,
6022 # so we bias towards conditional
6024 $msg .= "conditional (no ending ? on this line)\n";
6026 # we found an ending ?, so we bias towards a pattern
6030 # Watch out for an ending ? in quotes, like this
6031 # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
6035 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
6036 my $tok = $rtokens->[$ii];
6037 if ( $tok eq ":" ) { $colons++ }
6038 if ( $tok eq "'" ) { $s_quote++ }
6039 if ( $tok eq '"' ) { $d_quote++ }
6041 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
6043 $msg .= "found ending ? but unbalanced quote chars\n";
6045 elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
6047 $msg .= "pattern (found ending ? and pattern expected)\n";
6050 $msg .= "pattern (uncertain, but found ending ?)\n";
6054 return ( $is_pattern, $msg );
6057 my %is_known_constant;
6058 my %is_known_function;
6062 # Constants like 'pi' in Trig.pm are common
6063 my @q = qw(pi pi2 pi4 pip2 pip4);
6064 @{is_known_constant}{@q} = (1) x scalar(@q);
6066 # parenless calls of 'ok' are common
6068 @{is_known_function}{@q} = (1) x scalar(@q);
6071 sub guess_if_pattern_or_division {
6073 # this routine is called when we have encountered a / following an
6074 # unknown bareword, and we must decide if it starts a pattern or is a
6077 # $i - token index of the / starting possible pattern
6078 # output parameters:
6079 # $is_pattern = 0 if probably division, =1 if probably a pattern
6080 # msg = a warning or diagnostic message
6081 # USES GLOBAL VARIABLES: $last_nonblank_token
6082 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6084 my $msg = "guessing that / after $last_nonblank_token starts a ";
6086 if ( $i >= $max_token_index ) {
6087 $msg .= "division (no end to pattern found on the line)\n";
6091 my $divide_possible =
6092 is_possible_numerator( $i, $rtokens, $max_token_index );
6094 if ( $divide_possible < 0 ) {
6095 $msg = "pattern (division not possible here)\n";
6101 my $next_token = $rtokens->[$i]; # first token after slash
6103 # One of the things we can look at is the spacing around the slash.
6104 # There # are four possible spacings around the first slash:
6106 # return pi/two;#/; -/-
6107 # return pi/ two;#/; -/+
6108 # return pi / two;#/; +/+
6109 # return pi /two;#/; +/- <-- possible pattern
6111 # Spacing rule: a space before the slash but not after the slash
6112 # usually indicates a pattern. We can use this to break ties.
6114 my $is_pattern_by_spacing =
6115 ( $i > 1 && $next_token ne ' ' && $rtokens->[ $i - 2 ] eq ' ' );
6117 # look for a possible ending / on this line..
6119 my $quote_depth = 0;
6120 my $quote_character = '';
6124 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6127 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6128 $quote_pos, $quote_depth, $max_token_index );
6132 # we didn't find an ending / on this line, so we bias towards
6134 if ( $divide_possible >= 0 ) {
6136 $msg .= "division (no ending / on this line)\n";
6140 # assuming a multi-line pattern ... this is risky, but division
6141 # does not seem possible. If this fails, it would either be due
6142 # to a syntax error in the code, or the division_expected logic
6143 # needs to be fixed.
6144 $msg = "multi-line pattern (division not possible)\n";
6149 # we found an ending /, so we bias slightly towards a pattern
6152 my $pattern_expected =
6153 pattern_expected( $i, $rtokens, $max_token_index );
6155 if ( $pattern_expected >= 0 ) {
6157 # pattern looks possible...
6158 if ( $divide_possible >= 0 ) {
6160 # Both pattern and divide can work here...
6162 # Increase weight of divide if a pure number follows
6163 $divide_possible += $next_token =~ /^\d+$/;
6165 # Check for known constants in the numerator, like 'pi'
6166 if ( $is_known_constant{$last_nonblank_token} ) {
6168 "division (pattern works too but saw known constant '$last_nonblank_token')\n";
6172 # A very common bare word in pattern expressions is 'ok'
6173 elsif ( $is_known_function{$last_nonblank_token} ) {
6175 "pattern (division works too but saw '$last_nonblank_token')\n";
6179 # If one rule is more definite, use it
6180 elsif ( $divide_possible > $pattern_expected ) {
6182 "division (more likely based on following tokens)\n";
6186 # otherwise, use the spacing rule
6187 elsif ($is_pattern_by_spacing) {
6189 "pattern (guess on spacing, but division possible too)\n";
6194 "division (guess on spacing, but pattern is possible too)\n";
6199 # divide_possible < 0 means divide can not work here
6202 $msg .= "pattern (division not possible)\n";
6206 # pattern does not look possible...
6209 if ( $divide_possible >= 0 ) {
6211 $msg .= "division (pattern not possible)\n";
6214 # Neither pattern nor divide look possible...go by spacing
6216 if ($is_pattern_by_spacing) {
6217 $msg .= "pattern (guess on spacing)\n";
6221 $msg .= "division (guess on spacing)\n";
6230 return ( $is_pattern, $msg );
6233 # try to resolve here-doc vs. shift by looking ahead for
6234 # non-code or the end token (currently only looks for end token)
6235 # returns 1 if it is probably a here doc, 0 if not
6236 sub guess_if_here_doc {
6238 # This is how many lines we will search for a target as part of the
6239 # guessing strategy. It is a constant because there is probably
6240 # little reason to change it.
6241 # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
6243 my $HERE_DOC_WINDOW = 40;
6245 my $next_token = shift;
6246 my $here_doc_expected = 0;
6249 my $msg = "checking <<";
6252 $tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) )
6256 if ( $line =~ /^$next_token$/ ) {
6257 $msg .= " -- found target $next_token ahead $k lines\n";
6258 $here_doc_expected = 1; # got it
6261 last if ( $k >= $HERE_DOC_WINDOW );
6264 unless ($here_doc_expected) {
6266 if ( !defined($line) ) {
6267 $here_doc_expected = -1; # hit eof without seeing target
6268 $msg .= " -- must be shift; target $next_token not in file\n";
6271 else { # still unsure..taking a wild guess
6273 if ( !$is_constant{$current_package}{$next_token} ) {
6274 $here_doc_expected = 1;
6276 " -- guessing it's a here-doc ($next_token not a constant)\n";
6280 " -- guessing it's a shift ($next_token is a constant)\n";
6284 write_logfile_entry($msg);
6285 return $here_doc_expected;
6288 #########i#############################################################
6289 # Tokenizer Routines for scanning identifiers and related items
6290 #######################################################################
6292 sub scan_bare_identifier_do {
6294 # this routine is called to scan a token starting with an alphanumeric
6295 # variable or package separator, :: or '.
6296 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
6297 # $last_nonblank_type,@paren_type, $paren_depth
6299 my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
6303 my $package = undef;
6307 # we have to back up one pretoken at a :: since each : is one pretoken
6308 if ( $tok eq '::' ) { $i_beg-- }
6309 if ( $tok eq '->' ) { $i_beg-- }
6310 my $pos_beg = $rtoken_map->[$i_beg];
6311 pos($input_line) = $pos_beg;
6318 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
6320 my $pos = pos($input_line);
6321 my $numc = $pos - $pos_beg;
6322 $tok = substr( $input_line, $pos_beg, $numc );
6324 # type 'w' includes anything without leading type info
6325 # ($,%,@,*) including something like abc::def::ghi
6329 if ( defined($2) ) { $sub_name = $2; }
6330 if ( defined($1) ) {
6333 # patch: don't allow isolated package name which just ends
6334 # in the old style package separator (single quote). Example:
6336 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
6340 $package =~ s/\'/::/g;
6341 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
6342 $package =~ s/::$//;
6345 $package = $current_package;
6347 # patched for c043, part 1: keyword does not follow '->'
6348 if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
6353 # if it is a bareword.. patched for c043, part 2: not following '->'
6354 if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
6356 # check for v-string with leading 'v' type character
6357 # (This seems to have precedence over filehandle, type 'Y')
6358 if ( $tok =~ /^v\d[_\d]*$/ ) {
6360 # we only have the first part - something like 'v101' -
6362 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
6363 $pos = pos($input_line);
6364 $numc = $pos - $pos_beg;
6365 $tok = substr( $input_line, $pos_beg, $numc );
6369 # warn if this version can't handle v-strings
6370 report_v_string($tok);
6373 elsif ( $is_constant{$package}{$sub_name} ) {
6377 # bareword after sort has implied empty prototype; for example:
6378 # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
6379 # This has priority over whatever the user has specified.
6380 elsif ($last_nonblank_token eq 'sort'
6381 && $last_nonblank_type eq 'k' )
6386 # Note: strangely, perl does not seem to really let you create
6387 # functions which act like eval and do, in the sense that eval
6388 # and do may have operators following the final }, but any operators
6389 # that you create with prototype (&) apparently do not allow
6390 # trailing operators, only terms. This seems strange.
6391 # If this ever changes, here is the update
6392 # to make perltidy behave accordingly:
6394 # elsif ( $is_block_function{$package}{$tok} ) {
6395 # $tok='eval'; # patch to do braces like eval - doesn't work
6398 # FIXME: This could become a separate type to allow for different
6400 elsif ( $is_block_function{$package}{$sub_name} ) {
6403 elsif ( $is_block_list_function{$package}{$sub_name} ) {
6406 elsif ( $is_user_function{$package}{$sub_name} ) {
6408 $prototype = $user_function_prototype{$package}{$sub_name};
6411 # check for indirect object
6414 # added 2001-03-27: must not be followed immediately by '('
6416 ( $input_line !~ m/\G\(/gc )
6421 # preceded by keyword like 'print', 'printf' and friends
6422 $is_indirect_object_taker{$last_nonblank_token}
6424 # or preceded by something like 'print(' or 'printf('
6426 ( $last_nonblank_token eq '(' )
6427 && $is_indirect_object_taker{ $paren_type[$paren_depth]
6435 # may not be indirect object unless followed by a space;
6436 # updated 2021-01-16 to consider newline to be a space.
6437 # updated for case b990 to look for either ';' or space
6438 if ( pos($input_line) == length($input_line)
6439 || $input_line =~ m/\G[;\s]/gc )
6444 # Perl's indirect object notation is a very bad
6445 # thing and can cause subtle bugs, especially for
6446 # beginning programmers. And I haven't even been
6447 # able to figure out a sane warning scheme which
6448 # doesn't get in the way of good scripts.
6450 # Complain if a filehandle has any lower case
6451 # letters. This is suggested good practice.
6452 # Use 'sub_name' because something like
6453 # main::MYHANDLE is ok for filehandle
6454 if ( $sub_name =~ /[a-z]/ ) {
6456 # could be bug caused by older perltidy if
6458 if ( $input_line =~ m/\G\s*\(/gc ) {
6460 "Caution: unknown word '$tok' in indirect object slot\n"
6466 # bareword not followed by a space -- may not be filehandle
6467 # (may be function call defined in a 'use' statement)
6474 # Now we must convert back from character position
6475 # to pre_token index.
6476 # I don't think an error flag can occur here ..but who knows
6479 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
6481 warning("scan_bare_identifier: Possibly invalid tokenization\n");
6485 # no match but line not blank - could be syntax error
6486 # perl will take '::' alone without complaint
6490 # change this warning to log message if it becomes annoying
6491 warning("didn't find identifier after leading ::\n");
6493 return ( $i, $tok, $type, $prototype );
6498 # This is the new scanner and will eventually replace scan_identifier.
6499 # Only type 'sub' and 'package' are implemented.
6500 # Token types $ * % @ & -> are not yet implemented.
6502 # Scan identifier following a type token.
6503 # The type of call depends on $id_scan_state: $id_scan_state = ''
6504 # for starting call, in which case $tok must be the token defining
6507 # If the type token is the last nonblank token on the line, a value
6508 # of $id_scan_state = $tok is returned, indicating that further
6509 # calls must be made to get the identifier. If the type token is
6510 # not the last nonblank token on the line, the identifier is
6511 # scanned and handled and a value of '' is returned.
6512 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
6513 # $statement_type, $tokenizer_self
6515 my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
6518 use constant DEBUG_NSCAN => 0;
6520 my ( $i_beg, $pos_beg );
6522 #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
6523 #my ($a,$b,$c) = caller;
6524 #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
6526 # on re-entry, start scanning at first token on the line
6527 if ($id_scan_state) {
6532 # on initial entry, start scanning just after type token
6535 $id_scan_state = $tok;
6539 # find $i_beg = index of next nonblank token,
6540 # and handle empty lines
6542 my $next_nonblank_token = $rtokens->[$i_beg];
6543 if ( $i_beg > $max_token_index ) {
6548 # only a '#' immediately after a '$' is not a comment
6549 if ( $next_nonblank_token eq '#' ) {
6550 unless ( $tok eq '$' ) {
6555 if ( $next_nonblank_token =~ /^\s/ ) {
6556 ( $next_nonblank_token, $i_beg ) =
6557 find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
6559 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
6565 # handle non-blank line; identifier, if any, must follow
6566 unless ($blank_line) {
6568 if ( $is_sub{$id_scan_state} ) {
6569 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
6571 input_line => $input_line,
6576 rtokens => $rtokens,
6577 rtoken_map => $rtoken_map,
6578 id_scan_state => $id_scan_state,
6579 max_token_index => $max_token_index
6584 elsif ( $is_package{$id_scan_state} ) {
6585 ( $i, $tok, $type ) =
6586 do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
6587 $rtoken_map, $max_token_index );
6588 $id_scan_state = '';
6592 warning("invalid token in scan_id: $tok\n");
6593 $id_scan_state = '';
6597 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
6601 "Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
6603 report_definite_bug();
6608 "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
6610 return ( $i, $tok, $type, $id_scan_state );
6613 sub check_prototype {
6614 my ( $proto, $package, $subname ) = @_;
6615 return unless ( defined($package) && defined($subname) );
6616 if ( defined($proto) ) {
6617 $proto =~ s/^\s*\(\s*//;
6618 $proto =~ s/\s*\)$//;
6620 $is_user_function{$package}{$subname} = 1;
6621 $user_function_prototype{$package}{$subname} = "($proto)";
6623 # prototypes containing '&' must be treated specially..
6624 if ( $proto =~ /\&/ ) {
6626 # right curly braces of prototypes ending in
6627 # '&' may be followed by an operator
6628 if ( $proto =~ /\&$/ ) {
6629 $is_block_function{$package}{$subname} = 1;
6632 # right curly braces of prototypes NOT ending in
6633 # '&' may NOT be followed by an operator
6634 elsif ( $proto !~ /\&$/ ) {
6635 $is_block_list_function{$package}{$subname} = 1;
6640 $is_constant{$package}{$subname} = 1;
6644 $is_user_function{$package}{$subname} = 1;
6649 sub do_scan_package {
6651 # do_scan_package parses a package name
6652 # it is called with $i_beg equal to the index of the first nonblank
6653 # token following a 'package' token.
6654 # USES GLOBAL VARIABLES: $current_package,
6657 # package NAMESPACE VERSION
6658 # package NAMESPACE BLOCK
6659 # package NAMESPACE VERSION BLOCK
6661 # If VERSION is provided, package sets the $VERSION variable in the given
6662 # namespace to a version object with the VERSION provided. VERSION must be
6663 # a "strict" style version number as defined by the version module: a
6664 # positive decimal number (integer or decimal-fraction) without
6665 # exponentiation or else a dotted-decimal v-string with a leading 'v'
6666 # character and at least three components.
6667 # reference http://perldoc.perl.org/functions/package.html
6669 my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
6672 my $package = undef;
6673 my $pos_beg = $rtoken_map->[$i_beg];
6674 pos($input_line) = $pos_beg;
6676 # handle non-blank line; package name, if any, must follow
6677 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
6679 $package = ( defined($1) && $1 ) ? $1 : 'main';
6680 $package =~ s/\'/::/g;
6681 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
6682 $package =~ s/::$//;
6683 my $pos = pos($input_line);
6684 my $numc = $pos - $pos_beg;
6685 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
6688 # Now we must convert back from character position
6689 # to pre_token index.
6690 # I don't think an error flag can occur here ..but ?
6693 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
6694 if ($error) { warning("Possibly invalid package\n") }
6695 $current_package = $package;
6697 # we should now have package NAMESPACE
6698 # now expecting VERSION, BLOCK, or ; to follow ...
6699 # package NAMESPACE VERSION
6700 # package NAMESPACE BLOCK
6701 # package NAMESPACE VERSION BLOCK
6702 my ( $next_nonblank_token, $i_next ) =
6703 find_next_nonblank_token( $i, $rtokens, $max_token_index );
6705 # check that something recognizable follows, but do not parse.
6706 # A VERSION number will be parsed later as a number or v-string in the
6707 # normal way. What is important is to set the statement type if
6708 # everything looks okay so that the operator_expected() routine
6709 # knows that the number is in a package statement.
6710 # Examples of valid primitive tokens that might follow are:
6712 # FIX: added a '#' since a side comment may also follow
6713 if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#])|v\d|\d+$/ ) {
6714 $statement_type = $tok;
6718 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
6723 # no match but line not blank --
6724 # could be a label with name package, like package: , for example.
6729 return ( $i, $tok, $type );
6732 sub scan_identifier_do {
6734 # This routine assembles tokens into identifiers. It maintains a
6735 # scan state, id_scan_state. It updates id_scan_state based upon
6736 # current id_scan_state and token, and returns an updated
6737 # id_scan_state and the next index after the identifier.
6739 # USES GLOBAL VARIABLES: $context, $last_nonblank_token,
6740 # $last_nonblank_type
6742 my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
6743 $expecting, $container_type )
6745 use constant DEBUG_SCAN_ID => 0;
6748 my $tok_begin = $rtokens->[$i_begin];
6749 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
6750 my $id_scan_state_begin = $id_scan_state;
6751 my $identifier_begin = $identifier;
6752 my $tok = $tok_begin;
6754 my $tok_is_blank; # a flag to speed things up
6756 my $in_prototype_or_signature =
6757 $container_type && $container_type =~ /^sub\b/;
6759 # these flags will be used to help figure out the type:
6763 # allow old package separator (') except in 'use' statement
6764 my $allow_tick = ( $last_nonblank_token ne 'use' );
6766 #########################################################
6767 # get started by defining a type and a state if necessary
6768 #########################################################
6770 if ( !$id_scan_state ) {
6771 $context = UNKNOWN_CONTEXT;
6774 if ( $tok eq '>' ) {
6780 if ( $tok eq '$' || $tok eq '*' ) {
6781 $id_scan_state = '$';
6782 $context = SCALAR_CONTEXT;
6784 elsif ( $tok eq '%' || $tok eq '@' ) {
6785 $id_scan_state = '$';
6786 $context = LIST_CONTEXT;
6788 elsif ( $tok eq '&' ) {
6789 $id_scan_state = '&';
6791 elsif ( $tok eq 'sub' or $tok eq 'package' ) {
6792 $saw_alpha = 0; # 'sub' is considered type info here
6793 $id_scan_state = '$';
6794 $identifier .= ' '; # need a space to separate sub from sub name
6796 elsif ( $tok eq '::' ) {
6797 $id_scan_state = 'A';
6799 elsif ( $tok =~ /^\w/ ) {
6800 $id_scan_state = ':';
6803 elsif ( $tok eq '->' ) {
6804 $id_scan_state = '$';
6809 my ( $a, $b, $c ) = caller;
6810 warning("Program Bug: scan_identifier given bad token = $tok \n");
6811 warning(" called from sub $a line: $c\n");
6812 report_definite_bug();
6814 $saw_type = !$saw_alpha;
6818 $saw_alpha = ( $tok =~ /^\w/ );
6819 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
6822 ###############################
6823 # loop to gather the identifier
6824 ###############################
6828 while ( $i < $max_token_index ) {
6829 my $last_tok_is_blank = $tok_is_blank;
6830 if ($tok_is_blank) { $tok_is_blank = undef }
6831 else { $i_save = $i }
6833 $tok = $rtokens->[ ++$i ];
6835 # patch to make digraph :: if necessary
6836 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
6841 ########################
6842 # Starting variable name
6843 ########################
6845 if ( $id_scan_state eq '$' ) {
6847 if ( $tok eq '$' ) {
6849 $identifier .= $tok;
6851 # we've got a punctuation variable if end of line (punct.t)
6852 if ( $i == $max_token_index ) {
6854 $id_scan_state = '';
6858 elsif ( $tok =~ /^\w/ ) { # alphanumeric ..
6860 $id_scan_state = ':'; # now need ::
6861 $identifier .= $tok;
6863 elsif ( $tok eq '::' ) {
6864 $id_scan_state = 'A';
6865 $identifier .= $tok;
6868 # POSTDEFREF ->@ ->% ->& ->*
6869 elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
6870 $identifier .= $tok;
6872 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
6874 $id_scan_state = ':'; # now need ::
6875 $identifier .= $tok;
6877 # Perl will accept leading digits in identifiers,
6878 # although they may not always produce useful results.
6879 # Something like $main::0 is ok. But this also works:
6881 # sub howdy::123::bubba{ print "bubba $54321!\n" }
6882 # howdy::123::bubba();
6885 elsif ( $tok eq '#' ) {
6887 # side comment or identifier?
6890 # A '#' starts a comment if it follows a space. For example,
6891 # the following is equivalent to $ans=40.
6896 # a # inside a prototype or signature can only start a
6898 && !$in_prototype_or_signature
6900 # these are valid punctuation vars: *# %# @# $#
6901 # May also be '$#array' or POSTDEFREF ->$#
6902 && ( $identifier =~ /^[\%\@\$\*]$/ || $identifier =~ /\$$/ )
6906 $identifier .= $tok; # keep same state, a $ could follow
6910 # otherwise it is a side comment
6911 if ( $identifier eq '->' ) { }
6912 elsif ( $id_scan_state eq '$' ) { $type = 't' }
6913 else { $type = 'i' }
6915 $id_scan_state = '';
6920 elsif ( $tok eq '{' ) {
6922 # check for something like ${#} or ${©}
6926 || $identifier eq '@'
6927 || $identifier eq '$#'
6929 && $i + 2 <= $max_token_index
6930 && $rtokens->[ $i + 2 ] eq '}'
6931 && $rtokens->[ $i + 1 ] !~ /[\s\w]/
6934 my $next2 = $rtokens->[ $i + 2 ];
6935 my $next1 = $rtokens->[ $i + 1 ];
6936 $identifier .= $tok . $next1 . $next2;
6938 $id_scan_state = '';
6942 # skip something like ${xxx} or ->{
6943 $id_scan_state = '';
6945 # if this is the first token of a line, any tokens for this
6946 # identifier have already been accumulated
6947 if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; }
6952 # space ok after leading $ % * & @
6953 elsif ( $tok =~ /^\s*$/ ) {
6957 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
6959 if ( length($identifier) > 1 ) {
6960 $id_scan_state = '';
6962 $type = 'i'; # probably punctuation variable
6967 # spaces after $'s are common, and space after @
6968 # is harmless, so only complain about space
6969 # after other type characters. Space after $ and
6970 # @ will be removed in formatting. Report space
6971 # after % and * because they might indicate a
6972 # parsing error. In other words '% ' might be a
6973 # modulo operator. Delete this warning if it
6975 if ( $identifier !~ /^[\@\$]$/ ) {
6977 "Space in identifier, following $identifier\n";
6983 # space after '->' is ok
6985 elsif ( $tok eq '^' ) {
6987 # check for some special variables like $^W
6988 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
6989 $identifier .= $tok;
6990 $id_scan_state = 'A';
6992 # Perl accepts '$^]' or '@^]', but
6993 # there must not be a space before the ']'.
6994 my $next1 = $rtokens->[ $i + 1 ];
6995 if ( $next1 eq ']' ) {
6997 $identifier .= $next1;
6998 $id_scan_state = "";
7003 $id_scan_state = '';
7006 else { # something else
7008 if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
7010 # We might be in an extrusion of
7011 # sub foo2 ( $first, $, $third ) {
7012 # looking at a line starting with a comma, like
7015 # in this case the comma ends the signature variable
7016 # '$' which will have been previously marked type 't'
7018 if ( $i == $i_begin ) {
7023 # at a # we have to mark as type 't' because more may
7024 # follow, otherwise, in a signature we can let '$' be an
7025 # identifier here for better formatting.
7026 # See 'mangle4.in' for a test case.
7029 if ( $id_scan_state eq '$' && $tok eq '#' ) {
7034 $id_scan_state = '';
7038 # check for various punctuation variables
7039 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
7040 $identifier .= $tok;
7043 # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
7045 && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
7047 $identifier .= $tok;
7050 elsif ( $identifier eq '$#' ) {
7052 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
7054 # perl seems to allow just these: $#: $#- $#+
7055 elsif ( $tok =~ /^[\:\-\+]$/ ) {
7057 $identifier .= $tok;
7061 write_logfile_entry( 'Use of $# is deprecated' . "\n" );
7064 elsif ( $identifier eq '$$' ) {
7066 # perl does not allow references to punctuation
7067 # variables without braces. For example, this
7071 # You would have to use
7074 # '$$' alone is punctuation variable for PID
7076 if ( $tok eq '{' ) { $type = 't' }
7077 else { $type = 'i' }
7079 elsif ( $identifier eq '->' ) {
7084 if ( length($identifier) == 1 ) { $identifier = ''; }
7086 $id_scan_state = '';
7091 ###################################
7092 # looking for alphanumeric after ::
7093 ###################################
7095 elsif ( $id_scan_state eq 'A' ) {
7097 $tok_is_blank = $tok =~ /^\s*$/;
7099 if ( $tok =~ /^\w/ ) { # found it
7100 $identifier .= $tok;
7101 $id_scan_state = ':'; # now need ::
7104 elsif ( $tok eq "'" && $allow_tick ) {
7105 $identifier .= $tok;
7106 $id_scan_state = ':'; # now need ::
7109 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7110 $id_scan_state = '(';
7111 $identifier .= $tok;
7113 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7114 $id_scan_state = ')';
7115 $identifier .= $tok;
7118 $id_scan_state = '';
7124 ###################################
7125 # looking for :: after alphanumeric
7126 ###################################
7128 elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha
7130 $tok_is_blank = $tok =~ /^\s*$/;
7132 if ( $tok eq '::' ) { # got it
7133 $identifier .= $tok;
7134 $id_scan_state = 'A'; # now require alpha
7136 elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here
7137 $identifier .= $tok;
7138 $id_scan_state = ':'; # now need ::
7141 elsif ( $tok eq "'" && $allow_tick ) { # tick
7143 if ( $is_keyword{$identifier} ) {
7144 $id_scan_state = ''; # that's all
7148 $identifier .= $tok;
7151 elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7152 $id_scan_state = '(';
7153 $identifier .= $tok;
7155 elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7156 $id_scan_state = ')';
7157 $identifier .= $tok;
7160 $id_scan_state = ''; # that's all
7166 ##############################
7167 # looking for '(' of prototype
7168 ##############################
7170 elsif ( $id_scan_state eq '(' ) {
7172 if ( $tok eq '(' ) { # got it
7173 $identifier .= $tok;
7174 $id_scan_state = ')'; # now find the end of it
7176 elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
7177 $identifier .= $tok;
7181 $id_scan_state = ''; # that's all - no prototype
7187 ##############################
7188 # looking for ')' of prototype
7189 ##############################
7191 elsif ( $id_scan_state eq ')' ) {
7193 $tok_is_blank = $tok =~ /^\s*$/;
7195 if ( $tok eq ')' ) { # got it
7196 $identifier .= $tok;
7197 $id_scan_state = ''; # all done
7200 elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
7201 $identifier .= $tok;
7203 else { # probable error in script, but keep going
7204 warning("Unexpected '$tok' while seeking end of prototype\n");
7205 $identifier .= $tok;
7213 elsif ( $id_scan_state eq '&' ) {
7215 if ( $tok =~ /^[\$\w]/ ) { # alphanumeric ..
7216 $id_scan_state = ':'; # now need ::
7218 $identifier .= $tok;
7220 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
7221 $id_scan_state = ':'; # now need ::
7223 $identifier .= $tok;
7225 elsif ( $tok =~ /^\s*$/ ) { # allow space
7228 elsif ( $tok eq '::' ) { # leading ::
7229 $id_scan_state = 'A'; # accept alpha next
7230 $identifier .= $tok;
7232 elsif ( $tok eq '{' ) {
7233 if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; }
7235 $id_scan_state = '';
7240 # punctuation variable?
7241 # testfile: cunningham4.pl
7243 # We have to be careful here. If we are in an unknown state,
7244 # we will reject the punctuation variable. In the following
7245 # example the '&' is a binary operator but we are in an unknown
7246 # state because there is no sigil on 'Prima', so we don't
7247 # know what it is. But it is a bad guess that
7248 # '&~' is a function variable.
7249 # $self->{text}->{colorMap}->[
7250 # Prima::PodView::COLOR_CODE_FOREGROUND
7251 # & ~tb::COLOR_INDEX ] =
7254 # Fix for case c033: a '#' here starts a side comment
7255 if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
7256 $identifier .= $tok;
7263 $id_scan_state = '';
7268 ######################
7269 # unknown state - quit
7270 ######################
7272 else { # can get here due to error in initialization
7273 $id_scan_state = '';
7277 } ## end of main loop
7279 if ( $id_scan_state eq ')' ) {
7280 warning("Hit end of line while seeking ) to end prototype\n");
7283 # once we enter the actual identifier, it may not extend beyond
7284 # the end of the current line
7285 if ( $id_scan_state =~ /^[A\:\(\)]/ ) {
7286 $id_scan_state = '';
7289 # Patch: the deprecated variable $# does not combine with anything on the
7291 if ( $identifier eq '$#' ) { $id_scan_state = '' }
7293 if ( $i < 0 ) { $i = 0 }
7295 # Be sure a token type is defined
7301 if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
7304 else { $type = 'i' }
7306 elsif ( $identifier eq '->' ) {
7310 ( length($identifier) > 1 )
7312 # In something like '@$=' we have an identifier '@$'
7313 # In something like '$${' we have type '$$' (and only
7314 # part of an identifier)
7315 && !( $identifier =~ /\$$/ && $tok eq '{' )
7316 && ( $identifier !~ /^(sub |package )$/ )
7321 else { $type = 't' }
7323 elsif ($saw_alpha) {
7325 # type 'w' includes anything without leading type info
7326 # ($,%,@,*) including something like abc::def::ghi
7331 } # this can happen on a restart
7334 # See if we formed an identifier...
7337 if ($message) { write_logfile_entry($message) }
7340 # did not find an identifier, back up
7346 DEBUG_SCAN_ID && do {
7347 my ( $a, $b, $c ) = caller;
7349 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
7351 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
7353 return ( $i, $tok, $type, $id_scan_state, $identifier );
7356 { ## closure for sub do_scan_sub
7358 my %warn_if_lexical;
7362 # lexical subs with these names can cause parsing errors in this version
7363 my @q = qw( m q qq qr qw qx s tr y );
7364 @{warn_if_lexical}{@q} = (1) x scalar(@q);
7367 # saved package and subnames in case prototype is on separate line
7368 my ( $package_saved, $subname_saved );
7370 # initialize subname each time a new 'sub' keyword is encountered
7371 sub initialize_subname {
7372 $package_saved = "";
7373 $subname_saved = "";
7380 PROTOTYPE_CALL => 3,
7385 # do_scan_sub parses a sub name and prototype.
7387 # At present there are three basic CALL TYPES which are
7388 # distinguished by the starting value of '$tok':
7389 # 1. $tok='sub', id_scan_state='sub'
7390 # it is called with $i_beg equal to the index of the first nonblank
7391 # token following a 'sub' token.
7392 # 2. $tok='(', id_scan_state='sub',
7393 # it is called with $i_beg equal to the index of a '(' which may
7394 # start a prototype.
7395 # 3. $tok='prototype', id_scan_state='prototype'
7396 # it is called with $i_beg equal to the index of a '(' which is
7397 # preceded by ': prototype' and has $id_scan_state eq 'prototype'
7401 # A single type 1 call will get both the sub and prototype
7402 # sub foo1 ( $$ ) { }
7405 # The subname will be obtained with a 'sub' call
7406 # The prototype on line 2 will be obtained with a '(' call
7412 # The subname will be obtained with a 'sub' call
7413 # The prototype will be obtained with a 'prototype' call
7414 # sub foo1 ( $x, $y ) : prototype ( $$ ) { }
7415 # ^ <---type 1 ^ <---type 3
7417 # TODO: add future error checks to be sure we have a valid
7418 # sub name. For example, 'sub &doit' is wrong. Also, be sure
7419 # a name is given if and only if a non-anonymous sub is
7421 # USES GLOBAL VARS: $current_package, $last_nonblank_token,
7422 # $in_attribute_list, %saw_function_definition,
7425 my ($rinput_hash) = @_;
7427 my $input_line = $rinput_hash->{input_line};
7428 my $i = $rinput_hash->{i};
7429 my $i_beg = $rinput_hash->{i_beg};
7430 my $tok = $rinput_hash->{tok};
7431 my $type = $rinput_hash->{type};
7432 my $rtokens = $rinput_hash->{rtokens};
7433 my $rtoken_map = $rinput_hash->{rtoken_map};
7434 my $id_scan_state = $rinput_hash->{id_scan_state};
7435 my $max_token_index = $rinput_hash->{max_token_index};
7439 # Determine the CALL TYPE
7444 $tok eq 'prototype' ? PROTOTYPE_CALL
7445 : $tok eq '(' ? PAREN_CALL
7448 $id_scan_state = ""; # normally we get everything in one call
7449 my $subname = $subname_saved;
7450 my $package = $package_saved;
7455 my $pos_beg = $rtoken_map->[$i_beg];
7456 pos($input_line) = $pos_beg;
7458 # Look for the sub NAME if this is a SUB call
7460 $call_type == SUB_CALL
7461 && $input_line =~ m/\G\s*
7462 ((?:\w*(?:'|::))*) # package - something that ends in :: or '
7463 (\w+) # NAME - required
7470 my $is_lexical_sub =
7471 $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
7472 if ( $is_lexical_sub && $1 ) {
7473 warning("'my' sub $subname cannot be in package '$1'\n");
7474 $is_lexical_sub = 0;
7477 if ($is_lexical_sub) {
7479 # lexical subs use the block sequence number as a package name
7481 $current_sequence_number[BRACE][ $current_depth[BRACE] ];
7482 $seqno = 1 unless ( defined($seqno) );
7484 if ( $warn_if_lexical{$subname} ) {
7486 "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
7491 $package = ( defined($1) && $1 ) ? $1 : $current_package;
7492 $package =~ s/\'/::/g;
7493 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7494 $package =~ s/::$//;
7497 my $pos = pos($input_line);
7498 my $numc = $pos - $pos_beg;
7499 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
7502 # remember the sub name in case another call is needed to
7504 $package_saved = $package;
7505 $subname_saved = $subname;
7508 # Now look for PROTO ATTRS for all call types
7509 # Look for prototype/attributes which are usually on the same
7510 # line as the sub name but which might be on a separate line.
7511 # For example, we might have an anonymous sub with attributes,
7512 # or a prototype on a separate line from its sub name
7514 # NOTE: We only want to parse PROTOTYPES here. If we see anything that
7515 # does not look like a prototype, we assume it is a SIGNATURE and we
7516 # will stop and let the the standard tokenizer handle it. In
7517 # particular, we stop if we see any nested parens, braces, or commas.
7518 # Also note, a valid prototype cannot contain any alphabetic character
7519 # -- see https://perldoc.perl.org/perlsub
7520 # But it appears that an underscore is valid in a prototype, so the
7521 # regex below uses [A-Za-z] rather than \w
7522 # This is the old regex which has been replaced:
7523 # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO
7524 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
7526 $input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO
7527 (\s*:)? # ATTRS leading ':'
7535 # Append the prototype to the starting token if it is 'sub' or
7536 # 'prototype'. This is not necessary but for compatibility with
7537 # previous versions when the -csc flag is used:
7538 if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
7542 # If we just entered the sub at an opening paren on this call, not
7543 # a following :prototype, label it with the previous token. This is
7544 # necessary to propagate the sub name to its opening block.
7545 elsif ( $call_type == PAREN_CALL ) {
7546 $tok = $last_nonblank_token;
7551 # Patch part #1 to fixes cases b994 and b1053:
7552 # Mark an anonymous sub keyword without prototype as type 'k', i.e.
7553 # 'sub : lvalue { ...'
7555 if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
7560 # ATTRS: if there are attributes, back up and let the ':' be
7561 # found later by the scanner.
7562 my $pos = pos($input_line);
7564 $pos -= length($attrs);
7567 my $next_nonblank_token = $tok;
7569 # catch case of line with leading ATTR ':' after anonymous sub
7570 if ( $pos == $pos_beg && $tok eq ':' ) {
7572 $in_attribute_list = 1;
7575 # Otherwise, if we found a match we must convert back from
7576 # string position to the pre_token index for continued parsing.
7579 # I don't think an error flag can occur here ..but ?
7581 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
7583 if ($error) { warning("Possibly invalid sub\n") }
7585 # Patch part #2 to fixes cases b994 and b1053:
7586 # Do not let spaces be part of the token of an anonymous sub keyword
7587 # which we marked as type 'k' above...i.e. for something like:
7588 # 'sub : lvalue { ...'
7589 # Back up and let it be parsed as a blank
7593 && substr( $rtokens->[$i], 0, 1 ) eq ' ' )
7598 # check for multiple definitions of a sub
7599 ( $next_nonblank_token, my $i_next ) =
7600 find_next_nonblank_token_on_this_line( $i, $rtokens,
7604 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
7605 { # skip blank or side comment
7606 my ( $rpre_tokens, $rpre_types ) =
7607 peek_ahead_for_n_nonblank_pre_tokens(1);
7608 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
7609 $next_nonblank_token = $rpre_tokens->[0];
7612 $next_nonblank_token = '}';
7616 # See what's next...
7617 if ( $next_nonblank_token eq '{' ) {
7620 # Check for multiple definitions of a sub, but
7621 # it is ok to have multiple sub BEGIN, etc,
7622 # so we do not complain if name is all caps
7623 if ( $saw_function_definition{$subname}{$package}
7624 && $subname !~ /^[A-Z]+$/ )
7626 my $lno = $saw_function_definition{$subname}{$package};
7627 if ( $package =~ /^\d/ ) {
7629 "already saw definition of lexical 'sub $subname' at line $lno\n"
7635 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
7639 $saw_function_definition{$subname}{$package} =
7640 $tokenizer_self->[_last_line_number_];
7643 elsif ( $next_nonblank_token eq ';' ) {
7645 elsif ( $next_nonblank_token eq '}' ) {
7648 # ATTRS - if an attribute list follows, remember the name
7649 # of the sub so the next opening brace can be labeled.
7650 # Setting 'statement_type' causes any ':'s to introduce
7652 elsif ( $next_nonblank_token eq ':' ) {
7653 if ( $call_type == SUB_CALL ) {
7655 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
7659 # if we stopped before an open paren ...
7660 elsif ( $next_nonblank_token eq '(' ) {
7662 # If we DID NOT see this paren above then it must be on the
7663 # next line so we will set a flag to come back here and see if
7666 # Otherwise, we assume it is a SIGNATURE rather than a
7667 # PROTOTYPE and let the normal tokenizer handle it as a list
7668 if ( !$saw_opening_paren ) {
7669 $id_scan_state = 'sub'; # we must come back to get proto
7671 if ( $call_type == SUB_CALL ) {
7673 substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
7676 elsif ($next_nonblank_token) { # EOF technically ok
7677 $subname = "" unless defined($subname);
7679 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
7682 check_prototype( $proto, $package, $subname );
7685 # no match to either sub name or prototype, but line not blank
7689 return ( $i, $tok, $type, $id_scan_state );
7693 #########i###############################################################
7694 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
7695 #########################################################################
7697 sub find_next_nonblank_token {
7698 my ( $i, $rtokens, $max_token_index ) = @_;
7700 # Returns the next nonblank token after the token at index $i
7701 # To skip past a side comment, and any subsequent block comments
7702 # and blank lines, call with i=$max_token_index
7704 if ( $i >= $max_token_index ) {
7705 if ( !peeked_ahead() ) {
7707 peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
7711 my $next_nonblank_token = $rtokens->[ ++$i ];
7712 return ( " ", $i ) unless defined($next_nonblank_token);
7714 if ( $next_nonblank_token =~ /^\s*$/ ) {
7715 $next_nonblank_token = $rtokens->[ ++$i ];
7716 return ( " ", $i ) unless defined($next_nonblank_token);
7718 return ( $next_nonblank_token, $i );
7721 sub is_possible_numerator {
7723 # Look at the next non-comment character and decide if it could be a
7729 my ( $i, $rtokens, $max_token_index ) = @_;
7730 my $is_possible_numerator = 0;
7732 my $next_token = $rtokens->[ $i + 1 ];
7733 if ( $next_token eq '=' ) { $i++; } # handle /=
7734 my ( $next_nonblank_token, $i_next ) =
7735 find_next_nonblank_token( $i, $rtokens, $max_token_index );
7737 if ( $next_nonblank_token eq '#' ) {
7738 ( $next_nonblank_token, $i_next ) =
7739 find_next_nonblank_token( $max_token_index, $rtokens,
7743 if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
7744 $is_possible_numerator = 1;
7746 elsif ( $next_nonblank_token =~ /^\s*$/ ) {
7747 $is_possible_numerator = 0;
7750 $is_possible_numerator = -1;
7753 return $is_possible_numerator;
7756 { ## closure for sub pattern_expected
7761 # List of tokens which may follow a pattern. Note that we will not
7762 # have formed digraphs at this point, so we will see '&' instead of
7763 # '&&' and '|' instead of '||'
7765 # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
7766 my @q = qw( & && | || ? : + - * and or while if unless);
7767 push @q, ')', '}', ']', '>', ',', ';';
7768 @{pattern_test}{@q} = (1) x scalar(@q);
7771 sub pattern_expected {
7773 # This a filter for a possible pattern.
7774 # It looks at the token after a possible pattern and tries to
7775 # determine if that token could end a pattern.
7780 my ( $i, $rtokens, $max_token_index ) = @_;
7783 my $next_token = $rtokens->[ $i + 1 ];
7784 if ( $next_token =~ /^[msixpodualgc]/ ) {
7786 } # skip possible modifier
7787 my ( $next_nonblank_token, $i_next ) =
7788 find_next_nonblank_token( $i, $rtokens, $max_token_index );
7790 if ( $pattern_test{$next_nonblank_token} ) {
7795 # Added '#' to fix issue c044
7796 if ( $next_nonblank_token =~ /^\s*$/
7797 || $next_nonblank_token eq '#' )
7809 sub find_next_nonblank_token_on_this_line {
7810 my ( $i, $rtokens, $max_token_index ) = @_;
7811 my $next_nonblank_token;
7813 if ( $i < $max_token_index ) {
7814 $next_nonblank_token = $rtokens->[ ++$i ];
7816 if ( $next_nonblank_token =~ /^\s*$/ ) {
7818 if ( $i < $max_token_index ) {
7819 $next_nonblank_token = $rtokens->[ ++$i ];
7824 $next_nonblank_token = "";
7826 return ( $next_nonblank_token, $i );
7829 sub find_angle_operator_termination {
7831 # We are looking at a '<' and want to know if it is an angle operator.
7833 # $i = pretoken index of ending '>' if found, current $i otherwise
7834 # $type = 'Q' if found, '>' otherwise
7835 my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
7838 pos($input_line) = 1 + $rtoken_map->[$i];
7842 # we just have to find the next '>' if a term is expected
7843 if ( $expecting == TERM ) { $filter = '[\>]' }
7845 # we have to guess if we don't know what is expected
7846 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
7848 # shouldn't happen - we shouldn't be here if operator is expected
7849 else { warning("Program Bug in find_angle_operator_termination\n") }
7851 # To illustrate what we might be looking at, in case we are
7852 # guessing, here are some examples of valid angle operators
7859 # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
7860 # <${PREFIX}*img*.$IMAGE_TYPE>
7861 # <img*.$IMAGE_TYPE>
7862 # <Timg*.$IMAGE_TYPE>
7863 # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
7865 # Here are some examples of lines which do not have angle operators:
7866 # return unless $self->[2]++ < $#{$self->[1]};
7869 # the following line from dlister.pl caused trouble:
7870 # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
7872 # If the '<' starts an angle operator, it must end on this line and
7873 # it must not have certain characters like ';' and '=' in it. I use
7874 # this to limit the testing. This filter should be improved if
7877 if ( $input_line =~ /($filter)/g ) {
7881 # We MAY have found an angle operator termination if we get
7882 # here, but we need to do more to be sure we haven't been
7884 my $pos = pos($input_line);
7886 my $pos_beg = $rtoken_map->[$i];
7887 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
7889 # Reject if the closing '>' follows a '-' as in:
7890 # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
7891 if ( $expecting eq UNKNOWN ) {
7892 my $check = substr( $input_line, $pos - 2, 1 );
7893 if ( $check eq '-' ) {
7894 return ( $i, $type );
7898 ######################################debug#####
7899 #write_diagnostics( "ANGLE? :$str\n");
7900 #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
7901 ######################################debug#####
7905 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7907 # It may be possible that a quote ends midway in a pretoken.
7908 # If this happens, it may be necessary to split the pretoken.
7911 "Possible tokinization error..please check this line\n");
7912 report_possible_bug();
7915 # count blanks on inside of brackets
7916 my $blank_count = 0;
7917 $blank_count++ if ( $str =~ /<\s+/ );
7918 $blank_count++ if ( $str =~ /\s+>/ );
7920 # Now let's see where we stand....
7921 # OK if math op not possible
7922 if ( $expecting == TERM ) {
7925 # OK if there are no more than 2 non-blank pre-tokens inside
7926 # (not possible to write 2 token math between < and >)
7927 # This catches most common cases
7928 elsif ( $i <= $i_beg + 3 + $blank_count ) {
7930 # No longer any need to document this common case
7931 ## write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
7934 # OK if there is some kind of identifier inside
7935 # print $fh <tvg::INPUT>;
7936 elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
7937 write_diagnostics("ANGLE (contains identifier): $str\n");
7943 # Let's try a Brace Test: any braces inside must balance
7945 while ( $str =~ /\{/g ) { $br++ }
7946 while ( $str =~ /\}/g ) { $br-- }
7948 while ( $str =~ /\[/g ) { $sb++ }
7949 while ( $str =~ /\]/g ) { $sb-- }
7951 while ( $str =~ /\(/g ) { $pr++ }
7952 while ( $str =~ /\)/g ) { $pr-- }
7954 # if braces do not balance - not angle operator
7955 if ( $br || $sb || $pr ) {
7959 "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
7962 # we should keep doing more checks here...to be continued
7963 # Tentatively accepting this as a valid angle operator.
7964 # There are lots more things that can be checked.
7967 "ANGLE-Guessing yes: $str expecting=$expecting\n");
7968 write_logfile_entry("Guessing angle operator here: $str\n");
7973 # didn't find ending >
7975 if ( $expecting == TERM ) {
7976 warning("No ending > for angle operator\n");
7980 return ( $i, $type );
7983 sub scan_number_do {
7985 # scan a number in any of the formats that Perl accepts
7986 # Underbars (_) are allowed in decimal numbers.
7987 # input parameters -
7988 # $input_line - the string to scan
7989 # $i - pre_token index to start scanning
7990 # $rtoken_map - reference to the pre_token map giving starting
7991 # character position in $input_line of token $i
7992 # output parameters -
7993 # $i - last pre_token index of the number just scanned
7994 # number - the number (characters); or undef if not a number
7996 my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
7997 my $pos_beg = $rtoken_map->[$i];
8001 my $type = $input_type;
8003 my $first_char = substr( $input_line, $pos_beg, 1 );
8005 # Look for bad starting characters; Shouldn't happen..
8006 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
8007 warning("Program bug - scan_number given character $first_char\n");
8008 report_definite_bug();
8009 return ( $i, $type, $number );
8012 # handle v-string without leading 'v' character ('Two Dot' rule)
8014 # Here is the format prior to including underscores:
8015 ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
8016 pos($input_line) = $pos_beg;
8017 if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
8018 $pos = pos($input_line);
8019 my $numc = $pos - $pos_beg;
8020 $number = substr( $input_line, $pos_beg, $numc );
8022 report_v_string($number);
8025 # handle octal, hex, binary
8026 if ( !defined($number) ) {
8027 pos($input_line) = $pos_beg;
8029 # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
8030 # For reference, the format prior to hex floating point is:
8031 # /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
8032 # (hex) (octal) (binary)
8036 /\G[+-]?0( # leading [signed] 0
8038 # a hex float, i.e. '0x0.b17217f7d1cf78p0'
8039 ([xX][0-9a-fA-F_]* # X and optional leading digits
8040 (\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction
8041 [Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
8042 [0-9a-fA-F_]*) # optional Additional exponent digits
8045 |([xX][0-9a-fA-F_]+)
8048 |([oO]?[0-7_]+ # string of octal digits
8049 (\.([0-7][0-7_]*)?)? # optional decimal and fraction
8050 [Pp][+-]?[0-7] # REQUIRED exponent, no underscore
8051 [0-7_]*) # Additional exponent digits with underscores
8054 |([oO]?[0-7_]+) # string of octal digits
8057 |([bB][01_]* # 'b' with string of binary digits
8058 (\.([01][01_]*)?)? # optional decimal and fraction
8059 [Pp][+-]?[01] # Required exponent indicator, no underscore
8060 [01_]*) # additional exponent bits
8063 |([bB][01_]+) # 'b' with string of binary digits
8068 $pos = pos($input_line);
8069 my $numc = $pos - $pos_beg;
8070 $number = substr( $input_line, $pos_beg, $numc );
8076 if ( !defined($number) ) {
8077 pos($input_line) = $pos_beg;
8079 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
8080 $pos = pos($input_line);
8082 # watch out for things like 0..40 which would give 0. by this;
8083 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
8084 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
8088 my $numc = $pos - $pos_beg;
8089 $number = substr( $input_line, $pos_beg, $numc );
8094 # filter out non-numbers like e + - . e2 .e3 +e6
8095 # the rule: at least one digit, and any 'e' must be preceded by a digit
8097 $number !~ /\d/ # no digits
8098 || ( $number =~ /^(.*)[eE]/
8099 && $1 !~ /\d/ ) # or no digits before the 'e'
8103 $type = $input_type;
8104 return ( $i, $type, $number );
8107 # Found a number; now we must convert back from character position
8108 # to pre_token index. An error here implies user syntax error.
8109 # An example would be an invalid octal number like '009'.
8112 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
8113 if ($error) { warning("Possibly invalid number\n") }
8115 return ( $i, $type, $number );
8118 sub inverse_pretoken_map {
8120 # Starting with the current pre_token index $i, scan forward until
8121 # finding the index of the next pre_token whose position is $pos.
8122 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
8125 while ( ++$i <= $max_token_index ) {
8127 if ( $pos <= $rtoken_map->[$i] ) {
8129 # Let the calling routine handle errors in which we do not
8130 # land on a pre-token boundary. It can happen by running
8131 # perltidy on some non-perl scripts, for example.
8132 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
8137 return ( $i, $error );
8142 # find the target of a here document, if any
8144 # $i - token index of the second < of <<
8145 # ($i must be less than the last token index if this is called)
8146 # output parameters:
8147 # $found_target = 0 didn't find target; =1 found target
8148 # HERE_TARGET - the target string (may be empty string)
8149 # $i - unchanged if not here doc,
8150 # or index of the last token of the here target
8151 # $saw_error - flag noting unbalanced quote on here target
8152 my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
8154 my $found_target = 0;
8155 my $here_doc_target = '';
8156 my $here_quote_character = '';
8158 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
8159 $next_token = $rtokens->[ $i + 1 ];
8161 # perl allows a backslash before the target string (heredoc.t)
8163 if ( $next_token eq '\\' ) {
8165 $next_token = $rtokens->[ $i + 2 ];
8168 ( $next_nonblank_token, $i_next_nonblank ) =
8169 find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
8171 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
8174 my $quote_depth = 0;
8179 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
8182 = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
8183 $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
8185 if ($in_quote) { # didn't find end of quote, so no target found
8187 if ( $expecting == TERM ) {
8189 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
8194 else { # found ending quote
8198 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
8199 $tokj = $rtokens->[$j];
8201 # we have to remove any backslash before the quote character
8202 # so that the here-doc-target exactly matches this string
8206 && $rtokens->[ $j + 1 ] eq $here_quote_character );
8207 $here_doc_target .= $tokj;
8212 elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
8214 write_logfile_entry(
8215 "found blank here-target after <<; suggest using \"\"\n");
8218 elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
8220 my $here_doc_expected;
8221 if ( $expecting == UNKNOWN ) {
8222 $here_doc_expected = guess_if_here_doc($next_token);
8225 $here_doc_expected = 1;
8228 if ($here_doc_expected) {
8230 $here_doc_target = $next_token;
8237 if ( $expecting == TERM ) {
8239 write_logfile_entry("Note: bare here-doc operator <<\n");
8246 # patch to neglect any prepended backslash
8247 if ( $found_target && $backslash ) { $i++ }
8249 return ( $found_target, $here_doc_target, $here_quote_character, $i,
8255 # follow (or continue following) quoted string(s)
8256 # $in_quote return code:
8258 # 1 - still must find end of quote whose target is $quote_character
8259 # 2 - still looking for end of first of two quotes
8261 # Returns updated strings:
8262 # $quoted_string_1 = quoted string seen while in_quote=1
8263 # $quoted_string_2 = quoted string seen while in_quote=2
8265 $i, $in_quote, $quote_character,
8266 $quote_pos, $quote_depth, $quoted_string_1,
8267 $quoted_string_2, $rtokens, $rtoken_map,
8271 my $in_quote_starting = $in_quote;
8274 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
8277 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
8280 = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
8281 $quote_pos, $quote_depth, $max_token_index );
8282 $quoted_string_2 .= $quoted_string;
8283 if ( $in_quote == 1 ) {
8284 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
8285 $quote_character = '';
8288 $quoted_string_2 .= "\n";
8292 if ( $in_quote == 1 ) { # one (more) quote to follow
8295 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
8298 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
8299 $quote_pos, $quote_depth, $max_token_index );
8300 $quoted_string_1 .= $quoted_string;
8301 if ( $in_quote == 1 ) {
8302 $quoted_string_1 .= "\n";
8305 return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
8306 $quoted_string_1, $quoted_string_2 );
8309 sub follow_quoted_string {
8311 # scan for a specific token, skipping escaped characters
8312 # if the quote character is blank, use the first non-blank character
8314 # $rtokens = reference to the array of tokens
8315 # $i = the token index of the first character to search
8316 # $in_quote = number of quoted strings being followed
8317 # $beginning_tok = the starting quote character
8318 # $quote_pos = index to check next for alphanumeric delimiter
8319 # output parameters:
8320 # $i = the token index of the ending quote character
8321 # $in_quote = decremented if found end, unchanged if not
8322 # $beginning_tok = the starting quote character
8323 # $quote_pos = index to check next for alphanumeric delimiter
8324 # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
8325 # $quoted_string = the text of the quote (without quotation tokens)
8326 my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
8329 my ( $tok, $end_tok );
8331 my $quoted_string = "";
8335 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
8338 # get the corresponding end token
8339 if ( $beginning_tok !~ /^\s*$/ ) {
8340 $end_tok = matching_end_token($beginning_tok);
8343 # a blank token means we must find and use the first non-blank one
8345 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
8347 while ( $i < $max_token_index ) {
8348 $tok = $rtokens->[ ++$i ];
8350 if ( $tok !~ /^\s*$/ ) {
8352 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
8353 $i = $max_token_index;
8357 if ( length($tok) > 1 ) {
8358 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
8359 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
8362 $beginning_tok = $tok;
8365 $end_tok = matching_end_token($beginning_tok);
8371 $allow_quote_comments = 1;
8376 # There are two different loops which search for the ending quote
8377 # character. In the rare case of an alphanumeric quote delimiter, we
8378 # have to look through alphanumeric tokens character-by-character, since
8379 # the pre-tokenization process combines multiple alphanumeric
8380 # characters, whereas for a non-alphanumeric delimiter, only tokens of
8381 # length 1 can match.
8383 ###################################################################
8384 # Case 1 (rare): loop for case of alphanumeric quote delimiter..
8385 # "quote_pos" is the position the current word to begin searching
8386 ###################################################################
8387 if ( $beginning_tok =~ /\w/ ) {
8389 # Note this because it is not recommended practice except
8390 # for obfuscated perl contests
8391 if ( $in_quote == 1 ) {
8392 write_logfile_entry(
8393 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
8396 while ( $i < $max_token_index ) {
8398 if ( $quote_pos == 0 || ( $i < 0 ) ) {
8399 $tok = $rtokens->[ ++$i ];
8401 if ( $tok eq '\\' ) {
8403 # retain backslash unless it hides the end token
8404 $quoted_string .= $tok
8405 unless $rtokens->[ $i + 1 ] eq $end_tok;
8407 last if ( $i >= $max_token_index );
8408 $tok = $rtokens->[ ++$i ];
8411 my $old_pos = $quote_pos;
8413 unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
8417 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
8419 if ( $quote_pos > 0 ) {
8422 substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
8426 if ( $quote_depth == 0 ) {
8432 if ( $old_pos <= length($tok) ) {
8433 $quoted_string .= substr( $tok, $old_pos );
8439 ########################################################################
8440 # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
8441 ########################################################################
8444 while ( $i < $max_token_index ) {
8445 $tok = $rtokens->[ ++$i ];
8447 if ( $tok eq $end_tok ) {
8450 if ( $quote_depth == 0 ) {
8455 elsif ( $tok eq $beginning_tok ) {
8458 elsif ( $tok eq '\\' ) {
8460 # retain backslash unless it hides the beginning or end token
8461 $tok = $rtokens->[ ++$i ];
8462 $quoted_string .= '\\'
8463 unless ( $tok eq $end_tok || $tok eq $beginning_tok );
8465 $quoted_string .= $tok;
8468 if ( $i > $max_token_index ) { $i = $max_token_index }
8469 return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
8473 sub indicate_error {
8474 my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
8475 interrupt_logfile();
8477 write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
8482 sub write_error_indicator_pair {
8483 my ( $line_number, $input_line, $pos, $carrat ) = @_;
8484 my ( $offset, $numbered_line, $underline ) =
8485 make_numbered_line( $line_number, $input_line, $pos );
8486 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
8487 warning( $numbered_line . "\n" );
8488 $underline =~ s/\s*$//;
8489 warning( $underline . "\n" );
8493 sub make_numbered_line {
8495 # Given an input line, its line number, and a character position of
8496 # interest, create a string not longer than 80 characters of the form
8497 # $lineno: sub_string
8498 # such that the sub_string of $str contains the position of interest
8500 # Here is an example of what we want, in this case we add trailing
8501 # '...' because the line is long.
8503 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
8505 # Here is another example, this time in which we used leading '...'
8506 # because of excessive length:
8508 # 2: ... er of the World Wide Web Consortium's
8510 # input parameters are:
8511 # $lineno = line number
8512 # $str = the text of the line
8513 # $pos = position of interest (the error) : 0 = first character
8516 # - $offset = an offset which corrects the position in case we only
8517 # display part of a line, such that $pos-$offset is the effective
8518 # position from the start of the displayed line.
8519 # - $numbered_line = the numbered line as above,
8520 # - $underline = a blank 'underline' which is all spaces with the same
8521 # number of characters as the numbered line.
8523 my ( $lineno, $str, $pos ) = @_;
8524 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
8525 my $excess = length($str) - $offset - 68;
8526 my $numc = ( $excess > 0 ) ? 68 : undef;
8528 if ( defined($numc) ) {
8529 if ( $offset == 0 ) {
8530 $str = substr( $str, $offset, $numc - 4 ) . " ...";
8533 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
8538 if ( $offset == 0 ) {
8541 $str = "... " . substr( $str, $offset + 4 );
8545 my $numbered_line = sprintf( "%d: ", $lineno );
8546 $offset -= length($numbered_line);
8547 $numbered_line .= $str;
8548 my $underline = " " x length($numbered_line);
8549 return ( $offset, $numbered_line, $underline );
8552 sub write_on_underline {
8554 # The "underline" is a string that shows where an error is; it starts
8555 # out as a string of blanks with the same length as the numbered line of
8556 # code above it, and we have to add marking to show where an error is.
8557 # In the example below, we want to write the string '--^' just below
8558 # the line of bad code:
8560 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
8562 # We are given the current underline string, plus a position and a
8563 # string to write on it.
8565 # In the above example, there will be 2 calls to do this:
8566 # First call: $pos=19, pos_chr=^
8567 # Second call: $pos=16, pos_chr=---
8569 # This is a trivial thing to do with substr, but there is some
8572 my ( $underline, $pos, $pos_chr ) = @_;
8574 # check for error..shouldn't happen
8575 unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
8578 my $excess = length($pos_chr) + $pos - length($underline);
8579 if ( $excess > 0 ) {
8580 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
8582 substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
8583 return ($underline);
8588 # Break a string, $str, into a sequence of preliminary tokens. We
8589 # are interested in these types of tokens:
8590 # words (type='w'), example: 'max_tokens_wanted'
8591 # digits (type = 'd'), example: '0755'
8592 # whitespace (type = 'b'), example: ' '
8593 # any other single character (i.e. punct; type = the character itself).
8594 # We cannot do better than this yet because we might be in a quoted
8595 # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
8597 my ( $str, $max_tokens_wanted ) = @_;
8599 # we return references to these 3 arrays:
8600 my @tokens = (); # array of the tokens themselves
8601 my @token_map = (0); # string position of start of each token
8602 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
8607 if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
8610 # note that this must come before words!
8611 elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
8614 elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
8616 # single-character punctuation
8617 elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
8621 return ( \@tokens, \@token_map, \@type );
8625 push @token_map, pos($str);
8627 } while ( --$max_tokens_wanted != 0 );
8629 return ( \@tokens, \@token_map, \@type );
8634 # this is an old debug routine
8635 # not called, but saved for reference
8636 my ( $rtokens, $rtoken_map ) = @_;
8637 my $num = scalar( @{$rtokens} );
8639 foreach my $i ( 0 .. $num - 1 ) {
8640 my $len = length( $rtokens->[$i] );
8641 print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
8646 { ## closure for sub matching end token
8647 my %matching_end_token;
8650 %matching_end_token = (
8658 sub matching_end_token {
8660 # return closing character for a pattern
8661 my $beginning_token = shift;
8662 if ( $matching_end_token{$beginning_token} ) {
8663 return $matching_end_token{$beginning_token};
8665 return ($beginning_token);
8669 sub dump_token_types {
8670 my ( $class, $fh ) = @_;
8672 # This should be the latest list of token types in use
8673 # adding NEW_TOKENS: add a comment here
8674 $fh->print(<<'END_OF_LIST');
8676 Here is a list of the token types currently used for lines of type 'CODE'.
8677 For the following tokens, the "type" of a token is just the token itself.
8679 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
8680 ( ) <= >= == =~ !~ != ++ -- /= x=
8681 ... **= <<= >>= &&= ||= //= <=>
8682 , + - / * | % ! x ~ = \ ? : . < > ^ &
8684 The following additional token types are defined:
8687 b blank (white space)
8688 { indent: opening structural curly brace or square bracket or paren
8689 (code block, anonymous hash reference, or anonymous array reference)
8690 } outdent: right structural curly brace or square bracket or paren
8691 [ left non-structural square bracket (enclosing an array index)
8692 ] right non-structural square bracket
8693 ( left non-structural paren (all but a list right of an =)
8694 ) right non-structural paren
8695 L left non-structural curly brace (enclosing a key)
8696 R right non-structural curly brace
8697 ; terminal semicolon
8698 f indicates a semicolon in a "for" statement
8699 h here_doc operator <<
8701 Q indicates a quote or pattern
8702 q indicates a qw quote block
8704 C user-defined constant or constant function (with void prototype = ())
8705 U user-defined function taking parameters
8706 G user-defined function taking block parameter (like grep/map/eval)
8707 M (unused, but reserved for subroutine definition name)
8708 P (unused, but -html uses it to label pod text)
8709 t type indicater such as %,$,@,*,&,sub
8710 w bare word (perhaps a subroutine call)
8711 i identifier of some type (with leading %, $, @, *, &, sub, -> )
8714 F a file test operator (like -e)
8716 Z identifier in indirect object slot: may be file handle, object
8717 J LABEL: code block label
8718 j LABEL after next, last, redo, goto
8721 pp pre-increment operator ++
8722 mm pre-decrement operator --
8723 A : used as attribute separator
8725 Here are the '_line_type' codes used internally:
8726 SYSTEM - system-specific code before hash-bang line
8727 CODE - line of perl code (including comments)
8728 POD_START - line starting pod, such as '=head'
8729 POD - pod documentation text
8730 POD_END - last line of pod section, '=cut'
8731 HERE - text of here-document
8732 HERE_END - last line of here-doc (target word)
8733 FORMAT - format section
8734 FORMAT_END - last line of format section, '.'
8735 DATA_START - __DATA__ line
8736 DATA - unidentified text following __DATA__
8737 END_START - __END__ line
8738 END - unidentified text following __END__
8739 ERROR - we are in big trouble, probably not a perl script
8747 # These names are used in error messages
8748 @opening_brace_names = qw# '{' '[' '(' '?' #;
8749 @closing_brace_names = qw# '}' ']' ')' ':' #;
8754 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
8755 <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
8757 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
8759 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
8760 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
8762 my @tetragraphs = qw( <<>> );
8763 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
8765 # make a hash of all valid token types for self-checking the tokenizer
8766 # (adding NEW_TOKENS : select a new character and add to this list)
8767 my @valid_token_types = qw#
8768 A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
8769 { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
8771 push( @valid_token_types, @digraphs );
8772 push( @valid_token_types, @trigraphs );
8773 push( @valid_token_types, @tetragraphs );
8774 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
8775 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
8777 # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
8778 my @file_test_operators =
8779 qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
8780 @is_file_test_operator{@file_test_operators} =
8781 (1) x scalar(@file_test_operators);
8783 # these functions have prototypes of the form (&), so when they are
8784 # followed by a block, that block MAY BE followed by an operator.
8785 # Smartmatch operator ~~ may be followed by anonymous hash or array ref
8787 @is_block_operator{@q} = (1) x scalar(@q);
8789 # these functions allow an identifier in the indirect object slot
8790 @q = qw( print printf sort exec system say);
8791 @is_indirect_object_taker{@q} = (1) x scalar(@q);
8793 # These tokens may precede a code block
8794 # patched for SWITCH/CASE/CATCH. Actually these could be removed
8795 # now and we could let the extended-syntax coding handle them.
8796 # Added 'default' for Switch::Plain.
8798 qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
8799 unless do while until eval for foreach map grep sort
8800 switch case given when default catch try finally);
8801 @is_code_block_token{@q} = (1) x scalar(@q);
8803 # I'll build the list of keywords incrementally
8806 # keywords and tokens after which a value or pattern is expected,
8807 # but not an operator. In other words, these should consume terms
8808 # to their right, or at least they are not expected to be followed
8809 # immediately by operators.
8810 my @value_requestor = qw(
9038 # patched above for SWITCH/CASE given/when err say
9039 # 'err' is a fairly safe addition.
9040 # Added 'default' for Switch::Plain. Note that we could also have
9041 # a separate set of keywords to include if we see 'use Switch::Plain'
9042 push( @Keywords, @value_requestor );
9044 # These are treated the same but are not keywords:
9049 push( @value_requestor, @extra_vr );
9051 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
9053 # this list contains keywords which do not look for arguments,
9054 # so that they might be followed by an operator, or at least
9056 my @operator_requestor = qw(
9080 push( @Keywords, @operator_requestor );
9082 # These are treated the same but are not considered keywords:
9089 push( @operator_requestor, @extra_or );
9091 @expecting_operator_token{@operator_requestor} =
9092 (1) x scalar(@operator_requestor);
9094 # these token TYPES expect trailing operator but not a term
9095 # note: ++ and -- are post-increment and decrement, 'C' = constant
9096 my @operator_requestor_types = qw( ++ -- C <> q );
9097 @expecting_operator_types{@operator_requestor_types} =
9098 (1) x scalar(@operator_requestor_types);
9100 # these token TYPES consume values (terms)
9101 # note: pp and mm are pre-increment and decrement
9102 # f=semicolon in for, F=file test operator
9103 my @value_requestor_type = qw#
9104 L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
9105 **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
9106 <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
9107 f F pp mm Y p m U J G j >> << ^ t
9108 ~. ^. |. &. ^.= |.= &.=
9110 push( @value_requestor_type, ',' )
9111 ; # (perl doesn't like a ',' in a qw block)
9112 @expecting_term_types{@value_requestor_type} =
9113 (1) x scalar(@value_requestor_type);
9115 # Note: the following valid token types are not assigned here to
9116 # hashes requesting to be followed by values or terms, but are
9117 # instead currently hard-coded into sub operator_expected:
9118 # ) -> :: Q R Z ] b h i k n v w } #
9120 # For simple syntax checking, it is nice to have a list of operators which
9121 # will really be unhappy if not followed by a term. This includes most
9123 %really_want_term = %expecting_term_types;
9125 # with these exceptions...
9126 delete $really_want_term{'U'}; # user sub, depends on prototype
9127 delete $really_want_term{'F'}; # file test works on $_ if no following term
9128 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
9131 @q = qw(q qq qw qx qr s y tr m);
9132 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
9135 @is_package{@q} = (1) x scalar(@q);
9139 @is_comma_question_colon{@q} = (1) x scalar(@q);
9141 # Hash of other possible line endings which may occur.
9142 # Keep these coordinated with the regex where this is used.
9143 # Note: chr(13) = chr(015)="\r".
9144 @q = ( chr(13), chr(29), chr(26) );
9145 @other_line_endings{@q} = (1) x scalar(@q);
9147 # These keywords are handled specially in the tokenizer code:
9148 my @special_keywords = qw(
9164 push( @Keywords, @special_keywords );
9166 # Keywords after which list formatting may be used
9167 # WARNING: do not include |map|grep|eval or perl may die on
9168 # syntax errors (map1.t).
9169 my @keyword_taking_list = qw(
9244 @is_keyword_taking_list{@keyword_taking_list} =
9245 (1) x scalar(@keyword_taking_list);
9247 # perl functions which may be unary operators.
9249 # This list is used to decide if a pattern delimited by slashes, /pattern/,
9250 # can follow one of these keywords.
9252 chomp eof eval fc lc pop shift uc undef
9255 @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
9258 # These are keywords for which an arg may optionally be omitted. They are
9259 # currently only used to disambiguate a ? used as a ternary from one used
9260 # as a (depricated) pattern delimiter. In the future, they might be used
9261 # to give a warning about ambiguous syntax before a /.
9262 # Note: split has been omitted (see not below).
9263 my @keywords_taking_optional_arg = qw(
9332 @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
9333 (1) x scalar(@keywords_taking_optional_arg);
9335 # This list is used to decide if a pattern delmited by question marks,
9336 # ?pattern?, can follow one of these keywords. Note that from perl 5.22
9337 # on, a ?pattern? is not recognized, so we can be much more strict than
9338 # with a /pattern/. Note that 'split' is not in this list. In current
9339 # versions of perl a question following split must be a ternary, but
9340 # in older versions it could be a pattern. The guessing algorithm will
9341 # decide. We are combining two lists here to simplify the test.
9342 @q = ( @keywords_taking_optional_arg, @operator_requestor );
9343 @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
9346 # These are not used in any way yet
9347 # my @unused_keywords = qw(
9353 # The list of keywords was originally extracted from function 'keyword' in
9354 # perl file toke.c version 5.005.03, using this utility, plus a
9355 # little editing: (file getkwd.pl):
9356 # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
9357 # Add 'get' prefix where necessary, then split into the above lists.
9358 # This list should be updated as necessary.
9359 # The list should not contain these special variables:
9360 # ARGV DATA ENV SIG STDERR STDIN STDOUT
9363 @is_keyword{@Keywords} = (1) x scalar(@Keywords);