1 ########################################################################
3 # the Perl::Tidy::Tokenizer package is essentially a filter which
4 # reads lines of perl source code from a source object and provides
5 # corresponding tokenized lines through its get_line() method. Lines
6 # flow from the source_object to the caller like this:
8 # source_object --> LineBuffer_object --> Tokenizer --> calling routine
9 # get_line() get_line() get_line() line_of_tokens
11 # The source object can be any object with a get_line() method which
12 # supplies one line (a character string) perl call.
13 # The LineBuffer object is created by the Tokenizer.
14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
15 # containing one tokenized line for each call to its get_line() method.
17 # WARNING: This is not a real class yet. Only one tokenizer my be used.
19 ########################################################################
21 package Perl::Tidy::Tokenizer;
24 our $VERSION = '20181120';
26 use Perl::Tidy::LineBuffer;
30 # Caution: these debug flags produce a lot of output
31 # They should all be 0 except when debugging small scripts
33 use constant TOKENIZER_DEBUG_FLAG_EXPECT => 0;
34 use constant TOKENIZER_DEBUG_FLAG_NSCAN => 0;
35 use constant TOKENIZER_DEBUG_FLAG_QUOTE => 0;
36 use constant TOKENIZER_DEBUG_FLAG_SCAN_ID => 0;
37 use constant TOKENIZER_DEBUG_FLAG_TOKENIZE => 0;
39 my $debug_warning = sub {
40 print STDOUT "TOKENIZER_DEBUGGING with key $_[0]\n";
43 TOKENIZER_DEBUG_FLAG_EXPECT && $debug_warning->('EXPECT');
44 TOKENIZER_DEBUG_FLAG_NSCAN && $debug_warning->('NSCAN');
45 TOKENIZER_DEBUG_FLAG_QUOTE && $debug_warning->('QUOTE');
46 TOKENIZER_DEBUG_FLAG_SCAN_ID && $debug_warning->('SCAN_ID');
47 TOKENIZER_DEBUG_FLAG_TOKENIZE && $debug_warning->('TOKENIZE');
53 # PACKAGE VARIABLES for processing an entire FILE.
59 $last_nonblank_block_type
67 %user_function_prototype
69 %is_block_list_function
70 %saw_function_definition
79 @nesting_sequence_number
80 @current_sequence_number
82 @paren_semicolon_count
83 @paren_structural_type
85 @brace_structural_type
89 @square_bracket_structural_type
92 @nested_statement_type
93 @starting_line_of_current_depth
96 # GLOBAL CONSTANTS for routines in this package
98 %is_indirect_object_taker
100 %expecting_operator_token
101 %expecting_operator_types
102 %expecting_term_types
103 %expecting_term_token
105 %is_file_test_operator
114 %is_keyword_taking_list
115 %is_keyword_taking_optional_args
116 %is_q_qq_qw_qx_qr_s_y_tr_m
119 # possible values of operator_expected()
120 use constant TERM => -1;
121 use constant UNKNOWN => 0;
122 use constant OPERATOR => 1;
124 # possible values of context
125 use constant SCALAR_CONTEXT => -1;
126 use constant UNKNOWN_CONTEXT => 0;
127 use constant LIST_CONTEXT => 1;
129 # Maximum number of little messages; probably need not be changed.
130 use constant MAX_NAG_MESSAGES => 6;
134 # methods to count instances
136 sub get_count { return $_count; }
137 sub _increment_count { return ++$_count }
138 sub _decrement_count { return --$_count }
143 $self->_decrement_count();
149 my ( $class, @args ) = @_;
151 # Note: 'tabs' and 'indent_columns' are temporary and should be
154 source_object => undef,
155 debugger_object => undef,
156 diagnostics_object => undef,
157 logger_object => undef,
158 starting_level => undef,
161 look_for_hash_bang => 0,
163 look_for_autoloader => 1,
164 look_for_selfloader => 1,
165 starting_line_number => 1,
166 extended_syntax => 0,
168 my %args = ( %defaults, @args );
170 # we are given an object with a get_line() method to supply source lines
171 my $source_object = $args{source_object};
173 # we create another object with a get_line() and peek_ahead() method
174 my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
176 # Tokenizer state data is as follows:
177 # _rhere_target_list reference to list of here-doc targets
178 # _here_doc_target the target string for a here document
179 # _here_quote_character the type of here-doc quoting (" ' ` or none)
180 # to determine if interpolation is done
181 # _quote_target character we seek if chasing a quote
182 # _line_start_quote line where we started looking for a long quote
183 # _in_here_doc flag indicating if we are in a here-doc
184 # _in_pod flag set if we are in pod documentation
185 # _in_error flag set if we saw severe error (binary in script)
186 # _in_data flag set if we are in __DATA__ section
187 # _in_end flag set if we are in __END__ section
188 # _in_format flag set if we are in a format description
189 # _in_attribute_list flag telling if we are looking for attributes
190 # _in_quote flag telling if we are chasing a quote
191 # _starting_level indentation level of first line
192 # _line_buffer_object object with get_line() method to supply source code
193 # _diagnostics_object place to write debugging information
194 # _unexpected_error_count error count used to limit output
195 # _lower_case_labels_at line numbers where lower case labels seen
196 # _hit_bug program bug detected
198 _rhere_target_list => [],
200 _here_doc_target => "",
201 _here_quote_character => "",
207 _in_attribute_list => 0,
210 _line_start_quote => -1,
211 _starting_level => $args{starting_level},
212 _know_starting_level => defined( $args{starting_level} ),
213 _tabsize => $args{tabsize},
214 _indent_columns => $args{indent_columns},
215 _look_for_hash_bang => $args{look_for_hash_bang},
216 _trim_qw => $args{trim_qw},
217 _continuation_indentation => $args{continuation_indentation},
218 _outdent_labels => $args{outdent_labels},
219 _last_line_number => $args{starting_line_number} - 1,
220 _saw_perl_dash_P => 0,
221 _saw_perl_dash_w => 0,
222 _saw_use_strict => 0,
225 _look_for_autoloader => $args{look_for_autoloader},
226 _look_for_selfloader => $args{look_for_selfloader},
227 _saw_autoloader => 0,
228 _saw_selfloader => 0,
232 _saw_negative_indentation => 0,
233 _started_tokenizing => 0,
234 _line_buffer_object => $line_buffer_object,
235 _debugger_object => $args{debugger_object},
236 _diagnostics_object => $args{diagnostics_object},
237 _logger_object => $args{logger_object},
238 _unexpected_error_count => 0,
239 _started_looking_for_here_target_at => 0,
240 _nearly_matched_here_target_at => undef,
242 _rlower_case_labels_at => undef,
243 _extended_syntax => $args{extended_syntax},
246 prepare_for_a_new_file();
247 find_starting_indentation_level();
249 bless $tokenizer_self, $class;
251 # This is not a full class yet, so die if an attempt is made to
252 # create more than one object.
254 if ( _increment_count() > 1 ) {
256 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
259 return $tokenizer_self;
263 # interface to Perl::Tidy::Logger routines
266 my $logger_object = $tokenizer_self->{_logger_object};
267 if ($logger_object) {
268 $logger_object->warning($msg);
275 my $logger_object = $tokenizer_self->{_logger_object};
276 if ($logger_object) {
277 $logger_object->complain($msg);
282 sub write_logfile_entry {
284 my $logger_object = $tokenizer_self->{_logger_object};
285 if ($logger_object) {
286 $logger_object->write_logfile_entry($msg);
291 sub interrupt_logfile {
292 my $logger_object = $tokenizer_self->{_logger_object};
293 if ($logger_object) {
294 $logger_object->interrupt_logfile();
300 my $logger_object = $tokenizer_self->{_logger_object};
301 if ($logger_object) {
302 $logger_object->resume_logfile();
307 sub increment_brace_error {
308 my $logger_object = $tokenizer_self->{_logger_object};
309 if ($logger_object) {
310 $logger_object->increment_brace_error();
315 sub report_definite_bug {
316 $tokenizer_self->{_hit_bug} = 1;
317 my $logger_object = $tokenizer_self->{_logger_object};
318 if ($logger_object) {
319 $logger_object->report_definite_bug();
326 my $logger_object = $tokenizer_self->{_logger_object};
327 if ($logger_object) {
328 $logger_object->brace_warning($msg);
333 sub get_saw_brace_error {
334 my $logger_object = $tokenizer_self->{_logger_object};
335 if ($logger_object) {
336 return $logger_object->get_saw_brace_error();
343 # interface to Perl::Tidy::Diagnostics routines
344 sub write_diagnostics {
346 if ( $tokenizer_self->{_diagnostics_object} ) {
347 $tokenizer_self->{_diagnostics_object}->write_diagnostics($msg);
352 sub report_tokenization_errors {
355 my $severe_error = $self->{_in_error};
357 my $level = get_indentation_level();
358 if ( $level != $tokenizer_self->{_starting_level} ) {
359 warning("final indentation level: $level\n");
362 check_final_nesting_depths();
364 if ( $tokenizer_self->{_look_for_hash_bang}
365 && !$tokenizer_self->{_saw_hash_bang} )
368 "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
371 if ( $tokenizer_self->{_in_format} ) {
372 warning("hit EOF while in format description\n");
375 if ( $tokenizer_self->{_in_pod} ) {
377 # Just write log entry if this is after __END__ or __DATA__
378 # because this happens to often, and it is not likely to be
380 if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) {
382 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
388 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
394 if ( $tokenizer_self->{_in_here_doc} ) {
396 my $here_doc_target = $tokenizer_self->{_here_doc_target};
397 my $started_looking_for_here_target_at =
398 $tokenizer_self->{_started_looking_for_here_target_at};
399 if ($here_doc_target) {
401 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
406 "hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
409 my $nearly_matched_here_target_at =
410 $tokenizer_self->{_nearly_matched_here_target_at};
411 if ($nearly_matched_here_target_at) {
413 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
418 if ( $tokenizer_self->{_in_quote} ) {
420 my $line_start_quote = $tokenizer_self->{_line_start_quote};
421 my $quote_target = $tokenizer_self->{_quote_target};
423 ( $tokenizer_self->{_in_attribute_list} )
427 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
431 if ( $tokenizer_self->{_hit_bug} ) {
435 my $logger_object = $tokenizer_self->{_logger_object};
437 # TODO: eventually may want to activate this to cause file to be output verbatim
440 # Set the severe error for a fairly high warning count because
441 # some of the warnings do not harm formatting, such as duplicate
443 my $warning_count = $logger_object->{_warning_count};
444 if ( $warning_count > 50 ) {
448 # Brace errors are significant, so set the severe error flag at
450 my $saw_brace_error = $logger_object->{_saw_brace_error};
451 if ( $saw_brace_error > 2 ) {
456 unless ( $tokenizer_self->{_saw_perl_dash_w} ) {
458 write_logfile_entry("Suggest including '-w parameter'\n");
461 write_logfile_entry("Suggest including 'use warnings;'\n");
465 if ( $tokenizer_self->{_saw_perl_dash_P} ) {
466 write_logfile_entry("Use of -P parameter for defines is discouraged\n");
469 unless ( $tokenizer_self->{_saw_use_strict} ) {
470 write_logfile_entry("Suggest including 'use strict;'\n");
473 # it is suggested that labels have at least one upper case character
474 # for legibility and to avoid code breakage as new keywords are introduced
475 if ( $tokenizer_self->{_rlower_case_labels_at} ) {
476 my @lower_case_labels_at =
477 @{ $tokenizer_self->{_rlower_case_labels_at} };
479 "Suggest using upper case characters in label(s)\n");
481 write_logfile_entry(" defined at line(s): (@lower_case_labels_at)\n");
483 return $severe_error;
486 sub report_v_string {
488 # warn if this version can't handle v-strings
490 unless ( $tokenizer_self->{_saw_v_string} ) {
491 $tokenizer_self->{_saw_v_string} = $tokenizer_self->{_last_line_number};
495 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
501 sub get_input_line_number {
502 return $tokenizer_self->{_last_line_number};
505 # returns the next tokenized line
510 # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
511 # $square_bracket_depth, $paren_depth
513 my $input_line = $tokenizer_self->{_line_buffer_object}->get_line();
514 $tokenizer_self->{_line_text} = $input_line;
516 return unless ($input_line);
518 my $input_line_number = ++$tokenizer_self->{_last_line_number};
520 # Find and remove what characters terminate this line, including any
522 my $input_line_separator = "";
523 if ( chomp($input_line) ) { $input_line_separator = $/ }
525 # TODO: what other characters should be included here?
526 if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
527 $input_line_separator = $2 . $input_line_separator;
530 # for backwards compatibility we keep the line text terminated with
531 # a newline character
533 $tokenizer_self->{_line_text} = $input_line; # update
535 # create a data structure describing this line which will be
536 # returned to the caller.
538 # _line_type codes are:
539 # SYSTEM - system-specific code before hash-bang line
540 # CODE - line of perl code (including comments)
541 # POD_START - line starting pod, such as '=head'
542 # POD - pod documentation text
543 # POD_END - last line of pod section, '=cut'
544 # HERE - text of here-document
545 # HERE_END - last line of here-doc (target word)
546 # FORMAT - format section
547 # FORMAT_END - last line of format section, '.'
548 # DATA_START - __DATA__ line
549 # DATA - unidentified text following __DATA__
550 # END_START - __END__ line
551 # END - unidentified text following __END__
552 # ERROR - we are in big trouble, probably not a perl script
555 # _curly_brace_depth - depth of curly braces at start of line
556 # _square_bracket_depth - depth of square brackets at start of line
557 # _paren_depth - depth of parens at start of line
558 # _starting_in_quote - this line continues a multi-line quote
559 # (so don't trim leading blanks!)
560 # _ending_in_quote - this line ends in a multi-line quote
561 # (so don't trim trailing blanks!)
562 my $line_of_tokens = {
564 _line_text => $input_line,
565 _line_number => $input_line_number,
566 _rtoken_type => undef,
570 _rblock_type => undef,
571 _rcontainer_type => undef,
572 _rcontainer_environment => undef,
573 _rtype_sequence => undef,
574 _rnesting_tokens => undef,
575 _rci_levels => undef,
576 _rnesting_blocks => undef,
577 _guessed_indentation_level => 0,
578 _starting_in_quote => 0, # to be set by subroutine
579 _ending_in_quote => 0,
580 _curly_brace_depth => $brace_depth,
581 _square_bracket_depth => $square_bracket_depth,
582 _paren_depth => $paren_depth,
583 _quote_character => '',
586 # must print line unchanged if we are in a here document
587 if ( $tokenizer_self->{_in_here_doc} ) {
589 $line_of_tokens->{_line_type} = 'HERE';
590 my $here_doc_target = $tokenizer_self->{_here_doc_target};
591 my $here_quote_character = $tokenizer_self->{_here_quote_character};
592 my $candidate_target = $input_line;
593 chomp $candidate_target;
595 # Handle <<~ targets, which are indicated here by a leading space on
596 # the here quote character
597 if ( $here_quote_character =~ /^\s/ ) {
598 $candidate_target =~ s/^\s*//;
600 if ( $candidate_target eq $here_doc_target ) {
601 $tokenizer_self->{_nearly_matched_here_target_at} = undef;
602 $line_of_tokens->{_line_type} = 'HERE_END';
603 write_logfile_entry("Exiting HERE document $here_doc_target\n");
605 my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
606 if ( @{$rhere_target_list} ) { # there can be multiple here targets
607 ( $here_doc_target, $here_quote_character ) =
608 @{ shift @{$rhere_target_list} };
609 $tokenizer_self->{_here_doc_target} = $here_doc_target;
610 $tokenizer_self->{_here_quote_character} =
611 $here_quote_character;
613 "Entering HERE document $here_doc_target\n");
614 $tokenizer_self->{_nearly_matched_here_target_at} = undef;
615 $tokenizer_self->{_started_looking_for_here_target_at} =
619 $tokenizer_self->{_in_here_doc} = 0;
620 $tokenizer_self->{_here_doc_target} = "";
621 $tokenizer_self->{_here_quote_character} = "";
625 # check for error of extra whitespace
626 # note for PERL6: leading whitespace is allowed
628 $candidate_target =~ s/\s*$//;
629 $candidate_target =~ s/^\s*//;
630 if ( $candidate_target eq $here_doc_target ) {
631 $tokenizer_self->{_nearly_matched_here_target_at} =
635 return $line_of_tokens;
638 # must print line unchanged if we are in a format section
639 elsif ( $tokenizer_self->{_in_format} ) {
641 if ( $input_line =~ /^\.[\s#]*$/ ) {
642 write_logfile_entry("Exiting format section\n");
643 $tokenizer_self->{_in_format} = 0;
644 $line_of_tokens->{_line_type} = 'FORMAT_END';
647 $line_of_tokens->{_line_type} = 'FORMAT';
649 return $line_of_tokens;
652 # must print line unchanged if we are in pod documentation
653 elsif ( $tokenizer_self->{_in_pod} ) {
655 $line_of_tokens->{_line_type} = 'POD';
656 if ( $input_line =~ /^=cut/ ) {
657 $line_of_tokens->{_line_type} = 'POD_END';
658 write_logfile_entry("Exiting POD section\n");
659 $tokenizer_self->{_in_pod} = 0;
661 if ( $input_line =~ /^\#\!.*perl\b/ ) {
663 "Hash-bang in pod can cause older versions of perl to fail! \n"
667 return $line_of_tokens;
670 # must print line unchanged if we have seen a severe error (i.e., we
671 # are seeing illegal tokens and cannot continue. Syntax errors do
672 # not pass this route). Calling routine can decide what to do, but
673 # the default can be to just pass all lines as if they were after __END__
674 elsif ( $tokenizer_self->{_in_error} ) {
675 $line_of_tokens->{_line_type} = 'ERROR';
676 return $line_of_tokens;
679 # print line unchanged if we are __DATA__ section
680 elsif ( $tokenizer_self->{_in_data} ) {
682 # ...but look for POD
683 # Note that the _in_data and _in_end flags remain set
684 # so that we return to that state after seeing the
685 # end of a pod section
686 if ( $input_line =~ /^=(?!cut)/ ) {
687 $line_of_tokens->{_line_type} = 'POD_START';
688 write_logfile_entry("Entering POD section\n");
689 $tokenizer_self->{_in_pod} = 1;
690 return $line_of_tokens;
693 $line_of_tokens->{_line_type} = 'DATA';
694 return $line_of_tokens;
698 # print line unchanged if we are in __END__ section
699 elsif ( $tokenizer_self->{_in_end} ) {
701 # ...but look for POD
702 # Note that the _in_data and _in_end flags remain set
703 # so that we return to that state after seeing the
704 # end of a pod section
705 if ( $input_line =~ /^=(?!cut)/ ) {
706 $line_of_tokens->{_line_type} = 'POD_START';
707 write_logfile_entry("Entering POD section\n");
708 $tokenizer_self->{_in_pod} = 1;
709 return $line_of_tokens;
712 $line_of_tokens->{_line_type} = 'END';
713 return $line_of_tokens;
717 # check for a hash-bang line if we haven't seen one
718 if ( !$tokenizer_self->{_saw_hash_bang} ) {
719 if ( $input_line =~ /^\#\!.*perl\b/ ) {
720 $tokenizer_self->{_saw_hash_bang} = $input_line_number;
722 # check for -w and -P flags
723 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
724 $tokenizer_self->{_saw_perl_dash_P} = 1;
727 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
728 $tokenizer_self->{_saw_perl_dash_w} = 1;
732 ( $input_line_number > 1 )
734 # leave any hash bang in a BEGIN block alone
735 # i.e. see 'debugger-duck_type.t'
737 $last_nonblank_block_type
738 && $last_nonblank_block_type eq 'BEGIN'
740 && ( !$tokenizer_self->{_look_for_hash_bang} )
744 # this is helpful for VMS systems; we may have accidentally
745 # tokenized some DCL commands
746 if ( $tokenizer_self->{_started_tokenizing} ) {
748 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
752 complain("Useless hash-bang after line 1\n");
756 # Report the leading hash-bang as a system line
757 # This will prevent -dac from deleting it
759 $line_of_tokens->{_line_type} = 'SYSTEM';
760 return $line_of_tokens;
765 # wait for a hash-bang before parsing if the user invoked us with -x
766 if ( $tokenizer_self->{_look_for_hash_bang}
767 && !$tokenizer_self->{_saw_hash_bang} )
769 $line_of_tokens->{_line_type} = 'SYSTEM';
770 return $line_of_tokens;
773 # a first line of the form ': #' will be marked as SYSTEM
774 # since lines of this form may be used by tcsh
775 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
776 $line_of_tokens->{_line_type} = 'SYSTEM';
777 return $line_of_tokens;
780 # now we know that it is ok to tokenize the line...
781 # the line tokenizer will modify any of these private variables:
789 my $ending_in_quote_last = $tokenizer_self->{_in_quote};
790 tokenize_this_line($line_of_tokens);
792 # Now finish defining the return structure and return it
793 $line_of_tokens->{_ending_in_quote} = $tokenizer_self->{_in_quote};
795 # handle severe error (binary data in script)
796 if ( $tokenizer_self->{_in_error} ) {
797 $tokenizer_self->{_in_quote} = 0; # to avoid any more messages
798 warning("Giving up after error\n");
799 $line_of_tokens->{_line_type} = 'ERROR';
800 reset_indentation_level(0); # avoid error messages
801 return $line_of_tokens;
804 # handle start of pod documentation
805 if ( $tokenizer_self->{_in_pod} ) {
807 # This gets tricky..above a __DATA__ or __END__ section, perl
808 # accepts '=cut' as the start of pod section. But afterwards,
809 # only pod utilities see it and they may ignore an =cut without
810 # leading =head. In any case, this isn't good.
811 if ( $input_line =~ /^=cut\b/ ) {
812 if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) {
813 complain("=cut while not in pod ignored\n");
814 $tokenizer_self->{_in_pod} = 0;
815 $line_of_tokens->{_line_type} = 'POD_END';
818 $line_of_tokens->{_line_type} = 'POD_START';
820 "=cut starts a pod section .. this can fool pod utilities.\n"
822 write_logfile_entry("Entering POD section\n");
827 $line_of_tokens->{_line_type} = 'POD_START';
828 write_logfile_entry("Entering POD section\n");
831 return $line_of_tokens;
834 # update indentation levels for log messages
835 if ( $input_line !~ /^\s*$/ ) {
836 my $rlevels = $line_of_tokens->{_rlevels};
837 $line_of_tokens->{_guessed_indentation_level} =
838 guess_old_indentation_level($input_line);
841 # see if this line contains here doc targets
842 my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
843 if ( @{$rhere_target_list} ) {
845 my ( $here_doc_target, $here_quote_character ) =
846 @{ shift @{$rhere_target_list} };
847 $tokenizer_self->{_in_here_doc} = 1;
848 $tokenizer_self->{_here_doc_target} = $here_doc_target;
849 $tokenizer_self->{_here_quote_character} = $here_quote_character;
850 write_logfile_entry("Entering HERE document $here_doc_target\n");
851 $tokenizer_self->{_started_looking_for_here_target_at} =
855 # NOTE: __END__ and __DATA__ statements are written unformatted
856 # because they can theoretically contain additional characters
857 # which are not tokenized (and cannot be read with <DATA> either!).
858 if ( $tokenizer_self->{_in_data} ) {
859 $line_of_tokens->{_line_type} = 'DATA_START';
860 write_logfile_entry("Starting __DATA__ section\n");
861 $tokenizer_self->{_saw_data} = 1;
863 # keep parsing after __DATA__ if use SelfLoader was seen
864 if ( $tokenizer_self->{_saw_selfloader} ) {
865 $tokenizer_self->{_in_data} = 0;
867 "SelfLoader seen, continuing; -nlsl deactivates\n");
870 return $line_of_tokens;
873 elsif ( $tokenizer_self->{_in_end} ) {
874 $line_of_tokens->{_line_type} = 'END_START';
875 write_logfile_entry("Starting __END__ section\n");
876 $tokenizer_self->{_saw_end} = 1;
878 # keep parsing after __END__ if use AutoLoader was seen
879 if ( $tokenizer_self->{_saw_autoloader} ) {
880 $tokenizer_self->{_in_end} = 0;
882 "AutoLoader seen, continuing; -nlal deactivates\n");
884 return $line_of_tokens;
887 # now, finally, we know that this line is type 'CODE'
888 $line_of_tokens->{_line_type} = 'CODE';
890 # remember if we have seen any real code
891 if ( !$tokenizer_self->{_started_tokenizing}
892 && $input_line !~ /^\s*$/
893 && $input_line !~ /^\s*#/ )
895 $tokenizer_self->{_started_tokenizing} = 1;
898 if ( $tokenizer_self->{_debugger_object} ) {
899 $tokenizer_self->{_debugger_object}->write_debug_entry($line_of_tokens);
902 # Note: if keyword 'format' occurs in this line code, it is still CODE
903 # (keyword 'format' need not start a line)
904 if ( $tokenizer_self->{_in_format} ) {
905 write_logfile_entry("Entering format section\n");
908 if ( $tokenizer_self->{_in_quote}
909 and ( $tokenizer_self->{_line_start_quote} < 0 ) )
912 #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
914 ( my $quote_target = $tokenizer_self->{_quote_target} ) !~ /^\s*$/ )
916 $tokenizer_self->{_line_start_quote} = $input_line_number;
918 "Start multi-line quote or pattern ending in $quote_target\n");
921 elsif ( ( $tokenizer_self->{_line_start_quote} >= 0 )
922 && !$tokenizer_self->{_in_quote} )
924 $tokenizer_self->{_line_start_quote} = -1;
925 write_logfile_entry("End of multi-line quote or pattern\n");
928 # we are returning a line of CODE
929 return $line_of_tokens;
932 sub find_starting_indentation_level {
934 # We need to find the indentation level of the first line of the
935 # script being formatted. Often it will be zero for an entire file,
936 # but if we are formatting a local block of code (within an editor for
937 # example) it may not be zero. The user may specify this with the
938 # -sil=n parameter but normally doesn't so we have to guess.
940 # USES GLOBAL VARIABLES: $tokenizer_self
941 my $starting_level = 0;
943 # use value if given as parameter
944 if ( $tokenizer_self->{_know_starting_level} ) {
945 $starting_level = $tokenizer_self->{_starting_level};
948 # if we know there is a hash_bang line, the level must be zero
949 elsif ( $tokenizer_self->{_look_for_hash_bang} ) {
950 $tokenizer_self->{_know_starting_level} = 1;
953 # otherwise figure it out from the input file
958 # keep looking at lines until we find a hash bang or piece of code
961 $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
964 # if first line is #! then assume starting level is zero
965 if ( $i == 1 && $line =~ /^\#\!/ ) {
969 next if ( $line =~ /^\s*#/ ); # skip past comments
970 next if ( $line =~ /^\s*$/ ); # skip past blank lines
971 $starting_level = guess_old_indentation_level($line);
974 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
975 write_logfile_entry("$msg");
977 $tokenizer_self->{_starting_level} = $starting_level;
978 reset_indentation_level($starting_level);
982 sub guess_old_indentation_level {
985 # Guess the indentation level of an input line.
987 # For the first line of code this result will define the starting
988 # indentation level. It will mainly be non-zero when perltidy is applied
989 # within an editor to a local block of code.
991 # This is an impossible task in general because we can't know what tabs
992 # meant for the old script and how many spaces were used for one
993 # indentation level in the given input script. For example it may have
994 # been previously formatted with -i=7 -et=3. But we can at least try to
995 # make sure that perltidy guesses correctly if it is applied repeatedly to
996 # a block of code within an editor, so that the block stays at the same
997 # level when perltidy is applied repeatedly.
999 # USES GLOBAL VARIABLES: $tokenizer_self
1002 # find leading tabs, spaces, and any statement label
1004 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1006 # If there are leading tabs, we use the tab scheme for this run, if
1007 # any, so that the code will remain stable when editing.
1008 if ($1) { $spaces += length($1) * $tokenizer_self->{_tabsize} }
1010 if ($2) { $spaces += length($2) }
1012 # correct for outdented labels
1013 if ( $3 && $tokenizer_self->{'_outdent_labels'} ) {
1014 $spaces += $tokenizer_self->{_continuation_indentation};
1018 # compute indentation using the value of -i for this run.
1019 # If -i=0 is used for this run (which is possible) it doesn't matter
1020 # what we do here but we'll guess that the old run used 4 spaces per level.
1021 my $indent_columns = $tokenizer_self->{_indent_columns};
1022 $indent_columns = 4 if ( !$indent_columns );
1023 $level = int( $spaces / $indent_columns );
1027 # This is a currently unused debug routine
1028 sub dump_functions {
1031 foreach my $pkg ( keys %is_user_function ) {
1032 print $fh "\nnon-constant subs in package $pkg\n";
1034 foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1036 if ( $is_block_list_function{$pkg}{$sub} ) {
1037 $msg = 'block_list';
1040 if ( $is_block_function{$pkg}{$sub} ) {
1043 print $fh "$sub $msg\n";
1047 foreach my $pkg ( keys %is_constant ) {
1048 print $fh "\nconstants and constant subs in package $pkg\n";
1050 foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1059 # count number of 1's in a string of 1's and 0's
1060 # example: ones_count("010101010101") gives 6
1062 return $str =~ tr/1/0/;
1065 sub prepare_for_a_new_file {
1067 # previous tokens needed to determine what to expect next
1068 $last_nonblank_token = ';'; # the only possible starting state which
1069 $last_nonblank_type = ';'; # will make a leading brace a code block
1070 $last_nonblank_block_type = '';
1072 # scalars for remembering statement types across multiple lines
1073 $statement_type = ''; # '' or 'use' or 'sub..' or 'case..'
1074 $in_attribute_list = 0;
1076 # scalars for remembering where we are in the file
1077 $current_package = "main";
1078 $context = UNKNOWN_CONTEXT;
1080 # hashes used to remember function information
1081 %is_constant = (); # user-defined constants
1082 %is_user_function = (); # user-defined functions
1083 %user_function_prototype = (); # their prototypes
1084 %is_block_function = ();
1085 %is_block_list_function = ();
1086 %saw_function_definition = ();
1088 # variables used to track depths of various containers
1089 # and report nesting errors
1092 $square_bracket_depth = 0;
1093 @current_depth[ 0 .. $#closing_brace_names ] =
1094 (0) x scalar @closing_brace_names;
1097 @nesting_sequence_number[ 0 .. $#closing_brace_names ] =
1098 ( 0 .. $#closing_brace_names );
1099 @current_sequence_number = ();
1100 $paren_type[$paren_depth] = '';
1101 $paren_semicolon_count[$paren_depth] = 0;
1102 $paren_structural_type[$brace_depth] = '';
1103 $brace_type[$brace_depth] = ';'; # identify opening brace as code block
1104 $brace_structural_type[$brace_depth] = '';
1105 $brace_context[$brace_depth] = UNKNOWN_CONTEXT;
1106 $brace_package[$paren_depth] = $current_package;
1107 $square_bracket_type[$square_bracket_depth] = '';
1108 $square_bracket_structural_type[$square_bracket_depth] = '';
1110 initialize_tokenizer_state();
1114 { # begin tokenize_this_line
1116 use constant BRACE => 0;
1117 use constant SQUARE_BRACKET => 1;
1118 use constant PAREN => 2;
1119 use constant QUESTION_COLON => 3;
1121 # TV1: scalars for processing one LINE.
1122 # Re-initialized on each entry to sub tokenize_this_line.
1124 $block_type, $container_type, $expecting,
1125 $i, $i_tok, $input_line,
1126 $input_line_number, $last_nonblank_i, $max_token_index,
1127 $next_tok, $next_type, $peeked_ahead,
1128 $prototype, $rhere_target_list, $rtoken_map,
1129 $rtoken_type, $rtokens, $tok,
1130 $type, $type_sequence, $indent_flag,
1133 # TV2: refs to ARRAYS for processing one LINE
1134 # Re-initialized on each call.
1135 my $routput_token_list = []; # stack of output token indexes
1136 my $routput_token_type = []; # token types
1137 my $routput_block_type = []; # types of code block
1138 my $routput_container_type = []; # paren types, such as if, elsif, ..
1139 my $routput_type_sequence = []; # nesting sequential number
1140 my $routput_indent_flag = []; #
1142 # TV3: SCALARS for quote variables. These are initialized with a
1143 # subroutine call and continually updated as lines are processed.
1144 my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1145 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1147 # TV4: SCALARS for multi-line identifiers and
1148 # statements. These are initialized with a subroutine call
1149 # and continually updated as lines are processed.
1150 my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
1152 # TV5: SCALARS for tracking indentation level.
1153 # Initialized once and continually updated as lines are
1156 $nesting_token_string, $nesting_type_string,
1157 $nesting_block_string, $nesting_block_flag,
1158 $nesting_list_string, $nesting_list_flag,
1159 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1160 $in_statement_continuation, $level_in_tokenizer,
1161 $slevel_in_tokenizer, $rslevel_stack,
1164 # TV6: SCALARS for remembering several previous
1165 # tokens. Initialized once and continually updated as
1166 # lines are processed.
1168 $last_nonblank_container_type, $last_nonblank_type_sequence,
1169 $last_last_nonblank_token, $last_last_nonblank_type,
1170 $last_last_nonblank_block_type, $last_last_nonblank_container_type,
1171 $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1174 # ----------------------------------------------------------------
1175 # beginning of tokenizer variable access and manipulation routines
1176 # ----------------------------------------------------------------
1178 sub initialize_tokenizer_state {
1180 # TV1: initialized on each call
1181 # TV2: initialized on each call
1185 $quote_character = "";
1188 $quoted_string_1 = "";
1189 $quoted_string_2 = "";
1190 $allowed_quote_modifiers = "";
1193 $id_scan_state = '';
1196 $indented_if_level = 0;
1199 $nesting_token_string = "";
1200 $nesting_type_string = "";
1201 $nesting_block_string = '1'; # initially in a block
1202 $nesting_block_flag = 1;
1203 $nesting_list_string = '0'; # initially not in a list
1204 $nesting_list_flag = 0; # initially not in a list
1205 $ci_string_in_tokenizer = "";
1206 $continuation_string_in_tokenizer = "0";
1207 $in_statement_continuation = 0;
1208 $level_in_tokenizer = 0;
1209 $slevel_in_tokenizer = 0;
1210 $rslevel_stack = [];
1213 $last_nonblank_container_type = '';
1214 $last_nonblank_type_sequence = '';
1215 $last_last_nonblank_token = ';';
1216 $last_last_nonblank_type = ';';
1217 $last_last_nonblank_block_type = '';
1218 $last_last_nonblank_container_type = '';
1219 $last_last_nonblank_type_sequence = '';
1220 $last_nonblank_prototype = "";
1224 sub save_tokenizer_state {
1227 $block_type, $container_type, $expecting,
1228 $i, $i_tok, $input_line,
1229 $input_line_number, $last_nonblank_i, $max_token_index,
1230 $next_tok, $next_type, $peeked_ahead,
1231 $prototype, $rhere_target_list, $rtoken_map,
1232 $rtoken_type, $rtokens, $tok,
1233 $type, $type_sequence, $indent_flag,
1237 $routput_token_list, $routput_token_type,
1238 $routput_block_type, $routput_container_type,
1239 $routput_type_sequence, $routput_indent_flag,
1243 $in_quote, $quote_type,
1244 $quote_character, $quote_pos,
1245 $quote_depth, $quoted_string_1,
1246 $quoted_string_2, $allowed_quote_modifiers,
1250 [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
1253 $nesting_token_string, $nesting_type_string,
1254 $nesting_block_string, $nesting_block_flag,
1255 $nesting_list_string, $nesting_list_flag,
1256 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1257 $in_statement_continuation, $level_in_tokenizer,
1258 $slevel_in_tokenizer, $rslevel_stack,
1262 $last_nonblank_container_type,
1263 $last_nonblank_type_sequence,
1264 $last_last_nonblank_token,
1265 $last_last_nonblank_type,
1266 $last_last_nonblank_block_type,
1267 $last_last_nonblank_container_type,
1268 $last_last_nonblank_type_sequence,
1269 $last_nonblank_prototype,
1271 return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1274 sub restore_tokenizer_state {
1276 my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1278 $block_type, $container_type, $expecting,
1279 $i, $i_tok, $input_line,
1280 $input_line_number, $last_nonblank_i, $max_token_index,
1281 $next_tok, $next_type, $peeked_ahead,
1282 $prototype, $rhere_target_list, $rtoken_map,
1283 $rtoken_type, $rtokens, $tok,
1284 $type, $type_sequence, $indent_flag,
1288 $routput_token_list, $routput_token_type,
1289 $routput_block_type, $routput_container_type,
1290 $routput_type_sequence, $routput_type_sequence,
1294 $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1295 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1298 ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
1302 $nesting_token_string, $nesting_type_string,
1303 $nesting_block_string, $nesting_block_flag,
1304 $nesting_list_string, $nesting_list_flag,
1305 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1306 $in_statement_continuation, $level_in_tokenizer,
1307 $slevel_in_tokenizer, $rslevel_stack,
1311 $last_nonblank_container_type,
1312 $last_nonblank_type_sequence,
1313 $last_last_nonblank_token,
1314 $last_last_nonblank_type,
1315 $last_last_nonblank_block_type,
1316 $last_last_nonblank_container_type,
1317 $last_last_nonblank_type_sequence,
1318 $last_nonblank_prototype,
1323 sub get_indentation_level {
1325 # patch to avoid reporting error if indented if is not terminated
1326 if ($indented_if_level) { return $level_in_tokenizer - 1 }
1327 return $level_in_tokenizer;
1330 sub reset_indentation_level {
1331 $level_in_tokenizer = $slevel_in_tokenizer = shift;
1332 push @{$rslevel_stack}, $slevel_in_tokenizer;
1338 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1339 return $peeked_ahead;
1342 # ------------------------------------------------------------
1343 # end of tokenizer variable access and manipulation routines
1344 # ------------------------------------------------------------
1346 # ------------------------------------------------------------
1347 # beginning of various scanner interface routines
1348 # ------------------------------------------------------------
1349 sub scan_replacement_text {
1351 # check for here-docs in replacement text invoked by
1352 # a substitution operator with executable modifier 'e'.
1357 # $rht = reference to any here-doc targets
1358 my ($replacement_text) = @_;
1361 return unless ( $replacement_text =~ /<</ );
1363 write_logfile_entry("scanning replacement text for here-doc targets\n");
1365 # save the logger object for error messages
1366 my $logger_object = $tokenizer_self->{_logger_object};
1368 # localize all package variables
1370 $tokenizer_self, $last_nonblank_token,
1371 $last_nonblank_type, $last_nonblank_block_type,
1372 $statement_type, $in_attribute_list,
1373 $current_package, $context,
1374 %is_constant, %is_user_function,
1375 %user_function_prototype, %is_block_function,
1376 %is_block_list_function, %saw_function_definition,
1377 $brace_depth, $paren_depth,
1378 $square_bracket_depth, @current_depth,
1379 @total_depth, $total_depth,
1380 @nesting_sequence_number, @current_sequence_number,
1381 @paren_type, @paren_semicolon_count,
1382 @paren_structural_type, @brace_type,
1383 @brace_structural_type, @brace_context,
1384 @brace_package, @square_bracket_type,
1385 @square_bracket_structural_type, @depth_array,
1386 @starting_line_of_current_depth, @nested_ternary_flag,
1387 @nested_statement_type,
1390 # save all lexical variables
1391 my $rstate = save_tokenizer_state();
1392 _decrement_count(); # avoid error check for multiple tokenizers
1394 # make a new tokenizer
1396 my $rpending_logfile_message;
1398 Perl::Tidy::LineSource->new( \$replacement_text, $rOpts,
1399 $rpending_logfile_message );
1400 my $tokenizer = Perl::Tidy::Tokenizer->new(
1401 source_object => $source_object,
1402 logger_object => $logger_object,
1403 starting_line_number => $input_line_number,
1406 # scan the replacement text
1407 1 while ( $tokenizer->get_line() );
1409 # remove any here doc targets
1411 if ( $tokenizer_self->{_in_here_doc} ) {
1415 $tokenizer_self->{_here_doc_target},
1416 $tokenizer_self->{_here_quote_character}
1418 if ( $tokenizer_self->{_rhere_target_list} ) {
1419 push @{$rht}, @{ $tokenizer_self->{_rhere_target_list} };
1420 $tokenizer_self->{_rhere_target_list} = undef;
1422 $tokenizer_self->{_in_here_doc} = undef;
1425 # now its safe to report errors
1426 my $severe_error = $tokenizer->report_tokenization_errors();
1428 # TODO: Could propagate a severe error up
1430 # restore all tokenizer lexical variables
1431 restore_tokenizer_state($rstate);
1433 # return the here doc targets
1437 sub scan_bare_identifier {
1438 ( $i, $tok, $type, $prototype ) =
1439 scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
1440 $rtoken_map, $max_token_index );
1444 sub scan_identifier {
1445 ( $i, $tok, $type, $id_scan_state, $identifier ) =
1446 scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
1447 $max_token_index, $expecting, $paren_type[$paren_depth] );
1452 ( $i, $tok, $type, $id_scan_state ) =
1453 scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
1454 $id_scan_state, $max_token_index );
1460 ( $i, $type, $number ) =
1461 scan_number_do( $input_line, $i, $rtoken_map, $type,
1466 # a sub to warn if token found where term expected
1467 sub error_if_expecting_TERM {
1468 if ( $expecting == TERM ) {
1469 if ( $really_want_term{$last_nonblank_type} ) {
1470 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
1471 $rtoken_map, $rtoken_type, $input_line );
1478 # a sub to warn if token found where operator expected
1479 sub error_if_expecting_OPERATOR {
1481 if ( $expecting == OPERATOR ) {
1482 if ( !defined($thing) ) { $thing = $tok }
1483 report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
1484 $rtoken_map, $rtoken_type, $input_line );
1485 if ( $i_tok == 0 ) {
1486 interrupt_logfile();
1487 warning("Missing ';' above?\n");
1495 # ------------------------------------------------------------
1496 # end scanner interfaces
1497 # ------------------------------------------------------------
1500 @_ = qw(for foreach);
1501 @is_for_foreach{@_} = (1) x scalar(@_);
1505 @is_my_our{@_} = (1) x scalar(@_);
1507 # These keywords may introduce blocks after parenthesized expressions,
1509 # keyword ( .... ) { BLOCK }
1510 # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
1511 my %is_blocktype_with_paren;
1513 qw(if elsif unless while until for foreach switch case given when catch);
1514 @is_blocktype_with_paren{@_} = (1) x scalar(@_);
1516 # ------------------------------------------------------------
1517 # begin hash of code for handling most token types
1518 # ------------------------------------------------------------
1519 my $tokenization_code = {
1521 # no special code for these types yet, but syntax checks
1556 error_if_expecting_TERM()
1557 if ( $expecting == TERM );
1560 error_if_expecting_TERM()
1561 if ( $expecting == TERM );
1565 # start looking for a scalar
1566 error_if_expecting_OPERATOR("Scalar")
1567 if ( $expecting == OPERATOR );
1570 if ( $identifier eq '$^W' ) {
1571 $tokenizer_self->{_saw_perl_dash_w} = 1;
1574 # Check for identifier in indirect object slot
1575 # (vorboard.pl, sort.t). Something like:
1576 # /^(print|printf|sort|exec|system)$/
1578 $is_indirect_object_taker{$last_nonblank_token}
1580 || ( ( $last_nonblank_token eq '(' )
1581 && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
1582 || ( $last_nonblank_type =~ /^[Uw]$/ ) # possible object
1591 $paren_semicolon_count[$paren_depth] = 0;
1593 $container_type = $want_paren;
1596 elsif ( $statement_type =~ /^sub\b/ ) {
1597 $container_type = $statement_type;
1600 $container_type = $last_nonblank_token;
1602 # We can check for a syntax error here of unexpected '(',
1603 # but this is going to get messy...
1605 $expecting == OPERATOR
1607 # be sure this is not a method call of the form
1608 # &method(...), $method->(..), &{method}(...),
1609 # $ref[2](list) is ok & short for $ref[2]->(list)
1610 # NOTE: at present, braces in something like &{ xxx }
1611 # are not marked as a block, we might have a method call
1612 && $last_nonblank_token !~ /^([\]\}\&]|\-\>)/
1617 # ref: camel 3 p 703.
1618 if ( $last_last_nonblank_token eq 'do' ) {
1620 "do SUBROUTINE is deprecated; consider & or -> notation\n"
1625 # if this is an empty list, (), then it is not an
1626 # error; for example, we might have a constant pi and
1627 # invoke it with pi() or just pi;
1628 my ( $next_nonblank_token, $i_next ) =
1629 find_next_nonblank_token( $i, $rtokens,
1631 if ( $next_nonblank_token ne ')' ) {
1633 error_if_expecting_OPERATOR('(');
1635 if ( $last_nonblank_type eq 'C' ) {
1637 "$last_nonblank_token has a void prototype\n";
1639 elsif ( $last_nonblank_type eq 'i' ) {
1641 && $last_nonblank_token =~ /^\$/ )
1644 "Do you mean '$last_nonblank_token->(' ?\n";
1648 interrupt_logfile();
1652 } ## end if ( $next_nonblank_token...
1653 } ## end else [ if ( $last_last_nonblank_token...
1654 } ## end if ( $expecting == OPERATOR...
1656 $paren_type[$paren_depth] = $container_type;
1657 ( $type_sequence, $indent_flag ) =
1658 increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
1660 # propagate types down through nested parens
1661 # for example: the second paren in 'if ((' would be structural
1662 # since the first is.
1664 if ( $last_nonblank_token eq '(' ) {
1665 $type = $last_nonblank_type;
1668 # We exclude parens as structural after a ',' because it
1669 # causes subtle problems with continuation indentation for
1670 # something like this, where the first 'or' will not get
1675 # ( not defined $check )
1677 # or $check eq "new"
1678 # or $check eq "old",
1681 # Likewise, we exclude parens where a statement can start
1682 # because of problems with continuation indentation, like
1685 # ($firstline =~ /^#\!.*perl/)
1686 # and (print $File::Find::name, "\n")
1689 # (ref($usage_fref) =~ /CODE/)
1691 # : (&blast_usage, &blast_params, &blast_general_params);
1697 if ( $last_nonblank_type eq ')' ) {
1699 "Syntax error? found token '$last_nonblank_type' then '('\n"
1702 $paren_structural_type[$paren_depth] = $type;
1706 ( $type_sequence, $indent_flag ) =
1707 decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
1709 if ( $paren_structural_type[$paren_depth] eq '{' ) {
1713 $container_type = $paren_type[$paren_depth];
1715 # restore statement type as 'sub' at closing paren of a signature
1716 # so that a subsequent ':' is identified as an attribute
1717 if ( $container_type =~ /^sub\b/ ) {
1718 $statement_type = $container_type;
1722 if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
1723 my $num_sc = $paren_semicolon_count[$paren_depth];
1724 if ( $num_sc > 0 && $num_sc != 2 ) {
1725 warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
1729 if ( $paren_depth > 0 ) { $paren_depth-- }
1732 if ( $last_nonblank_type eq ',' ) {
1733 complain("Repeated ','s \n");
1736 # patch for operator_expected: note if we are in the list (use.t)
1737 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
1738 ## FIXME: need to move this elsewhere, perhaps check after a '('
1739 ## elsif ($last_nonblank_token eq '(') {
1740 ## warning("Leading ','s illegal in some versions of perl\n");
1744 $context = UNKNOWN_CONTEXT;
1745 $statement_type = '';
1749 if ( $is_for_foreach{ $paren_type[$paren_depth] } )
1750 { # mark ; in for loop
1752 # Be careful: we do not want a semicolon such as the
1753 # following to be included:
1755 # for (sort {strcoll($a,$b);} keys %investments) {
1757 if ( $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
1758 && $square_bracket_depth ==
1759 $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
1763 $paren_semicolon_count[$paren_depth]++;
1769 error_if_expecting_OPERATOR("String")
1770 if ( $expecting == OPERATOR );
1773 $allowed_quote_modifiers = "";
1776 error_if_expecting_OPERATOR("String")
1777 if ( $expecting == OPERATOR );
1780 $allowed_quote_modifiers = "";
1783 error_if_expecting_OPERATOR("String")
1784 if ( $expecting == OPERATOR );
1787 $allowed_quote_modifiers = "";
1792 # a pattern cannot follow certain keywords which take optional
1793 # arguments, like 'shift' and 'pop'. See also '?'.
1794 if ( $last_nonblank_type eq 'k'
1795 && $is_keyword_taking_optional_args{$last_nonblank_token} )
1799 elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
1801 ( $is_pattern, $msg ) =
1802 guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
1806 write_diagnostics("DIVIDE:$msg\n");
1807 write_logfile_entry($msg);
1810 else { $is_pattern = ( $expecting == TERM ) }
1815 $allowed_quote_modifiers = '[msixpodualngc]';
1817 else { # not a pattern; check for a /= token
1819 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
1825 #DEBUG - collecting info on what tokens follow a divide
1826 # for development of guessing algorithm
1827 #if ( numerator_expected( $i, $rtokens, $max_token_index ) < 0 ) {
1828 # #write_diagnostics( "DIVIDE? $input_line\n" );
1834 # if we just saw a ')', we will label this block with
1835 # its type. We need to do this to allow sub
1836 # code_block_type to determine if this brace starts a
1837 # code block or anonymous hash. (The type of a paren
1838 # pair is the preceding token, such as 'if', 'else',
1840 $container_type = "";
1842 # ATTRS: for a '{' following an attribute list, reset
1843 # things to look like we just saw the sub name
1844 if ( $statement_type =~ /^sub/ ) {
1845 $last_nonblank_token = $statement_type;
1846 $last_nonblank_type = 'i';
1847 $statement_type = "";
1850 # patch for SWITCH/CASE: hide these keywords from an immediately
1851 # following opening brace
1852 elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
1853 && $statement_type eq $last_nonblank_token )
1855 $last_nonblank_token = ";";
1858 elsif ( $last_nonblank_token eq ')' ) {
1859 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
1861 # defensive move in case of a nesting error (pbug.t)
1862 # in which this ')' had no previous '('
1863 # this nesting error will have been caught
1864 if ( !defined($last_nonblank_token) ) {
1865 $last_nonblank_token = 'if';
1868 # check for syntax error here;
1869 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
1870 if ( $tokenizer_self->{'_extended_syntax'} ) {
1872 # we append a trailing () to mark this as an unknown
1873 # block type. This allows perltidy to format some
1874 # common extensions of perl syntax.
1875 # This is used by sub code_block_type
1876 $last_nonblank_token .= '()';
1880 join( ' ', sort keys %is_blocktype_with_paren );
1882 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
1888 # patch for paren-less for/foreach glitch, part 2.
1889 # see note below under 'qw'
1890 elsif ($last_nonblank_token eq 'qw'
1891 && $is_for_foreach{$want_paren} )
1893 $last_nonblank_token = $want_paren;
1894 if ( $last_last_nonblank_token eq $want_paren ) {
1896 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
1903 # now identify which of the three possible types of
1904 # curly braces we have: hash index container, anonymous
1905 # hash reference, or code block.
1907 # non-structural (hash index) curly brace pair
1908 # get marked 'L' and 'R'
1909 if ( is_non_structural_brace() ) {
1912 # patch for SWITCH/CASE:
1913 # allow paren-less identifier after 'when'
1914 # if the brace is preceded by a space
1915 if ( $statement_type eq 'when'
1916 && $last_nonblank_type eq 'i'
1917 && $last_last_nonblank_type eq 'k'
1918 && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
1921 $block_type = $statement_type;
1925 # code and anonymous hash have the same type, '{', but are
1926 # distinguished by 'block_type',
1927 # which will be blank for an anonymous hash
1930 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
1933 # patch to promote bareword type to function taking block
1935 && $last_nonblank_type eq 'w'
1936 && $last_nonblank_i >= 0 )
1938 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
1939 $routput_token_type->[$last_nonblank_i] = 'G';
1943 # patch for SWITCH/CASE: if we find a stray opening block brace
1944 # where we might accept a 'case' or 'when' block, then take it
1945 if ( $statement_type eq 'case'
1946 || $statement_type eq 'when' )
1948 if ( !$block_type || $block_type eq '}' ) {
1949 $block_type = $statement_type;
1954 $brace_type[ ++$brace_depth ] = $block_type;
1955 $brace_package[$brace_depth] = $current_package;
1956 $brace_structural_type[$brace_depth] = $type;
1957 $brace_context[$brace_depth] = $context;
1958 ( $type_sequence, $indent_flag ) =
1959 increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
1962 $block_type = $brace_type[$brace_depth];
1963 if ($block_type) { $statement_type = '' }
1964 if ( defined( $brace_package[$brace_depth] ) ) {
1965 $current_package = $brace_package[$brace_depth];
1968 # can happen on brace error (caught elsewhere)
1971 ( $type_sequence, $indent_flag ) =
1972 decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
1974 if ( $brace_structural_type[$brace_depth] eq 'L' ) {
1978 # propagate type information for 'do' and 'eval' blocks, and also
1979 # for smartmatch operator. This is necessary to enable us to know
1980 # if an operator or term is expected next.
1981 if ( $is_block_operator{$block_type} ) {
1985 $context = $brace_context[$brace_depth];
1986 if ( $brace_depth > 0 ) { $brace_depth--; }
1988 '&' => sub { # maybe sub call? start looking
1990 # We have to check for sub call unless we are sure we
1991 # are expecting an operator. This example from s2p
1992 # got mistaken as a q operator in an early version:
1993 # print BODY &q(<<'EOT');
1994 if ( $expecting != OPERATOR ) {
1996 # But only look for a sub call if we are expecting a term or
1997 # if there is no existing space after the &.
1998 # For example we probably don't want & as sub call here:
1999 # Fcntl::S_IRUSR & $mode;
2000 if ( $expecting == TERM || $next_type ne 'b' ) {
2007 '<' => sub { # angle operator or less than?
2009 if ( $expecting != OPERATOR ) {
2011 find_angle_operator_termination( $input_line, $i, $rtoken_map,
2012 $expecting, $max_token_index );
2014 if ( $type eq '<' && $expecting == TERM ) {
2015 error_if_expecting_TERM();
2016 interrupt_logfile();
2017 warning("Unterminated <> operator?\n");
2024 '?' => sub { # ?: conditional or starting pattern?
2028 # Patch for rt #126965
2029 # a pattern cannot follow certain keywords which take optional
2030 # arguments, like 'shift' and 'pop'. See also '/'.
2031 if ( $last_nonblank_type eq 'k'
2032 && $is_keyword_taking_optional_args{$last_nonblank_token} )
2036 elsif ( $expecting == UNKNOWN ) {
2039 ( $is_pattern, $msg ) =
2040 guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
2043 if ($msg) { write_logfile_entry($msg) }
2045 else { $is_pattern = ( $expecting == TERM ) }
2050 $allowed_quote_modifiers = '[msixpodualngc]';
2053 ( $type_sequence, $indent_flag ) =
2054 increase_nesting_depth( QUESTION_COLON,
2055 $rtoken_map->[$i_tok] );
2058 '*' => sub { # typeglob, or multiply?
2060 if ( $expecting == TERM ) {
2065 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2070 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
2074 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2082 '.' => sub { # what kind of . ?
2084 if ( $expecting != OPERATOR ) {
2086 if ( $type eq '.' ) {
2087 error_if_expecting_TERM()
2088 if ( $expecting == TERM );
2096 # if this is the first nonblank character, call it a label
2097 # since perl seems to just swallow it
2098 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
2102 # ATTRS: check for a ':' which introduces an attribute list
2103 # (this might eventually get its own token type)
2104 elsif ( $statement_type =~ /^sub\b/ ) {
2106 $in_attribute_list = 1;
2109 # check for scalar attribute, such as
2110 # my $foo : shared = 1;
2111 elsif ($is_my_our{$statement_type}
2112 && $current_depth[QUESTION_COLON] == 0 )
2115 $in_attribute_list = 1;
2118 # otherwise, it should be part of a ?/: operator
2120 ( $type_sequence, $indent_flag ) =
2121 decrease_nesting_depth( QUESTION_COLON,
2122 $rtoken_map->[$i_tok] );
2123 if ( $last_nonblank_token eq '?' ) {
2124 warning("Syntax error near ? :\n");
2128 '+' => sub { # what kind of plus?
2130 if ( $expecting == TERM ) {
2131 my $number = scan_number();
2133 # unary plus is safest assumption if not a number
2134 if ( !defined($number) ) { $type = 'p'; }
2136 elsif ( $expecting == OPERATOR ) {
2139 if ( $next_type eq 'w' ) { $type = 'p' }
2144 error_if_expecting_OPERATOR("Array")
2145 if ( $expecting == OPERATOR );
2148 '%' => sub { # hash or modulo?
2150 # first guess is hash if no following blank
2151 if ( $expecting == UNKNOWN ) {
2152 if ( $next_type ne 'b' ) { $expecting = TERM }
2154 if ( $expecting == TERM ) {
2159 $square_bracket_type[ ++$square_bracket_depth ] =
2160 $last_nonblank_token;
2161 ( $type_sequence, $indent_flag ) =
2162 increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2164 # It may seem odd, but structural square brackets have
2165 # type '{' and '}'. This simplifies the indentation logic.
2166 if ( !is_non_structural_brace() ) {
2169 $square_bracket_structural_type[$square_bracket_depth] = $type;
2172 ( $type_sequence, $indent_flag ) =
2173 decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2175 if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
2180 # propagate type information for smartmatch operator. This is
2181 # necessary to enable us to know if an operator or term is expected
2183 if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
2184 $tok = $square_bracket_type[$square_bracket_depth];
2187 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
2189 '-' => sub { # what kind of minus?
2191 if ( ( $expecting != OPERATOR )
2192 && $is_file_test_operator{$next_tok} )
2194 my ( $next_nonblank_token, $i_next ) =
2195 find_next_nonblank_token( $i + 1, $rtokens,
2198 # check for a quoted word like "-w=>xx";
2199 # it is sufficient to just check for a following '='
2200 if ( $next_nonblank_token eq '=' ) {
2209 elsif ( $expecting == TERM ) {
2210 my $number = scan_number();
2212 # maybe part of bareword token? unary is safest
2213 if ( !defined($number) ) { $type = 'm'; }
2216 elsif ( $expecting == OPERATOR ) {
2220 if ( $next_type eq 'w' ) {
2228 # check for special variables like ${^WARNING_BITS}
2229 if ( $expecting == TERM ) {
2231 # FIXME: this should work but will not catch errors
2232 # because we also have to be sure that previous token is
2233 # a type character ($,@,%).
2234 if ( $last_nonblank_token eq '{'
2235 && ( $next_tok =~ /^[A-Za-z_]/ ) )
2238 if ( $next_tok eq 'W' ) {
2239 $tokenizer_self->{_saw_perl_dash_w} = 1;
2241 $tok = $tok . $next_tok;
2247 unless ( error_if_expecting_TERM() ) {
2249 # Something like this is valid but strange:
2251 complain("The '^' seems unusual here\n");
2257 '::' => sub { # probably a sub call
2258 scan_bare_identifier();
2260 '<<' => sub { # maybe a here-doc?
2262 unless ( $i < $max_token_index )
2263 ; # here-doc not possible if end of line
2265 if ( $expecting != OPERATOR ) {
2266 my ( $found_target, $here_doc_target, $here_quote_character,
2269 $found_target, $here_doc_target, $here_quote_character, $i,
2272 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
2275 if ($found_target) {
2276 push @{$rhere_target_list},
2277 [ $here_doc_target, $here_quote_character ];
2279 if ( length($here_doc_target) > 80 ) {
2280 my $truncated = substr( $here_doc_target, 0, 80 );
2281 complain("Long here-target: '$truncated' ...\n");
2283 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
2285 "Unconventional here-target: '$here_doc_target'\n");
2288 elsif ( $expecting == TERM ) {
2289 unless ($saw_error) {
2291 # shouldn't happen..
2292 warning("Program bug; didn't find here doc target\n");
2293 report_definite_bug();
2300 '<<~' => sub { # a here-doc, new type added in v26
2302 unless ( $i < $max_token_index )
2303 ; # here-doc not possible if end of line
2304 if ( $expecting != OPERATOR ) {
2305 my ( $found_target, $here_doc_target, $here_quote_character,
2308 $found_target, $here_doc_target, $here_quote_character, $i,
2311 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
2314 if ($found_target) {
2316 if ( length($here_doc_target) > 80 ) {
2317 my $truncated = substr( $here_doc_target, 0, 80 );
2318 complain("Long here-target: '$truncated' ...\n");
2320 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
2322 "Unconventional here-target: '$here_doc_target'\n");
2325 # Note that we put a leading space on the here quote
2326 # character indicate that it may be preceded by spaces
2327 $here_quote_character = " " . $here_quote_character;
2328 push @{$rhere_target_list},
2329 [ $here_doc_target, $here_quote_character ];
2332 elsif ( $expecting == TERM ) {
2333 unless ($saw_error) {
2335 # shouldn't happen..
2336 warning("Program bug; didn't find here doc target\n");
2337 report_definite_bug();
2346 # if -> points to a bare word, we must scan for an identifier,
2347 # otherwise something like ->y would look like the y operator
2351 # type = 'pp' for pre-increment, '++' for post-increment
2353 if ( $expecting == TERM ) { $type = 'pp' }
2354 elsif ( $expecting == UNKNOWN ) {
2355 my ( $next_nonblank_token, $i_next ) =
2356 find_next_nonblank_token( $i, $rtokens, $max_token_index );
2357 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
2362 if ( $last_nonblank_type eq $tok ) {
2363 complain("Repeated '=>'s \n");
2366 # patch for operator_expected: note if we are in the list (use.t)
2367 # TODO: make version numbers a new token type
2368 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2371 # type = 'mm' for pre-decrement, '--' for post-decrement
2374 if ( $expecting == TERM ) { $type = 'mm' }
2375 elsif ( $expecting == UNKNOWN ) {
2376 my ( $next_nonblank_token, $i_next ) =
2377 find_next_nonblank_token( $i, $rtokens, $max_token_index );
2378 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
2383 error_if_expecting_TERM()
2384 if ( $expecting == TERM );
2388 error_if_expecting_TERM()
2389 if ( $expecting == TERM );
2393 error_if_expecting_TERM()
2394 if ( $expecting == TERM );
2398 # ------------------------------------------------------------
2399 # end hash of code for handling individual token types
2400 # ------------------------------------------------------------
2402 my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
2404 # These block types terminate statements and do not need a trailing
2406 # patched for SWITCH/CASE/
2407 my %is_zero_continuation_block_type;
2408 @_ = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
2409 if elsif else unless while until for foreach switch case given when);
2410 @is_zero_continuation_block_type{@_} = (1) x scalar(@_);
2412 my %is_not_zero_continuation_block_type;
2413 @_ = qw(sort grep map do eval);
2414 @is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);
2416 my %is_logical_container;
2417 @_ = qw(if elsif unless while and or err not && ! || for foreach);
2418 @is_logical_container{@_} = (1) x scalar(@_);
2422 @is_binary_type{@_} = (1) x scalar(@_);
2424 my %is_binary_keyword;
2425 @_ = qw(and or err eq ne cmp);
2426 @is_binary_keyword{@_} = (1) x scalar(@_);
2428 # 'L' is token for opening { at hash key
2429 my %is_opening_type;
2431 @is_opening_type{@_} = (1) x scalar(@_);
2433 # 'R' is token for closing } at hash key
2434 my %is_closing_type;
2436 @is_closing_type{@_} = (1) x scalar(@_);
2438 my %is_redo_last_next_goto;
2439 @_ = qw(redo last next goto);
2440 @is_redo_last_next_goto{@_} = (1) x scalar(@_);
2443 @_ = qw(use require);
2444 @is_use_require{@_} = (1) x scalar(@_);
2447 @_ = qw(sub package);
2448 @is_sub_package{@_} = (1) x scalar(@_);
2450 # This hash holds the hash key in $tokenizer_self for these keywords:
2451 my %is_format_END_DATA = (
2452 'format' => '_in_format',
2453 '__END__' => '_in_end',
2454 '__DATA__' => '_in_data',
2457 # original ref: camel 3 p 147,
2458 # but perl may accept undocumented flags
2459 # perl 5.10 adds 'p' (preserve)
2460 # Perl version 5.22 added 'n'
2461 # From http://perldoc.perl.org/perlop.html we have
2462 # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
2463 # s/PATTERN/REPLACEMENT/msixpodualngcer
2464 # y/SEARCHLIST/REPLACEMENTLIST/cdsr
2465 # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
2466 # qr/STRING/msixpodualn
2467 my %quote_modifiers = (
2468 's' => '[msixpodualngcer]',
2471 'm' => '[msixpodualngc]',
2472 'qr' => '[msixpodualn]',
2479 # table showing how many quoted things to look for after quote operator..
2480 # s, y, tr have 2 (pattern and replacement)
2481 # others have 1 (pattern only)
2494 sub tokenize_this_line {
2496 # This routine breaks a line of perl code into tokens which are of use in
2497 # indentation and reformatting. One of my goals has been to define tokens
2498 # such that a newline may be inserted between any pair of tokens without
2499 # changing or invalidating the program. This version comes close to this,
2500 # although there are necessarily a few exceptions which must be caught by
2501 # the formatter. Many of these involve the treatment of bare words.
2503 # The tokens and their types are returned in arrays. See previous
2504 # routine for their names.
2506 # See also the array "valid_token_types" in the BEGIN section for an
2509 # To simplify things, token types are either a single character, or they
2510 # are identical to the tokens themselves.
2512 # As a debugging aid, the -D flag creates a file containing a side-by-side
2513 # comparison of the input string and its tokenization for each line of a file.
2514 # This is an invaluable debugging aid.
2516 # In addition to tokens, and some associated quantities, the tokenizer
2517 # also returns flags indication any special line types. These include
2518 # quotes, here_docs, formats.
2520 # -----------------------------------------------------------------------
2522 # How to add NEW_TOKENS:
2524 # New token types will undoubtedly be needed in the future both to keep up
2525 # with changes in perl and to help adapt the tokenizer to other applications.
2527 # Here are some notes on the minimal steps. I wrote these notes while
2528 # adding the 'v' token type for v-strings, which are things like version
2529 # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
2530 # can use your editor to search for the string "NEW_TOKENS" to find the
2531 # appropriate sections to change):
2533 # *. Try to talk somebody else into doing it! If not, ..
2535 # *. Make a backup of your current version in case things don't work out!
2537 # *. Think of a new, unused character for the token type, and add to
2538 # the array @valid_token_types in the BEGIN section of this package.
2539 # For example, I used 'v' for v-strings.
2541 # *. Implement coding to recognize the $type of the token in this routine.
2542 # This is the hardest part, and is best done by imitating or modifying
2543 # some of the existing coding. For example, to recognize v-strings, I
2544 # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
2545 # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
2547 # *. Update sub operator_expected. This update is critically important but
2548 # the coding is trivial. Look at the comments in that routine for help.
2549 # For v-strings, which should behave like numbers, I just added 'v' to the
2550 # regex used to handle numbers and strings (types 'n' and 'Q').
2552 # *. Implement a 'bond strength' rule in sub set_bond_strengths in
2553 # Perl::Tidy::Formatter for breaking lines around this token type. You can
2554 # skip this step and take the default at first, then adjust later to get
2555 # desired results. For adding type 'v', I looked at sub bond_strength and
2556 # saw that number type 'n' was using default strengths, so I didn't do
2557 # anything. I may tune it up someday if I don't like the way line
2558 # breaks with v-strings look.
2560 # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
2561 # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
2562 # and saw that type 'n' used spaces on both sides, so I just added 'v'
2563 # to the array @spaces_both_sides.
2565 # *. Update HtmlWriter package so that users can colorize the token as
2566 # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
2567 # that package. For v-strings, I initially chose to use a default color
2568 # equal to the default for numbers, but it might be nice to change that
2571 # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
2573 # *. Run lots and lots of debug tests. Start with special files designed
2574 # to test the new token type. Run with the -D flag to create a .DEBUG
2575 # file which shows the tokenization. When these work ok, test as many old
2576 # scripts as possible. Start with all of the '.t' files in the 'test'
2577 # directory of the distribution file. Compare .tdy output with previous
2578 # version and updated version to see the differences. Then include as
2579 # many more files as possible. My own technique has been to collect a huge
2580 # number of perl scripts (thousands!) into one directory and run perltidy
2581 # *, then run diff between the output of the previous version and the
2584 # *. For another example, search for the smartmatch operator '~~'
2585 # with your editor to see where updates were made for it.
2587 # -----------------------------------------------------------------------
2589 my $line_of_tokens = shift;
2590 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
2592 # patch while coding change is underway
2593 # make callers private data to allow access
2594 # $tokenizer_self = $caller_tokenizer_self;
2596 # extract line number for use in error messages
2597 $input_line_number = $line_of_tokens->{_line_number};
2599 # reinitialize for multi-line quote
2600 $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
2602 # check for pod documentation
2603 if ( ( $untrimmed_input_line =~ /^=[A-Za-z_]/ ) ) {
2605 # must not be in multi-line quote
2606 # and must not be in an equation
2607 if ( !$in_quote && ( operator_expected( 'b', '=', 'b' ) == TERM ) )
2609 $tokenizer_self->{_in_pod} = 1;
2614 $input_line = $untrimmed_input_line;
2618 # trim start of this line unless we are continuing a quoted line
2619 # do not trim end because we might end in a quote (test: deken4.pl)
2620 # Perl::Tidy::Formatter will delete needless trailing blanks
2621 unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
2622 $input_line =~ s/^\s*//; # trim left end
2625 # Set a flag to indicate if we might be at an __END__ or __DATA__ line
2626 # This will be used below to avoid quoting a bare word followed by
2628 my $is_END_or_DATA = $input_line =~ /^\s*__(END|DATA)__\s*$/;
2630 # update the copy of the line for use in error messages
2631 # This must be exactly what we give the pre_tokenizer
2632 $tokenizer_self->{_line_text} = $input_line;
2634 # re-initialize for the main loop
2635 $routput_token_list = []; # stack of output token indexes
2636 $routput_token_type = []; # token types
2637 $routput_block_type = []; # types of code block
2638 $routput_container_type = []; # paren types, such as if, elsif, ..
2639 $routput_type_sequence = []; # nesting sequential number
2641 $rhere_target_list = [];
2643 $tok = $last_nonblank_token;
2644 $type = $last_nonblank_type;
2645 $prototype = $last_nonblank_prototype;
2646 $last_nonblank_i = -1;
2647 $block_type = $last_nonblank_block_type;
2648 $container_type = $last_nonblank_container_type;
2649 $type_sequence = $last_nonblank_type_sequence;
2653 # tokenization is done in two stages..
2654 # stage 1 is a very simple pre-tokenization
2655 my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
2657 # a little optimization for a full-line comment
2658 if ( !$in_quote && ( $input_line =~ /^#/ ) ) {
2659 $max_tokens_wanted = 1 # no use tokenizing a comment
2662 # start by breaking the line into pre-tokens
2663 ( $rtokens, $rtoken_map, $rtoken_type ) =
2664 pre_tokenize( $input_line, $max_tokens_wanted );
2666 $max_token_index = scalar( @{$rtokens} ) - 1;
2667 push( @{$rtokens}, ' ', ' ', ' ' ); # extra whitespace simplifies logic
2668 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
2669 push( @{$rtoken_type}, 'b', 'b', 'b' );
2671 # initialize for main loop
2672 foreach my $ii ( 0 .. $max_token_index + 3 ) {
2673 $routput_token_type->[$ii] = "";
2674 $routput_block_type->[$ii] = "";
2675 $routput_container_type->[$ii] = "";
2676 $routput_type_sequence->[$ii] = "";
2677 $routput_indent_flag->[$ii] = 0;
2682 # ------------------------------------------------------------
2683 # begin main tokenization loop
2684 # ------------------------------------------------------------
2686 # we are looking at each pre-token of one line and combining them
2688 while ( ++$i <= $max_token_index ) {
2690 if ($in_quote) { # continue looking for end of a quote
2691 $type = $quote_type;
2693 unless ( @{$routput_token_list} )
2694 { # initialize if continuation line
2695 push( @{$routput_token_list}, $i );
2696 $routput_token_type->[$i] = $type;
2699 $tok = $quote_character unless ( $quote_character =~ /^\s*$/ );
2701 # scan for the end of the quote or pattern
2703 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
2704 $quoted_string_1, $quoted_string_2
2707 $i, $in_quote, $quote_character,
2708 $quote_pos, $quote_depth, $quoted_string_1,
2709 $quoted_string_2, $rtokens, $rtoken_map,
2713 # all done if we didn't find it
2714 last if ($in_quote);
2716 # save pattern and replacement text for rescanning
2717 my $qs1 = $quoted_string_1;
2718 my $qs2 = $quoted_string_2;
2720 # re-initialize for next search
2721 $quote_character = '';
2724 $quoted_string_1 = "";
2725 $quoted_string_2 = "";
2726 last if ( ++$i > $max_token_index );
2728 # look for any modifiers
2729 if ($allowed_quote_modifiers) {
2731 # check for exact quote modifiers
2732 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
2733 my $str = $rtokens->[$i];
2735 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
2736 my $pos = pos($str);
2737 my $char = substr( $str, $pos - 1, 1 );
2738 $saw_modifier_e ||= ( $char eq 'e' );
2741 # For an 'e' quote modifier we must scan the replacement
2742 # text for here-doc targets.
2743 if ($saw_modifier_e) {
2745 my $rht = scan_replacement_text($qs1);
2747 # Change type from 'Q' to 'h' for quotes with
2748 # here-doc targets so that the formatter (see sub
2749 # print_line_of_tokens) will not make any line
2750 # breaks after this point.
2752 push @{$rhere_target_list}, @{$rht};
2755 my $ilast = $routput_token_list->[-1];
2756 $routput_token_type->[$ilast] = $type;
2761 if ( defined( pos($str) ) ) {
2764 if ( pos($str) == length($str) ) {
2765 last if ( ++$i > $max_token_index );
2768 # Looks like a joined quote modifier
2769 # and keyword, maybe something like
2770 # s/xxx/yyy/gefor @k=...
2771 # Example is "galgen.pl". Would have to split
2772 # the word and insert a new token in the
2773 # pre-token list. This is so rare that I haven't
2774 # done it. Will just issue a warning citation.
2776 # This error might also be triggered if my quote
2777 # modifier characters are incomplete
2781 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
2782 Please put a space between quote modifiers and trailing keywords.
2785 # print "token $rtokens->[$i]\n";
2786 # my $num = length($str) - pos($str);
2787 # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
2788 # print "continuing with new token $rtokens->[$i]\n";
2790 # skipping past this token does least damage
2791 last if ( ++$i > $max_token_index );
2796 # example file: rokicki4.pl
2797 # This error might also be triggered if my quote
2798 # modifier characters are incomplete
2799 write_logfile_entry(
2800 "Note: found word $str at quote modifier location\n"
2806 $allowed_quote_modifiers = "";
2810 unless ( $tok =~ /^\s*$/ || $tok eq 'CORE::' ) {
2812 # try to catch some common errors
2813 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
2815 if ( $last_nonblank_token eq 'eq' ) {
2816 complain("Should 'eq' be '==' here ?\n");
2818 elsif ( $last_nonblank_token eq 'ne' ) {
2819 complain("Should 'ne' be '!=' here ?\n");
2823 $last_last_nonblank_token = $last_nonblank_token;
2824 $last_last_nonblank_type = $last_nonblank_type;
2825 $last_last_nonblank_block_type = $last_nonblank_block_type;
2826 $last_last_nonblank_container_type =
2827 $last_nonblank_container_type;
2828 $last_last_nonblank_type_sequence =
2829 $last_nonblank_type_sequence;
2830 $last_nonblank_token = $tok;
2831 $last_nonblank_type = $type;
2832 $last_nonblank_prototype = $prototype;
2833 $last_nonblank_block_type = $block_type;
2834 $last_nonblank_container_type = $container_type;
2835 $last_nonblank_type_sequence = $type_sequence;
2836 $last_nonblank_i = $i_tok;
2839 # store previous token type
2840 if ( $i_tok >= 0 ) {
2841 $routput_token_type->[$i_tok] = $type;
2842 $routput_block_type->[$i_tok] = $block_type;
2843 $routput_container_type->[$i_tok] = $container_type;
2844 $routput_type_sequence->[$i_tok] = $type_sequence;
2845 $routput_indent_flag->[$i_tok] = $indent_flag;
2847 my $pre_tok = $rtokens->[$i]; # get the next pre-token
2848 my $pre_type = $rtoken_type->[$i]; # and type
2850 $type = $pre_type; # to be modified as necessary
2851 $block_type = ""; # blank for all tokens except code block braces
2852 $container_type = ""; # blank for all tokens except some parens
2853 $type_sequence = ""; # blank for all tokens except ?/:
2855 $prototype = ""; # blank for all tokens except user defined subs
2858 # this pre-token will start an output token
2859 push( @{$routput_token_list}, $i_tok );
2861 # continue gathering identifier if necessary
2862 # but do not start on blanks and comments
2863 if ( $id_scan_state && $pre_type !~ /[b#]/ ) {
2865 if ( $id_scan_state =~ /^(sub|package)/ ) {
2872 last if ($id_scan_state);
2873 next if ( ( $i > 0 ) || $type );
2875 # didn't find any token; start over
2880 # handle whitespace tokens..
2881 next if ( $type eq 'b' );
2882 my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : ' ';
2883 my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
2885 # Build larger tokens where possible, since we are not in a quote.
2887 # First try to assemble digraphs. The following tokens are
2888 # excluded and handled specially:
2889 # '/=' is excluded because the / might start a pattern.
2890 # 'x=' is excluded since it might be $x=, with $ on previous line
2891 # '**' and *= might be typeglobs of punctuation variables
2892 # I have allowed tokens starting with <, such as <=,
2893 # because I don't think these could be valid angle operators.
2894 # test file: storrs4.pl
2895 my $test_tok = $tok . $rtokens->[ $i + 1 ];
2896 my $combine_ok = $is_digraph{$test_tok};
2898 # check for special cases which cannot be combined
2901 # '//' must be defined_or operator if an operator is expected.
2902 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
2903 # could be migrated here for clarity
2905 # Patch for RT#102371, misparsing a // in the following snippet:
2906 # state $b //= ccc();
2907 # The solution is to always accept the digraph (or trigraph) after
2908 # token type 'Z' (possible file handle). The reason is that
2909 # sub operator_expected gives TERM expected here, which is
2910 # wrong in this case.
2911 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
2912 my $next_type = $rtokens->[ $i + 1 ];
2914 operator_expected( $prev_type, $tok, $next_type );
2916 # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
2917 $combine_ok = 0 if ( $expecting == TERM );
2920 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
2921 # Accept the digraphs '**' only after type 'Z'
2922 # Otherwise postpone the decision.
2923 if ( $test_tok eq '**' ) {
2924 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
2931 && ( $test_tok ne '/=' ) # might be pattern
2932 && ( $test_tok ne 'x=' ) # might be $x
2933 && ( $test_tok ne '*=' ) # typeglob?
2935 # Moved above as part of fix for
2936 # RT #114359: Missparsing of "print $x ** 0.5;
2937 # && ( $test_tok ne '**' ) # typeglob?
2943 # Now try to assemble trigraphs. Note that all possible
2944 # perl trigraphs can be constructed by appending a character
2946 $test_tok = $tok . $rtokens->[ $i + 1 ];
2948 if ( $is_trigraph{$test_tok} ) {
2953 # The only current tetragraph is the double diamond operator
2954 # and its first three characters are not a trigraph, so
2955 # we do can do a special test for it
2956 elsif ( $test_tok eq '<<>' ) {
2957 $test_tok .= $rtokens->[ $i + 2 ];
2958 if ( $is_tetragraph{$test_tok} ) {
2966 $next_tok = $rtokens->[ $i + 1 ];
2967 $next_type = $rtoken_type->[ $i + 1 ];
2969 TOKENIZER_DEBUG_FLAG_TOKENIZE && do {
2972 $last_nonblank_token, $tok,
2973 $next_tok, $brace_depth,
2974 $brace_type[$brace_depth], $paren_depth,
2975 $paren_type[$paren_depth]
2977 print STDOUT "TOKENIZE:(@debug_list)\n";
2980 # turn off attribute list on first non-blank, non-bareword
2981 if ( $pre_type ne 'w' ) { $in_attribute_list = 0 }
2983 ###############################################################
2984 # We have the next token, $tok.
2985 # Now we have to examine this token and decide what it is
2986 # and define its $type
2988 # section 1: bare words
2989 ###############################################################
2991 if ( $pre_type eq 'w' ) {
2992 $expecting = operator_expected( $prev_type, $tok, $next_type );
2993 my ( $next_nonblank_token, $i_next ) =
2994 find_next_nonblank_token( $i, $rtokens, $max_token_index );
2996 # ATTRS: handle sub and variable attributes
2997 if ($in_attribute_list) {
2999 # treat bare word followed by open paren like qw(
3000 if ( $next_nonblank_token eq '(' ) {
3001 $in_quote = $quote_items{'q'};
3002 $allowed_quote_modifiers = $quote_modifiers{'q'};
3008 # handle bareword not followed by open paren
3015 # quote a word followed by => operator
3016 # unless the word __END__ or __DATA__ and the only word on
3018 if ( !$is_END_or_DATA && $next_nonblank_token eq '=' ) {
3020 if ( $rtokens->[ $i_next + 1 ] eq '>' ) {
3021 if ( $is_constant{$current_package}{$tok} ) {
3024 elsif ( $is_user_function{$current_package}{$tok} ) {
3027 $user_function_prototype{$current_package}{$tok};
3029 elsif ( $tok =~ /^v\d+$/ ) {
3031 report_v_string($tok);
3033 else { $type = 'w' }
3039 # quote a bare word within braces..like xxx->{s}; note that we
3040 # must be sure this is not a structural brace, to avoid
3041 # mistaking {s} in the following for a quoted bare word:
3042 # for(@[){s}bla}BLA}
3043 # Also treat q in something like var{-q} as a bare word, not qoute operator
3045 $next_nonblank_token eq '}'
3047 $last_nonblank_type eq 'L'
3048 || ( $last_nonblank_type eq 'm'
3049 && $last_last_nonblank_type eq 'L' )
3057 # a bare word immediately followed by :: is not a keyword;
3058 # use $tok_kw when testing for keywords to avoid a mistake
3060 if ( $rtokens->[ $i + 1 ] eq ':'
3061 && $rtokens->[ $i + 2 ] eq ':' )
3066 # handle operator x (now we know it isn't $x=)
3067 if ( ( $tok =~ /^x\d*$/ ) && ( $expecting == OPERATOR ) ) {
3068 if ( $tok eq 'x' ) {
3070 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
3080 # FIXME: Patch: mark something like x4 as an integer for now
3081 # It gets fixed downstream. This is easier than
3082 # splitting the pretoken.
3087 elsif ( $tok_kw eq 'CORE::' ) {
3088 $type = $tok = $tok_kw;
3091 elsif ( ( $tok eq 'strict' )
3092 and ( $last_nonblank_token eq 'use' ) )
3094 $tokenizer_self->{_saw_use_strict} = 1;
3095 scan_bare_identifier();
3098 elsif ( ( $tok eq 'warnings' )
3099 and ( $last_nonblank_token eq 'use' ) )
3101 $tokenizer_self->{_saw_perl_dash_w} = 1;
3103 # scan as identifier, so that we pick up something like:
3104 # use warnings::register
3105 scan_bare_identifier();
3109 $tok eq 'AutoLoader'
3110 && $tokenizer_self->{_look_for_autoloader}
3112 $last_nonblank_token eq 'use'
3114 # these regexes are from AutoSplit.pm, which we want
3116 || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
3117 || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
3121 write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
3122 $tokenizer_self->{_saw_autoloader} = 1;
3123 $tokenizer_self->{_look_for_autoloader} = 0;
3124 scan_bare_identifier();
3128 $tok eq 'SelfLoader'
3129 && $tokenizer_self->{_look_for_selfloader}
3130 && ( $last_nonblank_token eq 'use'
3131 || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
3132 || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
3135 write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
3136 $tokenizer_self->{_saw_selfloader} = 1;
3137 $tokenizer_self->{_look_for_selfloader} = 0;
3138 scan_bare_identifier();
3141 elsif ( ( $tok eq 'constant' )
3142 and ( $last_nonblank_token eq 'use' ) )
3144 scan_bare_identifier();
3145 my ( $next_nonblank_token, $i_next ) =
3146 find_next_nonblank_token( $i, $rtokens,
3149 if ($next_nonblank_token) {
3151 if ( $is_keyword{$next_nonblank_token} ) {
3153 # Assume qw is used as a quote and okay, as in:
3154 # use constant qw{ DEBUG 0 };
3155 # Not worth trying to parse for just a warning
3157 # NOTE: This warning is deactivated because recent
3158 # versions of perl do not complain here, but
3159 # the coding is retained for reference.
3160 if ( 0 && $next_nonblank_token ne 'qw' ) {
3162 "Attempting to define constant '$next_nonblank_token' which is a perl keyword\n"
3167 # FIXME: could check for error in which next token is
3168 # not a word (number, punctuation, ..)
3170 $is_constant{$current_package}{$next_nonblank_token}
3176 # various quote operators
3177 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
3179 if ( $expecting == OPERATOR ) {
3181 # Be careful not to call an error for a qw quote
3182 # where a parenthesized list is allowed. For example,
3183 # it could also be a for/foreach construct such as
3185 # foreach my $key qw\Uno Due Tres Quadro\ {
3186 # print "Set $key\n";
3190 # Or it could be a function call.
3191 # NOTE: Braces in something like &{ xxx } are not
3192 # marked as a block, we might have a method call.
3193 # &method(...), $method->(..), &{method}(...),
3194 # $ref[2](list) is ok & short for $ref[2]->(list)
3196 # See notes in 'sub code_block_type' and
3197 # 'sub is_non_structural_brace'
3201 && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
3202 || $is_for_foreach{$want_paren} )
3205 error_if_expecting_OPERATOR();
3208 $in_quote = $quote_items{$tok};
3209 $allowed_quote_modifiers = $quote_modifiers{$tok};
3211 # All quote types are 'Q' except possibly qw quotes.
3212 # qw quotes are special in that they may generally be trimmed
3213 # of leading and trailing whitespace. So they are given a
3214 # separate type, 'q', unless requested otherwise.
3216 ( $tok eq 'qw' && $tokenizer_self->{_trim_qw} )
3219 $quote_type = $type;
3222 # check for a statement label
3224 ( $next_nonblank_token eq ':' )
3225 && ( $rtokens->[ $i_next + 1 ] ne ':' )
3226 && ( $i_next <= $max_token_index ) # colon on same line
3230 if ( $tok !~ /[A-Z]/ ) {
3231 push @{ $tokenizer_self->{_rlower_case_labels_at} },
3240 # 'sub' || 'package'
3241 elsif ( $is_sub_package{$tok_kw} ) {
3242 error_if_expecting_OPERATOR()
3243 if ( $expecting == OPERATOR );
3247 # Note on token types for format, __DATA__, __END__:
3248 # It simplifies things to give these type ';', so that when we
3249 # start rescanning we will be expecting a token of type TERM.
3250 # We will switch to type 'k' before outputting the tokens.
3251 elsif ( $is_format_END_DATA{$tok_kw} ) {
3252 $type = ';'; # make tokenizer look for TERM next
3253 $tokenizer_self->{ $is_format_END_DATA{$tok_kw} } = 1;
3257 elsif ( $is_keyword{$tok_kw} ) {
3260 # Since for and foreach may not be followed immediately
3261 # by an opening paren, we have to remember which keyword
3262 # is associated with the next '('
3263 if ( $is_for_foreach{$tok} ) {
3264 if ( new_statement_ok() ) {
3269 # recognize 'use' statements, which are special
3270 elsif ( $is_use_require{$tok} ) {
3271 $statement_type = $tok;
3272 error_if_expecting_OPERATOR()
3273 if ( $expecting == OPERATOR );
3276 # remember my and our to check for trailing ": shared"
3277 elsif ( $is_my_our{$tok} ) {
3278 $statement_type = $tok;
3281 # Check for misplaced 'elsif' and 'else', but allow isolated
3282 # else or elsif blocks to be formatted. This is indicated
3283 # by a last noblank token of ';'
3284 elsif ( $tok eq 'elsif' ) {
3285 if ( $last_nonblank_token ne ';'
3286 && $last_nonblank_block_type !~
3287 /^(if|elsif|unless)$/ )
3290 "expecting '$tok' to follow one of 'if|elsif|unless'\n"
3294 elsif ( $tok eq 'else' ) {
3296 # patched for SWITCH/CASE
3298 $last_nonblank_token ne ';'
3299 && $last_nonblank_block_type !~
3300 /^(if|elsif|unless|case|when)$/
3302 # patch to avoid an unwanted error message for
3303 # the case of a parenless 'case' (RT 105484):
3304 # switch ( 1 ) { case x { 2 } else { } }
3305 && $statement_type !~
3306 /^(if|elsif|unless|case|when)$/
3310 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
3314 elsif ( $tok eq 'continue' ) {
3315 if ( $last_nonblank_token ne ';'
3316 && $last_nonblank_block_type !~
3317 /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
3320 # note: ';' '{' and '}' in list above
3321 # because continues can follow bare blocks;
3322 # ':' is labeled block
3324 ############################################
3325 # NOTE: This check has been deactivated because
3326 # continue has an alternative usage for given/when
3327 # blocks in perl 5.10
3328 ## warning("'$tok' should follow a block\n");
3329 ############################################
3333 # patch for SWITCH/CASE if 'case' and 'when are
3334 # treated as keywords.
3335 elsif ( $tok eq 'when' || $tok eq 'case' ) {
3336 $statement_type = $tok; # next '{' is block
3340 # indent trailing if/unless/while/until
3341 # outdenting will be handled by later indentation loop
3342 ## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
3354 ## if ( $tok =~ /^(if|unless|while|until)$/
3355 ## && $next_nonblank_token ne '(' )
3357 ## $indent_flag = 1;
3361 # check for inline label following
3362 # /^(redo|last|next|goto)$/
3363 elsif (( $last_nonblank_type eq 'k' )
3364 && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
3373 scan_bare_identifier();
3374 if ( $type eq 'w' ) {
3376 if ( $expecting == OPERATOR ) {
3378 # don't complain about possible indirect object
3382 # sub new($) { ... }
3383 # $b = new A::; # calls A::new
3384 # $c = new A; # same thing but suspicious
3385 # This will call A::new but we have a 'new' in
3386 # main:: which looks like a constant.
3388 if ( $last_nonblank_type eq 'C' ) {
3389 if ( $tok !~ /::$/ ) {
3391 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
3392 Maybe indirectet object notation?
3397 error_if_expecting_OPERATOR("bareword");
3401 # mark bare words immediately followed by a paren as
3403 $next_tok = $rtokens->[ $i + 1 ];
3404 if ( $next_tok eq '(' ) {
3408 # underscore after file test operator is file handle
3409 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
3413 # patch for SWITCH/CASE if 'case' and 'when are
3414 # not treated as keywords:
3418 && $brace_type[$brace_depth] eq 'switch'
3421 && $brace_type[$brace_depth] eq 'given' )
3424 $statement_type = $tok; # next '{' is block
3425 $type = 'k'; # for keyword syntax coloring
3428 # patch for SWITCH/CASE if switch and given not keywords
3429 # Switch is not a perl 5 keyword, but we will gamble
3430 # and mark switch followed by paren as a keyword. This
3431 # is only necessary to get html syntax coloring nice,
3432 # and does not commit this as being a switch/case.
3433 if ( $next_nonblank_token eq '('
3434 && ( $tok eq 'switch' || $tok eq 'given' ) )
3436 $type = 'k'; # for keyword syntax coloring
3442 ###############################################################
3443 # section 2: strings of digits
3444 ###############################################################
3445 elsif ( $pre_type eq 'd' ) {
3446 $expecting = operator_expected( $prev_type, $tok, $next_type );
3447 error_if_expecting_OPERATOR("Number")
3448 if ( $expecting == OPERATOR );
3449 my $number = scan_number();
3450 if ( !defined($number) ) {
3452 # shouldn't happen - we should always get a number
3453 warning("non-number beginning with digit--program bug\n");
3454 report_definite_bug();
3458 ###############################################################
3459 # section 3: all other tokens
3460 ###############################################################
3463 last if ( $tok eq '#' );
3464 my $code = $tokenization_code->{$tok};
3467 operator_expected( $prev_type, $tok, $next_type );
3474 # -----------------------------
3475 # end of main tokenization loop
3476 # -----------------------------
3478 if ( $i_tok >= 0 ) {
3479 $routput_token_type->[$i_tok] = $type;
3480 $routput_block_type->[$i_tok] = $block_type;
3481 $routput_container_type->[$i_tok] = $container_type;
3482 $routput_type_sequence->[$i_tok] = $type_sequence;
3483 $routput_indent_flag->[$i_tok] = $indent_flag;
3486 unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
3487 $last_last_nonblank_token = $last_nonblank_token;
3488 $last_last_nonblank_type = $last_nonblank_type;
3489 $last_last_nonblank_block_type = $last_nonblank_block_type;
3490 $last_last_nonblank_container_type = $last_nonblank_container_type;
3491 $last_last_nonblank_type_sequence = $last_nonblank_type_sequence;
3492 $last_nonblank_token = $tok;
3493 $last_nonblank_type = $type;
3494 $last_nonblank_block_type = $block_type;
3495 $last_nonblank_container_type = $container_type;
3496 $last_nonblank_type_sequence = $type_sequence;
3497 $last_nonblank_prototype = $prototype;
3500 # reset indentation level if necessary at a sub or package
3501 # in an attempt to recover from a nesting error
3502 if ( $level_in_tokenizer < 0 ) {
3503 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
3504 reset_indentation_level(0);
3505 brace_warning("resetting level to 0 at $1 $2\n");
3509 # all done tokenizing this line ...
3510 # now prepare the final list of tokens and types
3512 my @token_type = (); # stack of output token types
3513 my @block_type = (); # stack of output code block types
3514 my @container_type = (); # stack of output code container types
3515 my @type_sequence = (); # stack of output type sequence numbers
3516 my @tokens = (); # output tokens
3517 my @levels = (); # structural brace levels of output tokens
3518 my @slevels = (); # secondary nesting levels of output tokens
3519 my @nesting_tokens = (); # string of tokens leading to this depth
3520 my @nesting_types = (); # string of token types leading to this depth
3521 my @nesting_blocks = (); # string of block types leading to this depth
3522 my @nesting_lists = (); # string of list types leading to this depth
3523 my @ci_string = (); # string needed to compute continuation indentation
3524 my @container_environment = (); # BLOCK or LIST
3525 my $container_environment = '';
3526 my $im = -1; # previous $i value
3528 my $ci_string_sum = ones_count($ci_string_in_tokenizer);
3530 # Computing Token Indentation
3532 # The final section of the tokenizer forms tokens and also computes
3533 # parameters needed to find indentation. It is much easier to do it
3534 # in the tokenizer than elsewhere. Here is a brief description of how
3535 # indentation is computed. Perl::Tidy computes indentation as the sum
3538 # (1) structural indentation, such as if/else/elsif blocks
3539 # (2) continuation indentation, such as long parameter call lists.
3541 # These are occasionally called primary and secondary indentation.
3543 # Structural indentation is introduced by tokens of type '{', although
3544 # the actual tokens might be '{', '(', or '['. Structural indentation
3545 # is of two types: BLOCK and non-BLOCK. Default structural indentation
3546 # is 4 characters if the standard indentation scheme is used.
3548 # Continuation indentation is introduced whenever a line at BLOCK level
3549 # is broken before its termination. Default continuation indentation
3550 # is 2 characters in the standard indentation scheme.
3552 # Both types of indentation may be nested arbitrarily deep and
3553 # interlaced. The distinction between the two is somewhat arbitrary.
3555 # For each token, we will define two variables which would apply if
3556 # the current statement were broken just before that token, so that
3557 # that token started a new line:
3559 # $level = the structural indentation level,
3560 # $ci_level = the continuation indentation level
3562 # The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
3563 # assuming defaults. However, in some special cases it is customary
3564 # to modify $ci_level from this strict value.
3566 # The total structural indentation is easy to compute by adding and
3567 # subtracting 1 from a saved value as types '{' and '}' are seen. The
3568 # running value of this variable is $level_in_tokenizer.
3570 # The total continuation is much more difficult to compute, and requires
3571 # several variables. These variables are:
3573 # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
3574 # each indentation level, if there are intervening open secondary
3575 # structures just prior to that level.
3576 # $continuation_string_in_tokenizer = a string of 1's and 0's indicating
3577 # if the last token at that level is "continued", meaning that it
3578 # is not the first token of an expression.
3579 # $nesting_block_string = a string of 1's and 0's indicating, for each
3580 # indentation level, if the level is of type BLOCK or not.
3581 # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
3582 # $nesting_list_string = a string of 1's and 0's indicating, for each
3583 # indentation level, if it is appropriate for list formatting.
3584 # If so, continuation indentation is used to indent long list items.
3585 # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
3586 # @{$rslevel_stack} = a stack of total nesting depths at each
3587 # structural indentation level, where "total nesting depth" means
3588 # the nesting depth that would occur if every nesting token -- '{', '[',
3589 # and '(' -- , regardless of context, is used to compute a nesting
3592 #my $nesting_block_flag = ($nesting_block_string =~ /1$/);
3593 #my $nesting_list_flag = ($nesting_list_string =~ /1$/);
3595 my ( $ci_string_i, $level_i, $nesting_block_string_i,
3596 $nesting_list_string_i, $nesting_token_string_i,
3597 $nesting_type_string_i, );
3599 foreach my $i ( @{$routput_token_list} )
3600 { # scan the list of pre-tokens indexes
3602 # self-checking for valid token types
3603 my $type = $routput_token_type->[$i];
3604 my $forced_indentation_flag = $routput_indent_flag->[$i];
3606 # See if we should undo the $forced_indentation_flag.
3607 # Forced indentation after 'if', 'unless', 'while' and 'until'
3608 # expressions without trailing parens is optional and doesn't
3609 # always look good. It is usually okay for a trailing logical
3610 # expression, but if the expression is a function call, code block,
3611 # or some kind of list it puts in an unwanted extra indentation
3612 # level which is hard to remove.
3614 # Example where extra indentation looks ok:
3616 # if $det_a < 0 and $det_b > 0
3617 # or $det_a > 0 and $det_b < 0;
3619 # Example where extra indentation is not needed because
3620 # the eval brace also provides indentation:
3621 # print "not " if defined eval {
3622 # reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
3625 # The following rule works fairly well:
3626 # Undo the flag if the end of this line, or start of the next
3627 # line, is an opening container token or a comma.
3628 # This almost always works, but if not after another pass it will
3630 if ( $forced_indentation_flag && $type eq 'k' ) {
3632 my $ilast = $routput_token_list->[$ixlast];
3633 my $toklast = $routput_token_type->[$ilast];
3634 if ( $toklast eq '#' ) {
3636 $ilast = $routput_token_list->[$ixlast];
3637 $toklast = $routput_token_type->[$ilast];
3639 if ( $toklast eq 'b' ) {
3641 $ilast = $routput_token_list->[$ixlast];
3642 $toklast = $routput_token_type->[$ilast];
3644 if ( $toklast =~ /^[\{,]$/ ) {
3645 $forced_indentation_flag = 0;
3648 ( $toklast, my $i_next ) =
3649 find_next_nonblank_token( $max_token_index, $rtokens,
3651 if ( $toklast =~ /^[\{,]$/ ) {
3652 $forced_indentation_flag = 0;
3657 # if we are already in an indented if, see if we should outdent
3658 if ($indented_if_level) {
3660 # don't try to nest trailing if's - shouldn't happen
3661 if ( $type eq 'k' ) {
3662 $forced_indentation_flag = 0;
3665 # check for the normal case - outdenting at next ';'
3666 elsif ( $type eq ';' ) {
3667 if ( $level_in_tokenizer == $indented_if_level ) {
3668 $forced_indentation_flag = -1;
3669 $indented_if_level = 0;
3673 # handle case of missing semicolon
3674 elsif ( $type eq '}' ) {
3675 if ( $level_in_tokenizer == $indented_if_level ) {
3676 $indented_if_level = 0;
3678 # TBD: This could be a subroutine call
3679 $level_in_tokenizer--;
3680 if ( @{$rslevel_stack} > 1 ) {
3681 pop( @{$rslevel_stack} );
3683 if ( length($nesting_block_string) > 1 )
3684 { # true for valid script
3685 chop $nesting_block_string;
3686 chop $nesting_list_string;
3693 my $tok = $rtokens->[$i]; # the token, but ONLY if same as pretoken
3694 $level_i = $level_in_tokenizer;
3696 # This can happen by running perltidy on non-scripts
3697 # although it could also be bug introduced by programming change.
3698 # Perl silently accepts a 032 (^Z) and takes it as the end
3699 if ( !$is_valid_token_type{$type} ) {
3700 my $val = ord($type);
3702 "unexpected character decimal $val ($type) in script\n");
3703 $tokenizer_self->{_in_error} = 1;
3706 # ----------------------------------------------------------------
3707 # TOKEN TYPE PATCHES
3708 # output __END__, __DATA__, and format as type 'k' instead of ';'
3709 # to make html colors correct, etc.
3710 my $fix_type = $type;
3711 if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }
3713 # output anonymous 'sub' as keyword
3714 if ( $type eq 't' && $tok eq 'sub' ) { $fix_type = 'k' }
3716 # -----------------------------------------------------------------
3718 $nesting_token_string_i = $nesting_token_string;
3719 $nesting_type_string_i = $nesting_type_string;
3720 $nesting_block_string_i = $nesting_block_string;
3721 $nesting_list_string_i = $nesting_list_string;
3723 # set primary indentation levels based on structural braces
3724 # Note: these are set so that the leading braces have a HIGHER
3725 # level than their CONTENTS, which is convenient for indentation
3726 # Also, define continuation indentation for each token.
3727 if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
3730 # use environment before updating
3731 $container_environment =
3732 $nesting_block_flag ? 'BLOCK'
3733 : $nesting_list_flag ? 'LIST'
3736 # if the difference between total nesting levels is not 1,
3737 # there are intervening non-structural nesting types between
3738 # this '{' and the previous unclosed '{'
3739 my $intervening_secondary_structure = 0;
3740 if ( @{$rslevel_stack} ) {
3741 $intervening_secondary_structure =
3742 $slevel_in_tokenizer - $rslevel_stack->[-1];
3745 # Continuation Indentation
3747 # Having tried setting continuation indentation both in the formatter and
3748 # in the tokenizer, I can say that setting it in the tokenizer is much,
3749 # much easier. The formatter already has too much to do, and can't
3750 # make decisions on line breaks without knowing what 'ci' will be at
3751 # arbitrary locations.
3753 # But a problem with setting the continuation indentation (ci) here
3754 # in the tokenizer is that we do not know where line breaks will actually
3755 # be. As a result, we don't know if we should propagate continuation
3756 # indentation to higher levels of structure.
3758 # For nesting of only structural indentation, we never need to do this.
3759 # For example, in a long if statement, like this
3761 # if ( !$output_block_type[$i]
3762 # && ($in_statement_continuation) )
3767 # the second line has ci but we do normally give the lines within the BLOCK
3768 # any ci. This would be true if we had blocks nested arbitrarily deeply.
3770 # But consider something like this, where we have created a break after
3771 # an opening paren on line 1, and the paren is not (currently) a
3772 # structural indentation token:
3774 # my $file = $menubar->Menubutton(
3775 # qw/-text File -underline 0 -menuitems/ => [
3777 # Cascade => '~View',
3781 # The second line has ci, so it would seem reasonable to propagate it
3782 # down, giving the third line 1 ci + 1 indentation. This suggests the
3783 # following rule, which is currently used to propagating ci down: if there
3784 # are any non-structural opening parens (or brackets, or braces), before
3785 # an opening structural brace, then ci is propagated down, and otherwise
3786 # not. The variable $intervening_secondary_structure contains this
3787 # information for the current token, and the string
3788 # "$ci_string_in_tokenizer" is a stack of previous values of this
3791 # save the current states
3792 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
3793 $level_in_tokenizer++;
3795 if ($forced_indentation_flag) {
3797 # break BEFORE '?' when there is forced indentation
3798 if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
3799 if ( $type eq 'k' ) {
3800 $indented_if_level = $level_in_tokenizer;
3803 # do not change container environment here if we are not
3804 # at a real list. Adding this check prevents "blinkers"
3805 # often near 'unless" clauses, such as in the following
3810 ## File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
3813 $nesting_block_string .= "$nesting_block_flag";
3817 if ( $routput_block_type->[$i] ) {
3818 $nesting_block_flag = 1;
3819 $nesting_block_string .= '1';
3822 $nesting_block_flag = 0;
3823 $nesting_block_string .= '0';
3827 # we will use continuation indentation within containers
3828 # which are not blocks and not logical expressions
3830 if ( !$routput_block_type->[$i] ) {
3832 # propagate flag down at nested open parens
3833 if ( $routput_container_type->[$i] eq '(' ) {
3834 $bit = 1 if $nesting_list_flag;
3837 # use list continuation if not a logical grouping
3838 # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
3842 $is_logical_container{ $routput_container_type->[$i]
3846 $nesting_list_string .= $bit;
3847 $nesting_list_flag = $bit;
3849 $ci_string_in_tokenizer .=
3850 ( $intervening_secondary_structure != 0 ) ? '1' : '0';
3851 $ci_string_sum = ones_count($ci_string_in_tokenizer);
3852 $continuation_string_in_tokenizer .=
3853 ( $in_statement_continuation > 0 ) ? '1' : '0';
3855 # Sometimes we want to give an opening brace continuation indentation,
3856 # and sometimes not. For code blocks, we don't do it, so that the leading
3857 # '{' gets outdented, like this:
3859 # if ( !$output_block_type[$i]
3860 # && ($in_statement_continuation) )
3863 # For other types, we will give them continuation indentation. For example,
3864 # here is how a list looks with the opening paren indented:
3867 # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
3868 # [ "homer", "marge", "bart" ], );
3870 # This looks best when 'ci' is one-half of the indentation (i.e., 2 and 4)
3872 my $total_ci = $ci_string_sum;
3874 !$routput_block_type->[$i] # patch: skip for BLOCK
3875 && ($in_statement_continuation)
3876 && !( $forced_indentation_flag && $type eq ':' )
3879 $total_ci += $in_statement_continuation
3880 unless ( $ci_string_in_tokenizer =~ /1$/ );
3883 $ci_string_i = $total_ci;
3884 $in_statement_continuation = 0;
3889 || $forced_indentation_flag < 0 )
3892 # only a nesting error in the script would prevent popping here
3893 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
3895 $level_i = --$level_in_tokenizer;
3897 # restore previous level values
3898 if ( length($nesting_block_string) > 1 )
3899 { # true for valid script
3900 chop $nesting_block_string;
3901 $nesting_block_flag = ( $nesting_block_string =~ /1$/ );
3902 chop $nesting_list_string;
3903 $nesting_list_flag = ( $nesting_list_string =~ /1$/ );
3905 chop $ci_string_in_tokenizer;
3906 $ci_string_sum = ones_count($ci_string_in_tokenizer);
3908 $in_statement_continuation =
3909 chop $continuation_string_in_tokenizer;
3911 # zero continuation flag at terminal BLOCK '}' which
3913 if ( $routput_block_type->[$i] ) {
3915 # ...These include non-anonymous subs
3916 # note: could be sub ::abc { or sub 'abc
3917 if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
3919 # note: older versions of perl require the /gc modifier
3920 # here or else the \G does not work.
3921 if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
3923 $in_statement_continuation = 0;
3927 # ...and include all block types except user subs with
3928 # block prototypes and these: (sort|grep|map|do|eval)
3929 # /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
3931 $is_zero_continuation_block_type{
3932 $routput_block_type->[$i]
3935 $in_statement_continuation = 0;
3938 # ..but these are not terminal types:
3939 # /^(sort|grep|map|do|eval)$/ )
3941 $is_not_zero_continuation_block_type{
3942 $routput_block_type->[$i]
3947 # ..and a block introduced by a label
3949 elsif ( $routput_block_type->[$i] =~ /:$/ ) {
3950 $in_statement_continuation = 0;
3953 # user function with block prototype
3955 $in_statement_continuation = 0;
3959 # If we are in a list, then
3960 # we must set continuation indentation at the closing
3961 # paren of something like this (paren after $check):
3964 # ( not defined $check )
3966 # or $check eq "new"
3967 # or $check eq "old",
3969 elsif ( $tok eq ')' ) {
3970 $in_statement_continuation = 1
3971 if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
3974 elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
3977 # use environment after updating
3978 $container_environment =
3979 $nesting_block_flag ? 'BLOCK'
3980 : $nesting_list_flag ? 'LIST'
3982 $ci_string_i = $ci_string_sum + $in_statement_continuation;
3983 $nesting_block_string_i = $nesting_block_string;
3984 $nesting_list_string_i = $nesting_list_string;
3987 # not a structural indentation type..
3990 $container_environment =
3991 $nesting_block_flag ? 'BLOCK'
3992 : $nesting_list_flag ? 'LIST'
3995 # zero the continuation indentation at certain tokens so
3996 # that they will be at the same level as its container. For
3997 # commas, this simplifies the -lp indentation logic, which
3998 # counts commas. For ?: it makes them stand out.
3999 if ($nesting_list_flag) {
4000 if ( $type =~ /^[,\?\:]$/ ) {
4001 $in_statement_continuation = 0;
4005 # be sure binary operators get continuation indentation
4007 $container_environment
4008 && ( $type eq 'k' && $is_binary_keyword{$tok}
4009 || $is_binary_type{$type} )
4012 $in_statement_continuation = 1;
4015 # continuation indentation is sum of any open ci from previous
4016 # levels plus the current level
4017 $ci_string_i = $ci_string_sum + $in_statement_continuation;
4019 # update continuation flag ...
4020 # if this isn't a blank or comment..
4021 if ( $type ne 'b' && $type ne '#' ) {
4023 # and we are in a BLOCK
4024 if ($nesting_block_flag) {
4026 # the next token after a ';' and label starts a new stmt
4027 if ( $type eq ';' || $type eq 'J' ) {
4028 $in_statement_continuation = 0;
4031 # otherwise, we are continuing the current statement
4033 $in_statement_continuation = 1;
4037 # if we are not in a BLOCK..
4040 # do not use continuation indentation if not list
4041 # environment (could be within if/elsif clause)
4042 if ( !$nesting_list_flag ) {
4043 $in_statement_continuation = 0;
4046 # otherwise, the token after a ',' starts a new term
4048 # Patch FOR RT#99961; no continuation after a ';'
4049 # This is needed because perltidy currently marks
4050 # a block preceded by a type character like % or @
4051 # as a non block, to simplify formatting. But these
4052 # are actually blocks and can have semicolons.
4053 # See code_block_type() and is_non_structural_brace().
4054 elsif ( $type eq ',' || $type eq ';' ) {
4055 $in_statement_continuation = 0;
4058 # otherwise, we are continuing the current term
4060 $in_statement_continuation = 1;
4066 if ( $level_in_tokenizer < 0 ) {
4067 unless ( $tokenizer_self->{_saw_negative_indentation} ) {
4068 $tokenizer_self->{_saw_negative_indentation} = 1;
4069 warning("Starting negative indentation\n");
4073 # set secondary nesting levels based on all containment token types
4074 # Note: these are set so that the nesting depth is the depth
4075 # of the PREVIOUS TOKEN, which is convenient for setting
4076 # the strength of token bonds
4077 my $slevel_i = $slevel_in_tokenizer;
4080 if ( $is_opening_type{$type} ) {
4081 $slevel_in_tokenizer++;
4082 $nesting_token_string .= $tok;
4083 $nesting_type_string .= $type;
4087 elsif ( $is_closing_type{$type} ) {
4088 $slevel_in_tokenizer--;
4089 my $char = chop $nesting_token_string;
4091 if ( $char ne $matching_start_token{$tok} ) {
4092 $nesting_token_string .= $char . $tok;
4093 $nesting_type_string .= $type;
4096 chop $nesting_type_string;
4100 push( @block_type, $routput_block_type->[$i] );
4101 push( @ci_string, $ci_string_i );
4102 push( @container_environment, $container_environment );
4103 push( @container_type, $routput_container_type->[$i] );
4104 push( @levels, $level_i );
4105 push( @nesting_tokens, $nesting_token_string_i );
4106 push( @nesting_types, $nesting_type_string_i );
4107 push( @slevels, $slevel_i );
4108 push( @token_type, $fix_type );
4109 push( @type_sequence, $routput_type_sequence->[$i] );
4110 push( @nesting_blocks, $nesting_block_string );
4111 push( @nesting_lists, $nesting_list_string );
4113 # now form the previous token
4116 $rtoken_map->[$i] - $rtoken_map->[$im]; # how many characters
4120 substr( $input_line, $rtoken_map->[$im], $num ) );
4126 $num = length($input_line) - $rtoken_map->[$im]; # make the last token
4128 push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
4131 $tokenizer_self->{_in_attribute_list} = $in_attribute_list;
4132 $tokenizer_self->{_in_quote} = $in_quote;
4133 $tokenizer_self->{_quote_target} =
4134 $in_quote ? matching_end_token($quote_character) : "";
4135 $tokenizer_self->{_rhere_target_list} = $rhere_target_list;
4137 $line_of_tokens->{_rtoken_type} = \@token_type;
4138 $line_of_tokens->{_rtokens} = \@tokens;
4139 $line_of_tokens->{_rblock_type} = \@block_type;
4140 $line_of_tokens->{_rcontainer_type} = \@container_type;
4141 $line_of_tokens->{_rcontainer_environment} = \@container_environment;
4142 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
4143 $line_of_tokens->{_rlevels} = \@levels;
4144 $line_of_tokens->{_rslevels} = \@slevels;
4145 $line_of_tokens->{_rnesting_tokens} = \@nesting_tokens;
4146 $line_of_tokens->{_rci_levels} = \@ci_string;
4147 $line_of_tokens->{_rnesting_blocks} = \@nesting_blocks;
4151 } # end tokenize_this_line
4153 #########i#############################################################
4154 # Tokenizer routines which assist in identifying token types
4155 #######################################################################
4157 sub operator_expected {
4159 # Many perl symbols have two or more meanings. For example, '<<'
4160 # can be a shift operator or a here-doc operator. The
4161 # interpretation of these symbols depends on the current state of
4162 # the tokenizer, which may either be expecting a term or an
4163 # operator. For this example, a << would be a shift if an operator
4164 # is expected, and a here-doc if a term is expected. This routine
4165 # is called to make this decision for any current token. It returns
4166 # one of three possible values:
4168 # OPERATOR - operator expected (or at least, not a term)
4169 # UNKNOWN - can't tell
4170 # TERM - a term is expected (or at least, not an operator)
4172 # The decision is based on what has been seen so far. This
4173 # information is stored in the "$last_nonblank_type" and
4174 # "$last_nonblank_token" variables. For example, if the
4175 # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
4176 # if $last_nonblank_type is 'n' (numeric), we are expecting an
4179 # If a UNKNOWN is returned, the calling routine must guess. A major
4180 # goal of this tokenizer is to minimize the possibility of returning
4181 # UNKNOWN, because a wrong guess can spoil the formatting of a
4184 # adding NEW_TOKENS: it is critically important that this routine be
4185 # updated to allow it to determine if an operator or term is to be
4186 # expected after the new token. Doing this simply involves adding
4187 # the new token character to one of the regexes in this routine or
4188 # to one of the hash lists
4189 # that it uses, which are initialized in the BEGIN section.
4190 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
4193 my ( $prev_type, $tok, $next_type ) = @_;
4195 my $op_expected = UNKNOWN;
4197 ##print "tok=$tok last type=$last_nonblank_type last tok=$last_nonblank_token\n";
4199 # Note: function prototype is available for token type 'U' for future
4200 # program development. It contains the leading and trailing parens,
4201 # and no blanks. It might be used to eliminate token type 'C', for
4202 # example (prototype = '()'). Thus:
4203 # if ($last_nonblank_type eq 'U') {
4204 # print "previous token=$last_nonblank_token type=$last_nonblank_type prototype=$last_nonblank_prototype\n";
4207 # A possible filehandle (or object) requires some care...
4208 if ( $last_nonblank_type eq 'Z' ) {
4211 if ( $last_nonblank_token =~ /^[A-Za-z_]/ ) {
4212 $op_expected = UNKNOWN;
4215 # For possible file handle like "$a", Perl uses weird parsing rules.
4217 # print $a/2,"/hi"; - division
4218 # print $a / 2,"/hi"; - division
4219 # print $a/ 2,"/hi"; - division
4220 # print $a /2,"/hi"; - pattern (and error)!
4221 elsif ( ( $prev_type eq 'b' ) && ( $next_type ne 'b' ) ) {
4222 $op_expected = TERM;
4225 # Note when an operation is being done where a
4226 # filehandle might be expected, since a change in whitespace
4227 # could change the interpretation of the statement.
4229 if ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
4230 complain("operator in print statement not recommended\n");
4231 $op_expected = OPERATOR;
4236 # Check for smartmatch operator before preceding brace or square bracket.
4237 # For example, at the ? after the ] in the following expressions we are
4238 # expecting an operator:
4240 # qr/3/ ~~ ['1234'] ? 1 : 0;
4241 # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
4242 elsif ( $last_nonblank_type eq '}' && $last_nonblank_token eq '~~' ) {
4243 $op_expected = OPERATOR;
4246 # handle something after 'do' and 'eval'
4247 elsif ( $is_block_operator{$last_nonblank_token} ) {
4249 # something like $a = eval "expression";
4251 if ( $last_nonblank_type eq 'k' ) {
4252 $op_expected = TERM; # expression or list mode following keyword
4255 # something like $a = do { BLOCK } / 2;
4256 # or this ? after a smartmatch anonynmous hash or array reference:
4257 # qr/3/ ~~ ['1234'] ? 1 : 0;
4260 $op_expected = OPERATOR; # block mode following }
4264 # handle bare word..
4265 elsif ( $last_nonblank_type eq 'w' ) {
4267 # unfortunately, we can't tell what type of token to expect next
4268 # after most bare words
4269 $op_expected = UNKNOWN;
4272 # operator, but not term possible after these types
4273 # Note: moved ')' from type to token because parens in list context
4274 # get marked as '{' '}' now. This is a minor glitch in the following:
4275 # my %opts = (ref $_[0] eq 'HASH') ? %{shift()} : ();
4277 elsif (( $last_nonblank_type =~ /^[\]RnviQh]$/ )
4278 || ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) )
4280 $op_expected = OPERATOR;
4282 # in a 'use' statement, numbers and v-strings are not true
4283 # numbers, so to avoid incorrect error messages, we will
4284 # mark them as unknown for now (use.t)
4285 # TODO: it would be much nicer to create a new token V for VERSION
4286 # number in a use statement. Then this could be a check on type V
4287 # and related patches which change $statement_type for '=>'
4288 # and ',' could be removed. Further, it would clean things up to
4289 # scan the 'use' statement with a separate subroutine.
4290 if ( ( $statement_type eq 'use' )
4291 && ( $last_nonblank_type =~ /^[nv]$/ ) )
4293 $op_expected = UNKNOWN;
4296 # expecting VERSION or {} after package NAMESPACE
4297 elsif ($statement_type =~ /^package\b/
4298 && $last_nonblank_token =~ /^package\b/ )
4300 $op_expected = TERM;
4304 # no operator after many keywords, such as "die", "warn", etc
4305 elsif ( $expecting_term_token{$last_nonblank_token} ) {
4307 # // may follow perl functions which may be unary operators
4308 # see test file dor.t (defined or);
4310 && $next_type eq '/'
4311 && $last_nonblank_type eq 'k'
4312 && $is_keyword_taking_optional_args{$last_nonblank_token} )
4314 $op_expected = OPERATOR;
4317 $op_expected = TERM;
4321 # no operator after things like + - ** (i.e., other operators)
4322 elsif ( $expecting_term_types{$last_nonblank_type} ) {
4323 $op_expected = TERM;
4326 # a few operators, like "time", have an empty prototype () and so
4327 # take no parameters but produce a value to operate on
4328 elsif ( $expecting_operator_token{$last_nonblank_token} ) {
4329 $op_expected = OPERATOR;
4332 # post-increment and decrement produce values to be operated on
4333 elsif ( $expecting_operator_types{$last_nonblank_type} ) {
4334 $op_expected = OPERATOR;
4337 # no value to operate on after sub block
4338 elsif ( $last_nonblank_token =~ /^sub\s/ ) { $op_expected = TERM; }
4340 # a right brace here indicates the end of a simple block.
4341 # all non-structural right braces have type 'R'
4342 # all braces associated with block operator keywords have been given those
4343 # keywords as "last_nonblank_token" and caught above.
4344 # (This statement is order dependent, and must come after checking
4345 # $last_nonblank_token).
4346 elsif ( $last_nonblank_type eq '}' ) {
4348 # patch for dor.t (defined or).
4350 && $next_type eq '/'
4351 && $last_nonblank_token eq ']' )
4353 $op_expected = OPERATOR;
4356 # Patch for RT #116344: misparse a ternary operator after an anonymous
4358 # return ref {} ? 1 : 0;
4359 # The right brace should really be marked type 'R' in this case, and
4360 # it is safest to return an UNKNOWN here. Expecting a TERM will
4361 # cause the '?' to always be interpreted as a pattern delimiter
4362 # rather than introducing a ternary operator.
4363 elsif ( $tok eq '?' ) {
4364 $op_expected = UNKNOWN;
4367 $op_expected = TERM;
4371 # something else..what did I forget?
4374 # collecting diagnostics on unknown operator types..see what was missed
4375 $op_expected = UNKNOWN;
4377 "OP: unknown after type=$last_nonblank_type token=$last_nonblank_token\n"
4381 TOKENIZER_DEBUG_FLAG_EXPECT && do {
4383 "EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
4385 return $op_expected;
4388 sub new_statement_ok {
4390 # return true if the current token can start a new statement
4391 # USES GLOBAL VARIABLES: $last_nonblank_type
4393 return label_ok() # a label would be ok here
4395 || $last_nonblank_type eq 'J'; # or we follow a label
4401 # Decide if a bare word followed by a colon here is a label
4402 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
4403 # $brace_depth, @brace_type
4405 # if it follows an opening or closing code block curly brace..
4406 if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
4407 && $last_nonblank_type eq $last_nonblank_token )
4410 # it is a label if and only if the curly encloses a code block
4411 return $brace_type[$brace_depth];
4414 # otherwise, it is a label if and only if it follows a ';' (real or fake)
4417 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
4421 sub code_block_type {
4423 # Decide if this is a block of code, and its type.
4424 # Must be called only when $type = $token = '{'
4425 # The problem is to distinguish between the start of a block of code
4426 # and the start of an anonymous hash reference
4427 # Returns "" if not code block, otherwise returns 'last_nonblank_token'
4428 # to indicate the type of code block. (For example, 'last_nonblank_token'
4429 # might be 'if' for an if block, 'else' for an else block, etc).
4430 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
4431 # $last_nonblank_block_type, $brace_depth, @brace_type
4433 # handle case of multiple '{'s
4435 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
4437 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
4438 if ( $last_nonblank_token eq '{'
4439 && $last_nonblank_type eq $last_nonblank_token )
4442 # opening brace where a statement may appear is probably
4443 # a code block but might be and anonymous hash reference
4444 if ( $brace_type[$brace_depth] ) {
4445 return decide_if_code_block( $i, $rtokens, $rtoken_type,
4449 # cannot start a code block within an anonymous hash
4455 elsif ( $last_nonblank_token eq ';' ) {
4457 # an opening brace where a statement may appear is probably
4458 # a code block but might be and anonymous hash reference
4459 return decide_if_code_block( $i, $rtokens, $rtoken_type,
4463 # handle case of '}{'
4464 elsif ($last_nonblank_token eq '}'
4465 && $last_nonblank_type eq $last_nonblank_token )
4468 # a } { situation ...
4469 # could be hash reference after code block..(blktype1.t)
4470 if ($last_nonblank_block_type) {
4471 return decide_if_code_block( $i, $rtokens, $rtoken_type,
4475 # must be a block if it follows a closing hash reference
4477 return $last_nonblank_token;
4481 ################################################################
4482 # NOTE: braces after type characters start code blocks, but for
4483 # simplicity these are not identified as such. See also
4484 # sub is_non_structural_brace.
4485 ################################################################
4487 ## elsif ( $last_nonblank_type eq 't' ) {
4488 ## return $last_nonblank_token;
4491 # brace after label:
4492 elsif ( $last_nonblank_type eq 'J' ) {
4493 return $last_nonblank_token;
4496 # otherwise, look at previous token. This must be a code block if
4497 # it follows any of these:
4498 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
4499 elsif ( $is_code_block_token{$last_nonblank_token} ) {
4501 # Bug Patch: Note that the opening brace after the 'if' in the following
4502 # snippet is an anonymous hash ref and not a code block!
4503 # print 'hi' if { x => 1, }->{x};
4504 # We can identify this situation because the last nonblank type
4505 # will be a keyword (instead of a closing peren)
4506 if ( $last_nonblank_token =~ /^(if|unless)$/
4507 && $last_nonblank_type eq 'k' )
4512 return $last_nonblank_token;
4516 # or a sub or package BLOCK
4517 elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
4518 && $last_nonblank_token =~ /^(sub|package)\b/ )
4520 return $last_nonblank_token;
4523 elsif ( $statement_type =~ /^(sub|package)\b/ ) {
4524 return $statement_type;
4527 # user-defined subs with block parameters (like grep/map/eval)
4528 elsif ( $last_nonblank_type eq 'G' ) {
4529 return $last_nonblank_token;
4533 elsif ( $last_nonblank_type eq 'w' ) {
4534 return decide_if_code_block( $i, $rtokens, $rtoken_type,
4538 # Patch for bug # RT #94338 reported by Daniel Trizen
4539 # for-loop in a parenthesized block-map triggering an error message:
4540 # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
4541 # Check for a code block within a parenthesized function call
4542 elsif ( $last_nonblank_token eq '(' ) {
4543 my $paren_type = $paren_type[$paren_depth];
4544 if ( $paren_type && $paren_type =~ /^(map|grep|sort)$/ ) {
4546 # We will mark this as a code block but use type 't' instead
4547 # of the name of the contining function. This will allow for
4548 # correct parsing but will usually produce better formatting.
4549 # Braces with block type 't' are not broken open automatically
4550 # in the formatter as are other code block types, and this usually
4552 return 't'; # (Not $paren_type)
4559 # handle unknown syntax ') {'
4560 # we previously appended a '()' to mark this case
4561 elsif ( $last_nonblank_token =~ /\(\)$/ ) {
4562 return $last_nonblank_token;
4565 # anything else must be anonymous hash reference
4571 sub decide_if_code_block {
4573 # USES GLOBAL VARIABLES: $last_nonblank_token
4574 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
4576 my ( $next_nonblank_token, $i_next ) =
4577 find_next_nonblank_token( $i, $rtokens, $max_token_index );
4579 # we are at a '{' where a statement may appear.
4580 # We must decide if this brace starts an anonymous hash or a code
4582 # return "" if anonymous hash, and $last_nonblank_token otherwise
4584 # initialize to be code BLOCK
4585 my $code_block_type = $last_nonblank_token;
4587 # Check for the common case of an empty anonymous hash reference:
4588 # Maybe something like sub { { } }
4589 if ( $next_nonblank_token eq '}' ) {
4590 $code_block_type = "";
4595 # To guess if this '{' is an anonymous hash reference, look ahead
4596 # and test as follows:
4598 # it is a hash reference if next come:
4599 # - a string or digit followed by a comma or =>
4600 # - bareword followed by =>
4601 # otherwise it is a code block
4603 # Examples of anonymous hash ref:
4607 # Examples of code blocks:
4608 # {1; print "hello\n", 1;}
4611 # We are only going to look ahead one more (nonblank/comment) line.
4612 # Strange formatting could cause a bad guess, but that's unlikely.
4616 # Ignore the rest of this line if it is a side comment
4617 if ( $next_nonblank_token ne '#' ) {
4618 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
4619 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
4621 my ( $rpre_tokens, $rpre_types ) =
4622 peek_ahead_for_n_nonblank_pre_tokens(20); # 20 is arbitrary but
4623 # generous, and prevents
4625 # time in mangled files
4626 if ( defined($rpre_types) && @{$rpre_types} ) {
4627 push @pre_types, @{$rpre_types};
4628 push @pre_tokens, @{$rpre_tokens};
4631 # put a sentinel token to simplify stopping the search
4632 push @pre_types, '}';
4633 push @pre_types, '}';
4636 $jbeg = 1 if $pre_types[0] eq 'b';
4638 # first look for one of these
4640 # - bareword with leading -
4644 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
4646 # find the closing quote; don't worry about escapes
4647 my $quote_mark = $pre_types[$j];
4648 foreach my $k ( $j + 1 .. $#pre_types - 1 ) {
4649 if ( $pre_types[$k] eq $quote_mark ) {
4651 my $next = $pre_types[$j];
4656 elsif ( $pre_types[$j] eq 'd' ) {
4659 elsif ( $pre_types[$j] eq 'w' ) {
4662 elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
4667 $j++ if $pre_types[$j] eq 'b';
4669 # Patched for RT #95708
4672 # it is a comma which is not a pattern delimeter except for qw
4674 $pre_types[$j] eq ','
4675 && $pre_tokens[$jbeg] !~ /^(s|m|y|tr|qr|q|qq|qx)$/
4679 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
4682 $code_block_type = "";
4687 return $code_block_type;
4690 sub report_unexpected {
4692 # report unexpected token type and show where it is
4693 # USES GLOBAL VARIABLES: $tokenizer_self
4694 my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
4695 $rpretoken_type, $input_line )
4698 if ( ++$tokenizer_self->{_unexpected_error_count} <= MAX_NAG_MESSAGES ) {
4699 my $msg = "found $found where $expecting expected";
4700 my $pos = $rpretoken_map->[$i_tok];
4701 interrupt_logfile();
4702 my $input_line_number = $tokenizer_self->{_last_line_number};
4703 my ( $offset, $numbered_line, $underline ) =
4704 make_numbered_line( $input_line_number, $input_line, $pos );
4705 $underline = write_on_underline( $underline, $pos - $offset, '^' );
4708 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
4709 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
4711 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
4712 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
4715 $num = $pos - $pos_prev;
4717 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
4720 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
4721 $trailer = " (previous token underlined)";
4723 warning( $numbered_line . "\n" );
4724 warning( $underline . "\n" );
4725 warning( $msg . $trailer . "\n" );
4731 sub is_non_structural_brace {
4733 # Decide if a brace or bracket is structural or non-structural
4734 # by looking at the previous token and type
4735 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
4737 # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
4738 # Tentatively deactivated because it caused the wrong operator expectation
4740 # $user = @vars[1] / 100;
4741 # Must update sub operator_expected before re-implementing.
4742 # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
4746 ################################################################
4747 # NOTE: braces after type characters start code blocks, but for
4748 # simplicity these are not identified as such. See also
4749 # sub code_block_type
4750 ################################################################
4752 ##if ($last_nonblank_type eq 't') {return 0}
4754 # otherwise, it is non-structural if it is decorated
4755 # by type information.
4756 # For example, the '{' here is non-structural: ${xxx}
4758 $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
4760 # or if we follow a hash or array closing curly brace or bracket
4761 # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
4762 # because the first '}' would have been given type 'R'
4763 || $last_nonblank_type =~ /^([R\]])$/
4767 #########i#############################################################
4768 # Tokenizer routines for tracking container nesting depths
4769 #######################################################################
4771 # The following routines keep track of nesting depths of the nesting
4772 # types, ( [ { and ?. This is necessary for determining the indentation
4773 # level, and also for debugging programs. Not only do they keep track of
4774 # nesting depths of the individual brace types, but they check that each
4775 # of the other brace types is balanced within matching pairs. For
4776 # example, if the program sees this sequence:
4780 # then it can determine that there is an extra left paren somewhere
4781 # between the { and the }. And so on with every other possible
4782 # combination of outer and inner brace types. For another
4787 # which has an extra ] within the parens.
4789 # The brace types have indexes 0 .. 3 which are indexes into
4792 # The pair ? : are treated as just another nesting type, with ? acting
4793 # as the opening brace and : acting as the closing brace.
4797 # $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
4799 # saves the nesting depth of brace type $b (where $b is either of the other
4800 # nesting types) when brace type $a enters a new depth. When this depth
4801 # decreases, a check is made that the current depth of brace types $b is
4802 # unchanged, or otherwise there must have been an error. This can
4803 # be very useful for localizing errors, particularly when perl runs to
4804 # the end of a large file (such as this one) and announces that there
4805 # is a problem somewhere.
4807 # A numerical sequence number is maintained for every nesting type,
4808 # so that each matching pair can be uniquely identified in a simple
4811 sub increase_nesting_depth {
4812 my ( $aa, $pos ) = @_;
4814 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
4815 # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
4817 $current_depth[$aa]++;
4819 $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
4820 my $input_line_number = $tokenizer_self->{_last_line_number};
4821 my $input_line = $tokenizer_self->{_line_text};
4823 # Sequence numbers increment by number of items. This keeps
4824 # a unique set of numbers but still allows the relative location
4825 # of any type to be determined.
4826 $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
4827 my $seqno = $nesting_sequence_number[$aa];
4828 $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
4830 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
4831 [ $input_line_number, $input_line, $pos ];
4833 for my $bb ( 0 .. $#closing_brace_names ) {
4834 next if ( $bb == $aa );
4835 $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
4838 # set a flag for indenting a nested ternary statement
4840 if ( $aa == QUESTION_COLON ) {
4841 $nested_ternary_flag[ $current_depth[$aa] ] = 0;
4842 if ( $current_depth[$aa] > 1 ) {
4843 if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
4844 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
4845 if ( $pdepth == $total_depth - 1 ) {
4847 $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
4852 $nested_statement_type[$aa][ $current_depth[$aa] ] = $statement_type;
4853 $statement_type = "";
4854 return ( $seqno, $indent );
4857 sub decrease_nesting_depth {
4859 my ( $aa, $pos ) = @_;
4861 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
4862 # @current_sequence_number, @depth_array, @starting_line_of_current_depth
4865 my $input_line_number = $tokenizer_self->{_last_line_number};
4866 my $input_line = $tokenizer_self->{_line_text};
4870 if ( $current_depth[$aa] > 0 ) {
4872 # set a flag for un-indenting after seeing a nested ternary statement
4873 $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
4874 if ( $aa == QUESTION_COLON ) {
4875 $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
4877 $statement_type = $nested_statement_type[$aa][ $current_depth[$aa] ];
4879 # check that any brace types $bb contained within are balanced
4880 for my $bb ( 0 .. $#closing_brace_names ) {
4881 next if ( $bb == $aa );
4883 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
4884 $current_depth[$bb] )
4887 $current_depth[$bb] -
4888 $depth_array[$aa][$bb][ $current_depth[$aa] ];
4890 # don't whine too many times
4891 my $saw_brace_error = get_saw_brace_error();
4893 $saw_brace_error <= MAX_NAG_MESSAGES
4895 # if too many closing types have occurred, we probably
4896 # already caught this error
4897 && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
4900 interrupt_logfile();
4902 $starting_line_of_current_depth[$aa]
4903 [ $current_depth[$aa] ];
4905 my $rel = [ $input_line_number, $input_line, $pos ];
4909 if ( $diff == 1 || $diff == -1 ) {
4917 ? $opening_brace_names[$bb]
4918 : $closing_brace_names[$bb];
4919 write_error_indicator_pair( @{$rsl}, '^' );
4921 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
4926 $starting_line_of_current_depth[$bb]
4927 [ $current_depth[$bb] ];
4930 " The most recent un-matched $bname is on line $ml\n";
4931 write_error_indicator_pair( @{$rml}, '^' );
4933 write_error_indicator_pair( @{$rel}, '^' );
4937 increment_brace_error();
4940 $current_depth[$aa]--;
4944 my $saw_brace_error = get_saw_brace_error();
4945 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
4947 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
4949 indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
4951 increment_brace_error();
4953 return ( $seqno, $outdent );
4956 sub check_final_nesting_depths {
4958 # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
4960 for my $aa ( 0 .. $#closing_brace_names ) {
4962 if ( $current_depth[$aa] ) {
4964 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
4967 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
4968 The most recent un-matched $opening_brace_names[$aa] is on line $sl
4970 indicate_error( $msg, @{$rsl}, '^' );
4971 increment_brace_error();
4977 #########i#############################################################
4978 # Tokenizer routines for looking ahead in input stream
4979 #######################################################################
4981 sub peek_ahead_for_n_nonblank_pre_tokens {
4983 # returns next n pretokens if they exist
4984 # returns undef's if hits eof without seeing any pretokens
4985 # USES GLOBAL VARIABLES: $tokenizer_self
4986 my $max_pretokens = shift;
4989 my ( $rpre_tokens, $rmap, $rpre_types );
4991 while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
4993 $line =~ s/^\s*//; # trim leading blanks
4994 next if ( length($line) <= 0 ); # skip blank
4995 next if ( $line =~ /^#/ ); # skip comment
4996 ( $rpre_tokens, $rmap, $rpre_types ) =
4997 pre_tokenize( $line, $max_pretokens );
5000 return ( $rpre_tokens, $rpre_types );
5003 # look ahead for next non-blank, non-comment line of code
5004 sub peek_ahead_for_nonblank_token {
5006 # USES GLOBAL VARIABLES: $tokenizer_self
5007 my ( $rtokens, $max_token_index ) = @_;
5011 while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
5013 $line =~ s/^\s*//; # trim leading blanks
5014 next if ( length($line) <= 0 ); # skip blank
5015 next if ( $line =~ /^#/ ); # skip comment
5016 my ( $rtok, $rmap, $rtype ) =
5017 pre_tokenize( $line, 2 ); # only need 2 pre-tokens
5018 my $j = $max_token_index + 1;
5020 foreach my $tok ( @{$rtok} ) {
5021 last if ( $tok =~ "\n" );
5022 $rtokens->[ ++$j ] = $tok;
5029 #########i#############################################################
5030 # Tokenizer guessing routines for ambiguous situations
5031 #######################################################################
5033 sub guess_if_pattern_or_conditional {
5035 # this routine is called when we have encountered a ? following an
5036 # unknown bareword, and we must decide if it starts a pattern or not
5038 # $i - token index of the ? starting possible pattern
5039 # output parameters:
5040 # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
5041 # msg = a warning or diagnostic message
5042 # USES GLOBAL VARIABLES: $last_nonblank_token
5044 # FIXME: this needs to be rewritten
5046 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
5048 my $msg = "guessing that ? after $last_nonblank_token starts a ";
5050 if ( $i >= $max_token_index ) {
5051 $msg .= "conditional (no end to pattern found on the line)\n";
5056 my $next_token = $rtokens->[$i]; # first token after ?
5058 # look for a possible ending ? on this line..
5060 my $quote_depth = 0;
5061 my $quote_character = '';
5065 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
5068 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
5069 $quote_pos, $quote_depth, $max_token_index );
5073 # we didn't find an ending ? on this line,
5074 # so we bias towards conditional
5076 $msg .= "conditional (no ending ? on this line)\n";
5078 # we found an ending ?, so we bias towards a pattern
5082 # Watch out for an ending ? in quotes, like this
5083 # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
5087 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
5088 my $tok = $rtokens->[$ii];
5089 if ( $tok eq ":" ) { $colons++ }
5090 if ( $tok eq "'" ) { $s_quote++ }
5091 if ( $tok eq '"' ) { $d_quote++ }
5093 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
5095 $msg .= "found ending ? but unbalanced quote chars\n";
5097 elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
5099 $msg .= "pattern (found ending ? and pattern expected)\n";
5102 $msg .= "pattern (uncertain, but found ending ?)\n";
5106 return ( $is_pattern, $msg );
5109 sub guess_if_pattern_or_division {
5111 # this routine is called when we have encountered a / following an
5112 # unknown bareword, and we must decide if it starts a pattern or is a
5115 # $i - token index of the / starting possible pattern
5116 # output parameters:
5117 # $is_pattern = 0 if probably division, =1 if probably a pattern
5118 # msg = a warning or diagnostic message
5119 # USES GLOBAL VARIABLES: $last_nonblank_token
5120 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
5122 my $msg = "guessing that / after $last_nonblank_token starts a ";
5124 if ( $i >= $max_token_index ) {
5125 $msg .= "division (no end to pattern found on the line)\n";
5129 my $divide_expected =
5130 numerator_expected( $i, $rtokens, $max_token_index );
5132 my $next_token = $rtokens->[$i]; # first token after slash
5134 # look for a possible ending / on this line..
5136 my $quote_depth = 0;
5137 my $quote_character = '';
5141 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
5144 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
5145 $quote_pos, $quote_depth, $max_token_index );
5149 # we didn't find an ending / on this line,
5150 # so we bias towards division
5151 if ( $divide_expected >= 0 ) {
5153 $msg .= "division (no ending / on this line)\n";
5156 $msg = "multi-line pattern (division not possible)\n";
5162 # we found an ending /, so we bias towards a pattern
5165 if ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
5167 if ( $divide_expected >= 0 ) {
5169 if ( $i - $ibeg > 60 ) {
5170 $msg .= "division (matching / too distant)\n";
5174 $msg .= "pattern (but division possible too)\n";
5180 $msg .= "pattern (division not possible)\n";
5185 if ( $divide_expected >= 0 ) {
5187 $msg .= "division (pattern not possible)\n";
5192 "pattern (uncertain, but division would not work here)\n";
5197 return ( $is_pattern, $msg );
5200 # try to resolve here-doc vs. shift by looking ahead for
5201 # non-code or the end token (currently only looks for end token)
5202 # returns 1 if it is probably a here doc, 0 if not
5203 sub guess_if_here_doc {
5205 # This is how many lines we will search for a target as part of the
5206 # guessing strategy. It is a constant because there is probably
5207 # little reason to change it.
5208 # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
5210 my $HERE_DOC_WINDOW = 40;
5212 my $next_token = shift;
5213 my $here_doc_expected = 0;
5216 my $msg = "checking <<";
5218 while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $k++ ) )
5222 if ( $line =~ /^$next_token$/ ) {
5223 $msg .= " -- found target $next_token ahead $k lines\n";
5224 $here_doc_expected = 1; # got it
5227 last if ( $k >= $HERE_DOC_WINDOW );
5230 unless ($here_doc_expected) {
5232 if ( !defined($line) ) {
5233 $here_doc_expected = -1; # hit eof without seeing target
5234 $msg .= " -- must be shift; target $next_token not in file\n";
5237 else { # still unsure..taking a wild guess
5239 if ( !$is_constant{$current_package}{$next_token} ) {
5240 $here_doc_expected = 1;
5242 " -- guessing it's a here-doc ($next_token not a constant)\n";
5246 " -- guessing it's a shift ($next_token is a constant)\n";
5250 write_logfile_entry($msg);
5251 return $here_doc_expected;
5254 #########i#############################################################
5255 # Tokenizer Routines for scanning identifiers and related items
5256 #######################################################################
5258 sub scan_bare_identifier_do {
5260 # this routine is called to scan a token starting with an alphanumeric
5261 # variable or package separator, :: or '.
5262 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
5263 # $last_nonblank_type,@paren_type, $paren_depth
5265 my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
5269 my $package = undef;
5273 # we have to back up one pretoken at a :: since each : is one pretoken
5274 if ( $tok eq '::' ) { $i_beg-- }
5275 if ( $tok eq '->' ) { $i_beg-- }
5276 my $pos_beg = $rtoken_map->[$i_beg];
5277 pos($input_line) = $pos_beg;
5284 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
5286 my $pos = pos($input_line);
5287 my $numc = $pos - $pos_beg;
5288 $tok = substr( $input_line, $pos_beg, $numc );
5290 # type 'w' includes anything without leading type info
5291 # ($,%,@,*) including something like abc::def::ghi
5295 if ( defined($2) ) { $sub_name = $2; }
5296 if ( defined($1) ) {
5299 # patch: don't allow isolated package name which just ends
5300 # in the old style package separator (single quote). Example:
5302 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
5306 $package =~ s/\'/::/g;
5307 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
5308 $package =~ s/::$//;
5311 $package = $current_package;
5313 if ( $is_keyword{$tok} ) {
5318 # if it is a bareword..
5319 if ( $type eq 'w' ) {
5321 # check for v-string with leading 'v' type character
5322 # (This seems to have precedence over filehandle, type 'Y')
5323 if ( $tok =~ /^v\d[_\d]*$/ ) {
5325 # we only have the first part - something like 'v101' -
5327 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
5328 $pos = pos($input_line);
5329 $numc = $pos - $pos_beg;
5330 $tok = substr( $input_line, $pos_beg, $numc );
5334 # warn if this version can't handle v-strings
5335 report_v_string($tok);
5338 elsif ( $is_constant{$package}{$sub_name} ) {
5342 # bareword after sort has implied empty prototype; for example:
5343 # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
5344 # This has priority over whatever the user has specified.
5345 elsif ($last_nonblank_token eq 'sort'
5346 && $last_nonblank_type eq 'k' )
5351 # Note: strangely, perl does not seem to really let you create
5352 # functions which act like eval and do, in the sense that eval
5353 # and do may have operators following the final }, but any operators
5354 # that you create with prototype (&) apparently do not allow
5355 # trailing operators, only terms. This seems strange.
5356 # If this ever changes, here is the update
5357 # to make perltidy behave accordingly:
5359 # elsif ( $is_block_function{$package}{$tok} ) {
5360 # $tok='eval'; # patch to do braces like eval - doesn't work
5363 # FIXME: This could become a separate type to allow for different
5365 elsif ( $is_block_function{$package}{$sub_name} ) {
5369 elsif ( $is_block_list_function{$package}{$sub_name} ) {
5372 elsif ( $is_user_function{$package}{$sub_name} ) {
5374 $prototype = $user_function_prototype{$package}{$sub_name};
5377 # check for indirect object
5380 # added 2001-03-27: must not be followed immediately by '('
5382 ( $input_line !~ m/\G\(/gc )
5387 # preceded by keyword like 'print', 'printf' and friends
5388 $is_indirect_object_taker{$last_nonblank_token}
5390 # or preceded by something like 'print(' or 'printf('
5392 ( $last_nonblank_token eq '(' )
5393 && $is_indirect_object_taker{ $paren_type[$paren_depth]
5401 # may not be indirect object unless followed by a space
5402 if ( $input_line =~ m/\G\s+/gc ) {
5406 # Perl's indirect object notation is a very bad
5407 # thing and can cause subtle bugs, especially for
5408 # beginning programmers. And I haven't even been
5409 # able to figure out a sane warning scheme which
5410 # doesn't get in the way of good scripts.
5412 # Complain if a filehandle has any lower case
5413 # letters. This is suggested good practice.
5414 # Use 'sub_name' because something like
5415 # main::MYHANDLE is ok for filehandle
5416 if ( $sub_name =~ /[a-z]/ ) {
5418 # could be bug caused by older perltidy if
5420 if ( $input_line =~ m/\G\s*\(/gc ) {
5422 "Caution: unknown word '$tok' in indirect object slot\n"
5428 # bareword not followed by a space -- may not be filehandle
5429 # (may be function call defined in a 'use' statement)
5436 # Now we must convert back from character position
5437 # to pre_token index.
5438 # I don't think an error flag can occur here ..but who knows
5441 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
5443 warning("scan_bare_identifier: Possibly invalid tokenization\n");
5447 # no match but line not blank - could be syntax error
5448 # perl will take '::' alone without complaint
5452 # change this warning to log message if it becomes annoying
5453 warning("didn't find identifier after leading ::\n");
5455 return ( $i, $tok, $type, $prototype );
5460 # This is the new scanner and will eventually replace scan_identifier.
5461 # Only type 'sub' and 'package' are implemented.
5462 # Token types $ * % @ & -> are not yet implemented.
5464 # Scan identifier following a type token.
5465 # The type of call depends on $id_scan_state: $id_scan_state = ''
5466 # for starting call, in which case $tok must be the token defining
5469 # If the type token is the last nonblank token on the line, a value
5470 # of $id_scan_state = $tok is returned, indicating that further
5471 # calls must be made to get the identifier. If the type token is
5472 # not the last nonblank token on the line, the identifier is
5473 # scanned and handled and a value of '' is returned.
5474 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
5475 # $statement_type, $tokenizer_self
5477 my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
5481 my ( $i_beg, $pos_beg );
5483 #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
5484 #my ($a,$b,$c) = caller;
5485 #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
5487 # on re-entry, start scanning at first token on the line
5488 if ($id_scan_state) {
5493 # on initial entry, start scanning just after type token
5496 $id_scan_state = $tok;
5500 # find $i_beg = index of next nonblank token,
5501 # and handle empty lines
5503 my $next_nonblank_token = $rtokens->[$i_beg];
5504 if ( $i_beg > $max_token_index ) {
5509 # only a '#' immediately after a '$' is not a comment
5510 if ( $next_nonblank_token eq '#' ) {
5511 unless ( $tok eq '$' ) {
5516 if ( $next_nonblank_token =~ /^\s/ ) {
5517 ( $next_nonblank_token, $i_beg ) =
5518 find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
5520 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
5526 # handle non-blank line; identifier, if any, must follow
5527 unless ($blank_line) {
5529 if ( $id_scan_state eq 'sub' ) {
5530 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
5531 $input_line, $i, $i_beg,
5532 $tok, $type, $rtokens,
5533 $rtoken_map, $id_scan_state, $max_token_index
5537 elsif ( $id_scan_state eq 'package' ) {
5538 ( $i, $tok, $type ) =
5539 do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
5540 $rtoken_map, $max_token_index );
5541 $id_scan_state = '';
5545 warning("invalid token in scan_id: $tok\n");
5546 $id_scan_state = '';
5550 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
5554 "Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
5556 report_definite_bug();
5559 TOKENIZER_DEBUG_FLAG_NSCAN && do {
5561 "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
5563 return ( $i, $tok, $type, $id_scan_state );
5566 sub check_prototype {
5567 my ( $proto, $package, $subname ) = @_;
5568 return unless ( defined($package) && defined($subname) );
5569 if ( defined($proto) ) {
5570 $proto =~ s/^\s*\(\s*//;
5571 $proto =~ s/\s*\)$//;
5573 $is_user_function{$package}{$subname} = 1;
5574 $user_function_prototype{$package}{$subname} = "($proto)";
5576 # prototypes containing '&' must be treated specially..
5577 if ( $proto =~ /\&/ ) {
5579 # right curly braces of prototypes ending in
5580 # '&' may be followed by an operator
5581 if ( $proto =~ /\&$/ ) {
5582 $is_block_function{$package}{$subname} = 1;
5585 # right curly braces of prototypes NOT ending in
5586 # '&' may NOT be followed by an operator
5587 elsif ( $proto !~ /\&$/ ) {
5588 $is_block_list_function{$package}{$subname} = 1;
5593 $is_constant{$package}{$subname} = 1;
5597 $is_user_function{$package}{$subname} = 1;
5602 sub do_scan_package {
5604 # do_scan_package parses a package name
5605 # it is called with $i_beg equal to the index of the first nonblank
5606 # token following a 'package' token.
5607 # USES GLOBAL VARIABLES: $current_package,
5610 # package NAMESPACE VERSION
5611 # package NAMESPACE BLOCK
5612 # package NAMESPACE VERSION BLOCK
5614 # If VERSION is provided, package sets the $VERSION variable in the given
5615 # namespace to a version object with the VERSION provided. VERSION must be
5616 # a "strict" style version number as defined by the version module: a
5617 # positive decimal number (integer or decimal-fraction) without
5618 # exponentiation or else a dotted-decimal v-string with a leading 'v'
5619 # character and at least three components.
5620 # reference http://perldoc.perl.org/functions/package.html
5622 my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
5625 my $package = undef;
5626 my $pos_beg = $rtoken_map->[$i_beg];
5627 pos($input_line) = $pos_beg;
5629 # handle non-blank line; package name, if any, must follow
5630 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w+)/gc ) {
5632 $package = ( defined($1) && $1 ) ? $1 : 'main';
5633 $package =~ s/\'/::/g;
5634 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
5635 $package =~ s/::$//;
5636 my $pos = pos($input_line);
5637 my $numc = $pos - $pos_beg;
5638 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
5641 # Now we must convert back from character position
5642 # to pre_token index.
5643 # I don't think an error flag can occur here ..but ?
5646 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
5647 if ($error) { warning("Possibly invalid package\n") }
5648 $current_package = $package;
5650 # we should now have package NAMESPACE
5651 # now expecting VERSION, BLOCK, or ; to follow ...
5652 # package NAMESPACE VERSION
5653 # package NAMESPACE BLOCK
5654 # package NAMESPACE VERSION BLOCK
5655 my ( $next_nonblank_token, $i_next ) =
5656 find_next_nonblank_token( $i, $rtokens, $max_token_index );
5658 # check that something recognizable follows, but do not parse.
5659 # A VERSION number will be parsed later as a number or v-string in the
5660 # normal way. What is important is to set the statement type if
5661 # everything looks okay so that the operator_expected() routine
5662 # knows that the number is in a package statement.
5663 # Examples of valid primitive tokens that might follow are:
5665 if ( $next_nonblank_token =~ /^([v\.\d;\{\}])|v\d|\d+$/ ) {
5666 $statement_type = $tok;
5670 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
5675 # no match but line not blank --
5676 # could be a label with name package, like package: , for example.
5681 return ( $i, $tok, $type );
5684 sub scan_identifier_do {
5686 # This routine assembles tokens into identifiers. It maintains a
5687 # scan state, id_scan_state. It updates id_scan_state based upon
5688 # current id_scan_state and token, and returns an updated
5689 # id_scan_state and the next index after the identifier.
5690 # USES GLOBAL VARIABLES: $context, $last_nonblank_token,
5691 # $last_nonblank_type
5693 my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
5694 $expecting, $container_type )
5698 my $tok_begin = $rtokens->[$i_begin];
5699 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
5700 my $id_scan_state_begin = $id_scan_state;
5701 my $identifier_begin = $identifier;
5702 my $tok = $tok_begin;
5705 my $in_prototype_or_signature = $container_type =~ /^sub/;
5707 # these flags will be used to help figure out the type:
5708 my $saw_alpha = ( $tok =~ /^[A-Za-z_]/ );
5711 # allow old package separator (') except in 'use' statement
5712 my $allow_tick = ( $last_nonblank_token ne 'use' );
5714 # get started by defining a type and a state if necessary
5715 unless ($id_scan_state) {
5716 $context = UNKNOWN_CONTEXT;
5719 if ( $tok eq '>' ) {
5725 if ( $tok eq '$' || $tok eq '*' ) {
5726 $id_scan_state = '$';
5727 $context = SCALAR_CONTEXT;
5729 elsif ( $tok eq '%' || $tok eq '@' ) {
5730 $id_scan_state = '$';
5731 $context = LIST_CONTEXT;
5733 elsif ( $tok eq '&' ) {
5734 $id_scan_state = '&';
5736 elsif ( $tok eq 'sub' or $tok eq 'package' ) {
5737 $saw_alpha = 0; # 'sub' is considered type info here
5738 $id_scan_state = '$';
5739 $identifier .= ' '; # need a space to separate sub from sub name
5741 elsif ( $tok eq '::' ) {
5742 $id_scan_state = 'A';
5744 elsif ( $tok =~ /^[A-Za-z_]/ ) {
5745 $id_scan_state = ':';
5747 elsif ( $tok eq '->' ) {
5748 $id_scan_state = '$';
5753 my ( $a, $b, $c ) = caller;
5754 warning("Program Bug: scan_identifier given bad token = $tok \n");
5755 warning(" called from sub $a line: $c\n");
5756 report_definite_bug();
5758 $saw_type = !$saw_alpha;
5762 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
5765 # now loop to gather the identifier
5768 while ( $i < $max_token_index ) {
5769 $i_save = $i unless ( $tok =~ /^\s*$/ );
5770 $tok = $rtokens->[ ++$i ];
5772 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
5777 if ( $id_scan_state eq '$' ) { # starting variable name
5779 if ( $tok eq '$' ) {
5781 $identifier .= $tok;
5783 # we've got a punctuation variable if end of line (punct.t)
5784 if ( $i == $max_token_index ) {
5786 $id_scan_state = '';
5791 # POSTDEFREF ->@ ->% ->& ->*
5792 elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
5793 $identifier .= $tok;
5795 elsif ( $tok =~ /^[A-Za-z_]/ ) { # alphanumeric ..
5797 $id_scan_state = ':'; # now need ::
5798 $identifier .= $tok;
5800 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
5802 $id_scan_state = ':'; # now need ::
5803 $identifier .= $tok;
5805 # Perl will accept leading digits in identifiers,
5806 # although they may not always produce useful results.
5807 # Something like $main::0 is ok. But this also works:
5809 # sub howdy::123::bubba{ print "bubba $54321!\n" }
5810 # howdy::123::bubba();
5813 elsif ( $tok =~ /^[0-9]/ ) { # numeric
5815 $id_scan_state = ':'; # now need ::
5816 $identifier .= $tok;
5818 elsif ( $tok eq '::' ) {
5819 $id_scan_state = 'A';
5820 $identifier .= $tok;
5823 # $# and POSTDEFREF ->$#
5824 elsif ( ( $tok eq '#' ) && ( $identifier =~ /\$$/ ) ) { # $#array
5825 $identifier .= $tok; # keep same state, a $ could follow
5827 elsif ( $tok eq '{' ) {
5829 # check for something like ${#} or ${©}
5833 || $identifier eq '@'
5834 || $identifier eq '$#'
5836 && $i + 2 <= $max_token_index
5837 && $rtokens->[ $i + 2 ] eq '}'
5838 && $rtokens->[ $i + 1 ] !~ /[\s\w]/
5841 my $next2 = $rtokens->[ $i + 2 ];
5842 my $next1 = $rtokens->[ $i + 1 ];
5843 $identifier .= $tok . $next1 . $next2;
5845 $id_scan_state = '';
5849 # skip something like ${xxx} or ->{
5850 $id_scan_state = '';
5852 # if this is the first token of a line, any tokens for this
5853 # identifier have already been accumulated
5854 if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; }
5859 # space ok after leading $ % * & @
5860 elsif ( $tok =~ /^\s*$/ ) {
5862 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
5864 if ( length($identifier) > 1 ) {
5865 $id_scan_state = '';
5867 $type = 'i'; # probably punctuation variable
5872 # spaces after $'s are common, and space after @
5873 # is harmless, so only complain about space
5874 # after other type characters. Space after $ and
5875 # @ will be removed in formatting. Report space
5876 # after % and * because they might indicate a
5877 # parsing error. In other words '% ' might be a
5878 # modulo operator. Delete this warning if it
5880 if ( $identifier !~ /^[\@\$]$/ ) {
5882 "Space in identifier, following $identifier\n";
5888 # space after '->' is ok
5890 elsif ( $tok eq '^' ) {
5892 # check for some special variables like $^W
5893 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
5894 $identifier .= $tok;
5895 $id_scan_state = 'A';
5897 # Perl accepts '$^]' or '@^]', but
5898 # there must not be a space before the ']'.
5899 my $next1 = $rtokens->[ $i + 1 ];
5900 if ( $next1 eq ']' ) {
5902 $identifier .= $next1;
5903 $id_scan_state = "";
5908 $id_scan_state = '';
5911 else { # something else
5913 if ( $in_prototype_or_signature && $tok =~ /^[\),=]/ ) {
5914 $id_scan_state = '';
5916 $type = 'i'; # probably punctuation variable
5920 # check for various punctuation variables
5921 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
5922 $identifier .= $tok;
5925 # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
5926 elsif ( $tok eq '*' && $identifier =~ /([\@\%\$\*\&]|\$\#)$/ ) {
5927 $identifier .= $tok;
5930 elsif ( $identifier eq '$#' ) {
5932 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
5934 # perl seems to allow just these: $#: $#- $#+
5935 elsif ( $tok =~ /^[\:\-\+]$/ ) {
5937 $identifier .= $tok;
5941 write_logfile_entry( 'Use of $# is deprecated' . "\n" );
5944 elsif ( $identifier eq '$$' ) {
5946 # perl does not allow references to punctuation
5947 # variables without braces. For example, this
5951 # You would have to use
5955 if ( $tok eq '{' ) { $type = 't' }
5956 else { $type = 'i' }
5958 elsif ( $identifier eq '->' ) {
5963 if ( length($identifier) == 1 ) { $identifier = ''; }
5965 $id_scan_state = '';
5969 elsif ( $id_scan_state eq '&' ) { # starting sub call?
5971 if ( $tok =~ /^[\$A-Za-z_]/ ) { # alphanumeric ..
5972 $id_scan_state = ':'; # now need ::
5974 $identifier .= $tok;
5976 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
5977 $id_scan_state = ':'; # now need ::
5979 $identifier .= $tok;
5981 elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
5982 $id_scan_state = ':'; # now need ::
5984 $identifier .= $tok;
5986 elsif ( $tok =~ /^\s*$/ ) { # allow space
5988 elsif ( $tok eq '::' ) { # leading ::
5989 $id_scan_state = 'A'; # accept alpha next
5990 $identifier .= $tok;
5992 elsif ( $tok eq '{' ) {
5993 if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; }
5995 $id_scan_state = '';
6000 # punctuation variable?
6001 # testfile: cunningham4.pl
6003 # We have to be careful here. If we are in an unknown state,
6004 # we will reject the punctuation variable. In the following
6005 # example the '&' is a binary operator but we are in an unknown
6006 # state because there is no sigil on 'Prima', so we don't
6007 # know what it is. But it is a bad guess that
6008 # '&~' is a function variable.
6009 # $self->{text}->{colorMap}->[
6010 # Prima::PodView::COLOR_CODE_FOREGROUND
6011 # & ~tb::COLOR_INDEX ] =
6013 if ( $identifier eq '&' && $expecting ) {
6014 $identifier .= $tok;
6021 $id_scan_state = '';
6025 elsif ( $id_scan_state eq 'A' ) { # looking for alpha (after ::)
6027 if ( $tok =~ /^[A-Za-z_]/ ) { # found it
6028 $identifier .= $tok;
6029 $id_scan_state = ':'; # now need ::
6032 elsif ( $tok eq "'" && $allow_tick ) {
6033 $identifier .= $tok;
6034 $id_scan_state = ':'; # now need ::
6037 elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
6038 $identifier .= $tok;
6039 $id_scan_state = ':'; # now need ::
6042 elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
6043 $id_scan_state = '(';
6044 $identifier .= $tok;
6046 elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
6047 $id_scan_state = ')';
6048 $identifier .= $tok;
6051 $id_scan_state = '';
6056 elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha
6058 if ( $tok eq '::' ) { # got it
6059 $identifier .= $tok;
6060 $id_scan_state = 'A'; # now require alpha
6062 elsif ( $tok =~ /^[A-Za-z_]/ ) { # more alphanumeric is ok here
6063 $identifier .= $tok;
6064 $id_scan_state = ':'; # now need ::
6067 elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
6068 $identifier .= $tok;
6069 $id_scan_state = ':'; # now need ::
6072 elsif ( $tok eq "'" && $allow_tick ) { # tick
6074 if ( $is_keyword{$identifier} ) {
6075 $id_scan_state = ''; # that's all
6079 $identifier .= $tok;
6082 elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
6083 $id_scan_state = '(';
6084 $identifier .= $tok;
6086 elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
6087 $id_scan_state = ')';
6088 $identifier .= $tok;
6091 $id_scan_state = ''; # that's all
6096 elsif ( $id_scan_state eq '(' ) { # looking for ( of prototype
6098 if ( $tok eq '(' ) { # got it
6099 $identifier .= $tok;
6100 $id_scan_state = ')'; # now find the end of it
6102 elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
6103 $identifier .= $tok;
6106 $id_scan_state = ''; # that's all - no prototype
6111 elsif ( $id_scan_state eq ')' ) { # looking for ) to end
6113 if ( $tok eq ')' ) { # got it
6114 $identifier .= $tok;
6115 $id_scan_state = ''; # all done
6118 elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
6119 $identifier .= $tok;
6121 else { # probable error in script, but keep going
6122 warning("Unexpected '$tok' while seeking end of prototype\n");
6123 $identifier .= $tok;
6126 else { # can get here due to error in initialization
6127 $id_scan_state = '';
6133 if ( $id_scan_state eq ')' ) {
6134 warning("Hit end of line while seeking ) to end prototype\n");
6137 # once we enter the actual identifier, it may not extend beyond
6138 # the end of the current line
6139 if ( $id_scan_state =~ /^[A\:\(\)]/ ) {
6140 $id_scan_state = '';
6142 if ( $i < 0 ) { $i = 0 }
6149 if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
6152 else { $type = 'i' }
6154 elsif ( $identifier eq '->' ) {
6158 ( length($identifier) > 1 )
6160 # In something like '@$=' we have an identifier '@$'
6161 # In something like '$${' we have type '$$' (and only
6162 # part of an identifier)
6163 && !( $identifier =~ /\$$/ && $tok eq '{' )
6164 && ( $identifier !~ /^(sub |package )$/ )
6169 else { $type = 't' }
6171 elsif ($saw_alpha) {
6173 # type 'w' includes anything without leading type info
6174 # ($,%,@,*) including something like abc::def::ghi
6179 } # this can happen on a restart
6184 if ($message) { write_logfile_entry($message) }
6191 TOKENIZER_DEBUG_FLAG_SCAN_ID && do {
6192 my ( $a, $b, $c ) = caller;
6194 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
6196 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
6198 return ( $i, $tok, $type, $id_scan_state, $identifier );
6203 # saved package and subnames in case prototype is on separate line
6204 my ( $package_saved, $subname_saved );
6208 # do_scan_sub parses a sub name and prototype
6209 # it is called with $i_beg equal to the index of the first nonblank
6210 # token following a 'sub' token.
6212 # TODO: add future error checks to be sure we have a valid
6213 # sub name. For example, 'sub &doit' is wrong. Also, be sure
6214 # a name is given if and only if a non-anonymous sub is
6216 # USES GLOBAL VARS: $current_package, $last_nonblank_token,
6217 # $in_attribute_list, %saw_function_definition,
6221 $input_line, $i, $i_beg,
6222 $tok, $type, $rtokens,
6223 $rtoken_map, $id_scan_state, $max_token_index
6225 $id_scan_state = ""; # normally we get everything in one call
6226 my $subname = undef;
6227 my $package = undef;
6232 my $pos_beg = $rtoken_map->[$i_beg];
6233 pos($input_line) = $pos_beg;
6235 # Look for the sub NAME
6237 $input_line =~ m/\G\s*
6238 ((?:\w*(?:'|::))*) # package - something that ends in :: or '
6239 (\w+) # NAME - required
6246 $package = ( defined($1) && $1 ) ? $1 : $current_package;
6247 $package =~ s/\'/::/g;
6248 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
6249 $package =~ s/::$//;
6250 my $pos = pos($input_line);
6251 my $numc = $pos - $pos_beg;
6252 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
6256 # Now look for PROTO ATTRS
6257 # Look for prototype/attributes which are usually on the same
6258 # line as the sub name but which might be on a separate line.
6259 # For example, we might have an anonymous sub with attributes,
6260 # or a prototype on a separate line from its sub name
6262 # NOTE: We only want to parse PROTOTYPES here. If we see anything that
6263 # does not look like a prototype, we assume it is a SIGNATURE and we
6264 # will stop and let the the standard tokenizer handle it. In
6265 # particular, we stop if we see any nested parens, braces, or commas.
6266 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
6268 $input_line =~ m/\G(\s*\([^\)\(\}\{\,]*\))? # PROTO
6269 (\s*:)? # ATTRS leading ':'
6277 # If we also found the sub name on this call then append PROTO.
6278 # This is not necessary but for compatability with previous
6279 # versions when the -csc flag is used:
6280 if ( $match && $proto ) {
6285 # Handle prototype on separate line from subname
6286 if ($subname_saved) {
6287 $package = $package_saved;
6288 $subname = $subname_saved;
6289 $tok = $last_nonblank_token;
6296 # ATTRS: if there are attributes, back up and let the ':' be
6297 # found later by the scanner.
6298 my $pos = pos($input_line);
6300 $pos -= length($attrs);
6303 my $next_nonblank_token = $tok;
6305 # catch case of line with leading ATTR ':' after anonymous sub
6306 if ( $pos == $pos_beg && $tok eq ':' ) {
6308 $in_attribute_list = 1;
6311 # Otherwise, if we found a match we must convert back from
6312 # string position to the pre_token index for continued parsing.
6315 # I don't think an error flag can occur here ..but ?
6317 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
6319 if ($error) { warning("Possibly invalid sub\n") }
6321 # check for multiple definitions of a sub
6322 ( $next_nonblank_token, my $i_next ) =
6323 find_next_nonblank_token_on_this_line( $i, $rtokens,
6327 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
6328 { # skip blank or side comment
6329 my ( $rpre_tokens, $rpre_types ) =
6330 peek_ahead_for_n_nonblank_pre_tokens(1);
6331 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
6332 $next_nonblank_token = $rpre_tokens->[0];
6335 $next_nonblank_token = '}';
6338 $package_saved = "";
6339 $subname_saved = "";
6341 # See what's next...
6342 if ( $next_nonblank_token eq '{' ) {
6345 # Check for multiple definitions of a sub, but
6346 # it is ok to have multiple sub BEGIN, etc,
6347 # so we do not complain if name is all caps
6348 if ( $saw_function_definition{$package}{$subname}
6349 && $subname !~ /^[A-Z]+$/ )
6351 my $lno = $saw_function_definition{$package}{$subname};
6353 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
6356 $saw_function_definition{$package}{$subname} =
6357 $tokenizer_self->{_last_line_number};
6360 elsif ( $next_nonblank_token eq ';' ) {
6362 elsif ( $next_nonblank_token eq '}' ) {
6365 # ATTRS - if an attribute list follows, remember the name
6366 # of the sub so the next opening brace can be labeled.
6367 # Setting 'statement_type' causes any ':'s to introduce
6369 elsif ( $next_nonblank_token eq ':' ) {
6370 $statement_type = $tok;
6373 # if we stopped before an open paren ...
6374 elsif ( $next_nonblank_token eq '(' ) {
6376 # If we DID NOT see this paren above then it must be on the
6377 # next line so we will set a flag to come back here and see if
6380 # Otherwise, we assume it is a SIGNATURE rather than a
6381 # PROTOTYPE and let the normal tokenizer handle it as a list
6382 if ( !$saw_opening_paren ) {
6383 $id_scan_state = 'sub'; # we must come back to get proto
6384 $package_saved = $package;
6385 $subname_saved = $subname;
6387 $statement_type = $tok;
6389 elsif ($next_nonblank_token) { # EOF technically ok
6391 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
6394 check_prototype( $proto, $package, $subname );
6397 # no match but line not blank
6400 return ( $i, $tok, $type, $id_scan_state );
6404 #########i###############################################################
6405 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
6406 #########################################################################
6408 sub find_next_nonblank_token {
6409 my ( $i, $rtokens, $max_token_index ) = @_;
6411 if ( $i >= $max_token_index ) {
6412 if ( !peeked_ahead() ) {
6415 peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
6418 my $next_nonblank_token = $rtokens->[ ++$i ];
6420 if ( $next_nonblank_token =~ /^\s*$/ ) {
6421 $next_nonblank_token = $rtokens->[ ++$i ];
6423 return ( $next_nonblank_token, $i );
6426 sub numerator_expected {
6428 # this is a filter for a possible numerator, in support of guessing
6429 # for the / pattern delimiter token.
6434 # Note: I am using the convention that variables ending in
6435 # _expected have these 3 possible values.
6436 my ( $i, $rtokens, $max_token_index ) = @_;
6437 my $numerator_expected = 0;
6439 my $next_token = $rtokens->[ $i + 1 ];
6440 if ( $next_token eq '=' ) { $i++; } # handle /=
6441 my ( $next_nonblank_token, $i_next ) =
6442 find_next_nonblank_token( $i, $rtokens, $max_token_index );
6444 if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
6445 $numerator_expected = 1;
6449 if ( $next_nonblank_token =~ /^\s*$/ ) {
6450 $numerator_expected = 0;
6453 $numerator_expected = -1;
6456 return $numerator_expected;
6459 sub pattern_expected {
6461 # This is the start of a filter for a possible pattern.
6462 # It looks at the token after a possible pattern and tries to
6463 # determine if that token could end a pattern.
6468 my ( $i, $rtokens, $max_token_index ) = @_;
6471 my $next_token = $rtokens->[ $i + 1 ];
6472 if ( $next_token =~ /^[msixpodualgc]/ ) { $i++; } # skip possible modifier
6473 my ( $next_nonblank_token, $i_next ) =
6474 find_next_nonblank_token( $i, $rtokens, $max_token_index );
6476 # list of tokens which may follow a pattern
6477 # (can probably be expanded)
6478 if ( $next_nonblank_token =~ /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/ )
6484 if ( $next_nonblank_token =~ /^\s*$/ ) {
6494 sub find_next_nonblank_token_on_this_line {
6495 my ( $i, $rtokens, $max_token_index ) = @_;
6496 my $next_nonblank_token;
6498 if ( $i < $max_token_index ) {
6499 $next_nonblank_token = $rtokens->[ ++$i ];
6501 if ( $next_nonblank_token =~ /^\s*$/ ) {
6503 if ( $i < $max_token_index ) {
6504 $next_nonblank_token = $rtokens->[ ++$i ];
6509 $next_nonblank_token = "";
6511 return ( $next_nonblank_token, $i );
6514 sub find_angle_operator_termination {
6516 # We are looking at a '<' and want to know if it is an angle operator.
6518 # $i = pretoken index of ending '>' if found, current $i otherwise
6519 # $type = 'Q' if found, '>' otherwise
6520 my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
6523 pos($input_line) = 1 + $rtoken_map->[$i];
6527 # we just have to find the next '>' if a term is expected
6528 if ( $expecting == TERM ) { $filter = '[\>]' }
6530 # we have to guess if we don't know what is expected
6531 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
6533 # shouldn't happen - we shouldn't be here if operator is expected
6534 else { warning("Program Bug in find_angle_operator_termination\n") }
6536 # To illustrate what we might be looking at, in case we are
6537 # guessing, here are some examples of valid angle operators
6544 # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
6545 # <${PREFIX}*img*.$IMAGE_TYPE>
6546 # <img*.$IMAGE_TYPE>
6547 # <Timg*.$IMAGE_TYPE>
6548 # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
6550 # Here are some examples of lines which do not have angle operators:
6551 # return undef unless $self->[2]++ < $#{$self->[1]};
6554 # the following line from dlister.pl caused trouble:
6555 # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
6557 # If the '<' starts an angle operator, it must end on this line and
6558 # it must not have certain characters like ';' and '=' in it. I use
6559 # this to limit the testing. This filter should be improved if
6562 if ( $input_line =~ /($filter)/g ) {
6566 # We MAY have found an angle operator termination if we get
6567 # here, but we need to do more to be sure we haven't been
6569 my $pos = pos($input_line);
6571 my $pos_beg = $rtoken_map->[$i];
6572 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
6574 # Reject if the closing '>' follows a '-' as in:
6575 # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
6576 if ( $expecting eq UNKNOWN ) {
6577 my $check = substr( $input_line, $pos - 2, 1 );
6578 if ( $check eq '-' ) {
6579 return ( $i, $type );
6583 ######################################debug#####
6584 #write_diagnostics( "ANGLE? :$str\n");
6585 #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
6586 ######################################debug#####
6590 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
6592 # It may be possible that a quote ends midway in a pretoken.
6593 # If this happens, it may be necessary to split the pretoken.
6596 "Possible tokinization error..please check this line\n");
6597 report_possible_bug();
6600 # Now let's see where we stand....
6601 # OK if math op not possible
6602 if ( $expecting == TERM ) {
6605 # OK if there are no more than 2 pre-tokens inside
6606 # (not possible to write 2 token math between < and >)
6607 # This catches most common cases
6608 elsif ( $i <= $i_beg + 3 ) {
6609 write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
6615 # Let's try a Brace Test: any braces inside must balance
6617 while ( $str =~ /\{/g ) { $br++ }
6618 while ( $str =~ /\}/g ) { $br-- }
6620 while ( $str =~ /\[/g ) { $sb++ }
6621 while ( $str =~ /\]/g ) { $sb-- }
6623 while ( $str =~ /\(/g ) { $pr++ }
6624 while ( $str =~ /\)/g ) { $pr-- }
6626 # if braces do not balance - not angle operator
6627 if ( $br || $sb || $pr ) {
6631 "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
6634 # we should keep doing more checks here...to be continued
6635 # Tentatively accepting this as a valid angle operator.
6636 # There are lots more things that can be checked.
6639 "ANGLE-Guessing yes: $str expecting=$expecting\n");
6640 write_logfile_entry("Guessing angle operator here: $str\n");
6645 # didn't find ending >
6647 if ( $expecting == TERM ) {
6648 warning("No ending > for angle operator\n");
6652 return ( $i, $type );
6655 sub scan_number_do {
6657 # scan a number in any of the formats that Perl accepts
6658 # Underbars (_) are allowed in decimal numbers.
6659 # input parameters -
6660 # $input_line - the string to scan
6661 # $i - pre_token index to start scanning
6662 # $rtoken_map - reference to the pre_token map giving starting
6663 # character position in $input_line of token $i
6664 # output parameters -
6665 # $i - last pre_token index of the number just scanned
6666 # number - the number (characters); or undef if not a number
6668 my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
6669 my $pos_beg = $rtoken_map->[$i];
6673 my $type = $input_type;
6675 my $first_char = substr( $input_line, $pos_beg, 1 );
6677 # Look for bad starting characters; Shouldn't happen..
6678 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
6679 warning("Program bug - scan_number given character $first_char\n");
6680 report_definite_bug();
6681 return ( $i, $type, $number );
6684 # handle v-string without leading 'v' character ('Two Dot' rule)
6686 # TODO: v-strings may contain underscores
6687 pos($input_line) = $pos_beg;
6688 if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
6689 $pos = pos($input_line);
6690 my $numc = $pos - $pos_beg;
6691 $number = substr( $input_line, $pos_beg, $numc );
6693 report_v_string($number);
6696 # handle octal, hex, binary
6697 if ( !defined($number) ) {
6698 pos($input_line) = $pos_beg;
6700 /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
6702 $pos = pos($input_line);
6703 my $numc = $pos - $pos_beg;
6704 $number = substr( $input_line, $pos_beg, $numc );
6710 if ( !defined($number) ) {
6711 pos($input_line) = $pos_beg;
6713 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
6714 $pos = pos($input_line);
6716 # watch out for things like 0..40 which would give 0. by this;
6717 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
6718 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
6722 my $numc = $pos - $pos_beg;
6723 $number = substr( $input_line, $pos_beg, $numc );
6728 # filter out non-numbers like e + - . e2 .e3 +e6
6729 # the rule: at least one digit, and any 'e' must be preceded by a digit
6731 $number !~ /\d/ # no digits
6732 || ( $number =~ /^(.*)[eE]/
6733 && $1 !~ /\d/ ) # or no digits before the 'e'
6737 $type = $input_type;
6738 return ( $i, $type, $number );
6741 # Found a number; now we must convert back from character position
6742 # to pre_token index. An error here implies user syntax error.
6743 # An example would be an invalid octal number like '009'.
6746 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
6747 if ($error) { warning("Possibly invalid number\n") }
6749 return ( $i, $type, $number );
6752 sub inverse_pretoken_map {
6754 # Starting with the current pre_token index $i, scan forward until
6755 # finding the index of the next pre_token whose position is $pos.
6756 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
6759 while ( ++$i <= $max_token_index ) {
6761 if ( $pos <= $rtoken_map->[$i] ) {
6763 # Let the calling routine handle errors in which we do not
6764 # land on a pre-token boundary. It can happen by running
6765 # perltidy on some non-perl scripts, for example.
6766 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
6771 return ( $i, $error );
6776 # find the target of a here document, if any
6778 # $i - token index of the second < of <<
6779 # ($i must be less than the last token index if this is called)
6780 # output parameters:
6781 # $found_target = 0 didn't find target; =1 found target
6782 # HERE_TARGET - the target string (may be empty string)
6783 # $i - unchanged if not here doc,
6784 # or index of the last token of the here target
6785 # $saw_error - flag noting unbalanced quote on here target
6786 my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6788 my $found_target = 0;
6789 my $here_doc_target = '';
6790 my $here_quote_character = '';
6792 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
6793 $next_token = $rtokens->[ $i + 1 ];
6795 # perl allows a backslash before the target string (heredoc.t)
6797 if ( $next_token eq '\\' ) {
6799 $next_token = $rtokens->[ $i + 2 ];
6802 ( $next_nonblank_token, $i_next_nonblank ) =
6803 find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
6805 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
6808 my $quote_depth = 0;
6813 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
6816 = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
6817 $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
6819 if ($in_quote) { # didn't find end of quote, so no target found
6821 if ( $expecting == TERM ) {
6823 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
6828 else { # found ending quote
6833 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
6834 $tokj = $rtokens->[$j];
6836 # we have to remove any backslash before the quote character
6837 # so that the here-doc-target exactly matches this string
6841 && $rtokens->[ $j + 1 ] eq $here_quote_character );
6842 $here_doc_target .= $tokj;
6847 elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
6849 write_logfile_entry(
6850 "found blank here-target after <<; suggest using \"\"\n");
6853 elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
6855 my $here_doc_expected;
6856 if ( $expecting == UNKNOWN ) {
6857 $here_doc_expected = guess_if_here_doc($next_token);
6860 $here_doc_expected = 1;
6863 if ($here_doc_expected) {
6865 $here_doc_target = $next_token;
6872 if ( $expecting == TERM ) {
6874 write_logfile_entry("Note: bare here-doc operator <<\n");
6881 # patch to neglect any prepended backslash
6882 if ( $found_target && $backslash ) { $i++ }
6884 return ( $found_target, $here_doc_target, $here_quote_character, $i,
6890 # follow (or continue following) quoted string(s)
6891 # $in_quote return code:
6893 # 1 - still must find end of quote whose target is $quote_character
6894 # 2 - still looking for end of first of two quotes
6896 # Returns updated strings:
6897 # $quoted_string_1 = quoted string seen while in_quote=1
6898 # $quoted_string_2 = quoted string seen while in_quote=2
6900 $i, $in_quote, $quote_character,
6901 $quote_pos, $quote_depth, $quoted_string_1,
6902 $quoted_string_2, $rtokens, $rtoken_map,
6906 my $in_quote_starting = $in_quote;
6909 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
6912 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6915 = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
6916 $quote_pos, $quote_depth, $max_token_index );
6917 $quoted_string_2 .= $quoted_string;
6918 if ( $in_quote == 1 ) {
6919 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
6920 $quote_character = '';
6923 $quoted_string_2 .= "\n";
6927 if ( $in_quote == 1 ) { # one (more) quote to follow
6930 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6933 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6934 $quote_pos, $quote_depth, $max_token_index );
6935 $quoted_string_1 .= $quoted_string;
6936 if ( $in_quote == 1 ) {
6937 $quoted_string_1 .= "\n";
6940 return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6941 $quoted_string_1, $quoted_string_2 );
6944 sub follow_quoted_string {
6946 # scan for a specific token, skipping escaped characters
6947 # if the quote character is blank, use the first non-blank character
6949 # $rtokens = reference to the array of tokens
6950 # $i = the token index of the first character to search
6951 # $in_quote = number of quoted strings being followed
6952 # $beginning_tok = the starting quote character
6953 # $quote_pos = index to check next for alphanumeric delimiter
6954 # output parameters:
6955 # $i = the token index of the ending quote character
6956 # $in_quote = decremented if found end, unchanged if not
6957 # $beginning_tok = the starting quote character
6958 # $quote_pos = index to check next for alphanumeric delimiter
6959 # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
6960 # $quoted_string = the text of the quote (without quotation tokens)
6961 my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
6964 my ( $tok, $end_tok );
6966 my $quoted_string = "";
6968 TOKENIZER_DEBUG_FLAG_QUOTE && do {
6970 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
6973 # get the corresponding end token
6974 if ( $beginning_tok !~ /^\s*$/ ) {
6975 $end_tok = matching_end_token($beginning_tok);
6978 # a blank token means we must find and use the first non-blank one
6980 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
6982 while ( $i < $max_token_index ) {
6983 $tok = $rtokens->[ ++$i ];
6985 if ( $tok !~ /^\s*$/ ) {
6987 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
6988 $i = $max_token_index;
6992 if ( length($tok) > 1 ) {
6993 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
6994 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
6997 $beginning_tok = $tok;
7000 $end_tok = matching_end_token($beginning_tok);
7006 $allow_quote_comments = 1;
7011 # There are two different loops which search for the ending quote
7012 # character. In the rare case of an alphanumeric quote delimiter, we
7013 # have to look through alphanumeric tokens character-by-character, since
7014 # the pre-tokenization process combines multiple alphanumeric
7015 # characters, whereas for a non-alphanumeric delimiter, only tokens of
7016 # length 1 can match.
7018 ###################################################################
7019 # Case 1 (rare): loop for case of alphanumeric quote delimiter..
7020 # "quote_pos" is the position the current word to begin searching
7021 ###################################################################
7022 if ( $beginning_tok =~ /\w/ ) {
7024 # Note this because it is not recommended practice except
7025 # for obfuscated perl contests
7026 if ( $in_quote == 1 ) {
7027 write_logfile_entry(
7028 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
7031 while ( $i < $max_token_index ) {
7033 if ( $quote_pos == 0 || ( $i < 0 ) ) {
7034 $tok = $rtokens->[ ++$i ];
7036 if ( $tok eq '\\' ) {
7038 # retain backslash unless it hides the end token
7039 $quoted_string .= $tok
7040 unless $rtokens->[ $i + 1 ] eq $end_tok;
7042 last if ( $i >= $max_token_index );
7043 $tok = $rtokens->[ ++$i ];
7046 my $old_pos = $quote_pos;
7048 unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
7052 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
7054 if ( $quote_pos > 0 ) {
7057 substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
7061 if ( $quote_depth == 0 ) {
7067 $quoted_string .= substr( $tok, $old_pos );
7072 ########################################################################
7073 # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
7074 ########################################################################
7077 while ( $i < $max_token_index ) {
7078 $tok = $rtokens->[ ++$i ];
7080 if ( $tok eq $end_tok ) {
7083 if ( $quote_depth == 0 ) {
7088 elsif ( $tok eq $beginning_tok ) {
7091 elsif ( $tok eq '\\' ) {
7093 # retain backslash unless it hides the beginning or end token
7094 $tok = $rtokens->[ ++$i ];
7095 $quoted_string .= '\\'
7096 unless ( $tok eq $end_tok || $tok eq $beginning_tok );
7098 $quoted_string .= $tok;
7101 if ( $i > $max_token_index ) { $i = $max_token_index }
7102 return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
7106 sub indicate_error {
7107 my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
7108 interrupt_logfile();
7110 write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
7115 sub write_error_indicator_pair {
7116 my ( $line_number, $input_line, $pos, $carrat ) = @_;
7117 my ( $offset, $numbered_line, $underline ) =
7118 make_numbered_line( $line_number, $input_line, $pos );
7119 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
7120 warning( $numbered_line . "\n" );
7121 $underline =~ s/\s*$//;
7122 warning( $underline . "\n" );
7126 sub make_numbered_line {
7128 # Given an input line, its line number, and a character position of
7129 # interest, create a string not longer than 80 characters of the form
7130 # $lineno: sub_string
7131 # such that the sub_string of $str contains the position of interest
7133 # Here is an example of what we want, in this case we add trailing
7134 # '...' because the line is long.
7136 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
7138 # Here is another example, this time in which we used leading '...'
7139 # because of excessive length:
7141 # 2: ... er of the World Wide Web Consortium's
7143 # input parameters are:
7144 # $lineno = line number
7145 # $str = the text of the line
7146 # $pos = position of interest (the error) : 0 = first character
7149 # - $offset = an offset which corrects the position in case we only
7150 # display part of a line, such that $pos-$offset is the effective
7151 # position from the start of the displayed line.
7152 # - $numbered_line = the numbered line as above,
7153 # - $underline = a blank 'underline' which is all spaces with the same
7154 # number of characters as the numbered line.
7156 my ( $lineno, $str, $pos ) = @_;
7157 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
7158 my $excess = length($str) - $offset - 68;
7159 my $numc = ( $excess > 0 ) ? 68 : undef;
7161 if ( defined($numc) ) {
7162 if ( $offset == 0 ) {
7163 $str = substr( $str, $offset, $numc - 4 ) . " ...";
7166 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
7171 if ( $offset == 0 ) {
7174 $str = "... " . substr( $str, $offset + 4 );
7178 my $numbered_line = sprintf( "%d: ", $lineno );
7179 $offset -= length($numbered_line);
7180 $numbered_line .= $str;
7181 my $underline = " " x length($numbered_line);
7182 return ( $offset, $numbered_line, $underline );
7185 sub write_on_underline {
7187 # The "underline" is a string that shows where an error is; it starts
7188 # out as a string of blanks with the same length as the numbered line of
7189 # code above it, and we have to add marking to show where an error is.
7190 # In the example below, we want to write the string '--^' just below
7191 # the line of bad code:
7193 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
7195 # We are given the current underline string, plus a position and a
7196 # string to write on it.
7198 # In the above example, there will be 2 calls to do this:
7199 # First call: $pos=19, pos_chr=^
7200 # Second call: $pos=16, pos_chr=---
7202 # This is a trivial thing to do with substr, but there is some
7205 my ( $underline, $pos, $pos_chr ) = @_;
7207 # check for error..shouldn't happen
7208 unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
7211 my $excess = length($pos_chr) + $pos - length($underline);
7212 if ( $excess > 0 ) {
7213 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
7215 substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
7216 return ($underline);
7221 # Break a string, $str, into a sequence of preliminary tokens. We
7222 # are interested in these types of tokens:
7223 # words (type='w'), example: 'max_tokens_wanted'
7224 # digits (type = 'd'), example: '0755'
7225 # whitespace (type = 'b'), example: ' '
7226 # any other single character (i.e. punct; type = the character itself).
7227 # We cannot do better than this yet because we might be in a quoted
7228 # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
7230 my ( $str, $max_tokens_wanted ) = @_;
7232 # we return references to these 3 arrays:
7233 my @tokens = (); # array of the tokens themselves
7234 my @token_map = (0); # string position of start of each token
7235 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
7240 if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
7243 # note that this must come before words!
7244 elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
7247 elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
7249 # single-character punctuation
7250 elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
7254 return ( \@tokens, \@token_map, \@type );
7258 push @token_map, pos($str);
7260 } while ( --$max_tokens_wanted != 0 );
7262 return ( \@tokens, \@token_map, \@type );
7267 # this is an old debug routine
7268 # not called, but saved for reference
7269 my ( $rtokens, $rtoken_map ) = @_;
7270 my $num = scalar( @{$rtokens} );
7272 foreach my $i ( 0 .. $num - 1 ) {
7273 my $len = length( $rtokens->[$i] );
7274 print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
7280 my %matching_end_token;
7283 %matching_end_token = (
7291 sub matching_end_token {
7293 # return closing character for a pattern
7294 my $beginning_token = shift;
7295 if ( $matching_end_token{$beginning_token} ) {
7296 return $matching_end_token{$beginning_token};
7298 return ($beginning_token);
7302 sub dump_token_types {
7303 my ( $class, $fh ) = @_;
7305 # This should be the latest list of token types in use
7306 # adding NEW_TOKENS: add a comment here
7307 print $fh <<'END_OF_LIST';
7309 Here is a list of the token types currently used for lines of type 'CODE'.
7310 For the following tokens, the "type" of a token is just the token itself.
7312 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
7313 ( ) <= >= == =~ !~ != ++ -- /= x=
7314 ... **= <<= >>= &&= ||= //= <=>
7315 , + - / * | % ! x ~ = \ ? : . < > ^ &
7317 The following additional token types are defined:
7320 b blank (white space)
7321 { indent: opening structural curly brace or square bracket or paren
7322 (code block, anonymous hash reference, or anonymous array reference)
7323 } outdent: right structural curly brace or square bracket or paren
7324 [ left non-structural square bracket (enclosing an array index)
7325 ] right non-structural square bracket
7326 ( left non-structural paren (all but a list right of an =)
7327 ) right non-structural paren
7328 L left non-structural curly brace (enclosing a key)
7329 R right non-structural curly brace
7330 ; terminal semicolon
7331 f indicates a semicolon in a "for" statement
7332 h here_doc operator <<
7334 Q indicates a quote or pattern
7335 q indicates a qw quote block
7337 C user-defined constant or constant function (with void prototype = ())
7338 U user-defined function taking parameters
7339 G user-defined function taking block parameter (like grep/map/eval)
7340 M (unused, but reserved for subroutine definition name)
7341 P (unused, but -html uses it to label pod text)
7342 t type indicater such as %,$,@,*,&,sub
7343 w bare word (perhaps a subroutine call)
7344 i identifier of some type (with leading %, $, @, *, &, sub, -> )
7347 F a file test operator (like -e)
7349 Z identifier in indirect object slot: may be file handle, object
7350 J LABEL: code block label
7351 j LABEL after next, last, redo, goto
7354 pp pre-increment operator ++
7355 mm pre-decrement operator --
7356 A : used as attribute separator
7358 Here are the '_line_type' codes used internally:
7359 SYSTEM - system-specific code before hash-bang line
7360 CODE - line of perl code (including comments)
7361 POD_START - line starting pod, such as '=head'
7362 POD - pod documentation text
7363 POD_END - last line of pod section, '=cut'
7364 HERE - text of here-document
7365 HERE_END - last line of here-doc (target word)
7366 FORMAT - format section
7367 FORMAT_END - last line of format section, '.'
7368 DATA_START - __DATA__ line
7369 DATA - unidentified text following __DATA__
7370 END_START - __END__ line
7371 END - unidentified text following __END__
7372 ERROR - we are in big trouble, probably not a perl script
7380 # These names are used in error messages
7381 @opening_brace_names = qw# '{' '[' '(' '?' #;
7382 @closing_brace_names = qw# '}' ']' ')' ':' #;
7387 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
7388 <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
7390 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
7392 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
7393 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
7395 my @tetragraphs = qw( <<>> );
7396 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
7398 # make a hash of all valid token types for self-checking the tokenizer
7399 # (adding NEW_TOKENS : select a new character and add to this list)
7400 my @valid_token_types = qw#
7401 A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
7402 { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
7404 push( @valid_token_types, @digraphs );
7405 push( @valid_token_types, @trigraphs );
7406 push( @valid_token_types, @tetragraphs );
7407 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
7408 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
7410 # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
7411 my @file_test_operators =
7412 qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
7413 @is_file_test_operator{@file_test_operators} =
7414 (1) x scalar(@file_test_operators);
7416 # these functions have prototypes of the form (&), so when they are
7417 # followed by a block, that block MAY BE followed by an operator.
7418 # Smartmatch operator ~~ may be followed by anonymous hash or array ref
7420 @is_block_operator{@q} = (1) x scalar(@q);
7422 # these functions allow an identifier in the indirect object slot
7423 @q = qw( print printf sort exec system say);
7424 @is_indirect_object_taker{@q} = (1) x scalar(@q);
7426 # These tokens may precede a code block
7427 # patched for SWITCH/CASE/CATCH. Actually these could be removed
7428 # now and we could let the extended-syntax coding handle them
7430 qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
7431 unless do while until eval for foreach map grep sort
7432 switch case given when catch try finally);
7433 @is_code_block_token{@q} = (1) x scalar(@q);
7435 # I'll build the list of keywords incrementally
7438 # keywords and tokens after which a value or pattern is expected,
7439 # but not an operator. In other words, these should consume terms
7440 # to their right, or at least they are not expected to be followed
7441 # immediately by operators.
7442 my @value_requestor = qw(
7665 # patched above for SWITCH/CASE given/when err say
7666 # 'err' is a fairly safe addition.
7667 # TODO: 'default' still needed if appropriate
7668 # 'use feature' seen, but perltidy works ok without it.
7669 # Concerned that 'default' could break code.
7670 push( @Keywords, @value_requestor );
7672 # These are treated the same but are not keywords:
7677 push( @value_requestor, @extra_vr );
7679 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
7681 # this list contains keywords which do not look for arguments,
7682 # so that they might be followed by an operator, or at least
7684 my @operator_requestor = qw(
7708 push( @Keywords, @operator_requestor );
7710 # These are treated the same but are not considered keywords:
7717 push( @operator_requestor, @extra_or );
7719 @expecting_operator_token{@operator_requestor} =
7720 (1) x scalar(@operator_requestor);
7722 # these token TYPES expect trailing operator but not a term
7723 # note: ++ and -- are post-increment and decrement, 'C' = constant
7724 my @operator_requestor_types = qw( ++ -- C <> q );
7725 @expecting_operator_types{@operator_requestor_types} =
7726 (1) x scalar(@operator_requestor_types);
7728 # these token TYPES consume values (terms)
7729 # note: pp and mm are pre-increment and decrement
7730 # f=semicolon in for, F=file test operator
7731 my @value_requestor_type = qw#
7732 L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
7733 **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
7734 <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~
7735 f F pp mm Y p m U J G j >> << ^ t
7736 ~. ^. |. &. ^.= |.= &.=
7738 push( @value_requestor_type, ',' )
7739 ; # (perl doesn't like a ',' in a qw block)
7740 @expecting_term_types{@value_requestor_type} =
7741 (1) x scalar(@value_requestor_type);
7743 # Note: the following valid token types are not assigned here to
7744 # hashes requesting to be followed by values or terms, but are
7745 # instead currently hard-coded into sub operator_expected:
7746 # ) -> :: Q R Z ] b h i k n v w } #
7748 # For simple syntax checking, it is nice to have a list of operators which
7749 # will really be unhappy if not followed by a term. This includes most
7751 %really_want_term = %expecting_term_types;
7753 # with these exceptions...
7754 delete $really_want_term{'U'}; # user sub, depends on prototype
7755 delete $really_want_term{'F'}; # file test works on $_ if no following term
7756 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
7759 @q = qw(q qq qw qx qr s y tr m);
7760 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
7762 # These keywords are handled specially in the tokenizer code:
7763 my @special_keywords = qw(
7779 push( @Keywords, @special_keywords );
7781 # Keywords after which list formatting may be used
7782 # WARNING: do not include |map|grep|eval or perl may die on
7783 # syntax errors (map1.t).
7784 my @keyword_taking_list = qw(
7858 @is_keyword_taking_list{@keyword_taking_list} =
7859 (1) x scalar(@keyword_taking_list);
7861 # perl functions which may be unary operators
7862 my @keyword_taking_optional_args = qw(
7872 @is_keyword_taking_optional_args{@keyword_taking_optional_args} =
7873 (1) x scalar(@keyword_taking_optional_args);
7875 # These are not used in any way yet
7876 # my @unused_keywords = qw(
7882 # The list of keywords was originally extracted from function 'keyword' in
7883 # perl file toke.c version 5.005.03, using this utility, plus a
7884 # little editing: (file getkwd.pl):
7885 # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
7886 # Add 'get' prefix where necessary, then split into the above lists.
7887 # This list should be updated as necessary.
7888 # The list should not contain these special variables:
7889 # ARGV DATA ENV SIG STDERR STDIN STDOUT
7892 @is_keyword{@Keywords} = (1) x scalar(@Keywords);