1 ########################################################################
3 # the Perl::Tidy::Tokenizer package is essentially a filter which
4 # reads lines of perl source code from a source object and provides
5 # corresponding tokenized lines through its get_line() method. Lines
6 # flow from the source_object to the caller like this:
8 # source_object --> LineBuffer_object --> Tokenizer --> calling routine
9 # get_line() get_line() get_line() line_of_tokens
11 # The source object can be any object with a get_line() method which
12 # supplies one line (a character string) perl call.
13 # The LineBuffer object is created by the Tokenizer.
14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
15 # containing one tokenized line for each call to its get_line() method.
17 # WARNING: This is not a real class yet. Only one tokenizer my be used.
19 ########################################################################
21 package Perl::Tidy::Tokenizer;
24 our $VERSION = '20200110';
26 use Perl::Tidy::LineBuffer;
30 # Caution: these debug flags produce a lot of output
31 # They should all be 0 except when debugging small scripts
33 use constant TOKENIZER_DEBUG_FLAG_EXPECT => 0;
34 use constant TOKENIZER_DEBUG_FLAG_NSCAN => 0;
35 use constant TOKENIZER_DEBUG_FLAG_QUOTE => 0;
36 use constant TOKENIZER_DEBUG_FLAG_SCAN_ID => 0;
37 use constant TOKENIZER_DEBUG_FLAG_TOKENIZE => 0;
39 my $debug_warning = sub {
40 print STDOUT "TOKENIZER_DEBUGGING with key $_[0]\n";
43 TOKENIZER_DEBUG_FLAG_EXPECT && $debug_warning->('EXPECT');
44 TOKENIZER_DEBUG_FLAG_NSCAN && $debug_warning->('NSCAN');
45 TOKENIZER_DEBUG_FLAG_QUOTE && $debug_warning->('QUOTE');
46 TOKENIZER_DEBUG_FLAG_SCAN_ID && $debug_warning->('SCAN_ID');
47 TOKENIZER_DEBUG_FLAG_TOKENIZE && $debug_warning->('TOKENIZE');
53 # PACKAGE VARIABLES for processing an entire FILE.
59 $last_nonblank_block_type
67 %user_function_prototype
69 %is_block_list_function
70 %saw_function_definition
79 @nesting_sequence_number
80 @current_sequence_number
82 @paren_semicolon_count
83 @paren_structural_type
85 @brace_structural_type
89 @square_bracket_structural_type
92 @nested_statement_type
93 @starting_line_of_current_depth
96 # GLOBAL CONSTANTS for routines in this package
98 %is_indirect_object_taker
100 %expecting_operator_token
101 %expecting_operator_types
102 %expecting_term_types
103 %expecting_term_token
105 %is_file_test_operator
114 %is_keyword_taking_list
115 %is_keyword_taking_optional_args
116 %is_q_qq_qw_qx_qr_s_y_tr_m
121 # possible values of operator_expected()
122 use constant TERM => -1;
123 use constant UNKNOWN => 0;
124 use constant OPERATOR => 1;
126 # possible values of context
127 use constant SCALAR_CONTEXT => -1;
128 use constant UNKNOWN_CONTEXT => 0;
129 use constant LIST_CONTEXT => 1;
131 # Maximum number of little messages; probably need not be changed.
132 use constant MAX_NAG_MESSAGES => 6;
136 # methods to count instances
138 sub get_count { return $_count; }
139 sub _increment_count { return ++$_count }
140 sub _decrement_count { return --$_count }
145 $self->_decrement_count();
151 # Check Tokenizer parameters
157 # Install any aliases to 'sub'
158 if ( $rOpts->{'sub-alias-list'} ) {
160 # Note that any 'sub-alias-list' has been preprocessed to
161 # be a trimmed, space-separated list which includes 'sub'
162 # for example, it might be 'sub method fun'
163 my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
164 foreach my $word (@sub_alias_list) {
173 my ( $class, @args ) = @_;
175 # Note: 'tabs' and 'indent_columns' are temporary and should be
178 source_object => undef,
179 debugger_object => undef,
180 diagnostics_object => undef,
181 logger_object => undef,
182 starting_level => undef,
185 look_for_hash_bang => 0,
187 look_for_autoloader => 1,
188 look_for_selfloader => 1,
189 starting_line_number => 1,
190 extended_syntax => 0,
192 my %args = ( %defaults, @args );
194 # we are given an object with a get_line() method to supply source lines
195 my $source_object = $args{source_object};
197 # we create another object with a get_line() and peek_ahead() method
198 my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
200 # Tokenizer state data is as follows:
201 # _rhere_target_list reference to list of here-doc targets
202 # _here_doc_target the target string for a here document
203 # _here_quote_character the type of here-doc quoting (" ' ` or none)
204 # to determine if interpolation is done
205 # _quote_target character we seek if chasing a quote
206 # _line_start_quote line where we started looking for a long quote
207 # _in_here_doc flag indicating if we are in a here-doc
208 # _in_pod flag set if we are in pod documentation
209 # _in_error flag set if we saw severe error (binary in script)
210 # _in_data flag set if we are in __DATA__ section
211 # _in_end flag set if we are in __END__ section
212 # _in_format flag set if we are in a format description
213 # _in_attribute_list flag telling if we are looking for attributes
214 # _in_quote flag telling if we are chasing a quote
215 # _starting_level indentation level of first line
216 # _line_buffer_object object with get_line() method to supply source code
217 # _diagnostics_object place to write debugging information
218 # _unexpected_error_count error count used to limit output
219 # _lower_case_labels_at line numbers where lower case labels seen
220 # _hit_bug program bug detected
222 _rhere_target_list => [],
224 _here_doc_target => "",
225 _here_quote_character => "",
231 _in_attribute_list => 0,
234 _line_start_quote => -1,
235 _starting_level => $args{starting_level},
236 _know_starting_level => defined( $args{starting_level} ),
237 _tabsize => $args{tabsize},
238 _indent_columns => $args{indent_columns},
239 _look_for_hash_bang => $args{look_for_hash_bang},
240 _trim_qw => $args{trim_qw},
241 _continuation_indentation => $args{continuation_indentation},
242 _outdent_labels => $args{outdent_labels},
243 _last_line_number => $args{starting_line_number} - 1,
244 _saw_perl_dash_P => 0,
245 _saw_perl_dash_w => 0,
246 _saw_use_strict => 0,
249 _look_for_autoloader => $args{look_for_autoloader},
250 _look_for_selfloader => $args{look_for_selfloader},
251 _saw_autoloader => 0,
252 _saw_selfloader => 0,
256 _saw_negative_indentation => 0,
257 _started_tokenizing => 0,
258 _line_buffer_object => $line_buffer_object,
259 _debugger_object => $args{debugger_object},
260 _diagnostics_object => $args{diagnostics_object},
261 _logger_object => $args{logger_object},
262 _unexpected_error_count => 0,
263 _started_looking_for_here_target_at => 0,
264 _nearly_matched_here_target_at => undef,
266 _rlower_case_labels_at => undef,
267 _extended_syntax => $args{extended_syntax},
270 prepare_for_a_new_file();
271 find_starting_indentation_level();
273 bless $tokenizer_self, $class;
275 # This is not a full class yet, so die if an attempt is made to
276 # create more than one object.
278 if ( _increment_count() > 1 ) {
280 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
283 return $tokenizer_self;
287 # interface to Perl::Tidy::Logger routines
290 my $logger_object = $tokenizer_self->{_logger_object};
291 if ($logger_object) {
292 $logger_object->warning($msg);
299 my $logger_object = $tokenizer_self->{_logger_object};
300 if ($logger_object) {
301 $logger_object->complain($msg);
306 sub write_logfile_entry {
308 my $logger_object = $tokenizer_self->{_logger_object};
309 if ($logger_object) {
310 $logger_object->write_logfile_entry($msg);
315 sub interrupt_logfile {
316 my $logger_object = $tokenizer_self->{_logger_object};
317 if ($logger_object) {
318 $logger_object->interrupt_logfile();
324 my $logger_object = $tokenizer_self->{_logger_object};
325 if ($logger_object) {
326 $logger_object->resume_logfile();
331 sub increment_brace_error {
332 my $logger_object = $tokenizer_self->{_logger_object};
333 if ($logger_object) {
334 $logger_object->increment_brace_error();
339 sub report_definite_bug {
340 $tokenizer_self->{_hit_bug} = 1;
341 my $logger_object = $tokenizer_self->{_logger_object};
342 if ($logger_object) {
343 $logger_object->report_definite_bug();
350 my $logger_object = $tokenizer_self->{_logger_object};
351 if ($logger_object) {
352 $logger_object->brace_warning($msg);
357 sub get_saw_brace_error {
358 my $logger_object = $tokenizer_self->{_logger_object};
359 if ($logger_object) {
360 return $logger_object->get_saw_brace_error();
367 sub get_unexpected_error_count {
369 return $self->{_unexpected_error_count};
372 # interface to Perl::Tidy::Diagnostics routines
373 sub write_diagnostics {
375 if ( $tokenizer_self->{_diagnostics_object} ) {
376 $tokenizer_self->{_diagnostics_object}->write_diagnostics($msg);
381 sub report_tokenization_errors {
384 my $severe_error = $self->{_in_error};
386 my $level = get_indentation_level();
387 if ( $level != $tokenizer_self->{_starting_level} ) {
388 warning("final indentation level: $level\n");
391 check_final_nesting_depths();
393 if ( $tokenizer_self->{_look_for_hash_bang}
394 && !$tokenizer_self->{_saw_hash_bang} )
397 "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
400 if ( $tokenizer_self->{_in_format} ) {
401 warning("hit EOF while in format description\n");
404 if ( $tokenizer_self->{_in_pod} ) {
406 # Just write log entry if this is after __END__ or __DATA__
407 # because this happens to often, and it is not likely to be
409 if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) {
411 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
417 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
423 if ( $tokenizer_self->{_in_here_doc} ) {
425 my $here_doc_target = $tokenizer_self->{_here_doc_target};
426 my $started_looking_for_here_target_at =
427 $tokenizer_self->{_started_looking_for_here_target_at};
428 if ($here_doc_target) {
430 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
435 "hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
438 my $nearly_matched_here_target_at =
439 $tokenizer_self->{_nearly_matched_here_target_at};
440 if ($nearly_matched_here_target_at) {
442 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
447 if ( $tokenizer_self->{_in_quote} ) {
449 my $line_start_quote = $tokenizer_self->{_line_start_quote};
450 my $quote_target = $tokenizer_self->{_quote_target};
452 ( $tokenizer_self->{_in_attribute_list} )
456 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
460 if ( $tokenizer_self->{_hit_bug} ) {
464 my $logger_object = $tokenizer_self->{_logger_object};
466 # TODO: eventually may want to activate this to cause file to be output verbatim
469 # Set the severe error for a fairly high warning count because
470 # some of the warnings do not harm formatting, such as duplicate
472 my $warning_count = $logger_object->{_warning_count};
473 if ( $warning_count > 50 ) {
477 # Brace errors are significant, so set the severe error flag at
479 my $saw_brace_error = $logger_object->{_saw_brace_error};
480 if ( $saw_brace_error > 2 ) {
485 unless ( $tokenizer_self->{_saw_perl_dash_w} ) {
487 write_logfile_entry("Suggest including '-w parameter'\n");
490 write_logfile_entry("Suggest including 'use warnings;'\n");
494 if ( $tokenizer_self->{_saw_perl_dash_P} ) {
495 write_logfile_entry("Use of -P parameter for defines is discouraged\n");
498 unless ( $tokenizer_self->{_saw_use_strict} ) {
499 write_logfile_entry("Suggest including 'use strict;'\n");
502 # it is suggested that labels have at least one upper case character
503 # for legibility and to avoid code breakage as new keywords are introduced
504 if ( $tokenizer_self->{_rlower_case_labels_at} ) {
505 my @lower_case_labels_at =
506 @{ $tokenizer_self->{_rlower_case_labels_at} };
508 "Suggest using upper case characters in label(s)\n");
510 write_logfile_entry(" defined at line(s): (@lower_case_labels_at)\n");
512 return $severe_error;
515 sub report_v_string {
517 # warn if this version can't handle v-strings
519 unless ( $tokenizer_self->{_saw_v_string} ) {
520 $tokenizer_self->{_saw_v_string} = $tokenizer_self->{_last_line_number};
524 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
530 sub get_input_line_number {
531 return $tokenizer_self->{_last_line_number};
534 # returns the next tokenized line
539 # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
540 # $square_bracket_depth, $paren_depth
542 my $input_line = $tokenizer_self->{_line_buffer_object}->get_line();
543 $tokenizer_self->{_line_text} = $input_line;
545 return unless ($input_line);
547 my $input_line_number = ++$tokenizer_self->{_last_line_number};
549 # Find and remove what characters terminate this line, including any
551 my $input_line_separator = "";
552 if ( chomp($input_line) ) { $input_line_separator = $/ }
554 # TODO: what other characters should be included here?
555 if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
556 $input_line_separator = $2 . $input_line_separator;
559 # for backwards compatibility we keep the line text terminated with
560 # a newline character
562 $tokenizer_self->{_line_text} = $input_line; # update
564 # create a data structure describing this line which will be
565 # returned to the caller.
567 # _line_type codes are:
568 # SYSTEM - system-specific code before hash-bang line
569 # CODE - line of perl code (including comments)
570 # POD_START - line starting pod, such as '=head'
571 # POD - pod documentation text
572 # POD_END - last line of pod section, '=cut'
573 # HERE - text of here-document
574 # HERE_END - last line of here-doc (target word)
575 # FORMAT - format section
576 # FORMAT_END - last line of format section, '.'
577 # DATA_START - __DATA__ line
578 # DATA - unidentified text following __DATA__
579 # END_START - __END__ line
580 # END - unidentified text following __END__
581 # ERROR - we are in big trouble, probably not a perl script
584 # _curly_brace_depth - depth of curly braces at start of line
585 # _square_bracket_depth - depth of square brackets at start of line
586 # _paren_depth - depth of parens at start of line
587 # _starting_in_quote - this line continues a multi-line quote
588 # (so don't trim leading blanks!)
589 # _ending_in_quote - this line ends in a multi-line quote
590 # (so don't trim trailing blanks!)
591 my $line_of_tokens = {
593 _line_text => $input_line,
594 _line_number => $input_line_number,
595 _rtoken_type => undef,
599 _rblock_type => undef,
600 _rcontainer_type => undef,
601 _rcontainer_environment => undef,
602 _rtype_sequence => undef,
603 _rnesting_tokens => undef,
604 _rci_levels => undef,
605 _rnesting_blocks => undef,
606 _guessed_indentation_level => 0,
607 _starting_in_quote => 0, # to be set by subroutine
608 _ending_in_quote => 0,
609 _curly_brace_depth => $brace_depth,
610 _square_bracket_depth => $square_bracket_depth,
611 _paren_depth => $paren_depth,
612 _quote_character => '',
615 # must print line unchanged if we are in a here document
616 if ( $tokenizer_self->{_in_here_doc} ) {
618 $line_of_tokens->{_line_type} = 'HERE';
619 my $here_doc_target = $tokenizer_self->{_here_doc_target};
620 my $here_quote_character = $tokenizer_self->{_here_quote_character};
621 my $candidate_target = $input_line;
622 chomp $candidate_target;
624 # Handle <<~ targets, which are indicated here by a leading space on
625 # the here quote character
626 if ( $here_quote_character =~ /^\s/ ) {
627 $candidate_target =~ s/^\s*//;
629 if ( $candidate_target eq $here_doc_target ) {
630 $tokenizer_self->{_nearly_matched_here_target_at} = undef;
631 $line_of_tokens->{_line_type} = 'HERE_END';
632 write_logfile_entry("Exiting HERE document $here_doc_target\n");
634 my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
635 if ( @{$rhere_target_list} ) { # there can be multiple here targets
636 ( $here_doc_target, $here_quote_character ) =
637 @{ shift @{$rhere_target_list} };
638 $tokenizer_self->{_here_doc_target} = $here_doc_target;
639 $tokenizer_self->{_here_quote_character} =
640 $here_quote_character;
642 "Entering HERE document $here_doc_target\n");
643 $tokenizer_self->{_nearly_matched_here_target_at} = undef;
644 $tokenizer_self->{_started_looking_for_here_target_at} =
648 $tokenizer_self->{_in_here_doc} = 0;
649 $tokenizer_self->{_here_doc_target} = "";
650 $tokenizer_self->{_here_quote_character} = "";
654 # check for error of extra whitespace
655 # note for PERL6: leading whitespace is allowed
657 $candidate_target =~ s/\s*$//;
658 $candidate_target =~ s/^\s*//;
659 if ( $candidate_target eq $here_doc_target ) {
660 $tokenizer_self->{_nearly_matched_here_target_at} =
664 return $line_of_tokens;
667 # must print line unchanged if we are in a format section
668 elsif ( $tokenizer_self->{_in_format} ) {
670 if ( $input_line =~ /^\.[\s#]*$/ ) {
671 write_logfile_entry("Exiting format section\n");
672 $tokenizer_self->{_in_format} = 0;
673 $line_of_tokens->{_line_type} = 'FORMAT_END';
676 $line_of_tokens->{_line_type} = 'FORMAT';
678 return $line_of_tokens;
681 # must print line unchanged if we are in pod documentation
682 elsif ( $tokenizer_self->{_in_pod} ) {
684 $line_of_tokens->{_line_type} = 'POD';
685 if ( $input_line =~ /^=cut/ ) {
686 $line_of_tokens->{_line_type} = 'POD_END';
687 write_logfile_entry("Exiting POD section\n");
688 $tokenizer_self->{_in_pod} = 0;
690 if ( $input_line =~ /^\#\!.*perl\b/ ) {
692 "Hash-bang in pod can cause older versions of perl to fail! \n"
696 return $line_of_tokens;
699 # must print line unchanged if we have seen a severe error (i.e., we
700 # are seeing illegal tokens and cannot continue. Syntax errors do
701 # not pass this route). Calling routine can decide what to do, but
702 # the default can be to just pass all lines as if they were after __END__
703 elsif ( $tokenizer_self->{_in_error} ) {
704 $line_of_tokens->{_line_type} = 'ERROR';
705 return $line_of_tokens;
708 # print line unchanged if we are __DATA__ section
709 elsif ( $tokenizer_self->{_in_data} ) {
711 # ...but look for POD
712 # Note that the _in_data and _in_end flags remain set
713 # so that we return to that state after seeing the
714 # end of a pod section
715 if ( $input_line =~ /^=(?!cut)/ ) {
716 $line_of_tokens->{_line_type} = 'POD_START';
717 write_logfile_entry("Entering POD section\n");
718 $tokenizer_self->{_in_pod} = 1;
719 return $line_of_tokens;
722 $line_of_tokens->{_line_type} = 'DATA';
723 return $line_of_tokens;
727 # print line unchanged if we are in __END__ section
728 elsif ( $tokenizer_self->{_in_end} ) {
730 # ...but look for POD
731 # Note that the _in_data and _in_end flags remain set
732 # so that we return to that state after seeing the
733 # end of a pod section
734 if ( $input_line =~ /^=(?!cut)/ ) {
735 $line_of_tokens->{_line_type} = 'POD_START';
736 write_logfile_entry("Entering POD section\n");
737 $tokenizer_self->{_in_pod} = 1;
738 return $line_of_tokens;
741 $line_of_tokens->{_line_type} = 'END';
742 return $line_of_tokens;
746 # check for a hash-bang line if we haven't seen one
747 if ( !$tokenizer_self->{_saw_hash_bang} ) {
748 if ( $input_line =~ /^\#\!.*perl\b/ ) {
749 $tokenizer_self->{_saw_hash_bang} = $input_line_number;
751 # check for -w and -P flags
752 if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
753 $tokenizer_self->{_saw_perl_dash_P} = 1;
756 if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
757 $tokenizer_self->{_saw_perl_dash_w} = 1;
761 ( $input_line_number > 1 )
763 # leave any hash bang in a BEGIN block alone
764 # i.e. see 'debugger-duck_type.t'
766 $last_nonblank_block_type
767 && $last_nonblank_block_type eq 'BEGIN'
769 && ( !$tokenizer_self->{_look_for_hash_bang} )
773 # this is helpful for VMS systems; we may have accidentally
774 # tokenized some DCL commands
775 if ( $tokenizer_self->{_started_tokenizing} ) {
777 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
781 complain("Useless hash-bang after line 1\n");
785 # Report the leading hash-bang as a system line
786 # This will prevent -dac from deleting it
788 $line_of_tokens->{_line_type} = 'SYSTEM';
789 return $line_of_tokens;
794 # wait for a hash-bang before parsing if the user invoked us with -x
795 if ( $tokenizer_self->{_look_for_hash_bang}
796 && !$tokenizer_self->{_saw_hash_bang} )
798 $line_of_tokens->{_line_type} = 'SYSTEM';
799 return $line_of_tokens;
802 # a first line of the form ': #' will be marked as SYSTEM
803 # since lines of this form may be used by tcsh
804 if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
805 $line_of_tokens->{_line_type} = 'SYSTEM';
806 return $line_of_tokens;
809 # now we know that it is ok to tokenize the line...
810 # the line tokenizer will modify any of these private variables:
818 my $ending_in_quote_last = $tokenizer_self->{_in_quote};
819 tokenize_this_line($line_of_tokens);
821 # Now finish defining the return structure and return it
822 $line_of_tokens->{_ending_in_quote} = $tokenizer_self->{_in_quote};
824 # handle severe error (binary data in script)
825 if ( $tokenizer_self->{_in_error} ) {
826 $tokenizer_self->{_in_quote} = 0; # to avoid any more messages
827 warning("Giving up after error\n");
828 $line_of_tokens->{_line_type} = 'ERROR';
829 reset_indentation_level(0); # avoid error messages
830 return $line_of_tokens;
833 # handle start of pod documentation
834 if ( $tokenizer_self->{_in_pod} ) {
836 # This gets tricky..above a __DATA__ or __END__ section, perl
837 # accepts '=cut' as the start of pod section. But afterwards,
838 # only pod utilities see it and they may ignore an =cut without
839 # leading =head. In any case, this isn't good.
840 if ( $input_line =~ /^=cut\b/ ) {
841 if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) {
842 complain("=cut while not in pod ignored\n");
843 $tokenizer_self->{_in_pod} = 0;
844 $line_of_tokens->{_line_type} = 'POD_END';
847 $line_of_tokens->{_line_type} = 'POD_START';
849 "=cut starts a pod section .. this can fool pod utilities.\n"
851 write_logfile_entry("Entering POD section\n");
856 $line_of_tokens->{_line_type} = 'POD_START';
857 write_logfile_entry("Entering POD section\n");
860 return $line_of_tokens;
863 # update indentation levels for log messages
864 if ( $input_line !~ /^\s*$/ ) {
865 my $rlevels = $line_of_tokens->{_rlevels};
866 $line_of_tokens->{_guessed_indentation_level} =
867 guess_old_indentation_level($input_line);
870 # see if this line contains here doc targets
871 my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
872 if ( @{$rhere_target_list} ) {
874 my ( $here_doc_target, $here_quote_character ) =
875 @{ shift @{$rhere_target_list} };
876 $tokenizer_self->{_in_here_doc} = 1;
877 $tokenizer_self->{_here_doc_target} = $here_doc_target;
878 $tokenizer_self->{_here_quote_character} = $here_quote_character;
879 write_logfile_entry("Entering HERE document $here_doc_target\n");
880 $tokenizer_self->{_started_looking_for_here_target_at} =
884 # NOTE: __END__ and __DATA__ statements are written unformatted
885 # because they can theoretically contain additional characters
886 # which are not tokenized (and cannot be read with <DATA> either!).
887 if ( $tokenizer_self->{_in_data} ) {
888 $line_of_tokens->{_line_type} = 'DATA_START';
889 write_logfile_entry("Starting __DATA__ section\n");
890 $tokenizer_self->{_saw_data} = 1;
892 # keep parsing after __DATA__ if use SelfLoader was seen
893 if ( $tokenizer_self->{_saw_selfloader} ) {
894 $tokenizer_self->{_in_data} = 0;
896 "SelfLoader seen, continuing; -nlsl deactivates\n");
899 return $line_of_tokens;
902 elsif ( $tokenizer_self->{_in_end} ) {
903 $line_of_tokens->{_line_type} = 'END_START';
904 write_logfile_entry("Starting __END__ section\n");
905 $tokenizer_self->{_saw_end} = 1;
907 # keep parsing after __END__ if use AutoLoader was seen
908 if ( $tokenizer_self->{_saw_autoloader} ) {
909 $tokenizer_self->{_in_end} = 0;
911 "AutoLoader seen, continuing; -nlal deactivates\n");
913 return $line_of_tokens;
916 # now, finally, we know that this line is type 'CODE'
917 $line_of_tokens->{_line_type} = 'CODE';
919 # remember if we have seen any real code
920 if ( !$tokenizer_self->{_started_tokenizing}
921 && $input_line !~ /^\s*$/
922 && $input_line !~ /^\s*#/ )
924 $tokenizer_self->{_started_tokenizing} = 1;
927 if ( $tokenizer_self->{_debugger_object} ) {
928 $tokenizer_self->{_debugger_object}->write_debug_entry($line_of_tokens);
931 # Note: if keyword 'format' occurs in this line code, it is still CODE
932 # (keyword 'format' need not start a line)
933 if ( $tokenizer_self->{_in_format} ) {
934 write_logfile_entry("Entering format section\n");
937 if ( $tokenizer_self->{_in_quote}
938 and ( $tokenizer_self->{_line_start_quote} < 0 ) )
941 #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
943 ( my $quote_target = $tokenizer_self->{_quote_target} ) !~ /^\s*$/ )
945 $tokenizer_self->{_line_start_quote} = $input_line_number;
947 "Start multi-line quote or pattern ending in $quote_target\n");
950 elsif ( ( $tokenizer_self->{_line_start_quote} >= 0 )
951 && !$tokenizer_self->{_in_quote} )
953 $tokenizer_self->{_line_start_quote} = -1;
954 write_logfile_entry("End of multi-line quote or pattern\n");
957 # we are returning a line of CODE
958 return $line_of_tokens;
961 sub find_starting_indentation_level {
963 # We need to find the indentation level of the first line of the
964 # script being formatted. Often it will be zero for an entire file,
965 # but if we are formatting a local block of code (within an editor for
966 # example) it may not be zero. The user may specify this with the
967 # -sil=n parameter but normally doesn't so we have to guess.
969 # USES GLOBAL VARIABLES: $tokenizer_self
970 my $starting_level = 0;
972 # use value if given as parameter
973 if ( $tokenizer_self->{_know_starting_level} ) {
974 $starting_level = $tokenizer_self->{_starting_level};
977 # if we know there is a hash_bang line, the level must be zero
978 elsif ( $tokenizer_self->{_look_for_hash_bang} ) {
979 $tokenizer_self->{_know_starting_level} = 1;
982 # otherwise figure it out from the input file
987 # keep looking at lines until we find a hash bang or piece of code
990 $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
993 # if first line is #! then assume starting level is zero
994 if ( $i == 1 && $line =~ /^\#\!/ ) {
998 next if ( $line =~ /^\s*#/ ); # skip past comments
999 next if ( $line =~ /^\s*$/ ); # skip past blank lines
1000 $starting_level = guess_old_indentation_level($line);
1003 $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1004 write_logfile_entry("$msg");
1006 $tokenizer_self->{_starting_level} = $starting_level;
1007 reset_indentation_level($starting_level);
1011 sub guess_old_indentation_level {
1014 # Guess the indentation level of an input line.
1016 # For the first line of code this result will define the starting
1017 # indentation level. It will mainly be non-zero when perltidy is applied
1018 # within an editor to a local block of code.
1020 # This is an impossible task in general because we can't know what tabs
1021 # meant for the old script and how many spaces were used for one
1022 # indentation level in the given input script. For example it may have
1023 # been previously formatted with -i=7 -et=3. But we can at least try to
1024 # make sure that perltidy guesses correctly if it is applied repeatedly to
1025 # a block of code within an editor, so that the block stays at the same
1026 # level when perltidy is applied repeatedly.
1028 # USES GLOBAL VARIABLES: $tokenizer_self
1031 # find leading tabs, spaces, and any statement label
1033 if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1035 # If there are leading tabs, we use the tab scheme for this run, if
1036 # any, so that the code will remain stable when editing.
1037 if ($1) { $spaces += length($1) * $tokenizer_self->{_tabsize} }
1039 if ($2) { $spaces += length($2) }
1041 # correct for outdented labels
1042 if ( $3 && $tokenizer_self->{'_outdent_labels'} ) {
1043 $spaces += $tokenizer_self->{_continuation_indentation};
1047 # compute indentation using the value of -i for this run.
1048 # If -i=0 is used for this run (which is possible) it doesn't matter
1049 # what we do here but we'll guess that the old run used 4 spaces per level.
1050 my $indent_columns = $tokenizer_self->{_indent_columns};
1051 $indent_columns = 4 if ( !$indent_columns );
1052 $level = int( $spaces / $indent_columns );
1056 # This is a currently unused debug routine
1057 sub dump_functions {
1060 foreach my $pkg ( keys %is_user_function ) {
1061 print $fh "\nnon-constant subs in package $pkg\n";
1063 foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1065 if ( $is_block_list_function{$pkg}{$sub} ) {
1066 $msg = 'block_list';
1069 if ( $is_block_function{$pkg}{$sub} ) {
1072 print $fh "$sub $msg\n";
1076 foreach my $pkg ( keys %is_constant ) {
1077 print $fh "\nconstants and constant subs in package $pkg\n";
1079 foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1088 # count number of 1's in a string of 1's and 0's
1089 # example: ones_count("010101010101") gives 6
1091 return $str =~ tr/1/0/;
1094 sub prepare_for_a_new_file {
1096 # previous tokens needed to determine what to expect next
1097 $last_nonblank_token = ';'; # the only possible starting state which
1098 $last_nonblank_type = ';'; # will make a leading brace a code block
1099 $last_nonblank_block_type = '';
1101 # scalars for remembering statement types across multiple lines
1102 $statement_type = ''; # '' or 'use' or 'sub..' or 'case..'
1103 $in_attribute_list = 0;
1105 # scalars for remembering where we are in the file
1106 $current_package = "main";
1107 $context = UNKNOWN_CONTEXT;
1109 # hashes used to remember function information
1110 %is_constant = (); # user-defined constants
1111 %is_user_function = (); # user-defined functions
1112 %user_function_prototype = (); # their prototypes
1113 %is_block_function = ();
1114 %is_block_list_function = ();
1115 %saw_function_definition = ();
1117 # variables used to track depths of various containers
1118 # and report nesting errors
1121 $square_bracket_depth = 0;
1122 @current_depth = (0) x scalar @closing_brace_names;
1125 @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
1126 @current_sequence_number = ();
1127 $paren_type[$paren_depth] = '';
1128 $paren_semicolon_count[$paren_depth] = 0;
1129 $paren_structural_type[$brace_depth] = '';
1130 $brace_type[$brace_depth] = ';'; # identify opening brace as code block
1131 $brace_structural_type[$brace_depth] = '';
1132 $brace_context[$brace_depth] = UNKNOWN_CONTEXT;
1133 $brace_package[$paren_depth] = $current_package;
1134 $square_bracket_type[$square_bracket_depth] = '';
1135 $square_bracket_structural_type[$square_bracket_depth] = '';
1137 initialize_tokenizer_state();
1141 { # begin tokenize_this_line
1143 use constant BRACE => 0;
1144 use constant SQUARE_BRACKET => 1;
1145 use constant PAREN => 2;
1146 use constant QUESTION_COLON => 3;
1148 # TV1: scalars for processing one LINE.
1149 # Re-initialized on each entry to sub tokenize_this_line.
1151 $block_type, $container_type, $expecting,
1152 $i, $i_tok, $input_line,
1153 $input_line_number, $last_nonblank_i, $max_token_index,
1154 $next_tok, $next_type, $peeked_ahead,
1155 $prototype, $rhere_target_list, $rtoken_map,
1156 $rtoken_type, $rtokens, $tok,
1157 $type, $type_sequence, $indent_flag,
1160 # TV2: refs to ARRAYS for processing one LINE
1161 # Re-initialized on each call.
1162 my $routput_token_list = []; # stack of output token indexes
1163 my $routput_token_type = []; # token types
1164 my $routput_block_type = []; # types of code block
1165 my $routput_container_type = []; # paren types, such as if, elsif, ..
1166 my $routput_type_sequence = []; # nesting sequential number
1167 my $routput_indent_flag = []; #
1169 # TV3: SCALARS for quote variables. These are initialized with a
1170 # subroutine call and continually updated as lines are processed.
1171 my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1172 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1174 # TV4: SCALARS for multi-line identifiers and
1175 # statements. These are initialized with a subroutine call
1176 # and continually updated as lines are processed.
1177 my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
1179 # TV5: SCALARS for tracking indentation level.
1180 # Initialized once and continually updated as lines are
1183 $nesting_token_string, $nesting_type_string,
1184 $nesting_block_string, $nesting_block_flag,
1185 $nesting_list_string, $nesting_list_flag,
1186 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1187 $in_statement_continuation, $level_in_tokenizer,
1188 $slevel_in_tokenizer, $rslevel_stack,
1191 # TV6: SCALARS for remembering several previous
1192 # tokens. Initialized once and continually updated as
1193 # lines are processed.
1195 $last_nonblank_container_type, $last_nonblank_type_sequence,
1196 $last_last_nonblank_token, $last_last_nonblank_type,
1197 $last_last_nonblank_block_type, $last_last_nonblank_container_type,
1198 $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1201 # ----------------------------------------------------------------
1202 # beginning of tokenizer variable access and manipulation routines
1203 # ----------------------------------------------------------------
1205 sub initialize_tokenizer_state {
1207 # TV1: initialized on each call
1208 # TV2: initialized on each call
1212 $quote_character = "";
1215 $quoted_string_1 = "";
1216 $quoted_string_2 = "";
1217 $allowed_quote_modifiers = "";
1220 $id_scan_state = '';
1223 $indented_if_level = 0;
1226 $nesting_token_string = "";
1227 $nesting_type_string = "";
1228 $nesting_block_string = '1'; # initially in a block
1229 $nesting_block_flag = 1;
1230 $nesting_list_string = '0'; # initially not in a list
1231 $nesting_list_flag = 0; # initially not in a list
1232 $ci_string_in_tokenizer = "";
1233 $continuation_string_in_tokenizer = "0";
1234 $in_statement_continuation = 0;
1235 $level_in_tokenizer = 0;
1236 $slevel_in_tokenizer = 0;
1237 $rslevel_stack = [];
1240 $last_nonblank_container_type = '';
1241 $last_nonblank_type_sequence = '';
1242 $last_last_nonblank_token = ';';
1243 $last_last_nonblank_type = ';';
1244 $last_last_nonblank_block_type = '';
1245 $last_last_nonblank_container_type = '';
1246 $last_last_nonblank_type_sequence = '';
1247 $last_nonblank_prototype = "";
1251 sub save_tokenizer_state {
1254 $block_type, $container_type, $expecting,
1255 $i, $i_tok, $input_line,
1256 $input_line_number, $last_nonblank_i, $max_token_index,
1257 $next_tok, $next_type, $peeked_ahead,
1258 $prototype, $rhere_target_list, $rtoken_map,
1259 $rtoken_type, $rtokens, $tok,
1260 $type, $type_sequence, $indent_flag,
1264 $routput_token_list, $routput_token_type,
1265 $routput_block_type, $routput_container_type,
1266 $routput_type_sequence, $routput_indent_flag,
1270 $in_quote, $quote_type,
1271 $quote_character, $quote_pos,
1272 $quote_depth, $quoted_string_1,
1273 $quoted_string_2, $allowed_quote_modifiers,
1277 [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
1280 $nesting_token_string, $nesting_type_string,
1281 $nesting_block_string, $nesting_block_flag,
1282 $nesting_list_string, $nesting_list_flag,
1283 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1284 $in_statement_continuation, $level_in_tokenizer,
1285 $slevel_in_tokenizer, $rslevel_stack,
1289 $last_nonblank_container_type,
1290 $last_nonblank_type_sequence,
1291 $last_last_nonblank_token,
1292 $last_last_nonblank_type,
1293 $last_last_nonblank_block_type,
1294 $last_last_nonblank_container_type,
1295 $last_last_nonblank_type_sequence,
1296 $last_nonblank_prototype,
1298 return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1301 sub restore_tokenizer_state {
1303 my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1305 $block_type, $container_type, $expecting,
1306 $i, $i_tok, $input_line,
1307 $input_line_number, $last_nonblank_i, $max_token_index,
1308 $next_tok, $next_type, $peeked_ahead,
1309 $prototype, $rhere_target_list, $rtoken_map,
1310 $rtoken_type, $rtokens, $tok,
1311 $type, $type_sequence, $indent_flag,
1315 $routput_token_list, $routput_token_type,
1316 $routput_block_type, $routput_container_type,
1317 $routput_type_sequence, $routput_type_sequence,
1321 $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1322 $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1325 ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
1329 $nesting_token_string, $nesting_type_string,
1330 $nesting_block_string, $nesting_block_flag,
1331 $nesting_list_string, $nesting_list_flag,
1332 $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
1333 $in_statement_continuation, $level_in_tokenizer,
1334 $slevel_in_tokenizer, $rslevel_stack,
1338 $last_nonblank_container_type,
1339 $last_nonblank_type_sequence,
1340 $last_last_nonblank_token,
1341 $last_last_nonblank_type,
1342 $last_last_nonblank_block_type,
1343 $last_last_nonblank_container_type,
1344 $last_last_nonblank_type_sequence,
1345 $last_nonblank_prototype,
1350 sub get_indentation_level {
1352 # patch to avoid reporting error if indented if is not terminated
1353 if ($indented_if_level) { return $level_in_tokenizer - 1 }
1354 return $level_in_tokenizer;
1357 sub reset_indentation_level {
1358 $level_in_tokenizer = $slevel_in_tokenizer = shift;
1359 push @{$rslevel_stack}, $slevel_in_tokenizer;
1365 $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1366 return $peeked_ahead;
1369 # ------------------------------------------------------------
1370 # end of tokenizer variable access and manipulation routines
1371 # ------------------------------------------------------------
1373 # ------------------------------------------------------------
1374 # beginning of various scanner interface routines
1375 # ------------------------------------------------------------
1376 sub scan_replacement_text {
1378 # check for here-docs in replacement text invoked by
1379 # a substitution operator with executable modifier 'e'.
1384 # $rht = reference to any here-doc targets
1385 my ($replacement_text) = @_;
1388 return unless ( $replacement_text =~ /<</ );
1390 write_logfile_entry("scanning replacement text for here-doc targets\n");
1392 # save the logger object for error messages
1393 my $logger_object = $tokenizer_self->{_logger_object};
1395 # localize all package variables
1397 $tokenizer_self, $last_nonblank_token,
1398 $last_nonblank_type, $last_nonblank_block_type,
1399 $statement_type, $in_attribute_list,
1400 $current_package, $context,
1401 %is_constant, %is_user_function,
1402 %user_function_prototype, %is_block_function,
1403 %is_block_list_function, %saw_function_definition,
1404 $brace_depth, $paren_depth,
1405 $square_bracket_depth, @current_depth,
1406 @total_depth, $total_depth,
1407 @nesting_sequence_number, @current_sequence_number,
1408 @paren_type, @paren_semicolon_count,
1409 @paren_structural_type, @brace_type,
1410 @brace_structural_type, @brace_context,
1411 @brace_package, @square_bracket_type,
1412 @square_bracket_structural_type, @depth_array,
1413 @starting_line_of_current_depth, @nested_ternary_flag,
1414 @nested_statement_type,
1417 # save all lexical variables
1418 my $rstate = save_tokenizer_state();
1419 _decrement_count(); # avoid error check for multiple tokenizers
1421 # make a new tokenizer
1423 my $rpending_logfile_message;
1425 Perl::Tidy::LineSource->new( \$replacement_text, $rOpts,
1426 $rpending_logfile_message );
1427 my $tokenizer = Perl::Tidy::Tokenizer->new(
1428 source_object => $source_object,
1429 logger_object => $logger_object,
1430 starting_line_number => $input_line_number,
1433 # scan the replacement text
1434 1 while ( $tokenizer->get_line() );
1436 # remove any here doc targets
1438 if ( $tokenizer_self->{_in_here_doc} ) {
1442 $tokenizer_self->{_here_doc_target},
1443 $tokenizer_self->{_here_quote_character}
1445 if ( $tokenizer_self->{_rhere_target_list} ) {
1446 push @{$rht}, @{ $tokenizer_self->{_rhere_target_list} };
1447 $tokenizer_self->{_rhere_target_list} = undef;
1449 $tokenizer_self->{_in_here_doc} = undef;
1452 # now its safe to report errors
1453 my $severe_error = $tokenizer->report_tokenization_errors();
1455 # TODO: Could propagate a severe error up
1457 # restore all tokenizer lexical variables
1458 restore_tokenizer_state($rstate);
1460 # return the here doc targets
1464 sub scan_bare_identifier {
1465 ( $i, $tok, $type, $prototype ) =
1466 scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
1467 $rtoken_map, $max_token_index );
1471 sub scan_identifier {
1472 ( $i, $tok, $type, $id_scan_state, $identifier ) =
1473 scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
1474 $max_token_index, $expecting, $paren_type[$paren_depth] );
1479 ( $i, $tok, $type, $id_scan_state ) =
1480 scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
1481 $id_scan_state, $max_token_index );
1487 ( $i, $type, $number ) =
1488 scan_number_do( $input_line, $i, $rtoken_map, $type,
1493 # a sub to warn if token found where term expected
1494 sub error_if_expecting_TERM {
1495 if ( $expecting == TERM ) {
1496 if ( $really_want_term{$last_nonblank_type} ) {
1497 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
1498 $rtoken_map, $rtoken_type, $input_line );
1505 # a sub to warn if token found where operator expected
1506 sub error_if_expecting_OPERATOR {
1508 if ( $expecting == OPERATOR ) {
1509 if ( !defined($thing) ) { $thing = $tok }
1510 report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
1511 $rtoken_map, $rtoken_type, $input_line );
1512 if ( $i_tok == 0 ) {
1513 interrupt_logfile();
1514 warning("Missing ';' above?\n");
1522 # ------------------------------------------------------------
1523 # end scanner interfaces
1524 # ------------------------------------------------------------
1527 @_ = qw(for foreach);
1528 @is_for_foreach{@_} = (1) x scalar(@_);
1532 @is_my_our{@_} = (1) x scalar(@_);
1534 # These keywords may introduce blocks after parenthesized expressions,
1536 # keyword ( .... ) { BLOCK }
1537 # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
1538 my %is_blocktype_with_paren;
1540 qw(if elsif unless while until for foreach switch case given when catch);
1541 @is_blocktype_with_paren{@_} = (1) x scalar(@_);
1543 # ------------------------------------------------------------
1544 # begin hash of code for handling most token types
1545 # ------------------------------------------------------------
1546 my $tokenization_code = {
1548 # no special code for these types yet, but syntax checks
1583 error_if_expecting_TERM()
1584 if ( $expecting == TERM );
1587 error_if_expecting_TERM()
1588 if ( $expecting == TERM );
1592 # start looking for a scalar
1593 error_if_expecting_OPERATOR("Scalar")
1594 if ( $expecting == OPERATOR );
1597 if ( $identifier eq '$^W' ) {
1598 $tokenizer_self->{_saw_perl_dash_w} = 1;
1601 # Check for identifier in indirect object slot
1602 # (vorboard.pl, sort.t). Something like:
1603 # /^(print|printf|sort|exec|system)$/
1605 $is_indirect_object_taker{$last_nonblank_token}
1607 || ( ( $last_nonblank_token eq '(' )
1608 && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
1609 || ( $last_nonblank_type =~ /^[Uw]$/ ) # possible object
1618 $paren_semicolon_count[$paren_depth] = 0;
1620 $container_type = $want_paren;
1623 elsif ( $statement_type =~ /^sub\b/ ) {
1624 $container_type = $statement_type;
1627 $container_type = $last_nonblank_token;
1629 # We can check for a syntax error here of unexpected '(',
1630 # but this is going to get messy...
1632 $expecting == OPERATOR
1634 # be sure this is not a method call of the form
1635 # &method(...), $method->(..), &{method}(...),
1636 # $ref[2](list) is ok & short for $ref[2]->(list)
1637 # NOTE: at present, braces in something like &{ xxx }
1638 # are not marked as a block, we might have a method call
1639 && $last_nonblank_token !~ /^([\]\}\&]|\-\>)/
1644 # ref: camel 3 p 703.
1645 if ( $last_last_nonblank_token eq 'do' ) {
1647 "do SUBROUTINE is deprecated; consider & or -> notation\n"
1652 # if this is an empty list, (), then it is not an
1653 # error; for example, we might have a constant pi and
1654 # invoke it with pi() or just pi;
1655 my ( $next_nonblank_token, $i_next ) =
1656 find_next_nonblank_token( $i, $rtokens,
1658 if ( $next_nonblank_token ne ')' ) {
1660 error_if_expecting_OPERATOR('(');
1662 if ( $last_nonblank_type eq 'C' ) {
1664 "$last_nonblank_token has a void prototype\n";
1666 elsif ( $last_nonblank_type eq 'i' ) {
1668 && $last_nonblank_token =~ /^\$/ )
1671 "Do you mean '$last_nonblank_token->(' ?\n";
1675 interrupt_logfile();
1679 } ## end if ( $next_nonblank_token...
1680 } ## end else [ if ( $last_last_nonblank_token...
1681 } ## end if ( $expecting == OPERATOR...
1683 $paren_type[$paren_depth] = $container_type;
1684 ( $type_sequence, $indent_flag ) =
1685 increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
1687 # propagate types down through nested parens
1688 # for example: the second paren in 'if ((' would be structural
1689 # since the first is.
1691 if ( $last_nonblank_token eq '(' ) {
1692 $type = $last_nonblank_type;
1695 # We exclude parens as structural after a ',' because it
1696 # causes subtle problems with continuation indentation for
1697 # something like this, where the first 'or' will not get
1702 # ( not defined $check )
1704 # or $check eq "new"
1705 # or $check eq "old",
1708 # Likewise, we exclude parens where a statement can start
1709 # because of problems with continuation indentation, like
1712 # ($firstline =~ /^#\!.*perl/)
1713 # and (print $File::Find::name, "\n")
1716 # (ref($usage_fref) =~ /CODE/)
1718 # : (&blast_usage, &blast_params, &blast_general_params);
1724 if ( $last_nonblank_type eq ')' ) {
1726 "Syntax error? found token '$last_nonblank_type' then '('\n"
1729 $paren_structural_type[$paren_depth] = $type;
1733 ( $type_sequence, $indent_flag ) =
1734 decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
1736 if ( $paren_structural_type[$paren_depth] eq '{' ) {
1740 $container_type = $paren_type[$paren_depth];
1742 # restore statement type as 'sub' at closing paren of a signature
1743 # so that a subsequent ':' is identified as an attribute
1744 if ( $container_type =~ /^sub\b/ ) {
1745 $statement_type = $container_type;
1749 if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
1750 my $num_sc = $paren_semicolon_count[$paren_depth];
1751 if ( $num_sc > 0 && $num_sc != 2 ) {
1752 warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
1756 if ( $paren_depth > 0 ) { $paren_depth-- }
1759 if ( $last_nonblank_type eq ',' ) {
1760 complain("Repeated ','s \n");
1763 # patch for operator_expected: note if we are in the list (use.t)
1764 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
1765 ## FIXME: need to move this elsewhere, perhaps check after a '('
1766 ## elsif ($last_nonblank_token eq '(') {
1767 ## warning("Leading ','s illegal in some versions of perl\n");
1771 $context = UNKNOWN_CONTEXT;
1772 $statement_type = '';
1776 if ( $is_for_foreach{ $paren_type[$paren_depth] } )
1777 { # mark ; in for loop
1779 # Be careful: we do not want a semicolon such as the
1780 # following to be included:
1782 # for (sort {strcoll($a,$b);} keys %investments) {
1784 if ( $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
1785 && $square_bracket_depth ==
1786 $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
1790 $paren_semicolon_count[$paren_depth]++;
1796 error_if_expecting_OPERATOR("String")
1797 if ( $expecting == OPERATOR );
1800 $allowed_quote_modifiers = "";
1803 error_if_expecting_OPERATOR("String")
1804 if ( $expecting == OPERATOR );
1807 $allowed_quote_modifiers = "";
1810 error_if_expecting_OPERATOR("String")
1811 if ( $expecting == OPERATOR );
1814 $allowed_quote_modifiers = "";
1819 # a pattern cannot follow certain keywords which take optional
1820 # arguments, like 'shift' and 'pop'. See also '?'.
1821 if ( $last_nonblank_type eq 'k'
1822 && $is_keyword_taking_optional_args{$last_nonblank_token} )
1826 elsif ( $expecting == UNKNOWN ) { # indeterminate, must guess..
1828 ( $is_pattern, $msg ) =
1829 guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
1833 write_diagnostics("DIVIDE:$msg\n");
1834 write_logfile_entry($msg);
1837 else { $is_pattern = ( $expecting == TERM ) }
1842 $allowed_quote_modifiers = '[msixpodualngc]';
1844 else { # not a pattern; check for a /= token
1846 if ( $rtokens->[ $i + 1 ] eq '=' ) { # form token /=
1852 #DEBUG - collecting info on what tokens follow a divide
1853 # for development of guessing algorithm
1854 #if ( numerator_expected( $i, $rtokens, $max_token_index ) < 0 ) {
1855 # #write_diagnostics( "DIVIDE? $input_line\n" );
1861 # if we just saw a ')', we will label this block with
1862 # its type. We need to do this to allow sub
1863 # code_block_type to determine if this brace starts a
1864 # code block or anonymous hash. (The type of a paren
1865 # pair is the preceding token, such as 'if', 'else',
1867 $container_type = "";
1869 # ATTRS: for a '{' following an attribute list, reset
1870 # things to look like we just saw the sub name
1871 if ( $statement_type =~ /^sub/ ) {
1872 $last_nonblank_token = $statement_type;
1873 $last_nonblank_type = 'i';
1874 $statement_type = "";
1877 # patch for SWITCH/CASE: hide these keywords from an immediately
1878 # following opening brace
1879 elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
1880 && $statement_type eq $last_nonblank_token )
1882 $last_nonblank_token = ";";
1885 elsif ( $last_nonblank_token eq ')' ) {
1886 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
1888 # defensive move in case of a nesting error (pbug.t)
1889 # in which this ')' had no previous '('
1890 # this nesting error will have been caught
1891 if ( !defined($last_nonblank_token) ) {
1892 $last_nonblank_token = 'if';
1895 # check for syntax error here;
1896 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
1897 if ( $tokenizer_self->{'_extended_syntax'} ) {
1899 # we append a trailing () to mark this as an unknown
1900 # block type. This allows perltidy to format some
1901 # common extensions of perl syntax.
1902 # This is used by sub code_block_type
1903 $last_nonblank_token .= '()';
1907 join( ' ', sort keys %is_blocktype_with_paren );
1909 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
1915 # patch for paren-less for/foreach glitch, part 2.
1916 # see note below under 'qw'
1917 elsif ($last_nonblank_token eq 'qw'
1918 && $is_for_foreach{$want_paren} )
1920 $last_nonblank_token = $want_paren;
1921 if ( $last_last_nonblank_token eq $want_paren ) {
1923 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
1930 # now identify which of the three possible types of
1931 # curly braces we have: hash index container, anonymous
1932 # hash reference, or code block.
1934 # non-structural (hash index) curly brace pair
1935 # get marked 'L' and 'R'
1936 if ( is_non_structural_brace() ) {
1939 # patch for SWITCH/CASE:
1940 # allow paren-less identifier after 'when'
1941 # if the brace is preceded by a space
1942 if ( $statement_type eq 'when'
1943 && $last_nonblank_type eq 'i'
1944 && $last_last_nonblank_type eq 'k'
1945 && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
1948 $block_type = $statement_type;
1952 # code and anonymous hash have the same type, '{', but are
1953 # distinguished by 'block_type',
1954 # which will be blank for an anonymous hash
1957 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
1960 # patch to promote bareword type to function taking block
1962 && $last_nonblank_type eq 'w'
1963 && $last_nonblank_i >= 0 )
1965 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
1966 $routput_token_type->[$last_nonblank_i] = 'G';
1970 # patch for SWITCH/CASE: if we find a stray opening block brace
1971 # where we might accept a 'case' or 'when' block, then take it
1972 if ( $statement_type eq 'case'
1973 || $statement_type eq 'when' )
1975 if ( !$block_type || $block_type eq '}' ) {
1976 $block_type = $statement_type;
1981 $brace_type[ ++$brace_depth ] = $block_type;
1982 $brace_package[$brace_depth] = $current_package;
1983 $brace_structural_type[$brace_depth] = $type;
1984 $brace_context[$brace_depth] = $context;
1985 ( $type_sequence, $indent_flag ) =
1986 increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
1989 $block_type = $brace_type[$brace_depth];
1990 if ($block_type) { $statement_type = '' }
1991 if ( defined( $brace_package[$brace_depth] ) ) {
1992 $current_package = $brace_package[$brace_depth];
1995 # can happen on brace error (caught elsewhere)
1998 ( $type_sequence, $indent_flag ) =
1999 decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2001 if ( $brace_structural_type[$brace_depth] eq 'L' ) {
2005 # propagate type information for 'do' and 'eval' blocks, and also
2006 # for smartmatch operator. This is necessary to enable us to know
2007 # if an operator or term is expected next.
2008 if ( $is_block_operator{$block_type} ) {
2012 $context = $brace_context[$brace_depth];
2013 if ( $brace_depth > 0 ) { $brace_depth--; }
2015 '&' => sub { # maybe sub call? start looking
2017 # We have to check for sub call unless we are sure we
2018 # are expecting an operator. This example from s2p
2019 # got mistaken as a q operator in an early version:
2020 # print BODY &q(<<'EOT');
2021 if ( $expecting != OPERATOR ) {
2023 # But only look for a sub call if we are expecting a term or
2024 # if there is no existing space after the &.
2025 # For example we probably don't want & as sub call here:
2026 # Fcntl::S_IRUSR & $mode;
2027 if ( $expecting == TERM || $next_type ne 'b' ) {
2034 '<' => sub { # angle operator or less than?
2036 if ( $expecting != OPERATOR ) {
2038 find_angle_operator_termination( $input_line, $i, $rtoken_map,
2039 $expecting, $max_token_index );
2041 if ( $type eq '<' && $expecting == TERM ) {
2042 error_if_expecting_TERM();
2043 interrupt_logfile();
2044 warning("Unterminated <> operator?\n");
2051 '?' => sub { # ?: conditional or starting pattern?
2055 # Patch for rt #126965
2056 # a pattern cannot follow certain keywords which take optional
2057 # arguments, like 'shift' and 'pop'. See also '/'.
2058 if ( $last_nonblank_type eq 'k'
2059 && $is_keyword_taking_optional_args{$last_nonblank_token} )
2064 # patch for RT#131288, user constant function without prototype
2065 # last type is 'U' followed by ?.
2066 elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
2069 elsif ( $expecting == UNKNOWN ) {
2071 # In older versions of Perl, a bare ? can be a pattern
2072 # delimiter. Sometime after Perl 5.10 this seems to have
2073 # been dropped, but we have to support it in order to format
2074 # older programs. For example, the following line worked
2076 # ?(.*)? && (print $1,"\n");
2077 # In current versions it would have to be written with slashes:
2078 # /(.*)/ && (print $1,"\n");
2080 ( $is_pattern, $msg ) =
2081 guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
2084 if ($msg) { write_logfile_entry($msg) }
2086 else { $is_pattern = ( $expecting == TERM ) }
2091 $allowed_quote_modifiers = '[msixpodualngc]';
2094 ( $type_sequence, $indent_flag ) =
2095 increase_nesting_depth( QUESTION_COLON,
2096 $rtoken_map->[$i_tok] );
2099 '*' => sub { # typeglob, or multiply?
2101 if ( $expecting == TERM ) {
2106 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2111 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
2115 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2123 '.' => sub { # what kind of . ?
2125 if ( $expecting != OPERATOR ) {
2127 if ( $type eq '.' ) {
2128 error_if_expecting_TERM()
2129 if ( $expecting == TERM );
2137 # if this is the first nonblank character, call it a label
2138 # since perl seems to just swallow it
2139 if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
2143 # ATTRS: check for a ':' which introduces an attribute list
2144 # (this might eventually get its own token type)
2145 elsif ( $statement_type =~ /^sub\b/ ) {
2147 $in_attribute_list = 1;
2150 # check for scalar attribute, such as
2151 # my $foo : shared = 1;
2152 elsif ($is_my_our{$statement_type}
2153 && $current_depth[QUESTION_COLON] == 0 )
2156 $in_attribute_list = 1;
2159 # otherwise, it should be part of a ?/: operator
2161 ( $type_sequence, $indent_flag ) =
2162 decrease_nesting_depth( QUESTION_COLON,
2163 $rtoken_map->[$i_tok] );
2164 if ( $last_nonblank_token eq '?' ) {
2165 warning("Syntax error near ? :\n");
2169 '+' => sub { # what kind of plus?
2171 if ( $expecting == TERM ) {
2172 my $number = scan_number();
2174 # unary plus is safest assumption if not a number
2175 if ( !defined($number) ) { $type = 'p'; }
2177 elsif ( $expecting == OPERATOR ) {
2180 if ( $next_type eq 'w' ) { $type = 'p' }
2185 error_if_expecting_OPERATOR("Array")
2186 if ( $expecting == OPERATOR );
2189 '%' => sub { # hash or modulo?
2191 # first guess is hash if no following blank
2192 if ( $expecting == UNKNOWN ) {
2193 if ( $next_type ne 'b' ) { $expecting = TERM }
2195 if ( $expecting == TERM ) {
2200 $square_bracket_type[ ++$square_bracket_depth ] =
2201 $last_nonblank_token;
2202 ( $type_sequence, $indent_flag ) =
2203 increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2205 # It may seem odd, but structural square brackets have
2206 # type '{' and '}'. This simplifies the indentation logic.
2207 if ( !is_non_structural_brace() ) {
2210 $square_bracket_structural_type[$square_bracket_depth] = $type;
2213 ( $type_sequence, $indent_flag ) =
2214 decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2216 if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
2221 # propagate type information for smartmatch operator. This is
2222 # necessary to enable us to know if an operator or term is expected
2224 if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
2225 $tok = $square_bracket_type[$square_bracket_depth];
2228 if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
2230 '-' => sub { # what kind of minus?
2232 if ( ( $expecting != OPERATOR )
2233 && $is_file_test_operator{$next_tok} )
2235 my ( $next_nonblank_token, $i_next ) =
2236 find_next_nonblank_token( $i + 1, $rtokens,
2239 # check for a quoted word like "-w=>xx";
2240 # it is sufficient to just check for a following '='
2241 if ( $next_nonblank_token eq '=' ) {
2250 elsif ( $expecting == TERM ) {
2251 my $number = scan_number();
2253 # maybe part of bareword token? unary is safest
2254 if ( !defined($number) ) { $type = 'm'; }
2257 elsif ( $expecting == OPERATOR ) {
2261 if ( $next_type eq 'w' ) {
2269 # check for special variables like ${^WARNING_BITS}
2270 if ( $expecting == TERM ) {
2272 # FIXME: this should work but will not catch errors
2273 # because we also have to be sure that previous token is
2274 # a type character ($,@,%).
2275 if ( $last_nonblank_token eq '{'
2276 && ( $next_tok =~ /^[A-Za-z_]/ ) )
2279 if ( $next_tok eq 'W' ) {
2280 $tokenizer_self->{_saw_perl_dash_w} = 1;
2282 $tok = $tok . $next_tok;
2288 unless ( error_if_expecting_TERM() ) {
2290 # Something like this is valid but strange:
2292 complain("The '^' seems unusual here\n");
2298 '::' => sub { # probably a sub call
2299 scan_bare_identifier();
2301 '<<' => sub { # maybe a here-doc?
2303 unless ( $i < $max_token_index )
2304 ; # here-doc not possible if end of line
2306 if ( $expecting != OPERATOR ) {
2307 my ( $found_target, $here_doc_target, $here_quote_character,
2310 $found_target, $here_doc_target, $here_quote_character, $i,
2313 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
2316 if ($found_target) {
2317 push @{$rhere_target_list},
2318 [ $here_doc_target, $here_quote_character ];
2320 if ( length($here_doc_target) > 80 ) {
2321 my $truncated = substr( $here_doc_target, 0, 80 );
2322 complain("Long here-target: '$truncated' ...\n");
2324 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
2326 "Unconventional here-target: '$here_doc_target'\n");
2329 elsif ( $expecting == TERM ) {
2330 unless ($saw_error) {
2332 # shouldn't happen..
2333 warning("Program bug; didn't find here doc target\n");
2334 report_definite_bug();
2341 '<<~' => sub { # a here-doc, new type added in v26
2343 unless ( $i < $max_token_index )
2344 ; # here-doc not possible if end of line
2345 if ( $expecting != OPERATOR ) {
2346 my ( $found_target, $here_doc_target, $here_quote_character,
2349 $found_target, $here_doc_target, $here_quote_character, $i,
2352 = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
2355 if ($found_target) {
2357 if ( length($here_doc_target) > 80 ) {
2358 my $truncated = substr( $here_doc_target, 0, 80 );
2359 complain("Long here-target: '$truncated' ...\n");
2361 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
2363 "Unconventional here-target: '$here_doc_target'\n");
2366 # Note that we put a leading space on the here quote
2367 # character indicate that it may be preceded by spaces
2368 $here_quote_character = " " . $here_quote_character;
2369 push @{$rhere_target_list},
2370 [ $here_doc_target, $here_quote_character ];
2373 elsif ( $expecting == TERM ) {
2374 unless ($saw_error) {
2376 # shouldn't happen..
2377 warning("Program bug; didn't find here doc target\n");
2378 report_definite_bug();
2387 # if -> points to a bare word, we must scan for an identifier,
2388 # otherwise something like ->y would look like the y operator
2392 # type = 'pp' for pre-increment, '++' for post-increment
2394 if ( $expecting == TERM ) { $type = 'pp' }
2395 elsif ( $expecting == UNKNOWN ) {
2396 my ( $next_nonblank_token, $i_next ) =
2397 find_next_nonblank_token( $i, $rtokens, $max_token_index );
2398 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
2403 if ( $last_nonblank_type eq $tok ) {
2404 complain("Repeated '=>'s \n");
2407 # patch for operator_expected: note if we are in the list (use.t)
2408 # TODO: make version numbers a new token type
2409 if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2412 # type = 'mm' for pre-decrement, '--' for post-decrement
2415 if ( $expecting == TERM ) { $type = 'mm' }
2416 elsif ( $expecting == UNKNOWN ) {
2417 my ( $next_nonblank_token, $i_next ) =
2418 find_next_nonblank_token( $i, $rtokens, $max_token_index );
2419 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
2424 error_if_expecting_TERM()
2425 if ( $expecting == TERM );
2429 error_if_expecting_TERM()
2430 if ( $expecting == TERM );
2434 error_if_expecting_TERM()
2435 if ( $expecting == TERM );
2439 # ------------------------------------------------------------
2440 # end hash of code for handling individual token types
2441 # ------------------------------------------------------------
2443 my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
2445 # These block types terminate statements and do not need a trailing
2447 # patched for SWITCH/CASE/
2448 my %is_zero_continuation_block_type;
2449 @_ = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
2450 if elsif else unless while until for foreach switch case given when);
2451 @is_zero_continuation_block_type{@_} = (1) x scalar(@_);
2453 my %is_not_zero_continuation_block_type;
2454 @_ = qw(sort grep map do eval);
2455 @is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);
2457 my %is_logical_container;
2458 @_ = qw(if elsif unless while and or err not && ! || for foreach);
2459 @is_logical_container{@_} = (1) x scalar(@_);
2463 @is_binary_type{@_} = (1) x scalar(@_);
2465 my %is_binary_keyword;
2466 @_ = qw(and or err eq ne cmp);
2467 @is_binary_keyword{@_} = (1) x scalar(@_);
2469 # 'L' is token for opening { at hash key
2470 my %is_opening_type;
2472 @is_opening_type{@_} = (1) x scalar(@_);
2474 # 'R' is token for closing } at hash key
2475 my %is_closing_type;
2477 @is_closing_type{@_} = (1) x scalar(@_);
2479 my %is_redo_last_next_goto;
2480 @_ = qw(redo last next goto);
2481 @is_redo_last_next_goto{@_} = (1) x scalar(@_);
2484 @_ = qw(use require);
2485 @is_use_require{@_} = (1) x scalar(@_);
2487 # This hash holds the hash key in $tokenizer_self for these keywords:
2488 my %is_format_END_DATA = (
2489 'format' => '_in_format',
2490 '__END__' => '_in_end',
2491 '__DATA__' => '_in_data',
2494 # original ref: camel 3 p 147,
2495 # but perl may accept undocumented flags
2496 # perl 5.10 adds 'p' (preserve)
2497 # Perl version 5.22 added 'n'
2498 # From http://perldoc.perl.org/perlop.html we have
2499 # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
2500 # s/PATTERN/REPLACEMENT/msixpodualngcer
2501 # y/SEARCHLIST/REPLACEMENTLIST/cdsr
2502 # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
2503 # qr/STRING/msixpodualn
2504 my %quote_modifiers = (
2505 's' => '[msixpodualngcer]',
2508 'm' => '[msixpodualngc]',
2509 'qr' => '[msixpodualn]',
2516 # table showing how many quoted things to look for after quote operator..
2517 # s, y, tr have 2 (pattern and replacement)
2518 # others have 1 (pattern only)
2531 sub tokenize_this_line {
2533 # This routine breaks a line of perl code into tokens which are of use in
2534 # indentation and reformatting. One of my goals has been to define tokens
2535 # such that a newline may be inserted between any pair of tokens without
2536 # changing or invalidating the program. This version comes close to this,
2537 # although there are necessarily a few exceptions which must be caught by
2538 # the formatter. Many of these involve the treatment of bare words.
2540 # The tokens and their types are returned in arrays. See previous
2541 # routine for their names.
2543 # See also the array "valid_token_types" in the BEGIN section for an
2546 # To simplify things, token types are either a single character, or they
2547 # are identical to the tokens themselves.
2549 # As a debugging aid, the -D flag creates a file containing a side-by-side
2550 # comparison of the input string and its tokenization for each line of a file.
2551 # This is an invaluable debugging aid.
2553 # In addition to tokens, and some associated quantities, the tokenizer
2554 # also returns flags indication any special line types. These include
2555 # quotes, here_docs, formats.
2557 # -----------------------------------------------------------------------
2559 # How to add NEW_TOKENS:
2561 # New token types will undoubtedly be needed in the future both to keep up
2562 # with changes in perl and to help adapt the tokenizer to other applications.
2564 # Here are some notes on the minimal steps. I wrote these notes while
2565 # adding the 'v' token type for v-strings, which are things like version
2566 # numbers 5.6.0, and ip addresses, and will use that as an example. ( You
2567 # can use your editor to search for the string "NEW_TOKENS" to find the
2568 # appropriate sections to change):
2570 # *. Try to talk somebody else into doing it! If not, ..
2572 # *. Make a backup of your current version in case things don't work out!
2574 # *. Think of a new, unused character for the token type, and add to
2575 # the array @valid_token_types in the BEGIN section of this package.
2576 # For example, I used 'v' for v-strings.
2578 # *. Implement coding to recognize the $type of the token in this routine.
2579 # This is the hardest part, and is best done by imitating or modifying
2580 # some of the existing coding. For example, to recognize v-strings, I
2581 # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
2582 # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
2584 # *. Update sub operator_expected. This update is critically important but
2585 # the coding is trivial. Look at the comments in that routine for help.
2586 # For v-strings, which should behave like numbers, I just added 'v' to the
2587 # regex used to handle numbers and strings (types 'n' and 'Q').
2589 # *. Implement a 'bond strength' rule in sub set_bond_strengths in
2590 # Perl::Tidy::Formatter for breaking lines around this token type. You can
2591 # skip this step and take the default at first, then adjust later to get
2592 # desired results. For adding type 'v', I looked at sub bond_strength and
2593 # saw that number type 'n' was using default strengths, so I didn't do
2594 # anything. I may tune it up someday if I don't like the way line
2595 # breaks with v-strings look.
2597 # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
2598 # Perl::Tidy::Formatter. For adding type 'v', I looked at this routine
2599 # and saw that type 'n' used spaces on both sides, so I just added 'v'
2600 # to the array @spaces_both_sides.
2602 # *. Update HtmlWriter package so that users can colorize the token as
2603 # desired. This is quite easy; see comments identified by 'NEW_TOKENS' in
2604 # that package. For v-strings, I initially chose to use a default color
2605 # equal to the default for numbers, but it might be nice to change that
2608 # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
2610 # *. Run lots and lots of debug tests. Start with special files designed
2611 # to test the new token type. Run with the -D flag to create a .DEBUG
2612 # file which shows the tokenization. When these work ok, test as many old
2613 # scripts as possible. Start with all of the '.t' files in the 'test'
2614 # directory of the distribution file. Compare .tdy output with previous
2615 # version and updated version to see the differences. Then include as
2616 # many more files as possible. My own technique has been to collect a huge
2617 # number of perl scripts (thousands!) into one directory and run perltidy
2618 # *, then run diff between the output of the previous version and the
2621 # *. For another example, search for the smartmatch operator '~~'
2622 # with your editor to see where updates were made for it.
2624 # -----------------------------------------------------------------------
2626 my $line_of_tokens = shift;
2627 my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
2629 # patch while coding change is underway
2630 # make callers private data to allow access
2631 # $tokenizer_self = $caller_tokenizer_self;
2633 # extract line number for use in error messages
2634 $input_line_number = $line_of_tokens->{_line_number};
2636 # reinitialize for multi-line quote
2637 $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
2639 # check for pod documentation
2640 if ( ( $untrimmed_input_line =~ /^=[A-Za-z_]/ ) ) {
2642 # must not be in multi-line quote
2643 # and must not be in an equation
2644 if ( !$in_quote && ( operator_expected( 'b', '=', 'b' ) == TERM ) )
2646 $tokenizer_self->{_in_pod} = 1;
2651 $input_line = $untrimmed_input_line;
2655 # trim start of this line unless we are continuing a quoted line
2656 # do not trim end because we might end in a quote (test: deken4.pl)
2657 # Perl::Tidy::Formatter will delete needless trailing blanks
2658 unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
2659 $input_line =~ s/^\s*//; # trim left end
2662 # Set a flag to indicate if we might be at an __END__ or __DATA__ line
2663 # This will be used below to avoid quoting a bare word followed by
2665 my $is_END_or_DATA = $input_line =~ /^\s*__(END|DATA)__\s*$/;
2667 # update the copy of the line for use in error messages
2668 # This must be exactly what we give the pre_tokenizer
2669 $tokenizer_self->{_line_text} = $input_line;
2671 # re-initialize for the main loop
2672 $routput_token_list = []; # stack of output token indexes
2673 $routput_token_type = []; # token types
2674 $routput_block_type = []; # types of code block
2675 $routput_container_type = []; # paren types, such as if, elsif, ..
2676 $routput_type_sequence = []; # nesting sequential number
2678 $rhere_target_list = [];
2680 $tok = $last_nonblank_token;
2681 $type = $last_nonblank_type;
2682 $prototype = $last_nonblank_prototype;
2683 $last_nonblank_i = -1;
2684 $block_type = $last_nonblank_block_type;
2685 $container_type = $last_nonblank_container_type;
2686 $type_sequence = $last_nonblank_type_sequence;
2690 # tokenization is done in two stages..
2691 # stage 1 is a very simple pre-tokenization
2692 my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
2694 # a little optimization for a full-line comment
2695 if ( !$in_quote && ( $input_line =~ /^#/ ) ) {
2696 $max_tokens_wanted = 1 # no use tokenizing a comment
2699 # start by breaking the line into pre-tokens
2700 ( $rtokens, $rtoken_map, $rtoken_type ) =
2701 pre_tokenize( $input_line, $max_tokens_wanted );
2703 $max_token_index = scalar( @{$rtokens} ) - 1;
2704 push( @{$rtokens}, ' ', ' ', ' ' ); # extra whitespace simplifies logic
2705 push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced
2706 push( @{$rtoken_type}, 'b', 'b', 'b' );
2708 # initialize for main loop
2709 foreach my $ii ( 0 .. $max_token_index + 3 ) {
2710 $routput_token_type->[$ii] = "";
2711 $routput_block_type->[$ii] = "";
2712 $routput_container_type->[$ii] = "";
2713 $routput_type_sequence->[$ii] = "";
2714 $routput_indent_flag->[$ii] = 0;
2719 # ------------------------------------------------------------
2720 # begin main tokenization loop
2721 # ------------------------------------------------------------
2723 # we are looking at each pre-token of one line and combining them
2725 while ( ++$i <= $max_token_index ) {
2727 if ($in_quote) { # continue looking for end of a quote
2728 $type = $quote_type;
2730 unless ( @{$routput_token_list} )
2731 { # initialize if continuation line
2732 push( @{$routput_token_list}, $i );
2733 $routput_token_type->[$i] = $type;
2736 $tok = $quote_character unless ( $quote_character =~ /^\s*$/ );
2738 # scan for the end of the quote or pattern
2740 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
2741 $quoted_string_1, $quoted_string_2
2744 $i, $in_quote, $quote_character,
2745 $quote_pos, $quote_depth, $quoted_string_1,
2746 $quoted_string_2, $rtokens, $rtoken_map,
2750 # all done if we didn't find it
2751 last if ($in_quote);
2753 # save pattern and replacement text for rescanning
2754 my $qs1 = $quoted_string_1;
2755 my $qs2 = $quoted_string_2;
2757 # re-initialize for next search
2758 $quote_character = '';
2761 $quoted_string_1 = "";
2762 $quoted_string_2 = "";
2763 last if ( ++$i > $max_token_index );
2765 # look for any modifiers
2766 if ($allowed_quote_modifiers) {
2768 # check for exact quote modifiers
2769 if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
2770 my $str = $rtokens->[$i];
2772 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
2773 my $pos = pos($str);
2774 my $char = substr( $str, $pos - 1, 1 );
2775 $saw_modifier_e ||= ( $char eq 'e' );
2778 # For an 'e' quote modifier we must scan the replacement
2779 # text for here-doc targets.
2780 if ($saw_modifier_e) {
2782 my $rht = scan_replacement_text($qs1);
2784 # Change type from 'Q' to 'h' for quotes with
2785 # here-doc targets so that the formatter (see sub
2786 # print_line_of_tokens) will not make any line
2787 # breaks after this point.
2789 push @{$rhere_target_list}, @{$rht};
2792 my $ilast = $routput_token_list->[-1];
2793 $routput_token_type->[$ilast] = $type;
2798 if ( defined( pos($str) ) ) {
2801 if ( pos($str) == length($str) ) {
2802 last if ( ++$i > $max_token_index );
2805 # Looks like a joined quote modifier
2806 # and keyword, maybe something like
2807 # s/xxx/yyy/gefor @k=...
2808 # Example is "galgen.pl". Would have to split
2809 # the word and insert a new token in the
2810 # pre-token list. This is so rare that I haven't
2811 # done it. Will just issue a warning citation.
2813 # This error might also be triggered if my quote
2814 # modifier characters are incomplete
2818 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
2819 Please put a space between quote modifiers and trailing keywords.
2822 # print "token $rtokens->[$i]\n";
2823 # my $num = length($str) - pos($str);
2824 # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
2825 # print "continuing with new token $rtokens->[$i]\n";
2827 # skipping past this token does least damage
2828 last if ( ++$i > $max_token_index );
2833 # example file: rokicki4.pl
2834 # This error might also be triggered if my quote
2835 # modifier characters are incomplete
2836 write_logfile_entry(
2837 "Note: found word $str at quote modifier location\n"
2843 $allowed_quote_modifiers = "";
2847 unless ( $tok =~ /^\s*$/ || $tok eq 'CORE::' ) {
2849 # try to catch some common errors
2850 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
2852 if ( $last_nonblank_token eq 'eq' ) {
2853 complain("Should 'eq' be '==' here ?\n");
2855 elsif ( $last_nonblank_token eq 'ne' ) {
2856 complain("Should 'ne' be '!=' here ?\n");
2860 $last_last_nonblank_token = $last_nonblank_token;
2861 $last_last_nonblank_type = $last_nonblank_type;
2862 $last_last_nonblank_block_type = $last_nonblank_block_type;
2863 $last_last_nonblank_container_type =
2864 $last_nonblank_container_type;
2865 $last_last_nonblank_type_sequence =
2866 $last_nonblank_type_sequence;
2867 $last_nonblank_token = $tok;
2868 $last_nonblank_type = $type;
2869 $last_nonblank_prototype = $prototype;
2870 $last_nonblank_block_type = $block_type;
2871 $last_nonblank_container_type = $container_type;
2872 $last_nonblank_type_sequence = $type_sequence;
2873 $last_nonblank_i = $i_tok;
2876 # store previous token type
2877 if ( $i_tok >= 0 ) {
2878 $routput_token_type->[$i_tok] = $type;
2879 $routput_block_type->[$i_tok] = $block_type;
2880 $routput_container_type->[$i_tok] = $container_type;
2881 $routput_type_sequence->[$i_tok] = $type_sequence;
2882 $routput_indent_flag->[$i_tok] = $indent_flag;
2884 my $pre_tok = $rtokens->[$i]; # get the next pre-token
2885 my $pre_type = $rtoken_type->[$i]; # and type
2887 $type = $pre_type; # to be modified as necessary
2888 $block_type = ""; # blank for all tokens except code block braces
2889 $container_type = ""; # blank for all tokens except some parens
2890 $type_sequence = ""; # blank for all tokens except ?/:
2892 $prototype = ""; # blank for all tokens except user defined subs
2895 # this pre-token will start an output token
2896 push( @{$routput_token_list}, $i_tok );
2898 # continue gathering identifier if necessary
2899 # but do not start on blanks and comments
2900 if ( $id_scan_state && $pre_type !~ /[b#]/ ) {
2902 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
2909 last if ($id_scan_state);
2910 next if ( ( $i > 0 ) || $type );
2912 # didn't find any token; start over
2917 # handle whitespace tokens..
2918 next if ( $type eq 'b' );
2919 my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : ' ';
2920 my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
2922 # Build larger tokens where possible, since we are not in a quote.
2924 # First try to assemble digraphs. The following tokens are
2925 # excluded and handled specially:
2926 # '/=' is excluded because the / might start a pattern.
2927 # 'x=' is excluded since it might be $x=, with $ on previous line
2928 # '**' and *= might be typeglobs of punctuation variables
2929 # I have allowed tokens starting with <, such as <=,
2930 # because I don't think these could be valid angle operators.
2931 # test file: storrs4.pl
2932 my $test_tok = $tok . $rtokens->[ $i + 1 ];
2933 my $combine_ok = $is_digraph{$test_tok};
2935 # check for special cases which cannot be combined
2938 # '//' must be defined_or operator if an operator is expected.
2939 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
2940 # could be migrated here for clarity
2942 # Patch for RT#102371, misparsing a // in the following snippet:
2943 # state $b //= ccc();
2944 # The solution is to always accept the digraph (or trigraph) after
2945 # token type 'Z' (possible file handle). The reason is that
2946 # sub operator_expected gives TERM expected here, which is
2947 # wrong in this case.
2948 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
2949 my $next_type = $rtokens->[ $i + 1 ];
2951 operator_expected( $prev_type, $tok, $next_type );
2953 # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
2954 $combine_ok = 0 if ( $expecting == TERM );
2957 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
2958 # Accept the digraphs '**' only after type 'Z'
2959 # Otherwise postpone the decision.
2960 if ( $test_tok eq '**' ) {
2961 if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
2968 && ( $test_tok ne '/=' ) # might be pattern
2969 && ( $test_tok ne 'x=' ) # might be $x
2970 && ( $test_tok ne '*=' ) # typeglob?
2972 # Moved above as part of fix for
2973 # RT #114359: Missparsing of "print $x ** 0.5;
2974 # && ( $test_tok ne '**' ) # typeglob?
2980 # Now try to assemble trigraphs. Note that all possible
2981 # perl trigraphs can be constructed by appending a character
2983 $test_tok = $tok . $rtokens->[ $i + 1 ];
2985 if ( $is_trigraph{$test_tok} ) {
2990 # The only current tetragraph is the double diamond operator
2991 # and its first three characters are not a trigraph, so
2992 # we do can do a special test for it
2993 elsif ( $test_tok eq '<<>' ) {
2994 $test_tok .= $rtokens->[ $i + 2 ];
2995 if ( $is_tetragraph{$test_tok} ) {
3003 $next_tok = $rtokens->[ $i + 1 ];
3004 $next_type = $rtoken_type->[ $i + 1 ];
3006 TOKENIZER_DEBUG_FLAG_TOKENIZE && do {
3009 $last_nonblank_token, $tok,
3010 $next_tok, $brace_depth,
3011 $brace_type[$brace_depth], $paren_depth,
3012 $paren_type[$paren_depth]
3014 print STDOUT "TOKENIZE:(@debug_list)\n";
3017 # turn off attribute list on first non-blank, non-bareword
3018 if ( $pre_type ne 'w' ) { $in_attribute_list = 0 }
3020 ###############################################################
3021 # We have the next token, $tok.
3022 # Now we have to examine this token and decide what it is
3023 # and define its $type
3025 # section 1: bare words
3026 ###############################################################
3028 if ( $pre_type eq 'w' ) {
3029 $expecting = operator_expected( $prev_type, $tok, $next_type );
3030 my ( $next_nonblank_token, $i_next ) =
3031 find_next_nonblank_token( $i, $rtokens, $max_token_index );
3033 # ATTRS: handle sub and variable attributes
3034 if ($in_attribute_list) {
3036 # treat bare word followed by open paren like qw(
3037 if ( $next_nonblank_token eq '(' ) {
3038 $in_quote = $quote_items{'q'};
3039 $allowed_quote_modifiers = $quote_modifiers{'q'};
3045 # handle bareword not followed by open paren
3052 # quote a word followed by => operator
3053 # unless the word __END__ or __DATA__ and the only word on
3055 if ( !$is_END_or_DATA && $next_nonblank_token eq '=' ) {
3057 if ( $rtokens->[ $i_next + 1 ] eq '>' ) {
3058 if ( $is_constant{$current_package}{$tok} ) {
3061 elsif ( $is_user_function{$current_package}{$tok} ) {
3064 $user_function_prototype{$current_package}{$tok};
3066 elsif ( $tok =~ /^v\d+$/ ) {
3068 report_v_string($tok);
3070 else { $type = 'w' }
3076 # quote a bare word within braces..like xxx->{s}; note that we
3077 # must be sure this is not a structural brace, to avoid
3078 # mistaking {s} in the following for a quoted bare word:
3079 # for(@[){s}bla}BLA}
3080 # Also treat q in something like var{-q} as a bare word, not qoute operator
3082 $next_nonblank_token eq '}'
3084 $last_nonblank_type eq 'L'
3085 || ( $last_nonblank_type eq 'm'
3086 && $last_last_nonblank_type eq 'L' )
3094 # a bare word immediately followed by :: is not a keyword;
3095 # use $tok_kw when testing for keywords to avoid a mistake
3097 if ( $rtokens->[ $i + 1 ] eq ':'
3098 && $rtokens->[ $i + 2 ] eq ':' )
3103 # handle operator x (now we know it isn't $x=)
3104 if ( ( $tok =~ /^x\d*$/ ) && ( $expecting == OPERATOR ) ) {
3105 if ( $tok eq 'x' ) {
3107 if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
3117 # FIXME: Patch: mark something like x4 as an integer for now
3118 # It gets fixed downstream. This is easier than
3119 # splitting the pretoken.
3124 elsif ( $tok_kw eq 'CORE::' ) {
3125 $type = $tok = $tok_kw;
3128 elsif ( ( $tok eq 'strict' )
3129 and ( $last_nonblank_token eq 'use' ) )
3131 $tokenizer_self->{_saw_use_strict} = 1;
3132 scan_bare_identifier();
3135 elsif ( ( $tok eq 'warnings' )
3136 and ( $last_nonblank_token eq 'use' ) )
3138 $tokenizer_self->{_saw_perl_dash_w} = 1;
3140 # scan as identifier, so that we pick up something like:
3141 # use warnings::register
3142 scan_bare_identifier();
3146 $tok eq 'AutoLoader'
3147 && $tokenizer_self->{_look_for_autoloader}
3149 $last_nonblank_token eq 'use'
3151 # these regexes are from AutoSplit.pm, which we want
3153 || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
3154 || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
3158 write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
3159 $tokenizer_self->{_saw_autoloader} = 1;
3160 $tokenizer_self->{_look_for_autoloader} = 0;
3161 scan_bare_identifier();
3165 $tok eq 'SelfLoader'
3166 && $tokenizer_self->{_look_for_selfloader}
3167 && ( $last_nonblank_token eq 'use'
3168 || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
3169 || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
3172 write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
3173 $tokenizer_self->{_saw_selfloader} = 1;
3174 $tokenizer_self->{_look_for_selfloader} = 0;
3175 scan_bare_identifier();
3178 elsif ( ( $tok eq 'constant' )
3179 and ( $last_nonblank_token eq 'use' ) )
3181 scan_bare_identifier();
3182 my ( $next_nonblank_token, $i_next ) =
3183 find_next_nonblank_token( $i, $rtokens,
3186 if ($next_nonblank_token) {
3188 if ( $is_keyword{$next_nonblank_token} ) {
3190 # Assume qw is used as a quote and okay, as in:
3191 # use constant qw{ DEBUG 0 };
3192 # Not worth trying to parse for just a warning
3194 # NOTE: This warning is deactivated because recent
3195 # versions of perl do not complain here, but
3196 # the coding is retained for reference.
3197 if ( 0 && $next_nonblank_token ne 'qw' ) {
3199 "Attempting to define constant '$next_nonblank_token' which is a perl keyword\n"
3204 # FIXME: could check for error in which next token is
3205 # not a word (number, punctuation, ..)
3207 $is_constant{$current_package}{$next_nonblank_token}
3213 # various quote operators
3214 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
3216 if ( $expecting == OPERATOR ) {
3218 # Be careful not to call an error for a qw quote
3219 # where a parenthesized list is allowed. For example,
3220 # it could also be a for/foreach construct such as
3222 # foreach my $key qw\Uno Due Tres Quadro\ {
3223 # print "Set $key\n";
3227 # Or it could be a function call.
3228 # NOTE: Braces in something like &{ xxx } are not
3229 # marked as a block, we might have a method call.
3230 # &method(...), $method->(..), &{method}(...),
3231 # $ref[2](list) is ok & short for $ref[2]->(list)
3233 # See notes in 'sub code_block_type' and
3234 # 'sub is_non_structural_brace'
3238 && ( $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
3239 || $is_for_foreach{$want_paren} )
3242 error_if_expecting_OPERATOR();
3245 $in_quote = $quote_items{$tok};
3246 $allowed_quote_modifiers = $quote_modifiers{$tok};
3248 # All quote types are 'Q' except possibly qw quotes.
3249 # qw quotes are special in that they may generally be trimmed
3250 # of leading and trailing whitespace. So they are given a
3251 # separate type, 'q', unless requested otherwise.
3253 ( $tok eq 'qw' && $tokenizer_self->{_trim_qw} )
3256 $quote_type = $type;
3259 # check for a statement label
3261 ( $next_nonblank_token eq ':' )
3262 && ( $rtokens->[ $i_next + 1 ] ne ':' )
3263 && ( $i_next <= $max_token_index ) # colon on same line
3267 if ( $tok !~ /[A-Z]/ ) {
3268 push @{ $tokenizer_self->{_rlower_case_labels_at} },
3277 # 'sub' || 'package'
3278 elsif ( $is_sub{$tok_kw} || $is_package{$tok_kw} ) {
3279 error_if_expecting_OPERATOR()
3280 if ( $expecting == OPERATOR );
3284 # Note on token types for format, __DATA__, __END__:
3285 # It simplifies things to give these type ';', so that when we
3286 # start rescanning we will be expecting a token of type TERM.
3287 # We will switch to type 'k' before outputting the tokens.
3288 elsif ( $is_format_END_DATA{$tok_kw} ) {
3289 $type = ';'; # make tokenizer look for TERM next
3290 $tokenizer_self->{ $is_format_END_DATA{$tok_kw} } = 1;
3294 elsif ( $is_keyword{$tok_kw} ) {
3297 # Since for and foreach may not be followed immediately
3298 # by an opening paren, we have to remember which keyword
3299 # is associated with the next '('
3300 if ( $is_for_foreach{$tok} ) {
3301 if ( new_statement_ok() ) {
3306 # recognize 'use' statements, which are special
3307 elsif ( $is_use_require{$tok} ) {
3308 $statement_type = $tok;
3309 error_if_expecting_OPERATOR()
3310 if ( $expecting == OPERATOR );
3313 # remember my and our to check for trailing ": shared"
3314 elsif ( $is_my_our{$tok} ) {
3315 $statement_type = $tok;
3318 # Check for misplaced 'elsif' and 'else', but allow isolated
3319 # else or elsif blocks to be formatted. This is indicated
3320 # by a last noblank token of ';'
3321 elsif ( $tok eq 'elsif' ) {
3322 if ( $last_nonblank_token ne ';'
3323 && $last_nonblank_block_type !~
3324 /^(if|elsif|unless)$/ )
3327 "expecting '$tok' to follow one of 'if|elsif|unless'\n"
3331 elsif ( $tok eq 'else' ) {
3333 # patched for SWITCH/CASE
3335 $last_nonblank_token ne ';'
3336 && $last_nonblank_block_type !~
3337 /^(if|elsif|unless|case|when)$/
3339 # patch to avoid an unwanted error message for
3340 # the case of a parenless 'case' (RT 105484):
3341 # switch ( 1 ) { case x { 2 } else { } }
3342 && $statement_type !~
3343 /^(if|elsif|unless|case|when)$/
3347 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
3351 elsif ( $tok eq 'continue' ) {
3352 if ( $last_nonblank_token ne ';'
3353 && $last_nonblank_block_type !~
3354 /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
3357 # note: ';' '{' and '}' in list above
3358 # because continues can follow bare blocks;
3359 # ':' is labeled block
3361 ############################################
3362 # NOTE: This check has been deactivated because
3363 # continue has an alternative usage for given/when
3364 # blocks in perl 5.10
3365 ## warning("'$tok' should follow a block\n");
3366 ############################################
3370 # patch for SWITCH/CASE if 'case' and 'when are
3371 # treated as keywords.
3372 elsif ( $tok eq 'when' || $tok eq 'case' ) {
3373 $statement_type = $tok; # next '{' is block
3377 # indent trailing if/unless/while/until
3378 # outdenting will be handled by later indentation loop
3379 ## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
3391 ## if ( $tok =~ /^(if|unless|while|until)$/
3392 ## && $next_nonblank_token ne '(' )
3394 ## $indent_flag = 1;
3398 # check for inline label following
3399 # /^(redo|last|next|goto)$/
3400 elsif (( $last_nonblank_type eq 'k' )
3401 && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
3410 scan_bare_identifier();
3411 if ( $type eq 'w' ) {
3413 if ( $expecting == OPERATOR ) {
3415 # don't complain about possible indirect object
3419 # sub new($) { ... }
3420 # $b = new A::; # calls A::new
3421 # $c = new A; # same thing but suspicious
3422 # This will call A::new but we have a 'new' in
3423 # main:: which looks like a constant.
3425 if ( $last_nonblank_type eq 'C' ) {
3426 if ( $tok !~ /::$/ ) {
3428 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
3429 Maybe indirectet object notation?
3434 error_if_expecting_OPERATOR("bareword");
3438 # mark bare words immediately followed by a paren as
3440 $next_tok = $rtokens->[ $i + 1 ];
3441 if ( $next_tok eq '(' ) {
3445 # underscore after file test operator is file handle
3446 if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
3450 # patch for SWITCH/CASE if 'case' and 'when are
3451 # not treated as keywords:
3455 && $brace_type[$brace_depth] eq 'switch'
3458 && $brace_type[$brace_depth] eq 'given' )
3461 $statement_type = $tok; # next '{' is block
3462 $type = 'k'; # for keyword syntax coloring
3465 # patch for SWITCH/CASE if switch and given not keywords
3466 # Switch is not a perl 5 keyword, but we will gamble
3467 # and mark switch followed by paren as a keyword. This
3468 # is only necessary to get html syntax coloring nice,
3469 # and does not commit this as being a switch/case.
3470 if ( $next_nonblank_token eq '('
3471 && ( $tok eq 'switch' || $tok eq 'given' ) )
3473 $type = 'k'; # for keyword syntax coloring
3479 ###############################################################
3480 # section 2: strings of digits
3481 ###############################################################
3482 elsif ( $pre_type eq 'd' ) {
3483 $expecting = operator_expected( $prev_type, $tok, $next_type );
3484 error_if_expecting_OPERATOR("Number")
3485 if ( $expecting == OPERATOR );
3486 my $number = scan_number();
3487 if ( !defined($number) ) {
3489 # shouldn't happen - we should always get a number
3490 warning("non-number beginning with digit--program bug\n");
3491 report_definite_bug();
3495 ###############################################################
3496 # section 3: all other tokens
3497 ###############################################################
3500 last if ( $tok eq '#' );
3501 my $code = $tokenization_code->{$tok};
3504 operator_expected( $prev_type, $tok, $next_type );
3511 # -----------------------------
3512 # end of main tokenization loop
3513 # -----------------------------
3515 if ( $i_tok >= 0 ) {
3516 $routput_token_type->[$i_tok] = $type;
3517 $routput_block_type->[$i_tok] = $block_type;
3518 $routput_container_type->[$i_tok] = $container_type;
3519 $routput_type_sequence->[$i_tok] = $type_sequence;
3520 $routput_indent_flag->[$i_tok] = $indent_flag;
3523 unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
3524 $last_last_nonblank_token = $last_nonblank_token;
3525 $last_last_nonblank_type = $last_nonblank_type;
3526 $last_last_nonblank_block_type = $last_nonblank_block_type;
3527 $last_last_nonblank_container_type = $last_nonblank_container_type;
3528 $last_last_nonblank_type_sequence = $last_nonblank_type_sequence;
3529 $last_nonblank_token = $tok;
3530 $last_nonblank_type = $type;
3531 $last_nonblank_block_type = $block_type;
3532 $last_nonblank_container_type = $container_type;
3533 $last_nonblank_type_sequence = $type_sequence;
3534 $last_nonblank_prototype = $prototype;
3537 # reset indentation level if necessary at a sub or package
3538 # in an attempt to recover from a nesting error
3539 if ( $level_in_tokenizer < 0 ) {
3540 if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
3541 reset_indentation_level(0);
3542 brace_warning("resetting level to 0 at $1 $2\n");
3546 # all done tokenizing this line ...
3547 # now prepare the final list of tokens and types
3549 my @token_type = (); # stack of output token types
3550 my @block_type = (); # stack of output code block types
3551 my @container_type = (); # stack of output code container types
3552 my @type_sequence = (); # stack of output type sequence numbers
3553 my @tokens = (); # output tokens
3554 my @levels = (); # structural brace levels of output tokens
3555 my @slevels = (); # secondary nesting levels of output tokens
3556 my @nesting_tokens = (); # string of tokens leading to this depth
3557 my @nesting_types = (); # string of token types leading to this depth
3558 my @nesting_blocks = (); # string of block types leading to this depth
3559 my @nesting_lists = (); # string of list types leading to this depth
3560 my @ci_string = (); # string needed to compute continuation indentation
3561 my @container_environment = (); # BLOCK or LIST
3562 my $container_environment = '';
3563 my $im = -1; # previous $i value
3565 my $ci_string_sum = ones_count($ci_string_in_tokenizer);
3567 # Computing Token Indentation
3569 # The final section of the tokenizer forms tokens and also computes
3570 # parameters needed to find indentation. It is much easier to do it
3571 # in the tokenizer than elsewhere. Here is a brief description of how
3572 # indentation is computed. Perl::Tidy computes indentation as the sum
3575 # (1) structural indentation, such as if/else/elsif blocks
3576 # (2) continuation indentation, such as long parameter call lists.
3578 # These are occasionally called primary and secondary indentation.
3580 # Structural indentation is introduced by tokens of type '{', although
3581 # the actual tokens might be '{', '(', or '['. Structural indentation
3582 # is of two types: BLOCK and non-BLOCK. Default structural indentation
3583 # is 4 characters if the standard indentation scheme is used.
3585 # Continuation indentation is introduced whenever a line at BLOCK level
3586 # is broken before its termination. Default continuation indentation
3587 # is 2 characters in the standard indentation scheme.
3589 # Both types of indentation may be nested arbitrarily deep and
3590 # interlaced. The distinction between the two is somewhat arbitrary.
3592 # For each token, we will define two variables which would apply if
3593 # the current statement were broken just before that token, so that
3594 # that token started a new line:
3596 # $level = the structural indentation level,
3597 # $ci_level = the continuation indentation level
3599 # The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
3600 # assuming defaults. However, in some special cases it is customary
3601 # to modify $ci_level from this strict value.
3603 # The total structural indentation is easy to compute by adding and
3604 # subtracting 1 from a saved value as types '{' and '}' are seen. The
3605 # running value of this variable is $level_in_tokenizer.
3607 # The total continuation is much more difficult to compute, and requires
3608 # several variables. These variables are:
3610 # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
3611 # each indentation level, if there are intervening open secondary
3612 # structures just prior to that level.
3613 # $continuation_string_in_tokenizer = a string of 1's and 0's indicating
3614 # if the last token at that level is "continued", meaning that it
3615 # is not the first token of an expression.
3616 # $nesting_block_string = a string of 1's and 0's indicating, for each
3617 # indentation level, if the level is of type BLOCK or not.
3618 # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
3619 # $nesting_list_string = a string of 1's and 0's indicating, for each
3620 # indentation level, if it is appropriate for list formatting.
3621 # If so, continuation indentation is used to indent long list items.
3622 # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
3623 # @{$rslevel_stack} = a stack of total nesting depths at each
3624 # structural indentation level, where "total nesting depth" means
3625 # the nesting depth that would occur if every nesting token -- '{', '[',
3626 # and '(' -- , regardless of context, is used to compute a nesting
3629 #my $nesting_block_flag = ($nesting_block_string =~ /1$/);
3630 #my $nesting_list_flag = ($nesting_list_string =~ /1$/);
3632 my ( $ci_string_i, $level_i, $nesting_block_string_i,
3633 $nesting_list_string_i, $nesting_token_string_i,
3634 $nesting_type_string_i, );
3636 foreach my $i ( @{$routput_token_list} )
3637 { # scan the list of pre-tokens indexes
3639 # self-checking for valid token types
3640 my $type = $routput_token_type->[$i];
3641 my $forced_indentation_flag = $routput_indent_flag->[$i];
3643 # See if we should undo the $forced_indentation_flag.
3644 # Forced indentation after 'if', 'unless', 'while' and 'until'
3645 # expressions without trailing parens is optional and doesn't
3646 # always look good. It is usually okay for a trailing logical
3647 # expression, but if the expression is a function call, code block,
3648 # or some kind of list it puts in an unwanted extra indentation
3649 # level which is hard to remove.
3651 # Example where extra indentation looks ok:
3653 # if $det_a < 0 and $det_b > 0
3654 # or $det_a > 0 and $det_b < 0;
3656 # Example where extra indentation is not needed because
3657 # the eval brace also provides indentation:
3658 # print "not " if defined eval {
3659 # reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
3662 # The following rule works fairly well:
3663 # Undo the flag if the end of this line, or start of the next
3664 # line, is an opening container token or a comma.
3665 # This almost always works, but if not after another pass it will
3667 if ( $forced_indentation_flag && $type eq 'k' ) {
3669 my $ilast = $routput_token_list->[$ixlast];
3670 my $toklast = $routput_token_type->[$ilast];
3671 if ( $toklast eq '#' ) {
3673 $ilast = $routput_token_list->[$ixlast];
3674 $toklast = $routput_token_type->[$ilast];
3676 if ( $toklast eq 'b' ) {
3678 $ilast = $routput_token_list->[$ixlast];
3679 $toklast = $routput_token_type->[$ilast];
3681 if ( $toklast =~ /^[\{,]$/ ) {
3682 $forced_indentation_flag = 0;
3685 ( $toklast, my $i_next ) =
3686 find_next_nonblank_token( $max_token_index, $rtokens,
3688 if ( $toklast =~ /^[\{,]$/ ) {
3689 $forced_indentation_flag = 0;
3694 # if we are already in an indented if, see if we should outdent
3695 if ($indented_if_level) {
3697 # don't try to nest trailing if's - shouldn't happen
3698 if ( $type eq 'k' ) {
3699 $forced_indentation_flag = 0;
3702 # check for the normal case - outdenting at next ';'
3703 elsif ( $type eq ';' ) {
3704 if ( $level_in_tokenizer == $indented_if_level ) {
3705 $forced_indentation_flag = -1;
3706 $indented_if_level = 0;
3710 # handle case of missing semicolon
3711 elsif ( $type eq '}' ) {
3712 if ( $level_in_tokenizer == $indented_if_level ) {
3713 $indented_if_level = 0;
3715 # TBD: This could be a subroutine call
3716 $level_in_tokenizer--;
3717 if ( @{$rslevel_stack} > 1 ) {
3718 pop( @{$rslevel_stack} );
3720 if ( length($nesting_block_string) > 1 )
3721 { # true for valid script
3722 chop $nesting_block_string;
3723 chop $nesting_list_string;
3730 my $tok = $rtokens->[$i]; # the token, but ONLY if same as pretoken
3731 $level_i = $level_in_tokenizer;
3733 # This can happen by running perltidy on non-scripts
3734 # although it could also be bug introduced by programming change.
3735 # Perl silently accepts a 032 (^Z) and takes it as the end
3736 if ( !$is_valid_token_type{$type} ) {
3737 my $val = ord($type);
3739 "unexpected character decimal $val ($type) in script\n");
3740 $tokenizer_self->{_in_error} = 1;
3743 # ----------------------------------------------------------------
3744 # TOKEN TYPE PATCHES
3745 # output __END__, __DATA__, and format as type 'k' instead of ';'
3746 # to make html colors correct, etc.
3747 my $fix_type = $type;
3748 if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }
3750 # output anonymous 'sub' as keyword
3751 if ( $type eq 't' && $is_sub{$tok} ) { $fix_type = 'k' }
3753 # -----------------------------------------------------------------
3755 $nesting_token_string_i = $nesting_token_string;
3756 $nesting_type_string_i = $nesting_type_string;
3757 $nesting_block_string_i = $nesting_block_string;
3758 $nesting_list_string_i = $nesting_list_string;
3760 # set primary indentation levels based on structural braces
3761 # Note: these are set so that the leading braces have a HIGHER
3762 # level than their CONTENTS, which is convenient for indentation
3763 # Also, define continuation indentation for each token.
3764 if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
3767 # use environment before updating
3768 $container_environment =
3769 $nesting_block_flag ? 'BLOCK'
3770 : $nesting_list_flag ? 'LIST'
3773 # if the difference between total nesting levels is not 1,
3774 # there are intervening non-structural nesting types between
3775 # this '{' and the previous unclosed '{'
3776 my $intervening_secondary_structure = 0;
3777 if ( @{$rslevel_stack} ) {
3778 $intervening_secondary_structure =
3779 $slevel_in_tokenizer - $rslevel_stack->[-1];
3782 # Continuation Indentation
3784 # Having tried setting continuation indentation both in the formatter and
3785 # in the tokenizer, I can say that setting it in the tokenizer is much,
3786 # much easier. The formatter already has too much to do, and can't
3787 # make decisions on line breaks without knowing what 'ci' will be at
3788 # arbitrary locations.
3790 # But a problem with setting the continuation indentation (ci) here
3791 # in the tokenizer is that we do not know where line breaks will actually
3792 # be. As a result, we don't know if we should propagate continuation
3793 # indentation to higher levels of structure.
3795 # For nesting of only structural indentation, we never need to do this.
3796 # For example, in a long if statement, like this
3798 # if ( !$output_block_type[$i]
3799 # && ($in_statement_continuation) )
3804 # the second line has ci but we do normally give the lines within the BLOCK
3805 # any ci. This would be true if we had blocks nested arbitrarily deeply.
3807 # But consider something like this, where we have created a break after
3808 # an opening paren on line 1, and the paren is not (currently) a
3809 # structural indentation token:
3811 # my $file = $menubar->Menubutton(
3812 # qw/-text File -underline 0 -menuitems/ => [
3814 # Cascade => '~View',
3818 # The second line has ci, so it would seem reasonable to propagate it
3819 # down, giving the third line 1 ci + 1 indentation. This suggests the
3820 # following rule, which is currently used to propagating ci down: if there
3821 # are any non-structural opening parens (or brackets, or braces), before
3822 # an opening structural brace, then ci is propagated down, and otherwise
3823 # not. The variable $intervening_secondary_structure contains this
3824 # information for the current token, and the string
3825 # "$ci_string_in_tokenizer" is a stack of previous values of this
3828 # save the current states
3829 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
3830 $level_in_tokenizer++;
3832 if ($forced_indentation_flag) {
3834 # break BEFORE '?' when there is forced indentation
3835 if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
3836 if ( $type eq 'k' ) {
3837 $indented_if_level = $level_in_tokenizer;
3840 # do not change container environment here if we are not
3841 # at a real list. Adding this check prevents "blinkers"
3842 # often near 'unless" clauses, such as in the following
3847 ## File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
3850 $nesting_block_string .= "$nesting_block_flag";
3854 if ( $routput_block_type->[$i] ) {
3855 $nesting_block_flag = 1;
3856 $nesting_block_string .= '1';
3859 $nesting_block_flag = 0;
3860 $nesting_block_string .= '0';
3864 # we will use continuation indentation within containers
3865 # which are not blocks and not logical expressions
3867 if ( !$routput_block_type->[$i] ) {
3869 # propagate flag down at nested open parens
3870 if ( $routput_container_type->[$i] eq '(' ) {
3871 $bit = 1 if $nesting_list_flag;
3874 # use list continuation if not a logical grouping
3875 # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
3879 $is_logical_container{ $routput_container_type->[$i]
3883 $nesting_list_string .= $bit;
3884 $nesting_list_flag = $bit;
3886 $ci_string_in_tokenizer .=
3887 ( $intervening_secondary_structure != 0 ) ? '1' : '0';
3888 $ci_string_sum = ones_count($ci_string_in_tokenizer);
3889 $continuation_string_in_tokenizer .=
3890 ( $in_statement_continuation > 0 ) ? '1' : '0';
3892 # Sometimes we want to give an opening brace continuation indentation,
3893 # and sometimes not. For code blocks, we don't do it, so that the leading
3894 # '{' gets outdented, like this:
3896 # if ( !$output_block_type[$i]
3897 # && ($in_statement_continuation) )
3900 # For other types, we will give them continuation indentation. For example,
3901 # here is how a list looks with the opening paren indented:
3904 # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
3905 # [ "homer", "marge", "bart" ], );
3907 # This looks best when 'ci' is one-half of the indentation (i.e., 2 and 4)
3909 my $total_ci = $ci_string_sum;
3911 !$routput_block_type->[$i] # patch: skip for BLOCK
3912 && ($in_statement_continuation)
3913 && !( $forced_indentation_flag && $type eq ':' )
3916 $total_ci += $in_statement_continuation
3917 unless ( $ci_string_in_tokenizer =~ /1$/ );
3920 $ci_string_i = $total_ci;
3921 $in_statement_continuation = 0;
3926 || $forced_indentation_flag < 0 )
3929 # only a nesting error in the script would prevent popping here
3930 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
3932 $level_i = --$level_in_tokenizer;
3934 # restore previous level values
3935 if ( length($nesting_block_string) > 1 )
3936 { # true for valid script
3937 chop $nesting_block_string;
3938 $nesting_block_flag = ( $nesting_block_string =~ /1$/ );
3939 chop $nesting_list_string;
3940 $nesting_list_flag = ( $nesting_list_string =~ /1$/ );
3942 chop $ci_string_in_tokenizer;
3943 $ci_string_sum = ones_count($ci_string_in_tokenizer);
3945 $in_statement_continuation =
3946 chop $continuation_string_in_tokenizer;
3948 # zero continuation flag at terminal BLOCK '}' which
3950 if ( $routput_block_type->[$i] ) {
3952 # ...These include non-anonymous subs
3953 # note: could be sub ::abc { or sub 'abc
3954 if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
3956 # note: older versions of perl require the /gc modifier
3957 # here or else the \G does not work.
3958 if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
3960 $in_statement_continuation = 0;
3964 # ...and include all block types except user subs with
3965 # block prototypes and these: (sort|grep|map|do|eval)
3966 # /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
3968 $is_zero_continuation_block_type{
3969 $routput_block_type->[$i]
3972 $in_statement_continuation = 0;
3975 # ..but these are not terminal types:
3976 # /^(sort|grep|map|do|eval)$/ )
3978 $is_not_zero_continuation_block_type{
3979 $routput_block_type->[$i]
3984 # ..and a block introduced by a label
3986 elsif ( $routput_block_type->[$i] =~ /:$/ ) {
3987 $in_statement_continuation = 0;
3990 # user function with block prototype
3992 $in_statement_continuation = 0;
3996 # If we are in a list, then
3997 # we must set continuation indentation at the closing
3998 # paren of something like this (paren after $check):
4001 # ( not defined $check )
4003 # or $check eq "new"
4004 # or $check eq "old",
4006 elsif ( $tok eq ')' ) {
4007 $in_statement_continuation = 1
4008 if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
4011 elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
4014 # use environment after updating
4015 $container_environment =
4016 $nesting_block_flag ? 'BLOCK'
4017 : $nesting_list_flag ? 'LIST'
4019 $ci_string_i = $ci_string_sum + $in_statement_continuation;
4020 $nesting_block_string_i = $nesting_block_string;
4021 $nesting_list_string_i = $nesting_list_string;
4024 # not a structural indentation type..
4027 $container_environment =
4028 $nesting_block_flag ? 'BLOCK'
4029 : $nesting_list_flag ? 'LIST'
4032 # zero the continuation indentation at certain tokens so
4033 # that they will be at the same level as its container. For
4034 # commas, this simplifies the -lp indentation logic, which
4035 # counts commas. For ?: it makes them stand out.
4036 if ($nesting_list_flag) {
4037 if ( $type =~ /^[,\?\:]$/ ) {
4038 $in_statement_continuation = 0;
4042 # be sure binary operators get continuation indentation
4044 $container_environment
4045 && ( $type eq 'k' && $is_binary_keyword{$tok}
4046 || $is_binary_type{$type} )
4049 $in_statement_continuation = 1;
4052 # continuation indentation is sum of any open ci from previous
4053 # levels plus the current level
4054 $ci_string_i = $ci_string_sum + $in_statement_continuation;
4056 # update continuation flag ...
4057 # if this isn't a blank or comment..
4058 if ( $type ne 'b' && $type ne '#' ) {
4060 # and we are in a BLOCK
4061 if ($nesting_block_flag) {
4063 # the next token after a ';' and label starts a new stmt
4064 if ( $type eq ';' || $type eq 'J' ) {
4065 $in_statement_continuation = 0;
4068 # otherwise, we are continuing the current statement
4070 $in_statement_continuation = 1;
4074 # if we are not in a BLOCK..
4077 # do not use continuation indentation if not list
4078 # environment (could be within if/elsif clause)
4079 if ( !$nesting_list_flag ) {
4080 $in_statement_continuation = 0;
4083 # otherwise, the token after a ',' starts a new term
4085 # Patch FOR RT#99961; no continuation after a ';'
4086 # This is needed because perltidy currently marks
4087 # a block preceded by a type character like % or @
4088 # as a non block, to simplify formatting. But these
4089 # are actually blocks and can have semicolons.
4090 # See code_block_type() and is_non_structural_brace().
4091 elsif ( $type eq ',' || $type eq ';' ) {
4092 $in_statement_continuation = 0;
4095 # otherwise, we are continuing the current term
4097 $in_statement_continuation = 1;
4103 if ( $level_in_tokenizer < 0 ) {
4104 unless ( $tokenizer_self->{_saw_negative_indentation} ) {
4105 $tokenizer_self->{_saw_negative_indentation} = 1;
4106 warning("Starting negative indentation\n");
4110 # set secondary nesting levels based on all containment token types
4111 # Note: these are set so that the nesting depth is the depth
4112 # of the PREVIOUS TOKEN, which is convenient for setting
4113 # the strength of token bonds
4114 my $slevel_i = $slevel_in_tokenizer;
4117 if ( $is_opening_type{$type} ) {
4118 $slevel_in_tokenizer++;
4119 $nesting_token_string .= $tok;
4120 $nesting_type_string .= $type;
4124 elsif ( $is_closing_type{$type} ) {
4125 $slevel_in_tokenizer--;
4126 my $char = chop $nesting_token_string;
4128 if ( $char ne $matching_start_token{$tok} ) {
4129 $nesting_token_string .= $char . $tok;
4130 $nesting_type_string .= $type;
4133 chop $nesting_type_string;
4137 push( @block_type, $routput_block_type->[$i] );
4138 push( @ci_string, $ci_string_i );
4139 push( @container_environment, $container_environment );
4140 push( @container_type, $routput_container_type->[$i] );
4141 push( @levels, $level_i );
4142 push( @nesting_tokens, $nesting_token_string_i );
4143 push( @nesting_types, $nesting_type_string_i );
4144 push( @slevels, $slevel_i );
4145 push( @token_type, $fix_type );
4146 push( @type_sequence, $routput_type_sequence->[$i] );
4147 push( @nesting_blocks, $nesting_block_string );
4148 push( @nesting_lists, $nesting_list_string );
4150 # now form the previous token
4153 $rtoken_map->[$i] - $rtoken_map->[$im]; # how many characters
4157 substr( $input_line, $rtoken_map->[$im], $num ) );
4163 $num = length($input_line) - $rtoken_map->[$im]; # make the last token
4165 push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
4168 $tokenizer_self->{_in_attribute_list} = $in_attribute_list;
4169 $tokenizer_self->{_in_quote} = $in_quote;
4170 $tokenizer_self->{_quote_target} =
4171 $in_quote ? matching_end_token($quote_character) : "";
4172 $tokenizer_self->{_rhere_target_list} = $rhere_target_list;
4174 $line_of_tokens->{_rtoken_type} = \@token_type;
4175 $line_of_tokens->{_rtokens} = \@tokens;
4176 $line_of_tokens->{_rblock_type} = \@block_type;
4177 $line_of_tokens->{_rcontainer_type} = \@container_type;
4178 $line_of_tokens->{_rcontainer_environment} = \@container_environment;
4179 $line_of_tokens->{_rtype_sequence} = \@type_sequence;
4180 $line_of_tokens->{_rlevels} = \@levels;
4181 $line_of_tokens->{_rslevels} = \@slevels;
4182 $line_of_tokens->{_rnesting_tokens} = \@nesting_tokens;
4183 $line_of_tokens->{_rci_levels} = \@ci_string;
4184 $line_of_tokens->{_rnesting_blocks} = \@nesting_blocks;
4188 } # end tokenize_this_line
4190 #########i#############################################################
4191 # Tokenizer routines which assist in identifying token types
4192 #######################################################################
4194 sub operator_expected {
4196 # Many perl symbols have two or more meanings. For example, '<<'
4197 # can be a shift operator or a here-doc operator. The
4198 # interpretation of these symbols depends on the current state of
4199 # the tokenizer, which may either be expecting a term or an
4200 # operator. For this example, a << would be a shift if an operator
4201 # is expected, and a here-doc if a term is expected. This routine
4202 # is called to make this decision for any current token. It returns
4203 # one of three possible values:
4205 # OPERATOR - operator expected (or at least, not a term)
4206 # UNKNOWN - can't tell
4207 # TERM - a term is expected (or at least, not an operator)
4209 # The decision is based on what has been seen so far. This
4210 # information is stored in the "$last_nonblank_type" and
4211 # "$last_nonblank_token" variables. For example, if the
4212 # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
4213 # if $last_nonblank_type is 'n' (numeric), we are expecting an
4216 # If a UNKNOWN is returned, the calling routine must guess. A major
4217 # goal of this tokenizer is to minimize the possibility of returning
4218 # UNKNOWN, because a wrong guess can spoil the formatting of a
4221 # adding NEW_TOKENS: it is critically important that this routine be
4222 # updated to allow it to determine if an operator or term is to be
4223 # expected after the new token. Doing this simply involves adding
4224 # the new token character to one of the regexes in this routine or
4225 # to one of the hash lists
4226 # that it uses, which are initialized in the BEGIN section.
4227 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
4230 my ( $prev_type, $tok, $next_type ) = @_;
4232 my $op_expected = UNKNOWN;
4234 ##print "tok=$tok last type=$last_nonblank_type last tok=$last_nonblank_token\n";
4236 # Note: function prototype is available for token type 'U' for future
4237 # program development. It contains the leading and trailing parens,
4238 # and no blanks. It might be used to eliminate token type 'C', for
4239 # example (prototype = '()'). Thus:
4240 # if ($last_nonblank_type eq 'U') {
4241 # print "previous token=$last_nonblank_token type=$last_nonblank_type prototype=$last_nonblank_prototype\n";
4244 # A possible filehandle (or object) requires some care...
4245 if ( $last_nonblank_type eq 'Z' ) {
4248 if ( $last_nonblank_token =~ /^[A-Za-z_]/ ) {
4249 $op_expected = UNKNOWN;
4252 # For possible file handle like "$a", Perl uses weird parsing rules.
4254 # print $a/2,"/hi"; - division
4255 # print $a / 2,"/hi"; - division
4256 # print $a/ 2,"/hi"; - division
4257 # print $a /2,"/hi"; - pattern (and error)!
4258 elsif ( ( $prev_type eq 'b' ) && ( $next_type ne 'b' ) ) {
4259 $op_expected = TERM;
4262 # Note when an operation is being done where a
4263 # filehandle might be expected, since a change in whitespace
4264 # could change the interpretation of the statement.
4266 if ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
4268 # Do not complain in 'use' statements, which have special syntax.
4269 # For example, from RT#130344:
4270 # use lib $FindBin::Bin . '/lib';
4271 if ( $statement_type ne 'use' ) {
4272 complain("operator in print statement not recommended\n");
4274 $op_expected = OPERATOR;
4279 # Check for smartmatch operator before preceding brace or square bracket.
4280 # For example, at the ? after the ] in the following expressions we are
4281 # expecting an operator:
4283 # qr/3/ ~~ ['1234'] ? 1 : 0;
4284 # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
4285 elsif ( $last_nonblank_type eq '}' && $last_nonblank_token eq '~~' ) {
4286 $op_expected = OPERATOR;
4289 # handle something after 'do' and 'eval'
4290 elsif ( $is_block_operator{$last_nonblank_token} ) {
4292 # something like $a = eval "expression";
4294 if ( $last_nonblank_type eq 'k' ) {
4295 $op_expected = TERM; # expression or list mode following keyword
4298 # something like $a = do { BLOCK } / 2;
4299 # or this ? after a smartmatch anonynmous hash or array reference:
4300 # qr/3/ ~~ ['1234'] ? 1 : 0;
4303 $op_expected = OPERATOR; # block mode following }
4307 # handle bare word..
4308 elsif ( $last_nonblank_type eq 'w' ) {
4310 # unfortunately, we can't tell what type of token to expect next
4311 # after most bare words
4312 $op_expected = UNKNOWN;
4315 # operator, but not term possible after these types
4316 # Note: moved ')' from type to token because parens in list context
4317 # get marked as '{' '}' now. This is a minor glitch in the following:
4318 # my %opts = (ref $_[0] eq 'HASH') ? %{shift()} : ();
4320 elsif (( $last_nonblank_type =~ /^[\]RnviQh]$/ )
4321 || ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) )
4323 $op_expected = OPERATOR;
4325 # in a 'use' statement, numbers and v-strings are not true
4326 # numbers, so to avoid incorrect error messages, we will
4327 # mark them as unknown for now (use.t)
4328 # TODO: it would be much nicer to create a new token V for VERSION
4329 # number in a use statement. Then this could be a check on type V
4330 # and related patches which change $statement_type for '=>'
4331 # and ',' could be removed. Further, it would clean things up to
4332 # scan the 'use' statement with a separate subroutine.
4333 if ( ( $statement_type eq 'use' )
4334 && ( $last_nonblank_type =~ /^[nv]$/ ) )
4336 $op_expected = UNKNOWN;
4339 # expecting VERSION or {} after package NAMESPACE
4340 elsif ($statement_type =~ /^package\b/
4341 && $last_nonblank_token =~ /^package\b/ )
4343 $op_expected = TERM;
4347 # no operator after many keywords, such as "die", "warn", etc
4348 elsif ( $expecting_term_token{$last_nonblank_token} ) {
4350 # // may follow perl functions which may be unary operators
4351 # see test file dor.t (defined or);
4353 && $next_type eq '/'
4354 && $last_nonblank_type eq 'k'
4355 && $is_keyword_taking_optional_args{$last_nonblank_token} )
4357 $op_expected = OPERATOR;
4360 $op_expected = TERM;
4364 # no operator after things like + - ** (i.e., other operators)
4365 elsif ( $expecting_term_types{$last_nonblank_type} ) {
4366 $op_expected = TERM;
4369 # a few operators, like "time", have an empty prototype () and so
4370 # take no parameters but produce a value to operate on
4371 elsif ( $expecting_operator_token{$last_nonblank_token} ) {
4372 $op_expected = OPERATOR;
4375 # post-increment and decrement produce values to be operated on
4376 elsif ( $expecting_operator_types{$last_nonblank_type} ) {
4377 $op_expected = OPERATOR;
4380 # no value to operate on after sub block
4381 elsif ( $last_nonblank_token =~ /^sub\s/ ) { $op_expected = TERM; }
4383 # a right brace here indicates the end of a simple block.
4384 # all non-structural right braces have type 'R'
4385 # all braces associated with block operator keywords have been given those
4386 # keywords as "last_nonblank_token" and caught above.
4387 # (This statement is order dependent, and must come after checking
4388 # $last_nonblank_token).
4389 elsif ( $last_nonblank_type eq '}' ) {
4391 # patch for dor.t (defined or).
4393 && $next_type eq '/'
4394 && $last_nonblank_token eq ']' )
4396 $op_expected = OPERATOR;
4399 # Patch for RT #116344: misparse a ternary operator after an anonymous
4401 # return ref {} ? 1 : 0;
4402 # The right brace should really be marked type 'R' in this case, and
4403 # it is safest to return an UNKNOWN here. Expecting a TERM will
4404 # cause the '?' to always be interpreted as a pattern delimiter
4405 # rather than introducing a ternary operator.
4406 elsif ( $tok eq '?' ) {
4407 $op_expected = UNKNOWN;
4410 $op_expected = TERM;
4414 # something else..what did I forget?
4417 # collecting diagnostics on unknown operator types..see what was missed
4418 $op_expected = UNKNOWN;
4420 "OP: unknown after type=$last_nonblank_type token=$last_nonblank_token\n"
4424 TOKENIZER_DEBUG_FLAG_EXPECT && do {
4426 "EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
4428 return $op_expected;
4431 sub new_statement_ok {
4433 # return true if the current token can start a new statement
4434 # USES GLOBAL VARIABLES: $last_nonblank_type
4436 return label_ok() # a label would be ok here
4438 || $last_nonblank_type eq 'J'; # or we follow a label
4444 # Decide if a bare word followed by a colon here is a label
4445 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
4446 # $brace_depth, @brace_type
4448 # if it follows an opening or closing code block curly brace..
4449 if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
4450 && $last_nonblank_type eq $last_nonblank_token )
4453 # it is a label if and only if the curly encloses a code block
4454 return $brace_type[$brace_depth];
4457 # otherwise, it is a label if and only if it follows a ';' (real or fake)
4460 return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
4464 sub code_block_type {
4466 # Decide if this is a block of code, and its type.
4467 # Must be called only when $type = $token = '{'
4468 # The problem is to distinguish between the start of a block of code
4469 # and the start of an anonymous hash reference
4470 # Returns "" if not code block, otherwise returns 'last_nonblank_token'
4471 # to indicate the type of code block. (For example, 'last_nonblank_token'
4472 # might be 'if' for an if block, 'else' for an else block, etc).
4473 # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
4474 # $last_nonblank_block_type, $brace_depth, @brace_type
4476 # handle case of multiple '{'s
4478 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
4480 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
4481 if ( $last_nonblank_token eq '{'
4482 && $last_nonblank_type eq $last_nonblank_token )
4485 # opening brace where a statement may appear is probably
4486 # a code block but might be and anonymous hash reference
4487 if ( $brace_type[$brace_depth] ) {
4488 return decide_if_code_block( $i, $rtokens, $rtoken_type,
4492 # cannot start a code block within an anonymous hash
4498 elsif ( $last_nonblank_token eq ';' ) {
4500 # an opening brace where a statement may appear is probably
4501 # a code block but might be and anonymous hash reference
4502 return decide_if_code_block( $i, $rtokens, $rtoken_type,
4506 # handle case of '}{'
4507 elsif ($last_nonblank_token eq '}'
4508 && $last_nonblank_type eq $last_nonblank_token )
4511 # a } { situation ...
4512 # could be hash reference after code block..(blktype1.t)
4513 if ($last_nonblank_block_type) {
4514 return decide_if_code_block( $i, $rtokens, $rtoken_type,
4518 # must be a block if it follows a closing hash reference
4520 return $last_nonblank_token;
4524 ################################################################
4525 # NOTE: braces after type characters start code blocks, but for
4526 # simplicity these are not identified as such. See also
4527 # sub is_non_structural_brace.
4528 ################################################################
4530 ## elsif ( $last_nonblank_type eq 't' ) {
4531 ## return $last_nonblank_token;
4534 # brace after label:
4535 elsif ( $last_nonblank_type eq 'J' ) {
4536 return $last_nonblank_token;
4539 # otherwise, look at previous token. This must be a code block if
4540 # it follows any of these:
4541 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
4542 elsif ( $is_code_block_token{$last_nonblank_token} ) {
4544 # Bug Patch: Note that the opening brace after the 'if' in the following
4545 # snippet is an anonymous hash ref and not a code block!
4546 # print 'hi' if { x => 1, }->{x};
4547 # We can identify this situation because the last nonblank type
4548 # will be a keyword (instead of a closing peren)
4549 if ( $last_nonblank_token =~ /^(if|unless)$/
4550 && $last_nonblank_type eq 'k' )
4555 return $last_nonblank_token;
4559 # or a sub or package BLOCK
4560 elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
4561 && $last_nonblank_token =~ /^(sub|package)\b/ )
4563 return $last_nonblank_token;
4567 elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
4568 && ( $is_sub{$last_nonblank_token} ) )
4573 elsif ( $statement_type =~ /^(sub|package)\b/ ) {
4574 return $statement_type;
4577 # user-defined subs with block parameters (like grep/map/eval)
4578 elsif ( $last_nonblank_type eq 'G' ) {
4579 return $last_nonblank_token;
4583 elsif ( $last_nonblank_type eq 'w' ) {
4584 return decide_if_code_block( $i, $rtokens, $rtoken_type,
4588 # Patch for bug # RT #94338 reported by Daniel Trizen
4589 # for-loop in a parenthesized block-map triggering an error message:
4590 # map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
4591 # Check for a code block within a parenthesized function call
4592 elsif ( $last_nonblank_token eq '(' ) {
4593 my $paren_type = $paren_type[$paren_depth];
4594 if ( $paren_type && $paren_type =~ /^(map|grep|sort)$/ ) {
4596 # We will mark this as a code block but use type 't' instead
4597 # of the name of the contining function. This will allow for
4598 # correct parsing but will usually produce better formatting.
4599 # Braces with block type 't' are not broken open automatically
4600 # in the formatter as are other code block types, and this usually
4602 return 't'; # (Not $paren_type)
4609 # handle unknown syntax ') {'
4610 # we previously appended a '()' to mark this case
4611 elsif ( $last_nonblank_token =~ /\(\)$/ ) {
4612 return $last_nonblank_token;
4615 # anything else must be anonymous hash reference
4621 sub decide_if_code_block {
4623 # USES GLOBAL VARIABLES: $last_nonblank_token
4624 my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
4626 my ( $next_nonblank_token, $i_next ) =
4627 find_next_nonblank_token( $i, $rtokens, $max_token_index );
4629 # we are at a '{' where a statement may appear.
4630 # We must decide if this brace starts an anonymous hash or a code
4632 # return "" if anonymous hash, and $last_nonblank_token otherwise
4634 # initialize to be code BLOCK
4635 my $code_block_type = $last_nonblank_token;
4637 # Check for the common case of an empty anonymous hash reference:
4638 # Maybe something like sub { { } }
4639 if ( $next_nonblank_token eq '}' ) {
4640 $code_block_type = "";
4645 # To guess if this '{' is an anonymous hash reference, look ahead
4646 # and test as follows:
4648 # it is a hash reference if next come:
4649 # - a string or digit followed by a comma or =>
4650 # - bareword followed by =>
4651 # otherwise it is a code block
4653 # Examples of anonymous hash ref:
4657 # Examples of code blocks:
4658 # {1; print "hello\n", 1;}
4661 # We are only going to look ahead one more (nonblank/comment) line.
4662 # Strange formatting could cause a bad guess, but that's unlikely.
4666 # Ignore the rest of this line if it is a side comment
4667 if ( $next_nonblank_token ne '#' ) {
4668 @pre_types = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
4669 @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
4671 my ( $rpre_tokens, $rpre_types ) =
4672 peek_ahead_for_n_nonblank_pre_tokens(20); # 20 is arbitrary but
4673 # generous, and prevents
4675 # time in mangled files
4676 if ( defined($rpre_types) && @{$rpre_types} ) {
4677 push @pre_types, @{$rpre_types};
4678 push @pre_tokens, @{$rpre_tokens};
4681 # put a sentinel token to simplify stopping the search
4682 push @pre_types, '}';
4683 push @pre_types, '}';
4686 $jbeg = 1 if $pre_types[0] eq 'b';
4688 # first look for one of these
4690 # - bareword with leading -
4694 if ( $pre_types[$j] =~ /^[\'\"]/ ) {
4696 # find the closing quote; don't worry about escapes
4697 my $quote_mark = $pre_types[$j];
4698 foreach my $k ( $j + 1 .. @pre_types - 2 ) {
4699 if ( $pre_types[$k] eq $quote_mark ) {
4701 my $next = $pre_types[$j];
4706 elsif ( $pre_types[$j] eq 'd' ) {
4709 elsif ( $pre_types[$j] eq 'w' ) {
4712 elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
4717 $j++ if $pre_types[$j] eq 'b';
4719 # Patched for RT #95708
4722 # it is a comma which is not a pattern delimeter except for qw
4724 $pre_types[$j] eq ','
4725 && $pre_tokens[$jbeg] !~ /^(s|m|y|tr|qr|q|qq|qx)$/
4729 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
4732 $code_block_type = "";
4737 return $code_block_type;
4740 sub report_unexpected {
4742 # report unexpected token type and show where it is
4743 # USES GLOBAL VARIABLES: $tokenizer_self
4744 my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
4745 $rpretoken_type, $input_line )
4748 if ( ++$tokenizer_self->{_unexpected_error_count} <= MAX_NAG_MESSAGES ) {
4749 my $msg = "found $found where $expecting expected";
4750 my $pos = $rpretoken_map->[$i_tok];
4751 interrupt_logfile();
4752 my $input_line_number = $tokenizer_self->{_last_line_number};
4753 my ( $offset, $numbered_line, $underline ) =
4754 make_numbered_line( $input_line_number, $input_line, $pos );
4755 $underline = write_on_underline( $underline, $pos - $offset, '^' );
4758 if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
4759 my $pos_prev = $rpretoken_map->[$last_nonblank_i];
4761 if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
4762 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
4765 $num = $pos - $pos_prev;
4767 if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
4770 write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
4771 $trailer = " (previous token underlined)";
4773 $underline =~ s/\s+$//;
4774 warning( $numbered_line . "\n" );
4775 warning( $underline . "\n" );
4776 warning( $msg . $trailer . "\n" );
4782 sub is_non_structural_brace {
4784 # Decide if a brace or bracket is structural or non-structural
4785 # by looking at the previous token and type
4786 # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
4788 # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
4789 # Tentatively deactivated because it caused the wrong operator expectation
4791 # $user = @vars[1] / 100;
4792 # Must update sub operator_expected before re-implementing.
4793 # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
4797 ################################################################
4798 # NOTE: braces after type characters start code blocks, but for
4799 # simplicity these are not identified as such. See also
4800 # sub code_block_type
4801 ################################################################
4803 ##if ($last_nonblank_type eq 't') {return 0}
4805 # otherwise, it is non-structural if it is decorated
4806 # by type information.
4807 # For example, the '{' here is non-structural: ${xxx}
4809 $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
4811 # or if we follow a hash or array closing curly brace or bracket
4812 # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
4813 # because the first '}' would have been given type 'R'
4814 || $last_nonblank_type =~ /^([R\]])$/
4818 #########i#############################################################
4819 # Tokenizer routines for tracking container nesting depths
4820 #######################################################################
4822 # The following routines keep track of nesting depths of the nesting
4823 # types, ( [ { and ?. This is necessary for determining the indentation
4824 # level, and also for debugging programs. Not only do they keep track of
4825 # nesting depths of the individual brace types, but they check that each
4826 # of the other brace types is balanced within matching pairs. For
4827 # example, if the program sees this sequence:
4831 # then it can determine that there is an extra left paren somewhere
4832 # between the { and the }. And so on with every other possible
4833 # combination of outer and inner brace types. For another
4838 # which has an extra ] within the parens.
4840 # The brace types have indexes 0 .. 3 which are indexes into
4843 # The pair ? : are treated as just another nesting type, with ? acting
4844 # as the opening brace and : acting as the closing brace.
4848 # $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
4850 # saves the nesting depth of brace type $b (where $b is either of the other
4851 # nesting types) when brace type $a enters a new depth. When this depth
4852 # decreases, a check is made that the current depth of brace types $b is
4853 # unchanged, or otherwise there must have been an error. This can
4854 # be very useful for localizing errors, particularly when perl runs to
4855 # the end of a large file (such as this one) and announces that there
4856 # is a problem somewhere.
4858 # A numerical sequence number is maintained for every nesting type,
4859 # so that each matching pair can be uniquely identified in a simple
4862 sub increase_nesting_depth {
4863 my ( $aa, $pos ) = @_;
4865 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
4866 # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
4868 $current_depth[$aa]++;
4870 $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
4871 my $input_line_number = $tokenizer_self->{_last_line_number};
4872 my $input_line = $tokenizer_self->{_line_text};
4874 # Sequence numbers increment by number of items. This keeps
4875 # a unique set of numbers but still allows the relative location
4876 # of any type to be determined.
4877 $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
4878 my $seqno = $nesting_sequence_number[$aa];
4879 $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
4881 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
4882 [ $input_line_number, $input_line, $pos ];
4884 for my $bb ( 0 .. @closing_brace_names - 1 ) {
4885 next if ( $bb == $aa );
4886 $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
4889 # set a flag for indenting a nested ternary statement
4891 if ( $aa == QUESTION_COLON ) {
4892 $nested_ternary_flag[ $current_depth[$aa] ] = 0;
4893 if ( $current_depth[$aa] > 1 ) {
4894 if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
4895 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
4896 if ( $pdepth == $total_depth - 1 ) {
4898 $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
4903 $nested_statement_type[$aa][ $current_depth[$aa] ] = $statement_type;
4904 $statement_type = "";
4905 return ( $seqno, $indent );
4908 sub decrease_nesting_depth {
4910 my ( $aa, $pos ) = @_;
4912 # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
4913 # @current_sequence_number, @depth_array, @starting_line_of_current_depth
4916 my $input_line_number = $tokenizer_self->{_last_line_number};
4917 my $input_line = $tokenizer_self->{_line_text};
4921 if ( $current_depth[$aa] > 0 ) {
4923 # set a flag for un-indenting after seeing a nested ternary statement
4924 $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
4925 if ( $aa == QUESTION_COLON ) {
4926 $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
4928 $statement_type = $nested_statement_type[$aa][ $current_depth[$aa] ];
4930 # check that any brace types $bb contained within are balanced
4931 for my $bb ( 0 .. @closing_brace_names - 1 ) {
4932 next if ( $bb == $aa );
4934 unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
4935 $current_depth[$bb] )
4938 $current_depth[$bb] -
4939 $depth_array[$aa][$bb][ $current_depth[$aa] ];
4941 # don't whine too many times
4942 my $saw_brace_error = get_saw_brace_error();
4944 $saw_brace_error <= MAX_NAG_MESSAGES
4946 # if too many closing types have occurred, we probably
4947 # already caught this error
4948 && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
4951 interrupt_logfile();
4953 $starting_line_of_current_depth[$aa]
4954 [ $current_depth[$aa] ];
4956 my $rel = [ $input_line_number, $input_line, $pos ];
4960 if ( $diff == 1 || $diff == -1 ) {
4968 ? $opening_brace_names[$bb]
4969 : $closing_brace_names[$bb];
4970 write_error_indicator_pair( @{$rsl}, '^' );
4972 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
4977 $starting_line_of_current_depth[$bb]
4978 [ $current_depth[$bb] ];
4981 " The most recent un-matched $bname is on line $ml\n";
4982 write_error_indicator_pair( @{$rml}, '^' );
4984 write_error_indicator_pair( @{$rel}, '^' );
4988 increment_brace_error();
4991 $current_depth[$aa]--;
4995 my $saw_brace_error = get_saw_brace_error();
4996 if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
4998 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
5000 indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
5002 increment_brace_error();
5004 return ( $seqno, $outdent );
5007 sub check_final_nesting_depths {
5009 # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
5011 for my $aa ( 0 .. @closing_brace_names - 1 ) {
5013 if ( $current_depth[$aa] ) {
5015 $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
5018 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
5019 The most recent un-matched $opening_brace_names[$aa] is on line $sl
5021 indicate_error( $msg, @{$rsl}, '^' );
5022 increment_brace_error();
5028 #########i#############################################################
5029 # Tokenizer routines for looking ahead in input stream
5030 #######################################################################
5032 sub peek_ahead_for_n_nonblank_pre_tokens {
5034 # returns next n pretokens if they exist
5035 # returns undef's if hits eof without seeing any pretokens
5036 # USES GLOBAL VARIABLES: $tokenizer_self
5037 my $max_pretokens = shift;
5040 my ( $rpre_tokens, $rmap, $rpre_types );
5042 while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
5044 $line =~ s/^\s*//; # trim leading blanks
5045 next if ( length($line) <= 0 ); # skip blank
5046 next if ( $line =~ /^#/ ); # skip comment
5047 ( $rpre_tokens, $rmap, $rpre_types ) =
5048 pre_tokenize( $line, $max_pretokens );
5051 return ( $rpre_tokens, $rpre_types );
5054 # look ahead for next non-blank, non-comment line of code
5055 sub peek_ahead_for_nonblank_token {
5057 # USES GLOBAL VARIABLES: $tokenizer_self
5058 my ( $rtokens, $max_token_index ) = @_;
5062 while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
5064 $line =~ s/^\s*//; # trim leading blanks
5065 next if ( length($line) <= 0 ); # skip blank
5066 next if ( $line =~ /^#/ ); # skip comment
5067 my ( $rtok, $rmap, $rtype ) =
5068 pre_tokenize( $line, 2 ); # only need 2 pre-tokens
5069 my $j = $max_token_index + 1;
5071 foreach my $tok ( @{$rtok} ) {
5072 last if ( $tok =~ "\n" );
5073 $rtokens->[ ++$j ] = $tok;
5080 #########i#############################################################
5081 # Tokenizer guessing routines for ambiguous situations
5082 #######################################################################
5084 sub guess_if_pattern_or_conditional {
5086 # this routine is called when we have encountered a ? following an
5087 # unknown bareword, and we must decide if it starts a pattern or not
5089 # $i - token index of the ? starting possible pattern
5090 # output parameters:
5091 # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
5092 # msg = a warning or diagnostic message
5093 # USES GLOBAL VARIABLES: $last_nonblank_token
5095 # FIXME: this needs to be rewritten
5097 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
5099 my $msg = "guessing that ? after $last_nonblank_token starts a ";
5101 if ( $i >= $max_token_index ) {
5102 $msg .= "conditional (no end to pattern found on the line)\n";
5107 my $next_token = $rtokens->[$i]; # first token after ?
5109 # look for a possible ending ? on this line..
5111 my $quote_depth = 0;
5112 my $quote_character = '';
5116 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
5119 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
5120 $quote_pos, $quote_depth, $max_token_index );
5124 # we didn't find an ending ? on this line,
5125 # so we bias towards conditional
5127 $msg .= "conditional (no ending ? on this line)\n";
5129 # we found an ending ?, so we bias towards a pattern
5133 # Watch out for an ending ? in quotes, like this
5134 # my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
5138 foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
5139 my $tok = $rtokens->[$ii];
5140 if ( $tok eq ":" ) { $colons++ }
5141 if ( $tok eq "'" ) { $s_quote++ }
5142 if ( $tok eq '"' ) { $d_quote++ }
5144 if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
5146 $msg .= "found ending ? but unbalanced quote chars\n";
5148 elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
5150 $msg .= "pattern (found ending ? and pattern expected)\n";
5153 $msg .= "pattern (uncertain, but found ending ?)\n";
5157 return ( $is_pattern, $msg );
5160 sub guess_if_pattern_or_division {
5162 # this routine is called when we have encountered a / following an
5163 # unknown bareword, and we must decide if it starts a pattern or is a
5166 # $i - token index of the / starting possible pattern
5167 # output parameters:
5168 # $is_pattern = 0 if probably division, =1 if probably a pattern
5169 # msg = a warning or diagnostic message
5170 # USES GLOBAL VARIABLES: $last_nonblank_token
5171 my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
5173 my $msg = "guessing that / after $last_nonblank_token starts a ";
5175 if ( $i >= $max_token_index ) {
5176 $msg .= "division (no end to pattern found on the line)\n";
5180 my $divide_expected =
5181 numerator_expected( $i, $rtokens, $max_token_index );
5183 my $next_token = $rtokens->[$i]; # first token after slash
5185 # look for a possible ending / on this line..
5187 my $quote_depth = 0;
5188 my $quote_character = '';
5192 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
5195 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
5196 $quote_pos, $quote_depth, $max_token_index );
5200 # we didn't find an ending / on this line,
5201 # so we bias towards division
5202 if ( $divide_expected >= 0 ) {
5204 $msg .= "division (no ending / on this line)\n";
5207 $msg = "multi-line pattern (division not possible)\n";
5213 # we found an ending /, so we bias towards a pattern
5216 if ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
5218 if ( $divide_expected >= 0 ) {
5220 if ( $i - $ibeg > 60 ) {
5221 $msg .= "division (matching / too distant)\n";
5225 $msg .= "pattern (but division possible too)\n";
5231 $msg .= "pattern (division not possible)\n";
5236 if ( $divide_expected >= 0 ) {
5238 $msg .= "division (pattern not possible)\n";
5243 "pattern (uncertain, but division would not work here)\n";
5248 return ( $is_pattern, $msg );
5251 # try to resolve here-doc vs. shift by looking ahead for
5252 # non-code or the end token (currently only looks for end token)
5253 # returns 1 if it is probably a here doc, 0 if not
5254 sub guess_if_here_doc {
5256 # This is how many lines we will search for a target as part of the
5257 # guessing strategy. It is a constant because there is probably
5258 # little reason to change it.
5259 # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
5261 my $HERE_DOC_WINDOW = 40;
5263 my $next_token = shift;
5264 my $here_doc_expected = 0;
5267 my $msg = "checking <<";
5269 while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $k++ ) )
5273 if ( $line =~ /^$next_token$/ ) {
5274 $msg .= " -- found target $next_token ahead $k lines\n";
5275 $here_doc_expected = 1; # got it
5278 last if ( $k >= $HERE_DOC_WINDOW );
5281 unless ($here_doc_expected) {
5283 if ( !defined($line) ) {
5284 $here_doc_expected = -1; # hit eof without seeing target
5285 $msg .= " -- must be shift; target $next_token not in file\n";
5288 else { # still unsure..taking a wild guess
5290 if ( !$is_constant{$current_package}{$next_token} ) {
5291 $here_doc_expected = 1;
5293 " -- guessing it's a here-doc ($next_token not a constant)\n";
5297 " -- guessing it's a shift ($next_token is a constant)\n";
5301 write_logfile_entry($msg);
5302 return $here_doc_expected;
5305 #########i#############################################################
5306 # Tokenizer Routines for scanning identifiers and related items
5307 #######################################################################
5309 sub scan_bare_identifier_do {
5311 # this routine is called to scan a token starting with an alphanumeric
5312 # variable or package separator, :: or '.
5313 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
5314 # $last_nonblank_type,@paren_type, $paren_depth
5316 my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
5320 my $package = undef;
5324 # we have to back up one pretoken at a :: since each : is one pretoken
5325 if ( $tok eq '::' ) { $i_beg-- }
5326 if ( $tok eq '->' ) { $i_beg-- }
5327 my $pos_beg = $rtoken_map->[$i_beg];
5328 pos($input_line) = $pos_beg;
5335 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
5337 my $pos = pos($input_line);
5338 my $numc = $pos - $pos_beg;
5339 $tok = substr( $input_line, $pos_beg, $numc );
5341 # type 'w' includes anything without leading type info
5342 # ($,%,@,*) including something like abc::def::ghi
5346 if ( defined($2) ) { $sub_name = $2; }
5347 if ( defined($1) ) {
5350 # patch: don't allow isolated package name which just ends
5351 # in the old style package separator (single quote). Example:
5353 if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
5357 $package =~ s/\'/::/g;
5358 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
5359 $package =~ s/::$//;
5362 $package = $current_package;
5364 if ( $is_keyword{$tok} ) {
5369 # if it is a bareword..
5370 if ( $type eq 'w' ) {
5372 # check for v-string with leading 'v' type character
5373 # (This seems to have precedence over filehandle, type 'Y')
5374 if ( $tok =~ /^v\d[_\d]*$/ ) {
5376 # we only have the first part - something like 'v101' -
5378 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
5379 $pos = pos($input_line);
5380 $numc = $pos - $pos_beg;
5381 $tok = substr( $input_line, $pos_beg, $numc );
5385 # warn if this version can't handle v-strings
5386 report_v_string($tok);
5389 elsif ( $is_constant{$package}{$sub_name} ) {
5393 # bareword after sort has implied empty prototype; for example:
5394 # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
5395 # This has priority over whatever the user has specified.
5396 elsif ($last_nonblank_token eq 'sort'
5397 && $last_nonblank_type eq 'k' )
5402 # Note: strangely, perl does not seem to really let you create
5403 # functions which act like eval and do, in the sense that eval
5404 # and do may have operators following the final }, but any operators
5405 # that you create with prototype (&) apparently do not allow
5406 # trailing operators, only terms. This seems strange.
5407 # If this ever changes, here is the update
5408 # to make perltidy behave accordingly:
5410 # elsif ( $is_block_function{$package}{$tok} ) {
5411 # $tok='eval'; # patch to do braces like eval - doesn't work
5414 # FIXME: This could become a separate type to allow for different
5416 elsif ( $is_block_function{$package}{$sub_name} ) {
5420 elsif ( $is_block_list_function{$package}{$sub_name} ) {
5423 elsif ( $is_user_function{$package}{$sub_name} ) {
5425 $prototype = $user_function_prototype{$package}{$sub_name};
5428 # check for indirect object
5431 # added 2001-03-27: must not be followed immediately by '('
5433 ( $input_line !~ m/\G\(/gc )
5438 # preceded by keyword like 'print', 'printf' and friends
5439 $is_indirect_object_taker{$last_nonblank_token}
5441 # or preceded by something like 'print(' or 'printf('
5443 ( $last_nonblank_token eq '(' )
5444 && $is_indirect_object_taker{ $paren_type[$paren_depth]
5452 # may not be indirect object unless followed by a space
5453 if ( $input_line =~ m/\G\s+/gc ) {
5457 # Perl's indirect object notation is a very bad
5458 # thing and can cause subtle bugs, especially for
5459 # beginning programmers. And I haven't even been
5460 # able to figure out a sane warning scheme which
5461 # doesn't get in the way of good scripts.
5463 # Complain if a filehandle has any lower case
5464 # letters. This is suggested good practice.
5465 # Use 'sub_name' because something like
5466 # main::MYHANDLE is ok for filehandle
5467 if ( $sub_name =~ /[a-z]/ ) {
5469 # could be bug caused by older perltidy if
5471 if ( $input_line =~ m/\G\s*\(/gc ) {
5473 "Caution: unknown word '$tok' in indirect object slot\n"
5479 # bareword not followed by a space -- may not be filehandle
5480 # (may be function call defined in a 'use' statement)
5487 # Now we must convert back from character position
5488 # to pre_token index.
5489 # I don't think an error flag can occur here ..but who knows
5492 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
5494 warning("scan_bare_identifier: Possibly invalid tokenization\n");
5498 # no match but line not blank - could be syntax error
5499 # perl will take '::' alone without complaint
5503 # change this warning to log message if it becomes annoying
5504 warning("didn't find identifier after leading ::\n");
5506 return ( $i, $tok, $type, $prototype );
5511 # This is the new scanner and will eventually replace scan_identifier.
5512 # Only type 'sub' and 'package' are implemented.
5513 # Token types $ * % @ & -> are not yet implemented.
5515 # Scan identifier following a type token.
5516 # The type of call depends on $id_scan_state: $id_scan_state = ''
5517 # for starting call, in which case $tok must be the token defining
5520 # If the type token is the last nonblank token on the line, a value
5521 # of $id_scan_state = $tok is returned, indicating that further
5522 # calls must be made to get the identifier. If the type token is
5523 # not the last nonblank token on the line, the identifier is
5524 # scanned and handled and a value of '' is returned.
5525 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
5526 # $statement_type, $tokenizer_self
5528 my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
5532 my ( $i_beg, $pos_beg );
5534 #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
5535 #my ($a,$b,$c) = caller;
5536 #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
5538 # on re-entry, start scanning at first token on the line
5539 if ($id_scan_state) {
5544 # on initial entry, start scanning just after type token
5547 $id_scan_state = $tok;
5551 # find $i_beg = index of next nonblank token,
5552 # and handle empty lines
5554 my $next_nonblank_token = $rtokens->[$i_beg];
5555 if ( $i_beg > $max_token_index ) {
5560 # only a '#' immediately after a '$' is not a comment
5561 if ( $next_nonblank_token eq '#' ) {
5562 unless ( $tok eq '$' ) {
5567 if ( $next_nonblank_token =~ /^\s/ ) {
5568 ( $next_nonblank_token, $i_beg ) =
5569 find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
5571 if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
5577 # handle non-blank line; identifier, if any, must follow
5578 unless ($blank_line) {
5580 if ( $is_sub{$id_scan_state} ) {
5581 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
5582 $input_line, $i, $i_beg,
5583 $tok, $type, $rtokens,
5584 $rtoken_map, $id_scan_state, $max_token_index
5588 elsif ( $is_package{$id_scan_state} ) {
5589 ( $i, $tok, $type ) =
5590 do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
5591 $rtoken_map, $max_token_index );
5592 $id_scan_state = '';
5596 warning("invalid token in scan_id: $tok\n");
5597 $id_scan_state = '';
5601 if ( $id_scan_state && ( !defined($type) || !$type ) ) {
5605 "Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
5607 report_definite_bug();
5610 TOKENIZER_DEBUG_FLAG_NSCAN && do {
5612 "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
5614 return ( $i, $tok, $type, $id_scan_state );
5617 sub check_prototype {
5618 my ( $proto, $package, $subname ) = @_;
5619 return unless ( defined($package) && defined($subname) );
5620 if ( defined($proto) ) {
5621 $proto =~ s/^\s*\(\s*//;
5622 $proto =~ s/\s*\)$//;
5624 $is_user_function{$package}{$subname} = 1;
5625 $user_function_prototype{$package}{$subname} = "($proto)";
5627 # prototypes containing '&' must be treated specially..
5628 if ( $proto =~ /\&/ ) {
5630 # right curly braces of prototypes ending in
5631 # '&' may be followed by an operator
5632 if ( $proto =~ /\&$/ ) {
5633 $is_block_function{$package}{$subname} = 1;
5636 # right curly braces of prototypes NOT ending in
5637 # '&' may NOT be followed by an operator
5638 elsif ( $proto !~ /\&$/ ) {
5639 $is_block_list_function{$package}{$subname} = 1;
5644 $is_constant{$package}{$subname} = 1;
5648 $is_user_function{$package}{$subname} = 1;
5653 sub do_scan_package {
5655 # do_scan_package parses a package name
5656 # it is called with $i_beg equal to the index of the first nonblank
5657 # token following a 'package' token.
5658 # USES GLOBAL VARIABLES: $current_package,
5661 # package NAMESPACE VERSION
5662 # package NAMESPACE BLOCK
5663 # package NAMESPACE VERSION BLOCK
5665 # If VERSION is provided, package sets the $VERSION variable in the given
5666 # namespace to a version object with the VERSION provided. VERSION must be
5667 # a "strict" style version number as defined by the version module: a
5668 # positive decimal number (integer or decimal-fraction) without
5669 # exponentiation or else a dotted-decimal v-string with a leading 'v'
5670 # character and at least three components.
5671 # reference http://perldoc.perl.org/functions/package.html
5673 my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
5676 my $package = undef;
5677 my $pos_beg = $rtoken_map->[$i_beg];
5678 pos($input_line) = $pos_beg;
5680 # handle non-blank line; package name, if any, must follow
5681 if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w+)/gc ) {
5683 $package = ( defined($1) && $1 ) ? $1 : 'main';
5684 $package =~ s/\'/::/g;
5685 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
5686 $package =~ s/::$//;
5687 my $pos = pos($input_line);
5688 my $numc = $pos - $pos_beg;
5689 $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
5692 # Now we must convert back from character position
5693 # to pre_token index.
5694 # I don't think an error flag can occur here ..but ?
5697 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
5698 if ($error) { warning("Possibly invalid package\n") }
5699 $current_package = $package;
5701 # we should now have package NAMESPACE
5702 # now expecting VERSION, BLOCK, or ; to follow ...
5703 # package NAMESPACE VERSION
5704 # package NAMESPACE BLOCK
5705 # package NAMESPACE VERSION BLOCK
5706 my ( $next_nonblank_token, $i_next ) =
5707 find_next_nonblank_token( $i, $rtokens, $max_token_index );
5709 # check that something recognizable follows, but do not parse.
5710 # A VERSION number will be parsed later as a number or v-string in the
5711 # normal way. What is important is to set the statement type if
5712 # everything looks okay so that the operator_expected() routine
5713 # knows that the number is in a package statement.
5714 # Examples of valid primitive tokens that might follow are:
5716 if ( $next_nonblank_token =~ /^([v\.\d;\{\}])|v\d|\d+$/ ) {
5717 $statement_type = $tok;
5721 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
5726 # no match but line not blank --
5727 # could be a label with name package, like package: , for example.
5732 return ( $i, $tok, $type );
5735 sub scan_identifier_do {
5737 # This routine assembles tokens into identifiers. It maintains a
5738 # scan state, id_scan_state. It updates id_scan_state based upon
5739 # current id_scan_state and token, and returns an updated
5740 # id_scan_state and the next index after the identifier.
5741 # USES GLOBAL VARIABLES: $context, $last_nonblank_token,
5742 # $last_nonblank_type
5744 my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
5745 $expecting, $container_type )
5749 my $tok_begin = $rtokens->[$i_begin];
5750 if ( $tok_begin eq ':' ) { $tok_begin = '::' }
5751 my $id_scan_state_begin = $id_scan_state;
5752 my $identifier_begin = $identifier;
5753 my $tok = $tok_begin;
5756 my $in_prototype_or_signature = $container_type =~ /^sub/;
5758 # these flags will be used to help figure out the type:
5759 my $saw_alpha = ( $tok =~ /^[A-Za-z_]/ );
5762 # allow old package separator (') except in 'use' statement
5763 my $allow_tick = ( $last_nonblank_token ne 'use' );
5765 # get started by defining a type and a state if necessary
5766 unless ($id_scan_state) {
5767 $context = UNKNOWN_CONTEXT;
5770 if ( $tok eq '>' ) {
5776 if ( $tok eq '$' || $tok eq '*' ) {
5777 $id_scan_state = '$';
5778 $context = SCALAR_CONTEXT;
5780 elsif ( $tok eq '%' || $tok eq '@' ) {
5781 $id_scan_state = '$';
5782 $context = LIST_CONTEXT;
5784 elsif ( $tok eq '&' ) {
5785 $id_scan_state = '&';
5787 elsif ( $tok eq 'sub' or $tok eq 'package' ) {
5788 $saw_alpha = 0; # 'sub' is considered type info here
5789 $id_scan_state = '$';
5790 $identifier .= ' '; # need a space to separate sub from sub name
5792 elsif ( $tok eq '::' ) {
5793 $id_scan_state = 'A';
5795 elsif ( $tok =~ /^[A-Za-z_]/ ) {
5796 $id_scan_state = ':';
5798 elsif ( $tok eq '->' ) {
5799 $id_scan_state = '$';
5804 my ( $a, $b, $c ) = caller;
5805 warning("Program Bug: scan_identifier given bad token = $tok \n");
5806 warning(" called from sub $a line: $c\n");
5807 report_definite_bug();
5809 $saw_type = !$saw_alpha;
5813 $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
5816 # now loop to gather the identifier
5819 while ( $i < $max_token_index ) {
5820 $i_save = $i unless ( $tok =~ /^\s*$/ );
5821 $tok = $rtokens->[ ++$i ];
5823 if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
5828 if ( $id_scan_state eq '$' ) { # starting variable name
5830 if ( $tok eq '$' ) {
5832 $identifier .= $tok;
5834 # we've got a punctuation variable if end of line (punct.t)
5835 if ( $i == $max_token_index ) {
5837 $id_scan_state = '';
5842 # POSTDEFREF ->@ ->% ->& ->*
5843 elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
5844 $identifier .= $tok;
5846 elsif ( $tok =~ /^[A-Za-z_]/ ) { # alphanumeric ..
5848 $id_scan_state = ':'; # now need ::
5849 $identifier .= $tok;
5851 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
5853 $id_scan_state = ':'; # now need ::
5854 $identifier .= $tok;
5856 # Perl will accept leading digits in identifiers,
5857 # although they may not always produce useful results.
5858 # Something like $main::0 is ok. But this also works:
5860 # sub howdy::123::bubba{ print "bubba $54321!\n" }
5861 # howdy::123::bubba();
5864 elsif ( $tok =~ /^[0-9]/ ) { # numeric
5866 $id_scan_state = ':'; # now need ::
5867 $identifier .= $tok;
5869 elsif ( $tok eq '::' ) {
5870 $id_scan_state = 'A';
5871 $identifier .= $tok;
5874 # $# and POSTDEFREF ->$#
5875 elsif ( ( $tok eq '#' ) && ( $identifier =~ /\$$/ ) ) { # $#array
5876 $identifier .= $tok; # keep same state, a $ could follow
5878 elsif ( $tok eq '{' ) {
5880 # check for something like ${#} or ${©}
5884 || $identifier eq '@'
5885 || $identifier eq '$#'
5887 && $i + 2 <= $max_token_index
5888 && $rtokens->[ $i + 2 ] eq '}'
5889 && $rtokens->[ $i + 1 ] !~ /[\s\w]/
5892 my $next2 = $rtokens->[ $i + 2 ];
5893 my $next1 = $rtokens->[ $i + 1 ];
5894 $identifier .= $tok . $next1 . $next2;
5896 $id_scan_state = '';
5900 # skip something like ${xxx} or ->{
5901 $id_scan_state = '';
5903 # if this is the first token of a line, any tokens for this
5904 # identifier have already been accumulated
5905 if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; }
5910 # space ok after leading $ % * & @
5911 elsif ( $tok =~ /^\s*$/ ) {
5913 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
5915 if ( length($identifier) > 1 ) {
5916 $id_scan_state = '';
5918 $type = 'i'; # probably punctuation variable
5923 # spaces after $'s are common, and space after @
5924 # is harmless, so only complain about space
5925 # after other type characters. Space after $ and
5926 # @ will be removed in formatting. Report space
5927 # after % and * because they might indicate a
5928 # parsing error. In other words '% ' might be a
5929 # modulo operator. Delete this warning if it
5931 if ( $identifier !~ /^[\@\$]$/ ) {
5933 "Space in identifier, following $identifier\n";
5939 # space after '->' is ok
5941 elsif ( $tok eq '^' ) {
5943 # check for some special variables like $^W
5944 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
5945 $identifier .= $tok;
5946 $id_scan_state = 'A';
5948 # Perl accepts '$^]' or '@^]', but
5949 # there must not be a space before the ']'.
5950 my $next1 = $rtokens->[ $i + 1 ];
5951 if ( $next1 eq ']' ) {
5953 $identifier .= $next1;
5954 $id_scan_state = "";
5959 $id_scan_state = '';
5962 else { # something else
5964 if ( $in_prototype_or_signature && $tok =~ /^[\),=]/ ) {
5965 $id_scan_state = '';
5967 $type = 'i'; # probably punctuation variable
5971 # check for various punctuation variables
5972 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
5973 $identifier .= $tok;
5976 # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#*
5977 elsif ( $tok eq '*' && $identifier =~ /([\@\%\$\*\&]|\$\#)$/ ) {
5978 $identifier .= $tok;
5981 elsif ( $identifier eq '$#' ) {
5983 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
5985 # perl seems to allow just these: $#: $#- $#+
5986 elsif ( $tok =~ /^[\:\-\+]$/ ) {
5988 $identifier .= $tok;
5992 write_logfile_entry( 'Use of $# is deprecated' . "\n" );
5995 elsif ( $identifier eq '$$' ) {
5997 # perl does not allow references to punctuation
5998 # variables without braces. For example, this
6002 # You would have to use
6006 if ( $tok eq '{' ) { $type = 't' }
6007 else { $type = 'i' }
6009 elsif ( $identifier eq '->' ) {
6014 if ( length($identifier) == 1 ) { $identifier = ''; }
6016 $id_scan_state = '';
6020 elsif ( $id_scan_state eq '&' ) { # starting sub call?
6022 if ( $tok =~ /^[\$A-Za-z_]/ ) { # alphanumeric ..
6023 $id_scan_state = ':'; # now need ::
6025 $identifier .= $tok;
6027 elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
6028 $id_scan_state = ':'; # now need ::
6030 $identifier .= $tok;
6032 elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
6033 $id_scan_state = ':'; # now need ::
6035 $identifier .= $tok;
6037 elsif ( $tok =~ /^\s*$/ ) { # allow space
6039 elsif ( $tok eq '::' ) { # leading ::
6040 $id_scan_state = 'A'; # accept alpha next
6041 $identifier .= $tok;
6043 elsif ( $tok eq '{' ) {
6044 if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; }
6046 $id_scan_state = '';
6051 # punctuation variable?
6052 # testfile: cunningham4.pl
6054 # We have to be careful here. If we are in an unknown state,
6055 # we will reject the punctuation variable. In the following
6056 # example the '&' is a binary operator but we are in an unknown
6057 # state because there is no sigil on 'Prima', so we don't
6058 # know what it is. But it is a bad guess that
6059 # '&~' is a function variable.
6060 # $self->{text}->{colorMap}->[
6061 # Prima::PodView::COLOR_CODE_FOREGROUND
6062 # & ~tb::COLOR_INDEX ] =
6064 if ( $identifier eq '&' && $expecting ) {
6065 $identifier .= $tok;
6072 $id_scan_state = '';
6076 elsif ( $id_scan_state eq 'A' ) { # looking for alpha (after ::)
6078 if ( $tok =~ /^[A-Za-z_]/ ) { # found it
6079 $identifier .= $tok;
6080 $id_scan_state = ':'; # now need ::
6083 elsif ( $tok eq "'" && $allow_tick ) {
6084 $identifier .= $tok;
6085 $id_scan_state = ':'; # now need ::
6088 elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
6089 $identifier .= $tok;
6090 $id_scan_state = ':'; # now need ::
6093 elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
6094 $id_scan_state = '(';
6095 $identifier .= $tok;
6097 elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
6098 $id_scan_state = ')';
6099 $identifier .= $tok;
6102 $id_scan_state = '';
6107 elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha
6109 if ( $tok eq '::' ) { # got it
6110 $identifier .= $tok;
6111 $id_scan_state = 'A'; # now require alpha
6113 elsif ( $tok =~ /^[A-Za-z_]/ ) { # more alphanumeric is ok here
6114 $identifier .= $tok;
6115 $id_scan_state = ':'; # now need ::
6118 elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
6119 $identifier .= $tok;
6120 $id_scan_state = ':'; # now need ::
6123 elsif ( $tok eq "'" && $allow_tick ) { # tick
6125 if ( $is_keyword{$identifier} ) {
6126 $id_scan_state = ''; # that's all
6130 $identifier .= $tok;
6133 elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
6134 $id_scan_state = '(';
6135 $identifier .= $tok;
6137 elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
6138 $id_scan_state = ')';
6139 $identifier .= $tok;
6142 $id_scan_state = ''; # that's all
6147 elsif ( $id_scan_state eq '(' ) { # looking for ( of prototype
6149 if ( $tok eq '(' ) { # got it
6150 $identifier .= $tok;
6151 $id_scan_state = ')'; # now find the end of it
6153 elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
6154 $identifier .= $tok;
6157 $id_scan_state = ''; # that's all - no prototype
6162 elsif ( $id_scan_state eq ')' ) { # looking for ) to end
6164 if ( $tok eq ')' ) { # got it
6165 $identifier .= $tok;
6166 $id_scan_state = ''; # all done
6169 elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
6170 $identifier .= $tok;
6172 else { # probable error in script, but keep going
6173 warning("Unexpected '$tok' while seeking end of prototype\n");
6174 $identifier .= $tok;
6177 else { # can get here due to error in initialization
6178 $id_scan_state = '';
6184 if ( $id_scan_state eq ')' ) {
6185 warning("Hit end of line while seeking ) to end prototype\n");
6188 # once we enter the actual identifier, it may not extend beyond
6189 # the end of the current line
6190 if ( $id_scan_state =~ /^[A\:\(\)]/ ) {
6191 $id_scan_state = '';
6193 if ( $i < 0 ) { $i = 0 }
6200 if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
6203 else { $type = 'i' }
6205 elsif ( $identifier eq '->' ) {
6209 ( length($identifier) > 1 )
6211 # In something like '@$=' we have an identifier '@$'
6212 # In something like '$${' we have type '$$' (and only
6213 # part of an identifier)
6214 && !( $identifier =~ /\$$/ && $tok eq '{' )
6215 && ( $identifier !~ /^(sub |package )$/ )
6220 else { $type = 't' }
6222 elsif ($saw_alpha) {
6224 # type 'w' includes anything without leading type info
6225 # ($,%,@,*) including something like abc::def::ghi
6230 } # this can happen on a restart
6235 if ($message) { write_logfile_entry($message) }
6242 TOKENIZER_DEBUG_FLAG_SCAN_ID && do {
6243 my ( $a, $b, $c ) = caller;
6245 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
6247 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
6249 return ( $i, $tok, $type, $id_scan_state, $identifier );
6254 # saved package and subnames in case prototype is on separate line
6255 my ( $package_saved, $subname_saved );
6259 # do_scan_sub parses a sub name and prototype
6260 # it is called with $i_beg equal to the index of the first nonblank
6261 # token following a 'sub' token.
6263 # TODO: add future error checks to be sure we have a valid
6264 # sub name. For example, 'sub &doit' is wrong. Also, be sure
6265 # a name is given if and only if a non-anonymous sub is
6267 # USES GLOBAL VARS: $current_package, $last_nonblank_token,
6268 # $in_attribute_list, %saw_function_definition,
6272 $input_line, $i, $i_beg,
6273 $tok, $type, $rtokens,
6274 $rtoken_map, $id_scan_state, $max_token_index
6276 $id_scan_state = ""; # normally we get everything in one call
6277 my $subname = undef;
6278 my $package = undef;
6283 my $pos_beg = $rtoken_map->[$i_beg];
6284 pos($input_line) = $pos_beg;
6286 # Look for the sub NAME
6288 $input_line =~ m/\G\s*
6289 ((?:\w*(?:'|::))*) # package - something that ends in :: or '
6290 (\w+) # NAME - required
6297 $package = ( defined($1) && $1 ) ? $1 : $current_package;
6298 $package =~ s/\'/::/g;
6299 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
6300 $package =~ s/::$//;
6301 my $pos = pos($input_line);
6302 my $numc = $pos - $pos_beg;
6303 $tok = 'sub ' . substr( $input_line, $pos_beg, $numc );
6307 # Now look for PROTO ATTRS
6308 # Look for prototype/attributes which are usually on the same
6309 # line as the sub name but which might be on a separate line.
6310 # For example, we might have an anonymous sub with attributes,
6311 # or a prototype on a separate line from its sub name
6313 # NOTE: We only want to parse PROTOTYPES here. If we see anything that
6314 # does not look like a prototype, we assume it is a SIGNATURE and we
6315 # will stop and let the the standard tokenizer handle it. In
6316 # particular, we stop if we see any nested parens, braces, or commas.
6317 my $saw_opening_paren = $input_line =~ /\G\s*\(/;
6319 $input_line =~ m/\G(\s*\([^\)\(\}\{\,]*\))? # PROTO
6320 (\s*:)? # ATTRS leading ':'
6328 # If we also found the sub name on this call then append PROTO.
6329 # This is not necessary but for compatibility with previous
6330 # versions when the -csc flag is used:
6331 if ( $match && $proto ) {
6336 # Handle prototype on separate line from subname
6337 if ($subname_saved) {
6338 $package = $package_saved;
6339 $subname = $subname_saved;
6340 $tok = $last_nonblank_token;
6347 # ATTRS: if there are attributes, back up and let the ':' be
6348 # found later by the scanner.
6349 my $pos = pos($input_line);
6351 $pos -= length($attrs);
6354 my $next_nonblank_token = $tok;
6356 # catch case of line with leading ATTR ':' after anonymous sub
6357 if ( $pos == $pos_beg && $tok eq ':' ) {
6359 $in_attribute_list = 1;
6362 # Otherwise, if we found a match we must convert back from
6363 # string position to the pre_token index for continued parsing.
6366 # I don't think an error flag can occur here ..but ?
6368 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
6370 if ($error) { warning("Possibly invalid sub\n") }
6372 # check for multiple definitions of a sub
6373 ( $next_nonblank_token, my $i_next ) =
6374 find_next_nonblank_token_on_this_line( $i, $rtokens,
6378 if ( $next_nonblank_token =~ /^(\s*|#)$/ )
6379 { # skip blank or side comment
6380 my ( $rpre_tokens, $rpre_types ) =
6381 peek_ahead_for_n_nonblank_pre_tokens(1);
6382 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
6383 $next_nonblank_token = $rpre_tokens->[0];
6386 $next_nonblank_token = '}';
6389 $package_saved = "";
6390 $subname_saved = "";
6392 # See what's next...
6393 if ( $next_nonblank_token eq '{' ) {
6396 # Check for multiple definitions of a sub, but
6397 # it is ok to have multiple sub BEGIN, etc,
6398 # so we do not complain if name is all caps
6399 if ( $saw_function_definition{$package}{$subname}
6400 && $subname !~ /^[A-Z]+$/ )
6402 my $lno = $saw_function_definition{$package}{$subname};
6404 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
6407 $saw_function_definition{$package}{$subname} =
6408 $tokenizer_self->{_last_line_number};
6411 elsif ( $next_nonblank_token eq ';' ) {
6413 elsif ( $next_nonblank_token eq '}' ) {
6416 # ATTRS - if an attribute list follows, remember the name
6417 # of the sub so the next opening brace can be labeled.
6418 # Setting 'statement_type' causes any ':'s to introduce
6420 elsif ( $next_nonblank_token eq ':' ) {
6421 $statement_type = $tok;
6424 # if we stopped before an open paren ...
6425 elsif ( $next_nonblank_token eq '(' ) {
6427 # If we DID NOT see this paren above then it must be on the
6428 # next line so we will set a flag to come back here and see if
6431 # Otherwise, we assume it is a SIGNATURE rather than a
6432 # PROTOTYPE and let the normal tokenizer handle it as a list
6433 if ( !$saw_opening_paren ) {
6434 $id_scan_state = 'sub'; # we must come back to get proto
6435 $package_saved = $package;
6436 $subname_saved = $subname;
6438 $statement_type = $tok;
6440 elsif ($next_nonblank_token) { # EOF technically ok
6441 $subname = "" unless defined($subname);
6443 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
6446 check_prototype( $proto, $package, $subname );
6449 # no match but line not blank
6452 return ( $i, $tok, $type, $id_scan_state );
6456 #########i###############################################################
6457 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
6458 #########################################################################
6460 sub find_next_nonblank_token {
6461 my ( $i, $rtokens, $max_token_index ) = @_;
6463 if ( $i >= $max_token_index ) {
6464 if ( !peeked_ahead() ) {
6467 peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
6470 my $next_nonblank_token = $rtokens->[ ++$i ];
6472 if ( $next_nonblank_token =~ /^\s*$/ ) {
6473 $next_nonblank_token = $rtokens->[ ++$i ];
6475 return ( $next_nonblank_token, $i );
6478 sub numerator_expected {
6480 # this is a filter for a possible numerator, in support of guessing
6481 # for the / pattern delimiter token.
6486 # Note: I am using the convention that variables ending in
6487 # _expected have these 3 possible values.
6488 my ( $i, $rtokens, $max_token_index ) = @_;
6489 my $numerator_expected = 0;
6491 my $next_token = $rtokens->[ $i + 1 ];
6492 if ( $next_token eq '=' ) { $i++; } # handle /=
6493 my ( $next_nonblank_token, $i_next ) =
6494 find_next_nonblank_token( $i, $rtokens, $max_token_index );
6496 if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
6497 $numerator_expected = 1;
6501 if ( $next_nonblank_token =~ /^\s*$/ ) {
6502 $numerator_expected = 0;
6505 $numerator_expected = -1;
6508 return $numerator_expected;
6511 sub pattern_expected {
6513 # This is the start of a filter for a possible pattern.
6514 # It looks at the token after a possible pattern and tries to
6515 # determine if that token could end a pattern.
6520 my ( $i, $rtokens, $max_token_index ) = @_;
6523 my $next_token = $rtokens->[ $i + 1 ];
6524 if ( $next_token =~ /^[msixpodualgc]/ ) { $i++; } # skip possible modifier
6525 my ( $next_nonblank_token, $i_next ) =
6526 find_next_nonblank_token( $i, $rtokens, $max_token_index );
6528 # list of tokens which may follow a pattern
6529 # (can probably be expanded)
6530 if ( $next_nonblank_token =~ /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/ )
6536 if ( $next_nonblank_token =~ /^\s*$/ ) {
6546 sub find_next_nonblank_token_on_this_line {
6547 my ( $i, $rtokens, $max_token_index ) = @_;
6548 my $next_nonblank_token;
6550 if ( $i < $max_token_index ) {
6551 $next_nonblank_token = $rtokens->[ ++$i ];
6553 if ( $next_nonblank_token =~ /^\s*$/ ) {
6555 if ( $i < $max_token_index ) {
6556 $next_nonblank_token = $rtokens->[ ++$i ];
6561 $next_nonblank_token = "";
6563 return ( $next_nonblank_token, $i );
6566 sub find_angle_operator_termination {
6568 # We are looking at a '<' and want to know if it is an angle operator.
6570 # $i = pretoken index of ending '>' if found, current $i otherwise
6571 # $type = 'Q' if found, '>' otherwise
6572 my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
6575 pos($input_line) = 1 + $rtoken_map->[$i];
6579 # we just have to find the next '>' if a term is expected
6580 if ( $expecting == TERM ) { $filter = '[\>]' }
6582 # we have to guess if we don't know what is expected
6583 elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
6585 # shouldn't happen - we shouldn't be here if operator is expected
6586 else { warning("Program Bug in find_angle_operator_termination\n") }
6588 # To illustrate what we might be looking at, in case we are
6589 # guessing, here are some examples of valid angle operators
6596 # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
6597 # <${PREFIX}*img*.$IMAGE_TYPE>
6598 # <img*.$IMAGE_TYPE>
6599 # <Timg*.$IMAGE_TYPE>
6600 # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
6602 # Here are some examples of lines which do not have angle operators:
6603 # return unless $self->[2]++ < $#{$self->[1]};
6606 # the following line from dlister.pl caused trouble:
6607 # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
6609 # If the '<' starts an angle operator, it must end on this line and
6610 # it must not have certain characters like ';' and '=' in it. I use
6611 # this to limit the testing. This filter should be improved if
6614 if ( $input_line =~ /($filter)/g ) {
6618 # We MAY have found an angle operator termination if we get
6619 # here, but we need to do more to be sure we haven't been
6621 my $pos = pos($input_line);
6623 my $pos_beg = $rtoken_map->[$i];
6624 my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
6626 # Reject if the closing '>' follows a '-' as in:
6627 # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
6628 if ( $expecting eq UNKNOWN ) {
6629 my $check = substr( $input_line, $pos - 2, 1 );
6630 if ( $check eq '-' ) {
6631 return ( $i, $type );
6635 ######################################debug#####
6636 #write_diagnostics( "ANGLE? :$str\n");
6637 #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
6638 ######################################debug#####
6642 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
6644 # It may be possible that a quote ends midway in a pretoken.
6645 # If this happens, it may be necessary to split the pretoken.
6648 "Possible tokinization error..please check this line\n");
6649 report_possible_bug();
6652 # Now let's see where we stand....
6653 # OK if math op not possible
6654 if ( $expecting == TERM ) {
6657 # OK if there are no more than 2 pre-tokens inside
6658 # (not possible to write 2 token math between < and >)
6659 # This catches most common cases
6660 elsif ( $i <= $i_beg + 3 ) {
6661 write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
6667 # Let's try a Brace Test: any braces inside must balance
6669 while ( $str =~ /\{/g ) { $br++ }
6670 while ( $str =~ /\}/g ) { $br-- }
6672 while ( $str =~ /\[/g ) { $sb++ }
6673 while ( $str =~ /\]/g ) { $sb-- }
6675 while ( $str =~ /\(/g ) { $pr++ }
6676 while ( $str =~ /\)/g ) { $pr-- }
6678 # if braces do not balance - not angle operator
6679 if ( $br || $sb || $pr ) {
6683 "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
6686 # we should keep doing more checks here...to be continued
6687 # Tentatively accepting this as a valid angle operator.
6688 # There are lots more things that can be checked.
6691 "ANGLE-Guessing yes: $str expecting=$expecting\n");
6692 write_logfile_entry("Guessing angle operator here: $str\n");
6697 # didn't find ending >
6699 if ( $expecting == TERM ) {
6700 warning("No ending > for angle operator\n");
6704 return ( $i, $type );
6707 sub scan_number_do {
6709 # scan a number in any of the formats that Perl accepts
6710 # Underbars (_) are allowed in decimal numbers.
6711 # input parameters -
6712 # $input_line - the string to scan
6713 # $i - pre_token index to start scanning
6714 # $rtoken_map - reference to the pre_token map giving starting
6715 # character position in $input_line of token $i
6716 # output parameters -
6717 # $i - last pre_token index of the number just scanned
6718 # number - the number (characters); or undef if not a number
6720 my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
6721 my $pos_beg = $rtoken_map->[$i];
6725 my $type = $input_type;
6727 my $first_char = substr( $input_line, $pos_beg, 1 );
6729 # Look for bad starting characters; Shouldn't happen..
6730 if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
6731 warning("Program bug - scan_number given character $first_char\n");
6732 report_definite_bug();
6733 return ( $i, $type, $number );
6736 # handle v-string without leading 'v' character ('Two Dot' rule)
6738 # TODO: v-strings may contain underscores
6739 pos($input_line) = $pos_beg;
6740 if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
6741 $pos = pos($input_line);
6742 my $numc = $pos - $pos_beg;
6743 $number = substr( $input_line, $pos_beg, $numc );
6745 report_v_string($number);
6748 # handle octal, hex, binary
6749 if ( !defined($number) ) {
6750 pos($input_line) = $pos_beg;
6752 /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
6754 $pos = pos($input_line);
6755 my $numc = $pos - $pos_beg;
6756 $number = substr( $input_line, $pos_beg, $numc );
6762 if ( !defined($number) ) {
6763 pos($input_line) = $pos_beg;
6765 if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
6766 $pos = pos($input_line);
6768 # watch out for things like 0..40 which would give 0. by this;
6769 if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
6770 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
6774 my $numc = $pos - $pos_beg;
6775 $number = substr( $input_line, $pos_beg, $numc );
6780 # filter out non-numbers like e + - . e2 .e3 +e6
6781 # the rule: at least one digit, and any 'e' must be preceded by a digit
6783 $number !~ /\d/ # no digits
6784 || ( $number =~ /^(.*)[eE]/
6785 && $1 !~ /\d/ ) # or no digits before the 'e'
6789 $type = $input_type;
6790 return ( $i, $type, $number );
6793 # Found a number; now we must convert back from character position
6794 # to pre_token index. An error here implies user syntax error.
6795 # An example would be an invalid octal number like '009'.
6798 inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
6799 if ($error) { warning("Possibly invalid number\n") }
6801 return ( $i, $type, $number );
6804 sub inverse_pretoken_map {
6806 # Starting with the current pre_token index $i, scan forward until
6807 # finding the index of the next pre_token whose position is $pos.
6808 my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
6811 while ( ++$i <= $max_token_index ) {
6813 if ( $pos <= $rtoken_map->[$i] ) {
6815 # Let the calling routine handle errors in which we do not
6816 # land on a pre-token boundary. It can happen by running
6817 # perltidy on some non-perl scripts, for example.
6818 if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
6823 return ( $i, $error );
6828 # find the target of a here document, if any
6830 # $i - token index of the second < of <<
6831 # ($i must be less than the last token index if this is called)
6832 # output parameters:
6833 # $found_target = 0 didn't find target; =1 found target
6834 # HERE_TARGET - the target string (may be empty string)
6835 # $i - unchanged if not here doc,
6836 # or index of the last token of the here target
6837 # $saw_error - flag noting unbalanced quote on here target
6838 my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6840 my $found_target = 0;
6841 my $here_doc_target = '';
6842 my $here_quote_character = '';
6844 my ( $next_nonblank_token, $i_next_nonblank, $next_token );
6845 $next_token = $rtokens->[ $i + 1 ];
6847 # perl allows a backslash before the target string (heredoc.t)
6849 if ( $next_token eq '\\' ) {
6851 $next_token = $rtokens->[ $i + 2 ];
6854 ( $next_nonblank_token, $i_next_nonblank ) =
6855 find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
6857 if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
6860 my $quote_depth = 0;
6865 $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
6868 = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
6869 $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
6871 if ($in_quote) { # didn't find end of quote, so no target found
6873 if ( $expecting == TERM ) {
6875 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
6880 else { # found ending quote
6884 foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
6885 $tokj = $rtokens->[$j];
6887 # we have to remove any backslash before the quote character
6888 # so that the here-doc-target exactly matches this string
6892 && $rtokens->[ $j + 1 ] eq $here_quote_character );
6893 $here_doc_target .= $tokj;
6898 elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
6900 write_logfile_entry(
6901 "found blank here-target after <<; suggest using \"\"\n");
6904 elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
6906 my $here_doc_expected;
6907 if ( $expecting == UNKNOWN ) {
6908 $here_doc_expected = guess_if_here_doc($next_token);
6911 $here_doc_expected = 1;
6914 if ($here_doc_expected) {
6916 $here_doc_target = $next_token;
6923 if ( $expecting == TERM ) {
6925 write_logfile_entry("Note: bare here-doc operator <<\n");
6932 # patch to neglect any prepended backslash
6933 if ( $found_target && $backslash ) { $i++ }
6935 return ( $found_target, $here_doc_target, $here_quote_character, $i,
6941 # follow (or continue following) quoted string(s)
6942 # $in_quote return code:
6944 # 1 - still must find end of quote whose target is $quote_character
6945 # 2 - still looking for end of first of two quotes
6947 # Returns updated strings:
6948 # $quoted_string_1 = quoted string seen while in_quote=1
6949 # $quoted_string_2 = quoted string seen while in_quote=2
6951 $i, $in_quote, $quote_character,
6952 $quote_pos, $quote_depth, $quoted_string_1,
6953 $quoted_string_2, $rtokens, $rtoken_map,
6957 my $in_quote_starting = $in_quote;
6960 if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
6963 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6966 = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
6967 $quote_pos, $quote_depth, $max_token_index );
6968 $quoted_string_2 .= $quoted_string;
6969 if ( $in_quote == 1 ) {
6970 if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
6971 $quote_character = '';
6974 $quoted_string_2 .= "\n";
6978 if ( $in_quote == 1 ) { # one (more) quote to follow
6981 $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6984 = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6985 $quote_pos, $quote_depth, $max_token_index );
6986 $quoted_string_1 .= $quoted_string;
6987 if ( $in_quote == 1 ) {
6988 $quoted_string_1 .= "\n";
6991 return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6992 $quoted_string_1, $quoted_string_2 );
6995 sub follow_quoted_string {
6997 # scan for a specific token, skipping escaped characters
6998 # if the quote character is blank, use the first non-blank character
7000 # $rtokens = reference to the array of tokens
7001 # $i = the token index of the first character to search
7002 # $in_quote = number of quoted strings being followed
7003 # $beginning_tok = the starting quote character
7004 # $quote_pos = index to check next for alphanumeric delimiter
7005 # output parameters:
7006 # $i = the token index of the ending quote character
7007 # $in_quote = decremented if found end, unchanged if not
7008 # $beginning_tok = the starting quote character
7009 # $quote_pos = index to check next for alphanumeric delimiter
7010 # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
7011 # $quoted_string = the text of the quote (without quotation tokens)
7012 my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
7015 my ( $tok, $end_tok );
7017 my $quoted_string = "";
7019 TOKENIZER_DEBUG_FLAG_QUOTE && do {
7021 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
7024 # get the corresponding end token
7025 if ( $beginning_tok !~ /^\s*$/ ) {
7026 $end_tok = matching_end_token($beginning_tok);
7029 # a blank token means we must find and use the first non-blank one
7031 my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
7033 while ( $i < $max_token_index ) {
7034 $tok = $rtokens->[ ++$i ];
7036 if ( $tok !~ /^\s*$/ ) {
7038 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
7039 $i = $max_token_index;
7043 if ( length($tok) > 1 ) {
7044 if ( $quote_pos <= 0 ) { $quote_pos = 1 }
7045 $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
7048 $beginning_tok = $tok;
7051 $end_tok = matching_end_token($beginning_tok);
7057 $allow_quote_comments = 1;
7062 # There are two different loops which search for the ending quote
7063 # character. In the rare case of an alphanumeric quote delimiter, we
7064 # have to look through alphanumeric tokens character-by-character, since
7065 # the pre-tokenization process combines multiple alphanumeric
7066 # characters, whereas for a non-alphanumeric delimiter, only tokens of
7067 # length 1 can match.
7069 ###################################################################
7070 # Case 1 (rare): loop for case of alphanumeric quote delimiter..
7071 # "quote_pos" is the position the current word to begin searching
7072 ###################################################################
7073 if ( $beginning_tok =~ /\w/ ) {
7075 # Note this because it is not recommended practice except
7076 # for obfuscated perl contests
7077 if ( $in_quote == 1 ) {
7078 write_logfile_entry(
7079 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
7082 while ( $i < $max_token_index ) {
7084 if ( $quote_pos == 0 || ( $i < 0 ) ) {
7085 $tok = $rtokens->[ ++$i ];
7087 if ( $tok eq '\\' ) {
7089 # retain backslash unless it hides the end token
7090 $quoted_string .= $tok
7091 unless $rtokens->[ $i + 1 ] eq $end_tok;
7093 last if ( $i >= $max_token_index );
7094 $tok = $rtokens->[ ++$i ];
7097 my $old_pos = $quote_pos;
7099 unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
7103 $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
7105 if ( $quote_pos > 0 ) {
7108 substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
7112 if ( $quote_depth == 0 ) {
7118 $quoted_string .= substr( $tok, $old_pos );
7123 ########################################################################
7124 # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
7125 ########################################################################
7128 while ( $i < $max_token_index ) {
7129 $tok = $rtokens->[ ++$i ];
7131 if ( $tok eq $end_tok ) {
7134 if ( $quote_depth == 0 ) {
7139 elsif ( $tok eq $beginning_tok ) {
7142 elsif ( $tok eq '\\' ) {
7144 # retain backslash unless it hides the beginning or end token
7145 $tok = $rtokens->[ ++$i ];
7146 $quoted_string .= '\\'
7147 unless ( $tok eq $end_tok || $tok eq $beginning_tok );
7149 $quoted_string .= $tok;
7152 if ( $i > $max_token_index ) { $i = $max_token_index }
7153 return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
7157 sub indicate_error {
7158 my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
7159 interrupt_logfile();
7161 write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
7166 sub write_error_indicator_pair {
7167 my ( $line_number, $input_line, $pos, $carrat ) = @_;
7168 my ( $offset, $numbered_line, $underline ) =
7169 make_numbered_line( $line_number, $input_line, $pos );
7170 $underline = write_on_underline( $underline, $pos - $offset, $carrat );
7171 warning( $numbered_line . "\n" );
7172 $underline =~ s/\s*$//;
7173 warning( $underline . "\n" );
7177 sub make_numbered_line {
7179 # Given an input line, its line number, and a character position of
7180 # interest, create a string not longer than 80 characters of the form
7181 # $lineno: sub_string
7182 # such that the sub_string of $str contains the position of interest
7184 # Here is an example of what we want, in this case we add trailing
7185 # '...' because the line is long.
7187 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
7189 # Here is another example, this time in which we used leading '...'
7190 # because of excessive length:
7192 # 2: ... er of the World Wide Web Consortium's
7194 # input parameters are:
7195 # $lineno = line number
7196 # $str = the text of the line
7197 # $pos = position of interest (the error) : 0 = first character
7200 # - $offset = an offset which corrects the position in case we only
7201 # display part of a line, such that $pos-$offset is the effective
7202 # position from the start of the displayed line.
7203 # - $numbered_line = the numbered line as above,
7204 # - $underline = a blank 'underline' which is all spaces with the same
7205 # number of characters as the numbered line.
7207 my ( $lineno, $str, $pos ) = @_;
7208 my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
7209 my $excess = length($str) - $offset - 68;
7210 my $numc = ( $excess > 0 ) ? 68 : undef;
7212 if ( defined($numc) ) {
7213 if ( $offset == 0 ) {
7214 $str = substr( $str, $offset, $numc - 4 ) . " ...";
7217 $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
7222 if ( $offset == 0 ) {
7225 $str = "... " . substr( $str, $offset + 4 );
7229 my $numbered_line = sprintf( "%d: ", $lineno );
7230 $offset -= length($numbered_line);
7231 $numbered_line .= $str;
7232 my $underline = " " x length($numbered_line);
7233 return ( $offset, $numbered_line, $underline );
7236 sub write_on_underline {
7238 # The "underline" is a string that shows where an error is; it starts
7239 # out as a string of blanks with the same length as the numbered line of
7240 # code above it, and we have to add marking to show where an error is.
7241 # In the example below, we want to write the string '--^' just below
7242 # the line of bad code:
7244 # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
7246 # We are given the current underline string, plus a position and a
7247 # string to write on it.
7249 # In the above example, there will be 2 calls to do this:
7250 # First call: $pos=19, pos_chr=^
7251 # Second call: $pos=16, pos_chr=---
7253 # This is a trivial thing to do with substr, but there is some
7256 my ( $underline, $pos, $pos_chr ) = @_;
7258 # check for error..shouldn't happen
7259 unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
7262 my $excess = length($pos_chr) + $pos - length($underline);
7263 if ( $excess > 0 ) {
7264 $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
7266 substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
7267 return ($underline);
7272 # Break a string, $str, into a sequence of preliminary tokens. We
7273 # are interested in these types of tokens:
7274 # words (type='w'), example: 'max_tokens_wanted'
7275 # digits (type = 'd'), example: '0755'
7276 # whitespace (type = 'b'), example: ' '
7277 # any other single character (i.e. punct; type = the character itself).
7278 # We cannot do better than this yet because we might be in a quoted
7279 # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
7281 my ( $str, $max_tokens_wanted ) = @_;
7283 # we return references to these 3 arrays:
7284 my @tokens = (); # array of the tokens themselves
7285 my @token_map = (0); # string position of start of each token
7286 my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
7291 if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
7294 # note that this must come before words!
7295 elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
7298 elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
7300 # single-character punctuation
7301 elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
7305 return ( \@tokens, \@token_map, \@type );
7309 push @token_map, pos($str);
7311 } while ( --$max_tokens_wanted != 0 );
7313 return ( \@tokens, \@token_map, \@type );
7318 # this is an old debug routine
7319 # not called, but saved for reference
7320 my ( $rtokens, $rtoken_map ) = @_;
7321 my $num = scalar( @{$rtokens} );
7323 foreach my $i ( 0 .. $num - 1 ) {
7324 my $len = length( $rtokens->[$i] );
7325 print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
7331 my %matching_end_token;
7334 %matching_end_token = (
7342 sub matching_end_token {
7344 # return closing character for a pattern
7345 my $beginning_token = shift;
7346 if ( $matching_end_token{$beginning_token} ) {
7347 return $matching_end_token{$beginning_token};
7349 return ($beginning_token);
7353 sub dump_token_types {
7354 my ( $class, $fh ) = @_;
7356 # This should be the latest list of token types in use
7357 # adding NEW_TOKENS: add a comment here
7358 print $fh <<'END_OF_LIST';
7360 Here is a list of the token types currently used for lines of type 'CODE'.
7361 For the following tokens, the "type" of a token is just the token itself.
7363 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
7364 ( ) <= >= == =~ !~ != ++ -- /= x=
7365 ... **= <<= >>= &&= ||= //= <=>
7366 , + - / * | % ! x ~ = \ ? : . < > ^ &
7368 The following additional token types are defined:
7371 b blank (white space)
7372 { indent: opening structural curly brace or square bracket or paren
7373 (code block, anonymous hash reference, or anonymous array reference)
7374 } outdent: right structural curly brace or square bracket or paren
7375 [ left non-structural square bracket (enclosing an array index)
7376 ] right non-structural square bracket
7377 ( left non-structural paren (all but a list right of an =)
7378 ) right non-structural paren
7379 L left non-structural curly brace (enclosing a key)
7380 R right non-structural curly brace
7381 ; terminal semicolon
7382 f indicates a semicolon in a "for" statement
7383 h here_doc operator <<
7385 Q indicates a quote or pattern
7386 q indicates a qw quote block
7388 C user-defined constant or constant function (with void prototype = ())
7389 U user-defined function taking parameters
7390 G user-defined function taking block parameter (like grep/map/eval)
7391 M (unused, but reserved for subroutine definition name)
7392 P (unused, but -html uses it to label pod text)
7393 t type indicater such as %,$,@,*,&,sub
7394 w bare word (perhaps a subroutine call)
7395 i identifier of some type (with leading %, $, @, *, &, sub, -> )
7398 F a file test operator (like -e)
7400 Z identifier in indirect object slot: may be file handle, object
7401 J LABEL: code block label
7402 j LABEL after next, last, redo, goto
7405 pp pre-increment operator ++
7406 mm pre-decrement operator --
7407 A : used as attribute separator
7409 Here are the '_line_type' codes used internally:
7410 SYSTEM - system-specific code before hash-bang line
7411 CODE - line of perl code (including comments)
7412 POD_START - line starting pod, such as '=head'
7413 POD - pod documentation text
7414 POD_END - last line of pod section, '=cut'
7415 HERE - text of here-document
7416 HERE_END - last line of here-doc (target word)
7417 FORMAT - format section
7418 FORMAT_END - last line of format section, '.'
7419 DATA_START - __DATA__ line
7420 DATA - unidentified text following __DATA__
7421 END_START - __END__ line
7422 END - unidentified text following __END__
7423 ERROR - we are in big trouble, probably not a perl script
7431 # These names are used in error messages
7432 @opening_brace_names = qw# '{' '[' '(' '?' #;
7433 @closing_brace_names = qw# '}' ']' ')' ':' #;
7438 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
7439 <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
7441 @is_digraph{@digraphs} = (1) x scalar(@digraphs);
7443 my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
7444 @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
7446 my @tetragraphs = qw( <<>> );
7447 @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
7449 # make a hash of all valid token types for self-checking the tokenizer
7450 # (adding NEW_TOKENS : select a new character and add to this list)
7451 my @valid_token_types = qw#
7452 A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
7453 { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
7455 push( @valid_token_types, @digraphs );
7456 push( @valid_token_types, @trigraphs );
7457 push( @valid_token_types, @tetragraphs );
7458 push( @valid_token_types, ( '#', ',', 'CORE::' ) );
7459 @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
7461 # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
7462 my @file_test_operators =
7463 qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
7464 @is_file_test_operator{@file_test_operators} =
7465 (1) x scalar(@file_test_operators);
7467 # these functions have prototypes of the form (&), so when they are
7468 # followed by a block, that block MAY BE followed by an operator.
7469 # Smartmatch operator ~~ may be followed by anonymous hash or array ref
7471 @is_block_operator{@q} = (1) x scalar(@q);
7473 # these functions allow an identifier in the indirect object slot
7474 @q = qw( print printf sort exec system say);
7475 @is_indirect_object_taker{@q} = (1) x scalar(@q);
7477 # These tokens may precede a code block
7478 # patched for SWITCH/CASE/CATCH. Actually these could be removed
7479 # now and we could let the extended-syntax coding handle them
7481 qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
7482 unless do while until eval for foreach map grep sort
7483 switch case given when catch try finally);
7484 @is_code_block_token{@q} = (1) x scalar(@q);
7486 # I'll build the list of keywords incrementally
7489 # keywords and tokens after which a value or pattern is expected,
7490 # but not an operator. In other words, these should consume terms
7491 # to their right, or at least they are not expected to be followed
7492 # immediately by operators.
7493 my @value_requestor = qw(
7716 # patched above for SWITCH/CASE given/when err say
7717 # 'err' is a fairly safe addition.
7718 # TODO: 'default' still needed if appropriate
7719 # 'use feature' seen, but perltidy works ok without it.
7720 # Concerned that 'default' could break code.
7721 push( @Keywords, @value_requestor );
7723 # These are treated the same but are not keywords:
7728 push( @value_requestor, @extra_vr );
7730 @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
7732 # this list contains keywords which do not look for arguments,
7733 # so that they might be followed by an operator, or at least
7735 my @operator_requestor = qw(
7759 push( @Keywords, @operator_requestor );
7761 # These are treated the same but are not considered keywords:
7768 push( @operator_requestor, @extra_or );
7770 @expecting_operator_token{@operator_requestor} =
7771 (1) x scalar(@operator_requestor);
7773 # these token TYPES expect trailing operator but not a term
7774 # note: ++ and -- are post-increment and decrement, 'C' = constant
7775 my @operator_requestor_types = qw( ++ -- C <> q );
7776 @expecting_operator_types{@operator_requestor_types} =
7777 (1) x scalar(@operator_requestor_types);
7779 # these token TYPES consume values (terms)
7780 # note: pp and mm are pre-increment and decrement
7781 # f=semicolon in for, F=file test operator
7782 my @value_requestor_type = qw#
7783 L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
7784 **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
7785 <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~
7786 f F pp mm Y p m U J G j >> << ^ t
7787 ~. ^. |. &. ^.= |.= &.=
7789 push( @value_requestor_type, ',' )
7790 ; # (perl doesn't like a ',' in a qw block)
7791 @expecting_term_types{@value_requestor_type} =
7792 (1) x scalar(@value_requestor_type);
7794 # Note: the following valid token types are not assigned here to
7795 # hashes requesting to be followed by values or terms, but are
7796 # instead currently hard-coded into sub operator_expected:
7797 # ) -> :: Q R Z ] b h i k n v w } #
7799 # For simple syntax checking, it is nice to have a list of operators which
7800 # will really be unhappy if not followed by a term. This includes most
7802 %really_want_term = %expecting_term_types;
7804 # with these exceptions...
7805 delete $really_want_term{'U'}; # user sub, depends on prototype
7806 delete $really_want_term{'F'}; # file test works on $_ if no following term
7807 delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
7810 @q = qw(q qq qw qx qr s y tr m);
7811 @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
7814 @is_sub{@q} = (1) x scalar(@q);
7817 @is_package{@q} = (1) x scalar(@q);
7819 # These keywords are handled specially in the tokenizer code:
7820 my @special_keywords = qw(
7836 push( @Keywords, @special_keywords );
7838 # Keywords after which list formatting may be used
7839 # WARNING: do not include |map|grep|eval or perl may die on
7840 # syntax errors (map1.t).
7841 my @keyword_taking_list = qw(
7915 @is_keyword_taking_list{@keyword_taking_list} =
7916 (1) x scalar(@keyword_taking_list);
7918 # perl functions which may be unary operators
7919 my @keyword_taking_optional_args = qw(
7929 @is_keyword_taking_optional_args{@keyword_taking_optional_args} =
7930 (1) x scalar(@keyword_taking_optional_args);
7932 # These are not used in any way yet
7933 # my @unused_keywords = qw(
7939 # The list of keywords was originally extracted from function 'keyword' in
7940 # perl file toke.c version 5.005.03, using this utility, plus a
7941 # little editing: (file getkwd.pl):
7942 # while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
7943 # Add 'get' prefix where necessary, then split into the above lists.
7944 # This list should be updated as necessary.
7945 # The list should not contain these special variables:
7946 # ARGV DATA ENV SIG STDERR STDIN STDOUT
7949 @is_keyword{@Keywords} = (1) x scalar(@Keywords);