X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=lib%2FPerl%2FTidy%2FTokenizer.pm;h=c012e47f9d89b0a0c068a375f21c8ece343ced89;hb=57d829ae0e2c75828f8ecc9c7139579350927dbc;hp=e1d644a96bb06a3e89949e14a4edf45ff62e1270;hpb=7f27e55dce5925b2bbe8fcfca64f385e917a52be;p=perltidy.git diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm index e1d644a..c012e47 100644 --- a/lib/Perl/Tidy/Tokenizer.pm +++ b/lib/Perl/Tidy/Tokenizer.pm @@ -1,6 +1,6 @@ -######################################################################## +##################################################################### # -# the Perl::Tidy::Tokenizer package is essentially a filter which +# The Perl::Tidy::Tokenizer package is essentially a filter which # reads lines of perl source code from a source object and provides # corresponding tokenized lines through its get_line() method. Lines # flow from the source_object to the caller like this: @@ -14,43 +14,21 @@ # The Tokenizer returns a reference to a data structure 'line_of_tokens' # containing one tokenized line for each call to its get_line() method. # -# WARNING: This is not a real class yet. Only one tokenizer my be used. +# WARNING: This is not a real class. Only one tokenizer my be used. # ######################################################################## package Perl::Tidy::Tokenizer; use strict; use warnings; -our $VERSION = '20200110'; +our $VERSION = '20210717'; use Perl::Tidy::LineBuffer; - -BEGIN { - - # Caution: these debug flags produce a lot of output - # They should all be 0 except when debugging small scripts - - use constant TOKENIZER_DEBUG_FLAG_EXPECT => 0; - use constant TOKENIZER_DEBUG_FLAG_NSCAN => 0; - use constant TOKENIZER_DEBUG_FLAG_QUOTE => 0; - use constant TOKENIZER_DEBUG_FLAG_SCAN_ID => 0; - use constant TOKENIZER_DEBUG_FLAG_TOKENIZE => 0; - - my $debug_warning = sub { - print STDOUT "TOKENIZER_DEBUGGING with key $_[0]\n"; - }; - - TOKENIZER_DEBUG_FLAG_EXPECT && $debug_warning->('EXPECT'); - TOKENIZER_DEBUG_FLAG_NSCAN && $debug_warning->('NSCAN'); - TOKENIZER_DEBUG_FLAG_QUOTE && $debug_warning->('QUOTE'); - TOKENIZER_DEBUG_FLAG_SCAN_ID && $debug_warning->('SCAN_ID'); - TOKENIZER_DEBUG_FLAG_TOKENIZE && $debug_warning->('TOKENIZE'); - -} - use Carp; # PACKAGE VARIABLES for processing an entire FILE. +# These must be package variables because most may get localized during +# processing. Most are initialized in sub prepare_for_a_new_file. use vars qw{ $tokenizer_self @@ -68,6 +46,7 @@ use vars qw{ %is_block_function %is_block_list_function %saw_function_definition + %saw_use_module $brace_depth $paren_depth @@ -93,7 +72,8 @@ use vars qw{ @starting_line_of_current_depth }; -# GLOBAL CONSTANTS for routines in this package +# GLOBAL CONSTANTS for routines in this package, +# Initialized in a BEGIN block. use vars qw{ %is_indirect_object_taker %is_block_operator @@ -112,12 +92,27 @@ use vars qw{ @opening_brace_names @closing_brace_names %is_keyword_taking_list - %is_keyword_taking_optional_args + %is_keyword_taking_optional_arg + %is_keyword_rejecting_slash_as_pattern_delimiter + %is_keyword_rejecting_question_as_pattern_delimiter %is_q_qq_qw_qx_qr_s_y_tr_m %is_sub %is_package + %is_comma_question_colon + %other_line_endings + $code_skipping_pattern_begin + $code_skipping_pattern_end }; +# GLOBAL VARIABLES which are constant after being configured by user-supplied +# parameters. They remain constant as a file is being processed. +my ( + + $rOpts_code_skipping, + $code_skipping_pattern_begin, + $code_skipping_pattern_end, +); + # possible values of operator_expected() use constant TERM => -1; use constant UNKNOWN => 0; @@ -131,7 +126,68 @@ use constant LIST_CONTEXT => 1; # Maximum number of little messages; probably need not be changed. use constant MAX_NAG_MESSAGES => 6; -{ +BEGIN { + + # Array index names for $self + my $i = 0; + use constant { + _rhere_target_list_ => $i++, + _in_here_doc_ => $i++, + _here_doc_target_ => $i++, + _here_quote_character_ => $i++, + _in_data_ => $i++, + _in_end_ => $i++, + _in_format_ => $i++, + _in_error_ => $i++, + _in_pod_ => $i++, + _in_skipped_ => $i++, + _in_attribute_list_ => $i++, + _in_quote_ => $i++, + _quote_target_ => $i++, + _line_start_quote_ => $i++, + _starting_level_ => $i++, + _know_starting_level_ => $i++, + _tabsize_ => $i++, + _indent_columns_ => $i++, + _look_for_hash_bang_ => $i++, + _trim_qw_ => $i++, + _continuation_indentation_ => $i++, + _outdent_labels_ => $i++, + _last_line_number_ => $i++, + _saw_perl_dash_P_ => $i++, + _saw_perl_dash_w_ => $i++, + _saw_use_strict_ => $i++, + _saw_v_string_ => $i++, + _hit_bug_ => $i++, + _look_for_autoloader_ => $i++, + _look_for_selfloader_ => $i++, + _saw_autoloader_ => $i++, + _saw_selfloader_ => $i++, + _saw_hash_bang_ => $i++, + _saw_end_ => $i++, + _saw_data_ => $i++, + _saw_negative_indentation_ => $i++, + _started_tokenizing_ => $i++, + _line_buffer_object_ => $i++, + _debugger_object_ => $i++, + _diagnostics_object_ => $i++, + _logger_object_ => $i++, + _unexpected_error_count_ => $i++, + _started_looking_for_here_target_at_ => $i++, + _nearly_matched_here_target_at_ => $i++, + _line_of_text_ => $i++, + _rlower_case_labels_at_ => $i++, + _extended_syntax_ => $i++, + _maximum_level_ => $i++, + _true_brace_error_count_ => $i++, + _rOpts_maximum_level_errors_ => $i++, + _rOpts_maximum_unexpected_errors_ => $i++, + _rOpts_logfile_ => $i++, + _rOpts_ => $i++, + }; +} + +{ ## closure for subs to count instances # methods to count instances my $_count = 0; @@ -146,6 +202,60 @@ sub DESTROY { return; } +sub AUTOLOAD { + + # Catch any undefined sub calls so that we are sure to get + # some diagnostic information. This sub should never be called + # except for a programming error. + our $AUTOLOAD; + return if ( $AUTOLOAD =~ /\bDESTROY$/ ); + my ( $pkg, $fname, $lno ) = caller(); + my $my_package = __PACKAGE__; + print STDERR <{$opt_name}; + unless ($param) { $param = $default } + $param =~ s/^\s*//; # allow leading spaces to be like format-skipping + if ( $param !~ /^#/ ) { + Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n"); + } + my $pattern = '^\s*' . $param . '\b'; + if ( bad_pattern($pattern) ) { + Die( +"ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n" + ); + } + return $pattern; +} + sub check_options { # Check Tokenizer parameters @@ -165,6 +275,12 @@ sub check_options { $is_sub{$word} = 1; } } + + $rOpts_code_skipping = $rOpts->{'code-skipping'}; + $code_skipping_pattern_begin = + make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<>V' ); return; } @@ -188,90 +304,102 @@ sub new { look_for_selfloader => 1, starting_line_number => 1, extended_syntax => 0, + rOpts => {}, ); my %args = ( %defaults, @args ); # we are given an object with a get_line() method to supply source lines my $source_object = $args{source_object}; + my $rOpts = $args{rOpts}; # we create another object with a get_line() and peek_ahead() method my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object); # Tokenizer state data is as follows: - # _rhere_target_list reference to list of here-doc targets - # _here_doc_target the target string for a here document - # _here_quote_character the type of here-doc quoting (" ' ` or none) - # to determine if interpolation is done - # _quote_target character we seek if chasing a quote - # _line_start_quote line where we started looking for a long quote - # _in_here_doc flag indicating if we are in a here-doc - # _in_pod flag set if we are in pod documentation - # _in_error flag set if we saw severe error (binary in script) - # _in_data flag set if we are in __DATA__ section - # _in_end flag set if we are in __END__ section - # _in_format flag set if we are in a format description - # _in_attribute_list flag telling if we are looking for attributes - # _in_quote flag telling if we are chasing a quote - # _starting_level indentation level of first line - # _line_buffer_object object with get_line() method to supply source code - # _diagnostics_object place to write debugging information - # _unexpected_error_count error count used to limit output - # _lower_case_labels_at line numbers where lower case labels seen - # _hit_bug program bug detected - $tokenizer_self = { - _rhere_target_list => [], - _in_here_doc => 0, - _here_doc_target => "", - _here_quote_character => "", - _in_data => 0, - _in_end => 0, - _in_format => 0, - _in_error => 0, - _in_pod => 0, - _in_attribute_list => 0, - _in_quote => 0, - _quote_target => "", - _line_start_quote => -1, - _starting_level => $args{starting_level}, - _know_starting_level => defined( $args{starting_level} ), - _tabsize => $args{tabsize}, - _indent_columns => $args{indent_columns}, - _look_for_hash_bang => $args{look_for_hash_bang}, - _trim_qw => $args{trim_qw}, - _continuation_indentation => $args{continuation_indentation}, - _outdent_labels => $args{outdent_labels}, - _last_line_number => $args{starting_line_number} - 1, - _saw_perl_dash_P => 0, - _saw_perl_dash_w => 0, - _saw_use_strict => 0, - _saw_v_string => 0, - _hit_bug => 0, - _look_for_autoloader => $args{look_for_autoloader}, - _look_for_selfloader => $args{look_for_selfloader}, - _saw_autoloader => 0, - _saw_selfloader => 0, - _saw_hash_bang => 0, - _saw_end => 0, - _saw_data => 0, - _saw_negative_indentation => 0, - _started_tokenizing => 0, - _line_buffer_object => $line_buffer_object, - _debugger_object => $args{debugger_object}, - _diagnostics_object => $args{diagnostics_object}, - _logger_object => $args{logger_object}, - _unexpected_error_count => 0, - _started_looking_for_here_target_at => 0, - _nearly_matched_here_target_at => undef, - _line_text => "", - _rlower_case_labels_at => undef, - _extended_syntax => $args{extended_syntax}, - }; + # _rhere_target_list_ reference to list of here-doc targets + # _here_doc_target_ the target string for a here document + # _here_quote_character_ the type of here-doc quoting (" ' ` or none) + # to determine if interpolation is done + # _quote_target_ character we seek if chasing a quote + # _line_start_quote_ line where we started looking for a long quote + # _in_here_doc_ flag indicating if we are in a here-doc + # _in_pod_ flag set if we are in pod documentation + # _in_skipped_ flag set if we are in a skipped section + # _in_error_ flag set if we saw severe error (binary in script) + # _in_data_ flag set if we are in __DATA__ section + # _in_end_ flag set if we are in __END__ section + # _in_format_ flag set if we are in a format description + # _in_attribute_list_ flag telling if we are looking for attributes + # _in_quote_ flag telling if we are chasing a quote + # _starting_level_ indentation level of first line + # _line_buffer_object_ object with get_line() method to supply source code + # _diagnostics_object_ place to write debugging information + # _unexpected_error_count_ error count used to limit output + # _lower_case_labels_at_ line numbers where lower case labels seen + # _hit_bug_ program bug detected + + my $self = []; + $self->[_rhere_target_list_] = []; + $self->[_in_here_doc_] = 0; + $self->[_here_doc_target_] = ""; + $self->[_here_quote_character_] = ""; + $self->[_in_data_] = 0; + $self->[_in_end_] = 0; + $self->[_in_format_] = 0; + $self->[_in_error_] = 0; + $self->[_in_pod_] = 0; + $self->[_in_skipped_] = 0; + $self->[_in_attribute_list_] = 0; + $self->[_in_quote_] = 0; + $self->[_quote_target_] = ""; + $self->[_line_start_quote_] = -1; + $self->[_starting_level_] = $args{starting_level}; + $self->[_know_starting_level_] = defined( $args{starting_level} ); + $self->[_tabsize_] = $args{tabsize}; + $self->[_indent_columns_] = $args{indent_columns}; + $self->[_look_for_hash_bang_] = $args{look_for_hash_bang}; + $self->[_trim_qw_] = $args{trim_qw}; + $self->[_continuation_indentation_] = $args{continuation_indentation}; + $self->[_outdent_labels_] = $args{outdent_labels}; + $self->[_last_line_number_] = $args{starting_line_number} - 1; + $self->[_saw_perl_dash_P_] = 0; + $self->[_saw_perl_dash_w_] = 0; + $self->[_saw_use_strict_] = 0; + $self->[_saw_v_string_] = 0; + $self->[_hit_bug_] = 0; + $self->[_look_for_autoloader_] = $args{look_for_autoloader}; + $self->[_look_for_selfloader_] = $args{look_for_selfloader}; + $self->[_saw_autoloader_] = 0; + $self->[_saw_selfloader_] = 0; + $self->[_saw_hash_bang_] = 0; + $self->[_saw_end_] = 0; + $self->[_saw_data_] = 0; + $self->[_saw_negative_indentation_] = 0; + $self->[_started_tokenizing_] = 0; + $self->[_line_buffer_object_] = $line_buffer_object; + $self->[_debugger_object_] = $args{debugger_object}; + $self->[_diagnostics_object_] = $args{diagnostics_object}; + $self->[_logger_object_] = $args{logger_object}; + $self->[_unexpected_error_count_] = 0; + $self->[_started_looking_for_here_target_at_] = 0; + $self->[_nearly_matched_here_target_at_] = undef; + $self->[_line_of_text_] = ""; + $self->[_rlower_case_labels_at_] = undef; + $self->[_extended_syntax_] = $args{extended_syntax}; + $self->[_maximum_level_] = 0; + $self->[_true_brace_error_count_] = 0; + $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'}; + $self->[_rOpts_maximum_unexpected_errors_] = + $rOpts->{'maximum-unexpected-errors'}; + $self->[_rOpts_logfile_] = $rOpts->{'logfile'}; + $self->[_rOpts_] = $rOpts; + bless $self, $class; + + $tokenizer_self = $self; prepare_for_a_new_file(); find_starting_indentation_level(); - bless $tokenizer_self, $class; - # This is not a full class yet, so die if an attempt is made to # create more than one object. @@ -280,14 +408,14 @@ sub new { "Attempt to create more than 1 object in $class, which is not a true class yet\n"; } - return $tokenizer_self; + return $self; } # interface to Perl::Tidy::Logger routines sub warning { my $msg = shift; - my $logger_object = $tokenizer_self->{_logger_object}; + my $logger_object = $tokenizer_self->[_logger_object_]; if ($logger_object) { $logger_object->warning($msg); } @@ -296,7 +424,7 @@ sub warning { sub complain { my $msg = shift; - my $logger_object = $tokenizer_self->{_logger_object}; + my $logger_object = $tokenizer_self->[_logger_object_]; if ($logger_object) { $logger_object->complain($msg); } @@ -305,7 +433,7 @@ sub complain { sub write_logfile_entry { my $msg = shift; - my $logger_object = $tokenizer_self->{_logger_object}; + my $logger_object = $tokenizer_self->[_logger_object_]; if ($logger_object) { $logger_object->write_logfile_entry($msg); } @@ -313,7 +441,7 @@ sub write_logfile_entry { } sub interrupt_logfile { - my $logger_object = $tokenizer_self->{_logger_object}; + my $logger_object = $tokenizer_self->[_logger_object_]; if ($logger_object) { $logger_object->interrupt_logfile(); } @@ -321,7 +449,7 @@ sub interrupt_logfile { } sub resume_logfile { - my $logger_object = $tokenizer_self->{_logger_object}; + my $logger_object = $tokenizer_self->[_logger_object_]; if ($logger_object) { $logger_object->resume_logfile(); } @@ -329,7 +457,7 @@ sub resume_logfile { } sub increment_brace_error { - my $logger_object = $tokenizer_self->{_logger_object}; + my $logger_object = $tokenizer_self->[_logger_object_]; if ($logger_object) { $logger_object->increment_brace_error(); } @@ -337,8 +465,8 @@ sub increment_brace_error { } sub report_definite_bug { - $tokenizer_self->{_hit_bug} = 1; - my $logger_object = $tokenizer_self->{_logger_object}; + $tokenizer_self->[_hit_bug_] = 1; + my $logger_object = $tokenizer_self->[_logger_object_]; if ($logger_object) { $logger_object->report_definite_bug(); } @@ -347,7 +475,7 @@ sub report_definite_bug { sub brace_warning { my $msg = shift; - my $logger_object = $tokenizer_self->{_logger_object}; + my $logger_object = $tokenizer_self->[_logger_object_]; if ($logger_object) { $logger_object->brace_warning($msg); } @@ -355,7 +483,7 @@ sub brace_warning { } sub get_saw_brace_error { - my $logger_object = $tokenizer_self->{_logger_object}; + my $logger_object = $tokenizer_self->[_logger_object_]; if ($logger_object) { return $logger_object->get_saw_brace_error(); } @@ -365,48 +493,90 @@ sub get_saw_brace_error { } sub get_unexpected_error_count { - my ($self) = shift; - return $self->{_unexpected_error_count}; + my ($self) = @_; + return $self->[_unexpected_error_count_]; } # interface to Perl::Tidy::Diagnostics routines sub write_diagnostics { my $msg = shift; - if ( $tokenizer_self->{_diagnostics_object} ) { - $tokenizer_self->{_diagnostics_object}->write_diagnostics($msg); + if ( $tokenizer_self->[_diagnostics_object_] ) { + $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg); } return; } +sub get_maximum_level { + return $tokenizer_self->[_maximum_level_]; +} + sub report_tokenization_errors { - my $self = shift; - my $severe_error = $self->{_in_error}; + my ($self) = @_; + + # Report any tokenization errors and return a flag '$severe_error'. + # Set $severe_error = 1 if the tokenizations errors are so severe that + # the formatter should not attempt to format the file. Instead, it will + # just output the file verbatim. + + # set severe error flag if tokenizer has encountered file reading problems + # (i.e. unexpected binary characters) + my $severe_error = $self->[_in_error_]; + + my $maxle = $self->[_rOpts_maximum_level_errors_]; + my $maxue = $self->[_rOpts_maximum_unexpected_errors_]; + $maxle = 1 unless defined($maxle); + $maxue = 0 unless defined($maxue); my $level = get_indentation_level(); - if ( $level != $tokenizer_self->{_starting_level} ) { + if ( $level != $tokenizer_self->[_starting_level_] ) { warning("final indentation level: $level\n"); + my $level_diff = $tokenizer_self->[_starting_level_] - $level; + if ( $level_diff < 0 ) { $level_diff = -$level_diff } + + # Set severe error flag if the level error is greater than 1. + # The formatter can function for any level error but it is probably + # best not to attempt formatting for a high level error. + if ( $maxle >= 0 && $level_diff > $maxle ) { + $severe_error = 1; + warning(<{_look_for_hash_bang} - && !$tokenizer_self->{_saw_hash_bang} ) + # Likewise, large numbers of brace errors usually indicate non-perl + # scirpts, so set the severe error flag at a low number. This is similar + # to the level check, but different because braces may balance but be + # incorrectly interlaced. + if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) { + $severe_error = 1; + } + + if ( $tokenizer_self->[_look_for_hash_bang_] + && !$tokenizer_self->[_saw_hash_bang_] ) { warning( "hit EOF without seeing hash-bang line; maybe don't need -x?\n"); } - if ( $tokenizer_self->{_in_format} ) { + if ( $tokenizer_self->[_in_format_] ) { warning("hit EOF while in format description\n"); } - if ( $tokenizer_self->{_in_pod} ) { + if ( $tokenizer_self->[_in_skipped_] ) { + write_logfile_entry( + "hit EOF while in lines skipped with --code-skipping\n"); + } + + if ( $tokenizer_self->[_in_pod_] ) { # Just write log entry if this is after __END__ or __DATA__ # because this happens to often, and it is not likely to be # a parsing error. - if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) { + if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) { write_logfile_entry( "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n" ); @@ -420,11 +590,11 @@ sub report_tokenization_errors { } - if ( $tokenizer_self->{_in_here_doc} ) { + if ( $tokenizer_self->[_in_here_doc_] ) { $severe_error = 1; - my $here_doc_target = $tokenizer_self->{_here_doc_target}; + my $here_doc_target = $tokenizer_self->[_here_doc_target_]; my $started_looking_for_here_target_at = - $tokenizer_self->{_started_looking_for_here_target_at}; + $tokenizer_self->[_started_looking_for_here_target_at_]; if ($here_doc_target) { warning( "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n" @@ -436,7 +606,7 @@ sub report_tokenization_errors { ); } my $nearly_matched_here_target_at = - $tokenizer_self->{_nearly_matched_here_target_at}; + $tokenizer_self->[_nearly_matched_here_target_at_]; if ($nearly_matched_here_target_at) { warning( "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n" @@ -444,12 +614,13 @@ sub report_tokenization_errors { } } - if ( $tokenizer_self->{_in_quote} ) { + # Something is seriously wrong if we ended inside a quote + if ( $tokenizer_self->[_in_quote_] ) { $severe_error = 1; - my $line_start_quote = $tokenizer_self->{_line_start_quote}; - my $quote_target = $tokenizer_self->{_quote_target}; + my $line_start_quote = $tokenizer_self->[_line_start_quote_]; + my $quote_target = $tokenizer_self->[_quote_target_]; my $what = - ( $tokenizer_self->{_in_attribute_list} ) + ( $tokenizer_self->[_in_attribute_list_] ) ? "attribute list" : "quote/pattern"; warning( @@ -457,32 +628,25 @@ sub report_tokenization_errors { ); } - if ( $tokenizer_self->{_hit_bug} ) { + if ( $tokenizer_self->[_hit_bug_] ) { $severe_error = 1; } - my $logger_object = $tokenizer_self->{_logger_object}; - -# TODO: eventually may want to activate this to cause file to be output verbatim - if (0) { - - # Set the severe error for a fairly high warning count because - # some of the warnings do not harm formatting, such as duplicate - # sub names. - my $warning_count = $logger_object->{_warning_count}; - if ( $warning_count > 50 ) { - $severe_error = 1; - } - - # Brace errors are significant, so set the severe error flag at - # a low number. - my $saw_brace_error = $logger_object->{_saw_brace_error}; - if ( $saw_brace_error > 2 ) { - $severe_error = 1; - } + # Multiple "unexpected" type tokenization errors usually indicate parsing + # non-perl scripts, or that something is seriously wrong, so we should + # avoid formatting them. This can happen for example if we run perltidy on + # a shell script or an html file. But unfortunately this check can + # interfere with some extended syntaxes, such as RPerl, so it has to be off + # by default. + my $ue_count = $tokenizer_self->[_unexpected_error_count_]; + if ( $maxue > 0 && $ue_count > $maxue ) { + warning(< -maxue=$maxue; use -maxue=0 to force formatting +EOM + $severe_error = 1; } - unless ( $tokenizer_self->{_saw_perl_dash_w} ) { + unless ( $tokenizer_self->[_saw_perl_dash_w_] ) { if ( $] < 5.006 ) { write_logfile_entry("Suggest including '-w parameter'\n"); } @@ -491,19 +655,19 @@ sub report_tokenization_errors { } } - if ( $tokenizer_self->{_saw_perl_dash_P} ) { + if ( $tokenizer_self->[_saw_perl_dash_P_] ) { write_logfile_entry("Use of -P parameter for defines is discouraged\n"); } - unless ( $tokenizer_self->{_saw_use_strict} ) { + unless ( $tokenizer_self->[_saw_use_strict_] ) { write_logfile_entry("Suggest including 'use strict;'\n"); } # it is suggested that labels have at least one upper case character # for legibility and to avoid code breakage as new keywords are introduced - if ( $tokenizer_self->{_rlower_case_labels_at} ) { + if ( $tokenizer_self->[_rlower_case_labels_at_] ) { my @lower_case_labels_at = - @{ $tokenizer_self->{_rlower_case_labels_at} }; + @{ $tokenizer_self->[_rlower_case_labels_at_] }; write_logfile_entry( "Suggest using upper case characters in label(s)\n"); local $" = ')('; @@ -516,8 +680,9 @@ sub report_v_string { # warn if this version can't handle v-strings my $tok = shift; - unless ( $tokenizer_self->{_saw_v_string} ) { - $tokenizer_self->{_saw_v_string} = $tokenizer_self->{_last_line_number}; + unless ( $tokenizer_self->[_saw_v_string_] ) { + $tokenizer_self->[_saw_v_string_] = + $tokenizer_self->[_last_line_number_]; } if ( $] < 5.006 ) { warning( @@ -528,7 +693,7 @@ sub report_v_string { } sub get_input_line_number { - return $tokenizer_self->{_last_line_number}; + return $tokenizer_self->[_last_line_number_]; } # returns the next tokenized line @@ -539,27 +704,35 @@ sub get_line { # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth, # $square_bracket_depth, $paren_depth - my $input_line = $tokenizer_self->{_line_buffer_object}->get_line(); - $tokenizer_self->{_line_text} = $input_line; + my $input_line = $tokenizer_self->[_line_buffer_object_]->get_line(); + $tokenizer_self->[_line_of_text_] = $input_line; return unless ($input_line); - my $input_line_number = ++$tokenizer_self->{_last_line_number}; + my $input_line_number = ++$tokenizer_self->[_last_line_number_]; + + my $write_logfile_entry = sub { + my ($msg) = @_; + write_logfile_entry("Line $input_line_number: $msg"); + }; # Find and remove what characters terminate this line, including any # control r my $input_line_separator = ""; if ( chomp($input_line) ) { $input_line_separator = $/ } - # TODO: what other characters should be included here? - if ( $input_line =~ s/((\r|\035|\032)+)$// ) { - $input_line_separator = $2 . $input_line_separator; + # The first test here very significantly speeds things up, but be sure to + # keep the regex and hash %other_line_endings the same. + if ( $other_line_endings{ substr( $input_line, -1 ) } ) { + if ( $input_line =~ s/((\r|\035|\032)+)$// ) { + $input_line_separator = $2 . $input_line_separator; + } } # for backwards compatibility we keep the line text terminated with # a newline character $input_line .= "\n"; - $tokenizer_self->{_line_text} = $input_line; # update + $tokenizer_self->[_line_of_text_] = $input_line; # update # create a data structure describing this line which will be # returned to the caller. @@ -592,32 +765,32 @@ sub get_line { _line_type => 'EOF', _line_text => $input_line, _line_number => $input_line_number, - _rtoken_type => undef, - _rtokens => undef, - _rlevels => undef, - _rslevels => undef, - _rblock_type => undef, - _rcontainer_type => undef, - _rcontainer_environment => undef, - _rtype_sequence => undef, - _rnesting_tokens => undef, - _rci_levels => undef, - _rnesting_blocks => undef, _guessed_indentation_level => 0, - _starting_in_quote => 0, # to be set by subroutine - _ending_in_quote => 0, - _curly_brace_depth => $brace_depth, - _square_bracket_depth => $square_bracket_depth, - _paren_depth => $paren_depth, - _quote_character => '', + _curly_brace_depth => $brace_depth, + _square_bracket_depth => $square_bracket_depth, + _paren_depth => $paren_depth, + _quote_character => '', +## _rtoken_type => undef, +## _rtokens => undef, +## _rlevels => undef, +## _rslevels => undef, +## _rblock_type => undef, +## _rcontainer_type => undef, +## _rcontainer_environment => undef, +## _rtype_sequence => undef, +## _rnesting_tokens => undef, +## _rci_levels => undef, +## _rnesting_blocks => undef, +## _starting_in_quote => 0, +## _ending_in_quote => 0, }; # must print line unchanged if we are in a here document - if ( $tokenizer_self->{_in_here_doc} ) { + if ( $tokenizer_self->[_in_here_doc_] ) { $line_of_tokens->{_line_type} = 'HERE'; - my $here_doc_target = $tokenizer_self->{_here_doc_target}; - my $here_quote_character = $tokenizer_self->{_here_quote_character}; + my $here_doc_target = $tokenizer_self->[_here_doc_target_]; + my $here_quote_character = $tokenizer_self->[_here_quote_character_]; my $candidate_target = $input_line; chomp $candidate_target; @@ -627,27 +800,27 @@ sub get_line { $candidate_target =~ s/^\s*//; } if ( $candidate_target eq $here_doc_target ) { - $tokenizer_self->{_nearly_matched_here_target_at} = undef; - $line_of_tokens->{_line_type} = 'HERE_END'; - write_logfile_entry("Exiting HERE document $here_doc_target\n"); + $tokenizer_self->[_nearly_matched_here_target_at_] = undef; + $line_of_tokens->{_line_type} = 'HERE_END'; + $write_logfile_entry->("Exiting HERE document $here_doc_target\n"); - my $rhere_target_list = $tokenizer_self->{_rhere_target_list}; + my $rhere_target_list = $tokenizer_self->[_rhere_target_list_]; if ( @{$rhere_target_list} ) { # there can be multiple here targets ( $here_doc_target, $here_quote_character ) = @{ shift @{$rhere_target_list} }; - $tokenizer_self->{_here_doc_target} = $here_doc_target; - $tokenizer_self->{_here_quote_character} = + $tokenizer_self->[_here_doc_target_] = $here_doc_target; + $tokenizer_self->[_here_quote_character_] = $here_quote_character; - write_logfile_entry( + $write_logfile_entry->( "Entering HERE document $here_doc_target\n"); - $tokenizer_self->{_nearly_matched_here_target_at} = undef; - $tokenizer_self->{_started_looking_for_here_target_at} = + $tokenizer_self->[_nearly_matched_here_target_at_] = undef; + $tokenizer_self->[_started_looking_for_here_target_at_] = $input_line_number; } else { - $tokenizer_self->{_in_here_doc} = 0; - $tokenizer_self->{_here_doc_target} = ""; - $tokenizer_self->{_here_quote_character} = ""; + $tokenizer_self->[_in_here_doc_] = 0; + $tokenizer_self->[_here_doc_target_] = ""; + $tokenizer_self->[_here_quote_character_] = ""; } } @@ -657,37 +830,49 @@ sub get_line { $candidate_target =~ s/\s*$//; $candidate_target =~ s/^\s*//; if ( $candidate_target eq $here_doc_target ) { - $tokenizer_self->{_nearly_matched_here_target_at} = + $tokenizer_self->[_nearly_matched_here_target_at_] = $input_line_number; } } return $line_of_tokens; } - # must print line unchanged if we are in a format section - elsif ( $tokenizer_self->{_in_format} ) { + # Print line unchanged if we are in a format section + elsif ( $tokenizer_self->[_in_format_] ) { if ( $input_line =~ /^\.[\s#]*$/ ) { - write_logfile_entry("Exiting format section\n"); - $tokenizer_self->{_in_format} = 0; - $line_of_tokens->{_line_type} = 'FORMAT_END'; + + # Decrement format depth count at a '.' after a 'format' + $tokenizer_self->[_in_format_]--; + + # This is the end when count reaches 0 + if ( !$tokenizer_self->[_in_format_] ) { + $write_logfile_entry->("Exiting format section\n"); + $line_of_tokens->{_line_type} = 'FORMAT_END'; + } } else { $line_of_tokens->{_line_type} = 'FORMAT'; + if ( $input_line =~ /^\s*format\s+\w+/ ) { + + # Increment format depth count at a 'format' within a 'format' + # This is a simple way to handle nested formats (issue c019). + $tokenizer_self->[_in_format_]++; + } } return $line_of_tokens; } # must print line unchanged if we are in pod documentation - elsif ( $tokenizer_self->{_in_pod} ) { + elsif ( $tokenizer_self->[_in_pod_] ) { $line_of_tokens->{_line_type} = 'POD'; if ( $input_line =~ /^=cut/ ) { $line_of_tokens->{_line_type} = 'POD_END'; - write_logfile_entry("Exiting POD section\n"); - $tokenizer_self->{_in_pod} = 0; + $write_logfile_entry->("Exiting POD section\n"); + $tokenizer_self->[_in_pod_] = 0; } - if ( $input_line =~ /^\#\!.*perl\b/ ) { + if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) { warning( "Hash-bang in pod can cause older versions of perl to fail! \n" ); @@ -696,26 +881,38 @@ sub get_line { return $line_of_tokens; } + # print line unchanged if in skipped section + elsif ( $tokenizer_self->[_in_skipped_] ) { + + # NOTE: marked as the existing type 'FORMAT' to keep html working + $line_of_tokens->{_line_type} = 'FORMAT'; + if ( $input_line =~ /$code_skipping_pattern_end/ ) { + $write_logfile_entry->("Exiting code-skipping section\n"); + $tokenizer_self->[_in_skipped_] = 0; + } + return $line_of_tokens; + } + # must print line unchanged if we have seen a severe error (i.e., we # are seeing illegal tokens and cannot continue. Syntax errors do # not pass this route). Calling routine can decide what to do, but # the default can be to just pass all lines as if they were after __END__ - elsif ( $tokenizer_self->{_in_error} ) { + elsif ( $tokenizer_self->[_in_error_] ) { $line_of_tokens->{_line_type} = 'ERROR'; return $line_of_tokens; } # print line unchanged if we are __DATA__ section - elsif ( $tokenizer_self->{_in_data} ) { + elsif ( $tokenizer_self->[_in_data_] ) { # ...but look for POD # Note that the _in_data and _in_end flags remain set # so that we return to that state after seeing the # end of a pod section - if ( $input_line =~ /^=(?!cut)/ ) { + if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) { $line_of_tokens->{_line_type} = 'POD_START'; - write_logfile_entry("Entering POD section\n"); - $tokenizer_self->{_in_pod} = 1; + $write_logfile_entry->("Entering POD section\n"); + $tokenizer_self->[_in_pod_] = 1; return $line_of_tokens; } else { @@ -725,16 +922,16 @@ sub get_line { } # print line unchanged if we are in __END__ section - elsif ( $tokenizer_self->{_in_end} ) { + elsif ( $tokenizer_self->[_in_end_] ) { # ...but look for POD # Note that the _in_data and _in_end flags remain set # so that we return to that state after seeing the # end of a pod section - if ( $input_line =~ /^=(?!cut)/ ) { + if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) { $line_of_tokens->{_line_type} = 'POD_START'; - write_logfile_entry("Entering POD section\n"); - $tokenizer_self->{_in_pod} = 1; + $write_logfile_entry->("Entering POD section\n"); + $tokenizer_self->[_in_pod_] = 1; return $line_of_tokens; } else { @@ -744,21 +941,21 @@ sub get_line { } # check for a hash-bang line if we haven't seen one - if ( !$tokenizer_self->{_saw_hash_bang} ) { + if ( !$tokenizer_self->[_saw_hash_bang_] ) { if ( $input_line =~ /^\#\!.*perl\b/ ) { - $tokenizer_self->{_saw_hash_bang} = $input_line_number; + $tokenizer_self->[_saw_hash_bang_] = $input_line_number; # check for -w and -P flags if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) { - $tokenizer_self->{_saw_perl_dash_P} = 1; + $tokenizer_self->[_saw_perl_dash_P_] = 1; } if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) { - $tokenizer_self->{_saw_perl_dash_w} = 1; + $tokenizer_self->[_saw_perl_dash_w_] = 1; } if ( - ( $input_line_number > 1 ) + $input_line_number > 1 # leave any hash bang in a BEGIN block alone # i.e. see 'debugger-duck_type.t' @@ -766,13 +963,28 @@ sub get_line { $last_nonblank_block_type && $last_nonblank_block_type eq 'BEGIN' ) - && ( !$tokenizer_self->{_look_for_hash_bang} ) + && !$tokenizer_self->[_look_for_hash_bang_] + + # Try to avoid giving a false alarm at a simple comment. + # These look like valid hash-bang lines: + + #!/usr/bin/perl -w + #! /usr/bin/perl -w + #!c:\perl\bin\perl.exe + + # These are comments: + #! I love perl + #! sunos does not yet provide a /usr/bin/perl + + # Comments typically have multiple spaces, which suggests + # the filter + && $input_line =~ /^\#\!(\s+)?(\S+)?perl/ ) { # this is helpful for VMS systems; we may have accidentally # tokenized some DCL commands - if ( $tokenizer_self->{_started_tokenizing} ) { + if ( $tokenizer_self->[_started_tokenizing_] ) { warning( "There seems to be a hash-bang after line 1; do you need to run with -x ?\n" ); @@ -792,8 +1004,8 @@ sub get_line { } # wait for a hash-bang before parsing if the user invoked us with -x - if ( $tokenizer_self->{_look_for_hash_bang} - && !$tokenizer_self->{_saw_hash_bang} ) + if ( $tokenizer_self->[_look_for_hash_bang_] + && !$tokenizer_self->[_saw_hash_bang_] ) { $line_of_tokens->{_line_type} = 'SYSTEM'; return $line_of_tokens; @@ -808,106 +1020,120 @@ sub get_line { # now we know that it is ok to tokenize the line... # the line tokenizer will modify any of these private variables: - # _rhere_target_list - # _in_data - # _in_end - # _in_format - # _in_error - # _in_pod - # _in_quote - my $ending_in_quote_last = $tokenizer_self->{_in_quote}; + # _rhere_target_list_ + # _in_data_ + # _in_end_ + # _in_format_ + # _in_error_ + # _in_skipped_ + # _in_pod_ + # _in_quote_ + my $ending_in_quote_last = $tokenizer_self->[_in_quote_]; tokenize_this_line($line_of_tokens); # Now finish defining the return structure and return it - $line_of_tokens->{_ending_in_quote} = $tokenizer_self->{_in_quote}; + $line_of_tokens->{_ending_in_quote} = $tokenizer_self->[_in_quote_]; # handle severe error (binary data in script) - if ( $tokenizer_self->{_in_error} ) { - $tokenizer_self->{_in_quote} = 0; # to avoid any more messages + if ( $tokenizer_self->[_in_error_] ) { + $tokenizer_self->[_in_quote_] = 0; # to avoid any more messages warning("Giving up after error\n"); $line_of_tokens->{_line_type} = 'ERROR'; - reset_indentation_level(0); # avoid error messages + reset_indentation_level(0); # avoid error messages return $line_of_tokens; } # handle start of pod documentation - if ( $tokenizer_self->{_in_pod} ) { + if ( $tokenizer_self->[_in_pod_] ) { # This gets tricky..above a __DATA__ or __END__ section, perl # accepts '=cut' as the start of pod section. But afterwards, # only pod utilities see it and they may ignore an =cut without # leading =head. In any case, this isn't good. if ( $input_line =~ /^=cut\b/ ) { - if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) { + if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) + { complain("=cut while not in pod ignored\n"); - $tokenizer_self->{_in_pod} = 0; + $tokenizer_self->[_in_pod_] = 0; $line_of_tokens->{_line_type} = 'POD_END'; } else { $line_of_tokens->{_line_type} = 'POD_START'; - complain( + warning( "=cut starts a pod section .. this can fool pod utilities.\n" ); - write_logfile_entry("Entering POD section\n"); + $write_logfile_entry->("Entering POD section\n"); } } else { $line_of_tokens->{_line_type} = 'POD_START'; - write_logfile_entry("Entering POD section\n"); + $write_logfile_entry->("Entering POD section\n"); } return $line_of_tokens; } - # update indentation levels for log messages - if ( $input_line !~ /^\s*$/ ) { + # handle start of skipped section + if ( $tokenizer_self->[_in_skipped_] ) { + + # NOTE: marked as the existing type 'FORMAT' to keep html working + $line_of_tokens->{_line_type} = 'FORMAT'; + $write_logfile_entry->("Entering code-skipping section\n"); + return $line_of_tokens; + } + + # Update indentation levels for log messages. + # Skip blank lines and also block comments, unless a logfile is requested. + # Note that _line_of_text_ is the input line but trimmed from left to right. + my $lot = $tokenizer_self->[_line_of_text_]; + if ( $lot && ( $self->[_rOpts_logfile_] || substr( $lot, 0, 1 ) ne '#' ) ) { my $rlevels = $line_of_tokens->{_rlevels}; $line_of_tokens->{_guessed_indentation_level} = guess_old_indentation_level($input_line); } # see if this line contains here doc targets - my $rhere_target_list = $tokenizer_self->{_rhere_target_list}; + my $rhere_target_list = $tokenizer_self->[_rhere_target_list_]; if ( @{$rhere_target_list} ) { my ( $here_doc_target, $here_quote_character ) = @{ shift @{$rhere_target_list} }; - $tokenizer_self->{_in_here_doc} = 1; - $tokenizer_self->{_here_doc_target} = $here_doc_target; - $tokenizer_self->{_here_quote_character} = $here_quote_character; - write_logfile_entry("Entering HERE document $here_doc_target\n"); - $tokenizer_self->{_started_looking_for_here_target_at} = + $tokenizer_self->[_in_here_doc_] = 1; + $tokenizer_self->[_here_doc_target_] = $here_doc_target; + $tokenizer_self->[_here_quote_character_] = $here_quote_character; + $write_logfile_entry->("Entering HERE document $here_doc_target\n"); + $tokenizer_self->[_started_looking_for_here_target_at_] = $input_line_number; } # NOTE: __END__ and __DATA__ statements are written unformatted # because they can theoretically contain additional characters # which are not tokenized (and cannot be read with either!). - if ( $tokenizer_self->{_in_data} ) { + if ( $tokenizer_self->[_in_data_] ) { $line_of_tokens->{_line_type} = 'DATA_START'; - write_logfile_entry("Starting __DATA__ section\n"); - $tokenizer_self->{_saw_data} = 1; + $write_logfile_entry->("Starting __DATA__ section\n"); + $tokenizer_self->[_saw_data_] = 1; # keep parsing after __DATA__ if use SelfLoader was seen - if ( $tokenizer_self->{_saw_selfloader} ) { - $tokenizer_self->{_in_data} = 0; - write_logfile_entry( + if ( $tokenizer_self->[_saw_selfloader_] ) { + $tokenizer_self->[_in_data_] = 0; + $write_logfile_entry->( "SelfLoader seen, continuing; -nlsl deactivates\n"); } return $line_of_tokens; } - elsif ( $tokenizer_self->{_in_end} ) { + elsif ( $tokenizer_self->[_in_end_] ) { $line_of_tokens->{_line_type} = 'END_START'; - write_logfile_entry("Starting __END__ section\n"); - $tokenizer_self->{_saw_end} = 1; + $write_logfile_entry->("Starting __END__ section\n"); + $tokenizer_self->[_saw_end_] = 1; # keep parsing after __END__ if use AutoLoader was seen - if ( $tokenizer_self->{_saw_autoloader} ) { - $tokenizer_self->{_in_end} = 0; - write_logfile_entry( + if ( $tokenizer_self->[_saw_autoloader_] ) { + $tokenizer_self->[_in_end_] = 0; + $write_logfile_entry->( "AutoLoader seen, continuing; -nlal deactivates\n"); } return $line_of_tokens; @@ -917,41 +1143,42 @@ sub get_line { $line_of_tokens->{_line_type} = 'CODE'; # remember if we have seen any real code - if ( !$tokenizer_self->{_started_tokenizing} + if ( !$tokenizer_self->[_started_tokenizing_] && $input_line !~ /^\s*$/ && $input_line !~ /^\s*#/ ) { - $tokenizer_self->{_started_tokenizing} = 1; + $tokenizer_self->[_started_tokenizing_] = 1; } - if ( $tokenizer_self->{_debugger_object} ) { - $tokenizer_self->{_debugger_object}->write_debug_entry($line_of_tokens); + if ( $tokenizer_self->[_debugger_object_] ) { + $tokenizer_self->[_debugger_object_] + ->write_debug_entry($line_of_tokens); } # Note: if keyword 'format' occurs in this line code, it is still CODE # (keyword 'format' need not start a line) - if ( $tokenizer_self->{_in_format} ) { - write_logfile_entry("Entering format section\n"); + if ( $tokenizer_self->[_in_format_] ) { + $write_logfile_entry->("Entering format section\n"); } - if ( $tokenizer_self->{_in_quote} - and ( $tokenizer_self->{_line_start_quote} < 0 ) ) + if ( $tokenizer_self->[_in_quote_] + and ( $tokenizer_self->[_line_start_quote_] < 0 ) ) { #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) { - if ( - ( my $quote_target = $tokenizer_self->{_quote_target} ) !~ /^\s*$/ ) + if ( ( my $quote_target = $tokenizer_self->[_quote_target_] ) !~ + /^\s*$/ ) { - $tokenizer_self->{_line_start_quote} = $input_line_number; - write_logfile_entry( + $tokenizer_self->[_line_start_quote_] = $input_line_number; + $write_logfile_entry->( "Start multi-line quote or pattern ending in $quote_target\n"); } } - elsif ( ( $tokenizer_self->{_line_start_quote} >= 0 ) - && !$tokenizer_self->{_in_quote} ) + elsif ( ( $tokenizer_self->[_line_start_quote_] >= 0 ) + && !$tokenizer_self->[_in_quote_] ) { - $tokenizer_self->{_line_start_quote} = -1; - write_logfile_entry("End of multi-line quote or pattern\n"); + $tokenizer_self->[_line_start_quote_] = -1; + $write_logfile_entry->("End of multi-line quote or pattern\n"); } # we are returning a line of CODE @@ -970,13 +1197,13 @@ sub find_starting_indentation_level { my $starting_level = 0; # use value if given as parameter - if ( $tokenizer_self->{_know_starting_level} ) { - $starting_level = $tokenizer_self->{_starting_level}; + if ( $tokenizer_self->[_know_starting_level_] ) { + $starting_level = $tokenizer_self->[_starting_level_]; } # if we know there is a hash_bang line, the level must be zero - elsif ( $tokenizer_self->{_look_for_hash_bang} ) { - $tokenizer_self->{_know_starting_level} = 1; + elsif ( $tokenizer_self->[_look_for_hash_bang_] ) { + $tokenizer_self->[_know_starting_level_] = 1; } # otherwise figure it out from the input file @@ -987,7 +1214,7 @@ sub find_starting_indentation_level { # keep looking at lines until we find a hash bang or piece of code my $msg = ""; while ( $line = - $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) ) + $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) ) { # if first line is #! then assume starting level is zero @@ -1003,7 +1230,7 @@ sub find_starting_indentation_level { $msg = "Line $i implies starting-indentation-level = $starting_level\n"; write_logfile_entry("$msg"); } - $tokenizer_self->{_starting_level} = $starting_level; + $tokenizer_self->[_starting_level_] = $starting_level; reset_indentation_level($starting_level); return; } @@ -1034,20 +1261,20 @@ sub guess_old_indentation_level { # If there are leading tabs, we use the tab scheme for this run, if # any, so that the code will remain stable when editing. - if ($1) { $spaces += length($1) * $tokenizer_self->{_tabsize} } + if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] } if ($2) { $spaces += length($2) } # correct for outdented labels - if ( $3 && $tokenizer_self->{'_outdent_labels'} ) { - $spaces += $tokenizer_self->{_continuation_indentation}; + if ( $3 && $tokenizer_self->[_outdent_labels_] ) { + $spaces += $tokenizer_self->[_continuation_indentation_]; } } # compute indentation using the value of -i for this run. # If -i=0 is used for this run (which is possible) it doesn't matter # what we do here but we'll guess that the old run used 4 spaces per level. - my $indent_columns = $tokenizer_self->{_indent_columns}; + my $indent_columns = $tokenizer_self->[_indent_columns_]; $indent_columns = 4 if ( !$indent_columns ); $level = int( $spaces / $indent_columns ); return ($level); @@ -1058,7 +1285,7 @@ sub dump_functions { my $fh = *STDOUT; foreach my $pkg ( keys %is_user_function ) { - print $fh "\nnon-constant subs in package $pkg\n"; + $fh->print("\nnon-constant subs in package $pkg\n"); foreach my $sub ( keys %{ $is_user_function{$pkg} } ) { my $msg = ""; @@ -1069,28 +1296,20 @@ sub dump_functions { if ( $is_block_function{$pkg}{$sub} ) { $msg = 'block'; } - print $fh "$sub $msg\n"; + $fh->print("$sub $msg\n"); } } foreach my $pkg ( keys %is_constant ) { - print $fh "\nconstants and constant subs in package $pkg\n"; + $fh->print("\nconstants and constant subs in package $pkg\n"); foreach my $sub ( keys %{ $is_constant{$pkg} } ) { - print $fh "$sub\n"; + $fh->print("$sub\n"); } } return; } -sub ones_count { - - # count number of 1's in a string of 1's and 0's - # example: ones_count("010101010101") gives 6 - my $str = shift; - return $str =~ tr/1/0/; -} - sub prepare_for_a_new_file { # previous tokens needed to determine what to expect next @@ -1113,17 +1332,33 @@ sub prepare_for_a_new_file { %is_block_function = (); %is_block_list_function = (); %saw_function_definition = (); + %saw_use_module = (); # variables used to track depths of various containers # and report nesting errors - $paren_depth = 0; - $brace_depth = 0; - $square_bracket_depth = 0; - @current_depth = (0) x scalar @closing_brace_names; - $total_depth = 0; - @total_depth = (); - @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 ); - @current_sequence_number = (); + $paren_depth = 0; + $brace_depth = 0; + $square_bracket_depth = 0; + @current_depth = (0) x scalar @closing_brace_names; + $total_depth = 0; + @total_depth = (); + @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 ); + @current_sequence_number = (); + + @paren_type = (); + @paren_semicolon_count = (); + @paren_structural_type = (); + @brace_type = (); + @brace_structural_type = (); + @brace_context = (); + @brace_package = (); + @square_bracket_type = (); + @square_bracket_structural_type = (); + @depth_array = (); + @nested_ternary_flag = (); + @nested_statement_type = (); + @starting_line_of_current_depth = (); + $paren_type[$paren_depth] = ''; $paren_semicolon_count[$paren_depth] = 0; $paren_structural_type[$brace_depth] = ''; @@ -1138,7 +1373,7 @@ sub prepare_for_a_new_file { return; } -{ # begin tokenize_this_line +{ ## closure for sub tokenize_this_line use constant BRACE => 0; use constant SQUARE_BRACKET => 1; @@ -1390,7 +1625,7 @@ sub prepare_for_a_new_file { write_logfile_entry("scanning replacement text for here-doc targets\n"); # save the logger object for error messages - my $logger_object = $tokenizer_self->{_logger_object}; + my $logger_object = $tokenizer_self->[_logger_object_]; # localize all package variables local ( @@ -1421,9 +1656,11 @@ sub prepare_for_a_new_file { # make a new tokenizer my $rOpts = {}; my $rpending_logfile_message; - my $source_object = - Perl::Tidy::LineSource->new( \$replacement_text, $rOpts, - $rpending_logfile_message ); + my $source_object = Perl::Tidy::LineSource->new( + input_file => \$replacement_text, + rOpts => $rOpts, + rpending_logfile_message => $rpending_logfile_message, + ); my $tokenizer = Perl::Tidy::Tokenizer->new( source_object => $source_object, logger_object => $logger_object, @@ -1435,18 +1672,18 @@ sub prepare_for_a_new_file { # remove any here doc targets my $rht = undef; - if ( $tokenizer_self->{_in_here_doc} ) { + if ( $tokenizer_self->[_in_here_doc_] ) { $rht = []; push @{$rht}, [ - $tokenizer_self->{_here_doc_target}, - $tokenizer_self->{_here_quote_character} + $tokenizer_self->[_here_doc_target_], + $tokenizer_self->[_here_quote_character_] ]; - if ( $tokenizer_self->{_rhere_target_list} ) { - push @{$rht}, @{ $tokenizer_self->{_rhere_target_list} }; - $tokenizer_self->{_rhere_target_list} = undef; + if ( $tokenizer_self->[_rhere_target_list_] ) { + push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] }; + $tokenizer_self->[_rhere_target_list_] = undef; } - $tokenizer_self->{_in_here_doc} = undef; + $tokenizer_self->[_in_here_doc_] = undef; } # now its safe to report errors @@ -1475,6 +1712,139 @@ sub prepare_for_a_new_file { return; } + use constant VERIFY_FASTSCAN => 0; + my %fast_scan_context; + + BEGIN { + %fast_scan_context = ( + '$' => SCALAR_CONTEXT, + '*' => SCALAR_CONTEXT, + '@' => LIST_CONTEXT, + '%' => LIST_CONTEXT, + '&' => UNKNOWN_CONTEXT, + ); + } + + sub scan_identifier_fast { + + # This is a wrapper for sub scan_identifier. It does a fast preliminary + # scan for certain common identifiers: + # '$var', '@var', %var, *var, &var, '@{...}', '%{...}' + # If it does not find one of these, or this is a restart, it calls the + # original scanner directly. + + # This gives the same results as the full scanner in about 1/4 the + # total runtime for a typical input stream. + + my $i_begin = $i; + my $tok_begin = $tok; + my $fast_scan_type; + + ############################### + # quick scan with leading sigil + ############################### + if ( !$id_scan_state + && $i + 1 <= $max_token_index + && $fast_scan_context{$tok} ) + { + $context = $fast_scan_context{$tok}; + + # look for $var, @var, ... + if ( $rtoken_type->[ $i + 1 ] eq 'w' ) { + my $pretype_next = ""; + my $i_next = $i + 2; + if ( $i_next <= $max_token_index ) { + if ( $rtoken_type->[$i_next] eq 'b' + && $i_next < $max_token_index ) + { + $i_next += 1; + } + $pretype_next = $rtoken_type->[$i_next]; + } + if ( $pretype_next ne ':' && $pretype_next ne "'" ) { + + # Found type 'i' like '$var', '@var', or '%var' + $identifier = $tok . $rtokens->[ $i + 1 ]; + $tok = $identifier; + $type = 'i'; + $i = $i + 1; + $fast_scan_type = $type; + } + } + + # Look for @{ or %{ . + # But we must let the full scanner handle things ${ because it may + # keep going to get a complete identifier like '${#}' . + elsif ( + $rtoken_type->[ $i + 1 ] eq '{' + && ( $tok_begin eq '@' + || $tok_begin eq '%' ) + ) + { + + $identifier = $tok; + $type = 't'; + $fast_scan_type = $type; + } + } + + ############################ + # Quick scan with leading -> + # Look for ->[ and ->{ + ############################ + elsif ( + $tok eq '->' + && $i < $max_token_index + && ( $rtokens->[ $i + 1 ] eq '{' + || $rtokens->[ $i + 1 ] eq '[' ) + ) + { + $type = $tok; + $fast_scan_type = $type; + $identifier = $tok; + $context = UNKNOWN_CONTEXT; + } + + ####################################### + # Verify correctness during development + ####################################### + if ( VERIFY_FASTSCAN && $fast_scan_type ) { + + # We will call the full method + my $identifier_simple = $identifier; + my $tok_simple = $tok; + my $fast_scan_type = $type; + my $i_simple = $i; + my $context_simple = $context; + + $tok = $tok_begin; + $i = $i_begin; + scan_identifier(); + + if ( $tok ne $tok_simple + || $type ne $fast_scan_type + || $i != $i_simple + || $identifier ne $identifier_simple + || $id_scan_state + || $context ne $context_simple ) + { + print STDERR < 0; + + sub scan_number_fast { + + # This is a wrapper for sub scan_number. It does a fast preliminary + # scan for a simple integer. It calls the original scan_number if it + # does not find one. + + my $i_begin = $i; + my $tok_begin = $tok; + my $number; + + ################################## + # Quick check for (signed) integer + ################################## + + # This will be the string of digits: + my $i_d = $i; + my $tok_d = $tok; + my $typ_d = $rtoken_type->[$i_d]; + + # check for signed integer + my $sign = ""; + if ( $typ_d ne 'd' + && ( $typ_d eq '+' || $typ_d eq '-' ) + && $i_d < $max_token_index ) + { + $sign = $tok_d; + $i_d++; + $tok_d = $rtokens->[$i_d]; + $typ_d = $rtoken_type->[$i_d]; + } + + # Handle integers + if ( + $typ_d eq 'd' + && ( + $i_d == $max_token_index + || ( $i_d < $max_token_index + && $rtoken_type->[ $i_d + 1 ] ne '.' + && $rtoken_type->[ $i_d + 1 ] ne 'w' ) + ) + ) + { + # Let let full scanner handle multi-digit integers beginning with + # '0' because there could be error messages. For example, '009' is + # not a valid number. + + if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) { + $number = $sign . $tok_d; + $type = 'n'; + $i = $i_d; + } + } + + ####################################### + # Verify correctness during development + ####################################### + if ( VERIFY_FASTNUM && defined($number) ) { + + # We will call the full method + my $type_simple = $type; + my $i_simple = $i; + my $number_simple = $number; + + $tok = $tok_begin; + $i = $i_begin; + $number = scan_number(); + + if ( $type ne $type_simple + || ( $i != $i_simple && $i <= $max_token_index ) + || $number ne $number_simple ) + { + print STDERR <{_saw_perl_dash_w} = 1; + $tokenizer_self->[_saw_perl_dash_w_] = 1; } # Check for identifier in indirect object slot @@ -1603,10 +2067,10 @@ sub prepare_for_a_new_file { # /^(print|printf|sort|exec|system)$/ if ( $is_indirect_object_taker{$last_nonblank_token} - || ( ( $last_nonblank_token eq '(' ) && $is_indirect_object_taker{ $paren_type[$paren_depth] } ) - || ( $last_nonblank_type =~ /^[Uw]$/ ) # possible object + || ( $last_nonblank_type eq 'w' + || $last_nonblank_type eq 'U' ) # possible object ) { $type = 'Z'; @@ -1631,12 +2095,13 @@ sub prepare_for_a_new_file { if ( $expecting == OPERATOR - # be sure this is not a method call of the form + # Be sure this is not a method call of the form # &method(...), $method->(..), &{method}(...), # $ref[2](list) is ok & short for $ref[2]->(list) # NOTE: at present, braces in something like &{ xxx } - # are not marked as a block, we might have a method call - && $last_nonblank_token !~ /^([\]\}\&]|\-\>)/ + # are not marked as a block, we might have a method call. + # Added ')' to fix case c017, something like ()()() + && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/ ) { @@ -1655,8 +2120,14 @@ sub prepare_for_a_new_file { my ( $next_nonblank_token, $i_next ) = find_next_nonblank_token( $i, $rtokens, $max_token_index ); - if ( $next_nonblank_token ne ')' ) { + + # Patch for c029: give up error check if + # a side comment follows + if ( $next_nonblank_token ne ')' + && $next_nonblank_token ne '#' ) + { my $hint; + error_if_expecting_OPERATOR('('); if ( $last_nonblank_type eq 'C' ) { @@ -1760,12 +2231,16 @@ sub prepare_for_a_new_file { complain("Repeated ','s \n"); } + # Note that we have to check both token and type here because a + # comma following a qw list can have last token='(' but type = 'q' + elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' ) + { + warning("Unexpected leading ',' after a '('\n"); + } + # patch for operator_expected: note if we are in the list (use.t) if ( $statement_type eq 'use' ) { $statement_type = '_use' } -## FIXME: need to move this elsewhere, perhaps check after a '(' -## elsif ($last_nonblank_token eq '(') { -## warning("Leading ','s illegal in some versions of perl\n"); -## } + }, ';' => sub { $context = UNKNOWN_CONTEXT; @@ -1818,8 +2293,11 @@ sub prepare_for_a_new_file { # a pattern cannot follow certain keywords which take optional # arguments, like 'shift' and 'pop'. See also '?'. - if ( $last_nonblank_type eq 'k' - && $is_keyword_taking_optional_args{$last_nonblank_token} ) + if ( + $last_nonblank_type eq 'k' + && $is_keyword_rejecting_slash_as_pattern_delimiter{ + $last_nonblank_token} + ) { $is_pattern = 0; } @@ -1849,11 +2327,11 @@ sub prepare_for_a_new_file { $type = $tok; } - #DEBUG - collecting info on what tokens follow a divide - # for development of guessing algorithm - #if ( numerator_expected( $i, $rtokens, $max_token_index ) < 0 ) { - # #write_diagnostics( "DIVIDE? $input_line\n" ); - #} + #DEBUG - collecting info on what tokens follow a divide + # for development of guessing algorithm + #if ( is_possible_numerator( $i, $rtokens, $max_token_index ) < 0 ) { + # #write_diagnostics( "DIVIDE? $input_line\n" ); + #} } }, '{' => sub { @@ -1868,7 +2346,7 @@ sub prepare_for_a_new_file { # ATTRS: for a '{' following an attribute list, reset # things to look like we just saw the sub name - if ( $statement_type =~ /^sub/ ) { + if ( $statement_type =~ /^sub\b/ ) { $last_nonblank_token = $statement_type; $last_nonblank_type = 'i'; $statement_type = ""; @@ -1894,7 +2372,7 @@ sub prepare_for_a_new_file { # check for syntax error here; unless ( $is_blocktype_with_paren{$last_nonblank_token} ) { - if ( $tokenizer_self->{'_extended_syntax'} ) { + if ( $tokenizer_self->[_extended_syntax_] ) { # we append a trailing () to mark this as an unknown # block type. This allows perltidy to format some @@ -2025,7 +2503,7 @@ sub prepare_for_a_new_file { # For example we probably don't want & as sub call here: # Fcntl::S_IRUSR & $mode; if ( $expecting == TERM || $next_type ne 'b' ) { - scan_identifier(); + scan_identifier_fast(); } } else { @@ -2038,12 +2516,15 @@ sub prepare_for_a_new_file { find_angle_operator_termination( $input_line, $i, $rtoken_map, $expecting, $max_token_index ); - if ( $type eq '<' && $expecting == TERM ) { - error_if_expecting_TERM(); - interrupt_logfile(); - warning("Unterminated <> operator?\n"); - resume_logfile(); - } + ## This message is not very helpful and quite confusing if the above + ## routine decided not to write a message with the line number. + ## if ( $type eq '<' && $expecting == TERM ) { + ## error_if_expecting_TERM(); + ## interrupt_logfile(); + ## warning("Unterminated <> operator?\n"); + ## resume_logfile(); + ## } + } else { } @@ -2055,8 +2536,11 @@ sub prepare_for_a_new_file { # Patch for rt #126965 # a pattern cannot follow certain keywords which take optional # arguments, like 'shift' and 'pop'. See also '/'. - if ( $last_nonblank_type eq 'k' - && $is_keyword_taking_optional_args{$last_nonblank_token} ) + if ( + $last_nonblank_type eq 'k' + && $is_keyword_rejecting_question_as_pattern_delimiter{ + $last_nonblank_token} + ) { $is_pattern = 0; } @@ -2069,9 +2553,11 @@ sub prepare_for_a_new_file { elsif ( $expecting == UNKNOWN ) { # In older versions of Perl, a bare ? can be a pattern - # delimiter. Sometime after Perl 5.10 this seems to have - # been dropped, but we have to support it in order to format - # older programs. For example, the following line worked + # delimiter. In perl version 5.22 this was + # dropped, but we have to support it in order to format + # older programs. See: + ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html + # For example, the following line worked # at one time: # ?(.*)? && (print $1,"\n"); # In current versions it would have to be written with slashes: @@ -2098,8 +2584,16 @@ sub prepare_for_a_new_file { }, '*' => sub { # typeglob, or multiply? + if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) { + if ( $next_type ne 'b' + && $next_type ne '(' + && $next_type ne '#' ) # Fix c036 + { + $expecting = TERM; + } + } if ( $expecting == TERM ) { - scan_identifier(); + scan_identifier_fast(); } else { @@ -2141,21 +2635,47 @@ sub prepare_for_a_new_file { } # ATTRS: check for a ':' which introduces an attribute list - # (this might eventually get its own token type) + # either after a 'sub' keyword or within a paren list elsif ( $statement_type =~ /^sub\b/ ) { $type = 'A'; $in_attribute_list = 1; } + # Within a signature, unless we are in a ternary. For example, + # from 't/filter_example.t': + # method foo4 ( $class: $bar ) { $class->bar($bar) } + elsif ( $paren_type[$paren_depth] =~ /^sub\b/ + && !is_balanced_closing_container(QUESTION_COLON) ) + { + $type = 'A'; + $in_attribute_list = 1; + } + # check for scalar attribute, such as # my $foo : shared = 1; - elsif ($is_my_our{$statement_type} + elsif ($is_my_our_state{$statement_type} && $current_depth[QUESTION_COLON] == 0 ) { $type = 'A'; $in_attribute_list = 1; } + # Look for Switch::Plain syntax if an error would otherwise occur + # here. Note that we do not need to check if the extended syntax + # flag is set because otherwise an error would occur, and we would + # then have to output a message telling the user to set the + # extended syntax flag to avoid the error. + # case 1: { + # default: { + # default: + # Note that the line 'default:' will be parsed as a label elsewhere. + elsif ( $is_case_default{$statement_type} + && !is_balanced_closing_container(QUESTION_COLON) ) + { + # mark it as a perltidy label type + $type = 'J'; + } + # otherwise, it should be part of a ?/: operator else { ( $type_sequence, $indent_flag ) = @@ -2169,7 +2689,7 @@ sub prepare_for_a_new_file { '+' => sub { # what kind of plus? if ( $expecting == TERM ) { - my $number = scan_number(); + my $number = scan_number_fast(); # unary plus is safest assumption if not a number if ( !defined($number) ) { $type = 'p'; } @@ -2184,16 +2704,18 @@ sub prepare_for_a_new_file { error_if_expecting_OPERATOR("Array") if ( $expecting == OPERATOR ); - scan_identifier(); + scan_identifier_fast(); }, '%' => sub { # hash or modulo? - # first guess is hash if no following blank + # first guess is hash if no following blank or paren if ( $expecting == UNKNOWN ) { - if ( $next_type ne 'b' ) { $expecting = TERM } + if ( $next_type ne 'b' && $next_type ne '(' ) { + $expecting = TERM; + } } if ( $expecting == TERM ) { - scan_identifier(); + scan_identifier_fast(); } }, '[' => sub { @@ -2248,7 +2770,7 @@ sub prepare_for_a_new_file { } } elsif ( $expecting == TERM ) { - my $number = scan_number(); + my $number = scan_number_fast(); # maybe part of bareword token? unary is safest if ( !defined($number) ) { $type = 'm'; } @@ -2272,12 +2794,13 @@ sub prepare_for_a_new_file { # FIXME: this should work but will not catch errors # because we also have to be sure that previous token is # a type character ($,@,%). - if ( $last_nonblank_token eq '{' - && ( $next_tok =~ /^[A-Za-z_]/ ) ) + if ( $last_nonblank_token eq '{' + && ( $next_tok !~ /^\d/ ) + && ( $next_tok =~ /^\w/ ) ) { if ( $next_tok eq 'W' ) { - $tokenizer_self->{_saw_perl_dash_w} = 1; + $tokenizer_self->[_saw_perl_dash_w_] = 1; } $tok = $tok . $next_tok; $i = $i + 1; @@ -2299,9 +2822,12 @@ sub prepare_for_a_new_file { scan_bare_identifier(); }, '<<' => sub { # maybe a here-doc? - return - unless ( $i < $max_token_index ) - ; # here-doc not possible if end of line + +## This check removed because it could be a deprecated here-doc with +## no specified target. See example in log 16 Sep 2020. +## return +## unless ( $i < $max_token_index ) +## ; # here-doc not possible if end of line if ( $expecting != OPERATOR ) { my ( $found_target, $here_doc_target, $here_quote_character, @@ -2321,6 +2847,11 @@ sub prepare_for_a_new_file { my $truncated = substr( $here_doc_target, 0, 80 ); complain("Long here-target: '$truncated' ...\n"); } + elsif ( !$here_doc_target ) { + warning( + 'Use of bare << to mean <<"" is deprecated' . "\n" ) + unless ($here_quote_character); + } elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) { complain( "Unconventional here-target: '$here_doc_target'\n"); @@ -2386,15 +2917,33 @@ sub prepare_for_a_new_file { # if -> points to a bare word, we must scan for an identifier, # otherwise something like ->y would look like the y operator - scan_identifier(); + + # NOTE: this will currently allow things like + # '->@array' '->*VAR' '->%hash' + # to get parsed as identifiers, even though these are not currently + # allowed syntax. To catch syntax errors like this we could first + # check that the next character and skip this call if it is one of + # ' @ % * '. A disadvantage with doing this is that this would + # have to be fixed if the perltidy syntax is ever extended to make + # any of these valid. So for now this check is not done. + scan_identifier_fast(); }, # type = 'pp' for pre-increment, '++' for post-increment '++' => sub { - if ( $expecting == TERM ) { $type = 'pp' } + if ( $expecting == TERM ) { $type = 'pp' } elsif ( $expecting == UNKNOWN ) { + my ( $next_nonblank_token, $i_next ) = find_next_nonblank_token( $i, $rtokens, $max_token_index ); + + # Fix for c042: look past a side comment + if ( $next_nonblank_token eq '#' ) { + ( $next_nonblank_token, $i_next ) = + find_next_nonblank_token( $max_token_index, + $rtokens, $max_token_index ); + } + if ( $next_nonblank_token eq '$' ) { $type = 'pp' } } }, @@ -2412,22 +2961,30 @@ sub prepare_for_a_new_file { # type = 'mm' for pre-decrement, '--' for post-decrement '--' => sub { - if ( $expecting == TERM ) { $type = 'mm' } + if ( $expecting == TERM ) { $type = 'mm' } elsif ( $expecting == UNKNOWN ) { my ( $next_nonblank_token, $i_next ) = find_next_nonblank_token( $i, $rtokens, $max_token_index ); + + # Fix for c042: look past a side comment + if ( $next_nonblank_token eq '#' ) { + ( $next_nonblank_token, $i_next ) = + find_next_nonblank_token( $max_token_index, + $rtokens, $max_token_index ); + } + if ( $next_nonblank_token eq '$' ) { $type = 'mm' } } }, '&&' => sub { error_if_expecting_TERM() - if ( $expecting == TERM ); + if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015 }, '||' => sub { error_if_expecting_TERM() - if ( $expecting == TERM ); + if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015 }, '//' => sub { @@ -2484,11 +3041,11 @@ sub prepare_for_a_new_file { @_ = qw(use require); @is_use_require{@_} = (1) x scalar(@_); - # This hash holds the hash key in $tokenizer_self for these keywords: - my %is_format_END_DATA = ( - 'format' => '_in_format', - '__END__' => '_in_end', - '__DATA__' => '_in_data', + # This hash holds the array index in $tokenizer_self for these keywords: + # Fix for issue c035: removed 'format' from this hash + my %is_END_DATA = ( + '__END__' => _in_end_, + '__DATA__' => _in_data_, ); # original ref: camel 3 p 147, @@ -2528,6 +3085,8 @@ sub prepare_for_a_new_file { 'qx' => 1, ); + use constant DEBUG_TOKENIZE => 0; + sub tokenize_this_line { # This routine breaks a line of perl code into tokens which are of use in @@ -2637,13 +3196,16 @@ sub prepare_for_a_new_file { $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q'; # check for pod documentation - if ( ( $untrimmed_input_line =~ /^=[A-Za-z_]/ ) ) { + if ( substr( $untrimmed_input_line, 0, 1 ) eq '=' + && $untrimmed_input_line =~ /^=[A-Za-z_]/ ) + { # must not be in multi-line quote # and must not be in an equation - if ( !$in_quote && ( operator_expected( 'b', '=', 'b' ) == TERM ) ) + if ( !$in_quote + && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) ) { - $tokenizer_self->{_in_pod} = 1; + $tokenizer_self->[_in_pod_] = 1; return; } } @@ -2652,21 +3214,24 @@ sub prepare_for_a_new_file { chomp $input_line; + # Set a flag to indicate if we might be at an __END__ or __DATA__ line + # This will be used below to avoid quoting a bare word followed by + # a fat comma. + my $is_END_or_DATA; + # trim start of this line unless we are continuing a quoted line # do not trim end because we might end in a quote (test: deken4.pl) # Perl::Tidy::Formatter will delete needless trailing blanks unless ( $in_quote && ( $quote_type eq 'Q' ) ) { - $input_line =~ s/^\s*//; # trim left end - } + $input_line =~ s/^\s+//; # trim left end - # Set a flag to indicate if we might be at an __END__ or __DATA__ line - # This will be used below to avoid quoting a bare word followed by - # a fat comma. - my $is_END_or_DATA = $input_line =~ /^\s*__(END|DATA)__\s*$/; + $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_' + && $input_line =~ /^\s*__(END|DATA)__\s*$/; + } # update the copy of the line for use in error messages # This must be exactly what we give the pre_tokenizer - $tokenizer_self->{_line_text} = $input_line; + $tokenizer_self->[_line_of_text_] = $input_line; # re-initialize for the main loop $routput_token_list = []; # stack of output token indexes @@ -2691,9 +3256,17 @@ sub prepare_for_a_new_file { # stage 1 is a very simple pre-tokenization my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens - # a little optimization for a full-line comment - if ( !$in_quote && ( $input_line =~ /^#/ ) ) { - $max_tokens_wanted = 1 # no use tokenizing a comment + # optimize for a full-line comment + if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) { + $max_tokens_wanted = 1; # no use tokenizing a comment + + # and check for skipped section + if ( $rOpts_code_skipping + && $input_line =~ /$code_skipping_pattern_begin/ ) + { + $tokenizer_self->[_in_skipped_] = 1; + return; + } } # start by breaking the line into pre-tokens @@ -2733,7 +3306,7 @@ sub prepare_for_a_new_file { $routput_token_type->[$i] = $type; } - $tok = $quote_character unless ( $quote_character =~ /^\s*$/ ); + $tok = $quote_character if ($quote_character); # scan for the end of the quote or pattern ( @@ -2776,14 +3349,19 @@ sub prepare_for_a_new_file { } # For an 'e' quote modifier we must scan the replacement - # text for here-doc targets. - if ($saw_modifier_e) { + # text for here-doc targets... + # but if the modifier starts a new line we can skip + # this because either the here doc will be fully + # contained in the replacement text (so we can + # ignore it) or Perl will not find it. + # See test 'here2.in'. + if ( $saw_modifier_e && $i_tok >= 0 ) { my $rht = scan_replacement_text($qs1); # Change type from 'Q' to 'h' for quotes with # here-doc targets so that the formatter (see sub - # print_line_of_tokens) will not make any line + # process_line_of_CODE) will not make any line # breaks after this point. if ($rht) { push @{$rhere_target_list}, @{$rht}; @@ -2844,7 +3422,7 @@ EOM } } - unless ( $tok =~ /^\s*$/ || $tok eq 'CORE::' ) { + unless ( $type eq 'b' || $tok eq 'CORE::' ) { # try to catch some common errors if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) { @@ -2871,6 +3449,21 @@ EOM $last_nonblank_container_type = $container_type; $last_nonblank_type_sequence = $type_sequence; $last_nonblank_i = $i_tok; + + # Patch for c030: Fix things in case a '->' got separated from + # the subsequent identifier by a side comment. We need the + # last_nonblank_token to have a leading -> to avoid triggering + # an operator expected error message at the next '('. See also + # fix for git #63. + if ( $last_last_nonblank_token eq '->' ) { + if ( $last_nonblank_type eq 'w' + || $last_nonblank_type eq 'i' + && substr( $last_nonblank_token, 0, 1 ) eq '$' ) + { + $last_nonblank_token = '->' . $last_nonblank_token; + $last_nonblank_type = 'i'; + } + } } # store previous token type @@ -2883,8 +3476,8 @@ EOM } my $pre_tok = $rtokens->[$i]; # get the next pre-token my $pre_type = $rtoken_type->[$i]; # and type - $tok = $pre_tok; - $type = $pre_type; # to be modified as necessary + $tok = $pre_tok; + $type = $pre_type; # to be modified as necessary $block_type = ""; # blank for all tokens except code block braces $container_type = ""; # blank for all tokens except some parens $type_sequence = ""; # blank for all tokens except ?/: @@ -2897,7 +3490,7 @@ EOM # continue gathering identifier if necessary # but do not start on blanks and comments - if ( $id_scan_state && $pre_type !~ /[b#]/ ) { + if ( $id_scan_state && $pre_type ne 'b' && $pre_type ne '#' ) { if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) { scan_id(); @@ -2948,7 +3541,7 @@ EOM if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) { my $next_type = $rtokens->[ $i + 1 ]; my $expecting = - operator_expected( $prev_type, $tok, $next_type ); + operator_expected( [ $prev_type, $tok, $next_type ] ); # Patched for RT#101547, was 'unless ($expecting==OPERATOR)' $combine_ok = 0 if ( $expecting == TERM ); @@ -3003,7 +3596,7 @@ EOM $next_tok = $rtokens->[ $i + 1 ]; $next_type = $rtoken_type->[ $i + 1 ]; - TOKENIZER_DEBUG_FLAG_TOKENIZE && do { + DEBUG_TOKENIZE && do { local $" = ')('; my @debug_list = ( $last_nonblank_token, $tok, @@ -3014,8 +3607,11 @@ EOM print STDOUT "TOKENIZE:(@debug_list)\n"; }; - # turn off attribute list on first non-blank, non-bareword - if ( $pre_type ne 'w' ) { $in_attribute_list = 0 } + # Turn off attribute list on first non-blank, non-bareword. + # Added '#' to fix c038. + if ( $pre_type ne 'w' && $pre_type ne '#' ) { + $in_attribute_list = 0; + } ############################################################### # We have the next token, $tok. @@ -3026,7 +3622,16 @@ EOM ############################################################### if ( $pre_type eq 'w' ) { - $expecting = operator_expected( $prev_type, $tok, $next_type ); + $expecting = + operator_expected( [ $prev_type, $tok, $next_type ] ); + + # Patch for c043, part 3: A bareword after '->' expects a TERM + # FIXME: It would be cleaner to give method calls a new type 'M' + # and update sub operator_expected to handle this. + if ( $last_nonblank_type eq '->' ) { + $expecting = TERM; + } + my ( $next_nonblank_token, $i_next ) = find_next_nonblank_token( $i, $rtokens, $max_token_index ); @@ -3035,6 +3640,43 @@ EOM # treat bare word followed by open paren like qw( if ( $next_nonblank_token eq '(' ) { + + # For something like: + # : prototype($$) + # we should let do_scan_sub see it so that it can see + # the prototype. All other attributes get parsed as a + # quoted string. + if ( $tok eq 'prototype' ) { + $id_scan_state = 'prototype'; + + # start just after the word 'prototype' + my $i_beg = $i + 1; + ( $i, $tok, $type, $id_scan_state ) = do_scan_sub( + { + input_line => $input_line, + i => $i, + i_beg => $i_beg, + tok => $tok, + type => $type, + rtokens => $rtokens, + rtoken_map => $rtoken_map, + id_scan_state => $id_scan_state, + max_token_index => $max_token_index + } + ); + + # If successful, mark as type 'q' to be consistent with other + # attributes. Note that type 'w' would also work. + if ( $i > $i_beg ) { + $type = 'q'; + next; + } + + # If not successful, continue and parse as a quote. + } + + # All other attribute lists must be parsed as quotes + # (see 'signatures.t' for good examples) $in_quote = $quote_items{'q'}; $allowed_quote_modifiers = $quote_modifiers{'q'}; $type = 'q'; @@ -3067,7 +3709,36 @@ EOM $type = 'v'; report_v_string($tok); } - else { $type = 'w' } + else { + + # Bareword followed by a fat comma ... see 'git18.in' + # If tok is something like 'x17' then it could + # actually be operator x followed by number 17. + # For example, here: + # 123x17 => [ 792, 1224 ], + # (a key of 123 repeated 17 times, perhaps not + # what was intended). We will mark x17 as type + # 'n' and it will be split. If the previous token + # was also a bareword then it is not very clear is + # going on. In this case we will not be sure that + # an operator is expected, so we just mark it as a + # bareword. Perl is a little murky in what it does + # with stuff like this, and its behavior can change + # over time. Something like + # a x18 => [792, 1224], will compile as + # a key with 18 a's. But something like + # push @array, a x18; + # is a syntax error. + if ( $expecting == OPERATOR && $tok =~ /^x\d+$/ ) { + $type = 'n'; + } + else { + + # git #18 + $type = 'w'; + error_if_expecting_OPERATOR(); + } + } next; } @@ -3091,6 +3762,18 @@ EOM next; } + # Scan a bare word following a -> as an identifir; it could + # have a long package name. Fixes c037, c041. + if ( $last_nonblank_token eq '->' ) { + scan_bare_identifier(); + + # Patch for c043, part 4; use type 'w' after a '->'. + # This is just a safety check on sub scan_bare_identifier, + # which should get this case correct. + $type = 'w'; + next; + } + # a bare word immediately followed by :: is not a keyword; # use $tok_kw when testing for keywords to avoid a mistake my $tok_kw = $tok; @@ -3100,8 +3783,28 @@ EOM $tok_kw .= '::'; } + # Decide if 'sub :' can be the start of a sub attribute list. + # We will decide based on if the colon is followed by a + # bareword which is not a keyword. + my $sub_attribute_ok_here; + if ( $is_sub{$tok_kw} + && $expecting != OPERATOR + && $next_nonblank_token eq ':' ) + { + my ( $nn_nonblank_token, $i_nn ) = + find_next_nonblank_token( $i_next + 1, + $rtokens, $max_token_index ); + $sub_attribute_ok_here = + $nn_nonblank_token =~ /^\w/ + && $nn_nonblank_token !~ /^\d/ + && !$is_keyword{$nn_nonblank_token}; + } + # handle operator x (now we know it isn't $x=) - if ( ( $tok =~ /^x\d*$/ ) && ( $expecting == OPERATOR ) ) { + if ( $expecting == OPERATOR + && substr( $tok, 0, 1 ) eq 'x' + && $tok =~ /^x\d*$/ ) + { if ( $tok eq 'x' ) { if ( $rtokens->[ $i + 1 ] eq '=' ) { # x= @@ -3114,7 +3817,7 @@ EOM } } - # FIXME: Patch: mark something like x4 as an integer for now + # NOTE: mark something like x4 as an integer for now # It gets fixed downstream. This is easier than # splitting the pretoken. else { @@ -3128,14 +3831,14 @@ EOM elsif ( ( $tok eq 'strict' ) and ( $last_nonblank_token eq 'use' ) ) { - $tokenizer_self->{_saw_use_strict} = 1; + $tokenizer_self->[_saw_use_strict_] = 1; scan_bare_identifier(); } elsif ( ( $tok eq 'warnings' ) and ( $last_nonblank_token eq 'use' ) ) { - $tokenizer_self->{_saw_perl_dash_w} = 1; + $tokenizer_self->[_saw_perl_dash_w_] = 1; # scan as identifier, so that we pick up something like: # use warnings::register @@ -3144,7 +3847,7 @@ EOM elsif ( $tok eq 'AutoLoader' - && $tokenizer_self->{_look_for_autoloader} + && $tokenizer_self->[_look_for_autoloader_] && ( $last_nonblank_token eq 'use' @@ -3156,22 +3859,22 @@ EOM ) { write_logfile_entry("AutoLoader seen, -nlal deactivates\n"); - $tokenizer_self->{_saw_autoloader} = 1; - $tokenizer_self->{_look_for_autoloader} = 0; + $tokenizer_self->[_saw_autoloader_] = 1; + $tokenizer_self->[_look_for_autoloader_] = 0; scan_bare_identifier(); } elsif ( $tok eq 'SelfLoader' - && $tokenizer_self->{_look_for_selfloader} + && $tokenizer_self->[_look_for_selfloader_] && ( $last_nonblank_token eq 'use' || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/ || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ ) ) { write_logfile_entry("SelfLoader seen, -nlsl deactivates\n"); - $tokenizer_self->{_saw_selfloader} = 1; - $tokenizer_self->{_look_for_selfloader} = 0; + $tokenizer_self->[_saw_selfloader_] = 1; + $tokenizer_self->[_look_for_selfloader_] = 0; scan_bare_identifier(); } @@ -3201,8 +3904,6 @@ EOM } } - # FIXME: could check for error in which next token is - # not a word (number, punctuation, ..) else { $is_constant{$current_package}{$next_nonblank_token} = 1; @@ -3250,7 +3951,7 @@ EOM # of leading and trailing whitespace. So they are given a # separate type, 'q', unless requested otherwise. $type = - ( $tok eq 'qw' && $tokenizer_self->{_trim_qw} ) + ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] ) ? 'q' : 'Q'; $quote_type = $type; @@ -3260,12 +3961,13 @@ EOM elsif ( ( $next_nonblank_token eq ':' ) && ( $rtokens->[ $i_next + 1 ] ne ':' ) - && ( $i_next <= $max_token_index ) # colon on same line + && ( $i_next <= $max_token_index ) # colon on same line + && !$sub_attribute_ok_here # like 'sub : lvalue' ? && label_ok() ) { if ( $tok !~ /[A-Z]/ ) { - push @{ $tokenizer_self->{_rlower_case_labels_at} }, + push @{ $tokenizer_self->[_rlower_case_labels_at_] }, $input_line_number; } $type = 'J'; @@ -3274,20 +3976,38 @@ EOM next; } - # 'sub' || 'package' - elsif ( $is_sub{$tok_kw} || $is_package{$tok_kw} ) { + # 'sub' or alias + elsif ( $is_sub{$tok_kw} ) { + error_if_expecting_OPERATOR() + if ( $expecting == OPERATOR ); + initialize_subname(); + scan_id(); + } + + # 'package' + elsif ( $is_package{$tok_kw} ) { error_if_expecting_OPERATOR() if ( $expecting == OPERATOR ); scan_id(); } + # Fix for c035: split 'format' from 'is_format_END_DATA' to be + # more restrictive. Require a new statement to be ok here. + elsif ( $tok_kw eq 'format' && new_statement_ok() ) { + $type = ';'; # make tokenizer look for TERM next + $tokenizer_self->[_in_format_] = 1; + last; + } + # Note on token types for format, __DATA__, __END__: # It simplifies things to give these type ';', so that when we # start rescanning we will be expecting a token of type TERM. # We will switch to type 'k' before outputting the tokens. - elsif ( $is_format_END_DATA{$tok_kw} ) { + elsif ( $is_END_DATA{$tok_kw} ) { $type = ';'; # make tokenizer look for TERM next - $tokenizer_self->{ $is_format_END_DATA{$tok_kw} } = 1; + + # Remember that we are in one of these three sections + $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1; last; } @@ -3311,7 +4031,7 @@ EOM } # remember my and our to check for trailing ": shared" - elsif ( $is_my_our{$tok} ) { + elsif ( $is_my_our_state{$tok} ) { $statement_type = $tok; } @@ -3368,8 +4088,11 @@ EOM } # patch for SWITCH/CASE if 'case' and 'when are - # treated as keywords. - elsif ( $tok eq 'when' || $tok eq 'case' ) { + # treated as keywords. Also 'default' for Switch::Plain + elsif ($tok eq 'when' + || $tok eq 'case' + || $tok eq 'default' ) + { $statement_type = $tok; # next '{' is block } @@ -3408,10 +4131,38 @@ EOM else { scan_bare_identifier(); + + if ( $statement_type eq 'use' + && $last_nonblank_token eq 'use' ) + { + $saw_use_module{$current_package}->{$tok} = 1; + } + if ( $type eq 'w' ) { if ( $expecting == OPERATOR ) { + # Patch to avoid error message for RPerl overloaded + # operator functions: use overload + # '+' => \&sse_add, + # '-' => \&sse_sub, + # '*' => \&sse_mul, + # '/' => \&sse_div; + # FIXME: this should eventually be generalized + if ( $saw_use_module{$current_package}->{'RPerl'} + && $tok =~ /^sse_(mul|div|add|sub)$/ ) + { + + } + + # Fix part 1 for git #63 in which a comment falls + # between an -> and the following word. An + # alternate fix would be to change operator_expected + # to return an UNKNOWN for this type. + elsif ( $last_nonblank_type eq '->' ) { + + } + # don't complain about possible indirect object # notation. # For example: @@ -3422,7 +4173,7 @@ EOM # This will call A::new but we have a 'new' in # main:: which looks like a constant. # - if ( $last_nonblank_type eq 'C' ) { + elsif ( $last_nonblank_type eq 'C' ) { if ( $tok !~ /::$/ ) { complain(<[ $i + 1 ]; if ( $next_tok eq '(' ) { - $type = 'U'; + + # Fix part 2 for git #63. Leave type as 'w' to keep + # the type the same as if the -> were not separated + $type = 'U' unless ( $last_nonblank_type eq '->' ); } # underscore after file test operator is file handle @@ -3459,7 +4213,7 @@ EOM ) { $statement_type = $tok; # next '{' is block - $type = 'k'; # for keyword syntax coloring + $type = 'k'; # for keyword syntax coloring } # patch for SWITCH/CASE if switch and given not keywords @@ -3480,10 +4234,12 @@ EOM # section 2: strings of digits ############################################################### elsif ( $pre_type eq 'd' ) { - $expecting = operator_expected( $prev_type, $tok, $next_type ); + $expecting = + operator_expected( [ $prev_type, $tok, $next_type ] ); error_if_expecting_OPERATOR("Number") if ( $expecting == OPERATOR ); - my $number = scan_number(); + + my $number = scan_number_fast(); if ( !defined($number) ) { # shouldn't happen - we should always get a number @@ -3501,7 +4257,7 @@ EOM my $code = $tokenization_code->{$tok}; if ($code) { $expecting = - operator_expected( $prev_type, $tok, $next_type ); + operator_expected( [ $prev_type, $tok, $next_type ] ); $code->(); redo if $in_quote; } @@ -3562,7 +4318,9 @@ EOM my $container_environment = ''; my $im = -1; # previous $i value my $num; - my $ci_string_sum = ones_count($ci_string_in_tokenizer); + + # Count the number of '1's in the string (previously sub ones_count) + my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; # Computing Token Indentation # @@ -3737,7 +4495,7 @@ EOM my $val = ord($type); warning( "unexpected character decimal $val ($type) in script\n"); - $tokenizer_self->{_in_error} = 1; + $tokenizer_self->[_in_error_] = 1; } # ---------------------------------------------------------------- @@ -3829,6 +4587,11 @@ EOM push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer ); $level_in_tokenizer++; + if ( $level_in_tokenizer > $tokenizer_self->[_maximum_level_] ) + { + $tokenizer_self->[_maximum_level_] = $level_in_tokenizer; + } + if ($forced_indentation_flag) { # break BEFORE '?' when there is forced indentation @@ -3885,7 +4648,8 @@ EOM $ci_string_in_tokenizer .= ( $intervening_secondary_structure != 0 ) ? '1' : '0'; - $ci_string_sum = ones_count($ci_string_in_tokenizer); + $ci_string_sum = + ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; $continuation_string_in_tokenizer .= ( $in_statement_continuation > 0 ) ? '1' : '0'; @@ -3914,7 +4678,7 @@ EOM ) { $total_ci += $in_statement_continuation - unless ( $ci_string_in_tokenizer =~ /1$/ ); + unless ( substr( $ci_string_in_tokenizer, -1 ) eq '1' ); } $ci_string_i = $total_ci; @@ -3935,12 +4699,15 @@ EOM if ( length($nesting_block_string) > 1 ) { # true for valid script chop $nesting_block_string; - $nesting_block_flag = ( $nesting_block_string =~ /1$/ ); + $nesting_block_flag = + substr( $nesting_block_string, -1 ) eq '1'; chop $nesting_list_string; - $nesting_list_flag = ( $nesting_list_string =~ /1$/ ); + $nesting_list_flag = + substr( $nesting_list_string, -1 ) eq '1'; chop $ci_string_in_tokenizer; - $ci_string_sum = ones_count($ci_string_in_tokenizer); + $ci_string_sum = + ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; $in_statement_continuation = chop $continuation_string_in_tokenizer; @@ -3967,7 +4734,8 @@ EOM elsif ( $is_zero_continuation_block_type{ $routput_block_type->[$i] - } ) + } + ) { $in_statement_continuation = 0; } @@ -3977,7 +4745,8 @@ EOM elsif ( $is_not_zero_continuation_block_type{ $routput_block_type->[$i] - } ) + } + ) { } @@ -4034,7 +4803,8 @@ EOM # commas, this simplifies the -lp indentation logic, which # counts commas. For ?: it makes them stand out. if ($nesting_list_flag) { - if ( $type =~ /^[,\?\:]$/ ) { + ## $type =~ /^[,\?\:]$/ + if ( $is_comma_question_colon{$type} ) { $in_statement_continuation = 0; } } @@ -4101,8 +4871,8 @@ EOM } if ( $level_in_tokenizer < 0 ) { - unless ( $tokenizer_self->{_saw_negative_indentation} ) { - $tokenizer_self->{_saw_negative_indentation} = 1; + unless ( $tokenizer_self->[_saw_negative_indentation_] ) { + $tokenizer_self->[_saw_negative_indentation_] = 1; warning("Starting negative indentation\n"); } } @@ -4165,11 +4935,11 @@ EOM push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) ); } - $tokenizer_self->{_in_attribute_list} = $in_attribute_list; - $tokenizer_self->{_in_quote} = $in_quote; - $tokenizer_self->{_quote_target} = + $tokenizer_self->[_in_attribute_list_] = $in_attribute_list; + $tokenizer_self->[_in_quote_] = $in_quote; + $tokenizer_self->[_quote_target_] = $in_quote ? matching_end_token($quote_character) : ""; - $tokenizer_self->{_rhere_target_list} = $rhere_target_list; + $tokenizer_self->[_rhere_target_list_] = $rhere_target_list; $line_of_tokens->{_rtoken_type} = \@token_type; $line_of_tokens->{_rtokens} = \@tokens; @@ -4191,14 +4961,65 @@ EOM # Tokenizer routines which assist in identifying token types ####################################################################### +# hash lookup table of operator expected values +my %op_expected_table; + +# exceptions to perl's weird parsing rules after type 'Z' +my %is_weird_parsing_rule_exception; + +BEGIN { + + # Always expecting TERM following these types: + # note: this is identical to '@value_requestor_type' defined later. + my @q = qw( + ; ! + x & ? F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t + || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /= + &= // >> ~. &. |. ^. + ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~ + ); + push @q, ','; + push @q, '('; # for completeness, not currently a token type + @{op_expected_table}{@q} = (TERM) x scalar(@q); + + # Always UNKNOWN following these types: + # Fix for c030: added '->' to this list + @q = qw( w -> ); + @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q); + + # Always expecting OPERATOR ... + # 'n' and 'v' are currently excluded because they might be VERSION numbers + # 'i' is currently excluded because it might be a package + # 'q' is currently excluded because it might be a prototype + # Fix for c030: removed '->' from this list: + @q = qw( -- C h R ++ ] Q <> ); ## n v q i ); + push @q, ')'; + @{op_expected_table}{@q} = (OPERATOR) x scalar(@q); + + # Fix for git #62: added '*' and '%' + @q = qw( < ? * % ); + @{is_weird_parsing_rule_exception}{@q} = (OPERATOR) x scalar(@q); + +} + +use constant DEBUG_OPERATOR_EXPECTED => 0; + sub operator_expected { + # Returns a parameter indicating what types of tokens can occur next + + # Call format: + # $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] ); + # where + # $prev_type is the type of the previous token (blank or not) + # $tok is the current token + # $next_type is the type of the next token (blank or not) + # Many perl symbols have two or more meanings. For example, '<<' # can be a shift operator or a here-doc operator. The # interpretation of these symbols depends on the current state of # the tokenizer, which may either be expecting a term or an - # operator. For this example, a << would be a shift if an operator - # is expected, and a here-doc if a term is expected. This routine + # operator. For this example, a << would be a shift if an OPERATOR + # is expected, and a here-doc if a TERM is expected. This routine # is called to make this decision for any current token. It returns # one of three possible values: # @@ -4218,7 +5039,7 @@ sub operator_expected { # UNKNOWN, because a wrong guess can spoil the formatting of a # script. # - # adding NEW_TOKENS: it is critically important that this routine be + # Adding NEW_TOKENS: it is critically important that this routine be # updated to allow it to determine if an operator or term is to be # expected after the new token. Doing this simply involves adding # the new token character to one of the regexes in this routine or @@ -4227,206 +5048,243 @@ sub operator_expected { # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token, # $statement_type - my ( $prev_type, $tok, $next_type ) = @_; + # When possible, token types should be selected such that we can determine + # the 'operator_expected' value by a simple hash lookup. If there are + # exceptions, that is an indication that a new type is needed. - my $op_expected = UNKNOWN; + my ($rarg) = @_; -##print "tok=$tok last type=$last_nonblank_type last tok=$last_nonblank_token\n"; + my $msg = ""; -# Note: function prototype is available for token type 'U' for future -# program development. It contains the leading and trailing parens, -# and no blanks. It might be used to eliminate token type 'C', for -# example (prototype = '()'). Thus: -# if ($last_nonblank_type eq 'U') { -# print "previous token=$last_nonblank_token type=$last_nonblank_type prototype=$last_nonblank_prototype\n"; -# } + ############## + # Table lookup + ############## - # A possible filehandle (or object) requires some care... - if ( $last_nonblank_type eq 'Z' ) { + # Many types are can be obtained by a table lookup given the previous type. + # This typically handles half or more of the calls. + my $op_expected = $op_expected_table{$last_nonblank_type}; + if ( defined($op_expected) ) { + $msg = "Table lookup"; + goto RETURN; + } - # angle.t - if ( $last_nonblank_token =~ /^[A-Za-z_]/ ) { - $op_expected = UNKNOWN; - } + ###################### + # Handle special cases + ###################### - # For possible file handle like "$a", Perl uses weird parsing rules. - # For example: - # print $a/2,"/hi"; - division - # print $a / 2,"/hi"; - division - # print $a/ 2,"/hi"; - division - # print $a /2,"/hi"; - pattern (and error)! - elsif ( ( $prev_type eq 'b' ) && ( $next_type ne 'b' ) ) { - $op_expected = TERM; - } + $op_expected = UNKNOWN; + my ( $prev_type, $tok, $next_type ) = @{$rarg}; - # Note when an operation is being done where a - # filehandle might be expected, since a change in whitespace - # could change the interpretation of the statement. - else { - if ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) { + # Types 'k', '}' and 'Z' depend on context + # FIXME: Types 'i', 'n', 'v', 'q' currently also temporarily depend on + # context but that dependence could eventually be eliminated with better + # token type definition - # Do not complain in 'use' statements, which have special syntax. - # For example, from RT#130344: - # use lib $FindBin::Bin . '/lib'; - if ( $statement_type ne 'use' ) { - complain("operator in print statement not recommended\n"); - } - $op_expected = OPERATOR; - } + # identifier... + if ( $last_nonblank_type eq 'i' ) { + $op_expected = OPERATOR; + + # FIXME: it would be cleaner to make this a special type + # expecting VERSION or {} after package NAMESPACE + # TODO: maybe mark these words as type 'Y'? + if ( $statement_type =~ /^package\b/ + && $last_nonblank_token =~ /^package\b/ ) + { + $op_expected = TERM; } } - # Check for smartmatch operator before preceding brace or square bracket. - # For example, at the ? after the ] in the following expressions we are - # expecting an operator: - # - # qr/3/ ~~ ['1234'] ? 1 : 0; - # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a; - elsif ( $last_nonblank_type eq '}' && $last_nonblank_token eq '~~' ) { - $op_expected = OPERATOR; - } + # keyword... + elsif ( $last_nonblank_type eq 'k' ) { + $op_expected = TERM; + if ( $expecting_operator_token{$last_nonblank_token} ) { + $op_expected = OPERATOR; + } + elsif ( $expecting_term_token{$last_nonblank_token} ) { - # handle something after 'do' and 'eval' - elsif ( $is_block_operator{$last_nonblank_token} ) { + # Exceptions from TERM: - # something like $a = eval "expression"; - # ^ - if ( $last_nonblank_type eq 'k' ) { - $op_expected = TERM; # expression or list mode following keyword - } + # // may follow perl functions which may be unary operators + # see test file dor.t (defined or); + if ( + $tok eq '/' + && $next_type eq '/' + && $is_keyword_rejecting_slash_as_pattern_delimiter{ + $last_nonblank_token} + ) + { + $op_expected = OPERATOR; + } - # something like $a = do { BLOCK } / 2; - # or this ? after a smartmatch anonynmous hash or array reference: - # qr/3/ ~~ ['1234'] ? 1 : 0; - # ^ - else { - $op_expected = OPERATOR; # block mode following } + # Patch to allow a ? following 'split' to be a depricated pattern + # delimiter. This patch is coordinated with the omission of split + # from the list + # %is_keyword_rejecting_question_as_pattern_delimiter. This patch + # will force perltidy to guess. + elsif ($tok eq '?' + && $last_nonblank_token eq 'split' ) + { + $op_expected = UNKNOWN; + } } - } + } ## end type 'k' - # handle bare word.. - elsif ( $last_nonblank_type eq 'w' ) { + # closing container token... + + # Note that the actual token for type '}' may also be a ')'. + + # Also note that $last_nonblank_token is not the token corresponding to + # $last_nonblank_type when the type is a closing container. In that + # case it is the token before the corresponding opening container token. + # So for example, for this snippet + # $a = do { BLOCK } / 2; + # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'. - # unfortunately, we can't tell what type of token to expect next - # after most bare words + elsif ( $last_nonblank_type eq '}' ) { $op_expected = UNKNOWN; - } - # operator, but not term possible after these types - # Note: moved ')' from type to token because parens in list context - # get marked as '{' '}' now. This is a minor glitch in the following: - # my %opts = (ref $_[0] eq 'HASH') ? %{shift()} : (); - # - elsif (( $last_nonblank_type =~ /^[\]RnviQh]$/ ) - || ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) ) - { - $op_expected = OPERATOR; + # handle something after 'do' and 'eval' + if ( $is_block_operator{$last_nonblank_token} ) { - # in a 'use' statement, numbers and v-strings are not true - # numbers, so to avoid incorrect error messages, we will - # mark them as unknown for now (use.t) - # TODO: it would be much nicer to create a new token V for VERSION - # number in a use statement. Then this could be a check on type V - # and related patches which change $statement_type for '=>' - # and ',' could be removed. Further, it would clean things up to - # scan the 'use' statement with a separate subroutine. - if ( ( $statement_type eq 'use' ) - && ( $last_nonblank_type =~ /^[nv]$/ ) ) - { - $op_expected = UNKNOWN; + # something like $a = do { BLOCK } / 2; + $op_expected = OPERATOR; # block mode following } } - # expecting VERSION or {} after package NAMESPACE - elsif ($statement_type =~ /^package\b/ - && $last_nonblank_token =~ /^package\b/ ) - { - $op_expected = TERM; - } - } + elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) { + $op_expected = OPERATOR; + if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN } - # no operator after many keywords, such as "die", "warn", etc - elsif ( $expecting_term_token{$last_nonblank_token} ) { + } - # // may follow perl functions which may be unary operators - # see test file dor.t (defined or); - if ( $tok eq '/' - && $next_type eq '/' - && $last_nonblank_type eq 'k' - && $is_keyword_taking_optional_args{$last_nonblank_token} ) - { + # Check for smartmatch operator before preceding brace or square + # bracket. For example, at the ? after the ] in the following + # expressions we are expecting an operator: + # + # qr/3/ ~~ ['1234'] ? 1 : 0; + # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a; + elsif ( $last_nonblank_token eq '~~' ) { $op_expected = OPERATOR; } + + # A right brace here indicates the end of a simple block. All + # non-structural right braces have type 'R' all braces associated with + # block operator keywords have been given those keywords as + # "last_nonblank_token" and caught above. (This statement is order + # dependent, and must come after checking $last_nonblank_token). else { - $op_expected = TERM; - } - } - # no operator after things like + - ** (i.e., other operators) - elsif ( $expecting_term_types{$last_nonblank_type} ) { - $op_expected = TERM; - } + # patch for dor.t (defined or). + if ( $tok eq '/' + && $next_type eq '/' + && $last_nonblank_token eq ']' ) + { + $op_expected = OPERATOR; + } + + # Patch for RT #116344: misparse a ternary operator after an + # anonymous hash, like this: + # return ref {} ? 1 : 0; + # The right brace should really be marked type 'R' in this case, + # and it is safest to return an UNKNOWN here. Expecting a TERM will + # cause the '?' to always be interpreted as a pattern delimiter + # rather than introducing a ternary operator. + elsif ( $tok eq '?' ) { + $op_expected = UNKNOWN; + } + else { + $op_expected = TERM; + } + } + } ## end type '}' - # a few operators, like "time", have an empty prototype () and so - # take no parameters but produce a value to operate on - elsif ( $expecting_operator_token{$last_nonblank_token} ) { + # number or v-string... + # An exception is for VERSION numbers a 'use' statement. It has the format + # use Module VERSION LIST + # We could avoid this exception by writing a special sub to parse 'use' + # statements and perhaps mark these numbers with a new type V (for VERSION) + elsif ( $last_nonblank_type =~ /^[nv]$/ ) { $op_expected = OPERATOR; + if ( $statement_type eq 'use' ) { + $op_expected = UNKNOWN; + } } - # post-increment and decrement produce values to be operated on - elsif ( $expecting_operator_types{$last_nonblank_type} ) { + # quote... + # FIXME: labeled prototype words should probably be given type 'A' or maybe + # 'J'; not 'q'; or maybe mark as type 'Y' + elsif ( $last_nonblank_type eq 'q' ) { $op_expected = OPERATOR; + if ( $last_nonblank_token eq 'prototype' ) + ##|| $last_nonblank_token eq 'switch' ) + { + $op_expected = TERM; + } } - # no value to operate on after sub block - elsif ( $last_nonblank_token =~ /^sub\s/ ) { $op_expected = TERM; } + # file handle or similar + elsif ( $last_nonblank_type eq 'Z' ) { - # a right brace here indicates the end of a simple block. - # all non-structural right braces have type 'R' - # all braces associated with block operator keywords have been given those - # keywords as "last_nonblank_token" and caught above. - # (This statement is order dependent, and must come after checking - # $last_nonblank_token). - elsif ( $last_nonblank_type eq '}' ) { + $op_expected = UNKNOWN; - # patch for dor.t (defined or). - if ( $tok eq '/' - && $next_type eq '/' - && $last_nonblank_token eq ']' ) - { - $op_expected = OPERATOR; + # angle.t + if ( $last_nonblank_token =~ /^\w/ ) { + $op_expected = UNKNOWN; } - # Patch for RT #116344: misparse a ternary operator after an anonymous - # hash, like this: - # return ref {} ? 1 : 0; - # The right brace should really be marked type 'R' in this case, and - # it is safest to return an UNKNOWN here. Expecting a TERM will - # cause the '?' to always be interpreted as a pattern delimiter - # rather than introducing a ternary operator. - elsif ( $tok eq '?' ) { + # The 'weird parsing rules' of next section do not work for '<' and '?' + # It is best to mark them as unknown. Test case: + # print $fh ; + elsif ( $is_weird_parsing_rule_exception{$tok} ) { $op_expected = UNKNOWN; } - else { + + # For possible file handle like "$a", Perl uses weird parsing rules. + # For example: + # print $a/2,"/hi"; - division + # print $a / 2,"/hi"; - division + # print $a/ 2,"/hi"; - division + # print $a /2,"/hi"; - pattern (and error)! + # Some examples where this logic works okay, for '&','*','+': + # print $fh &xsi_protos(@mods); + # my $x = new $CompressClass *FH; + # print $OUT +( $count % 15 ? ", " : "\n\t" ); + elsif ($prev_type eq 'b' + && $next_type ne 'b' ) + { $op_expected = TERM; } + + # Note that '?' and '<' have been moved above + # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) { + elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) { + + # Do not complain in 'use' statements, which have special syntax. + # For example, from RT#130344: + # use lib $FindBin::Bin . '/lib'; + if ( $statement_type ne 'use' ) { + complain( +"operator in possible indirect object location not recommended\n" + ); + } + $op_expected = OPERATOR; + } } - # something else..what did I forget? + # anything else... else { - - # collecting diagnostics on unknown operator types..see what was missed $op_expected = UNKNOWN; - write_diagnostics( -"OP: unknown after type=$last_nonblank_type token=$last_nonblank_token\n" - ); } - TOKENIZER_DEBUG_FLAG_EXPECT && do { + RETURN: + + DEBUG_OPERATOR_EXPECTED && do { print STDOUT -"EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n"; +"OPERATOR_EXPECTED: $msg: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n"; }; + return $op_expected; -} + +} ## end of sub operator_expected sub new_statement_ok { @@ -4581,6 +5439,11 @@ sub code_block_type { # check bareword elsif ( $last_nonblank_type eq 'w' ) { + + # check for syntax 'use MODULE LIST' + # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031 + return "" if ( $statement_type eq 'use' ); + return decide_if_code_block( $i, $rtokens, $rtoken_type, $max_token_index ); } @@ -4732,6 +5595,15 @@ sub decide_if_code_block { $code_block_type = ""; } } + + if ($code_block_type) { + + # Patch for cases b1085 b1128: It is uncertain if this is a block. + # If this brace follows a bareword, then append a space as a signal + # to the formatter that this may not be a block brace. To find the + # corresponding code in Formatter.pm search for 'b1085'. + $code_block_type .= " " if ( $code_block_type =~ /^\w/ ); + } } return $code_block_type; @@ -4745,11 +5617,11 @@ sub report_unexpected { $rpretoken_type, $input_line ) = @_; - if ( ++$tokenizer_self->{_unexpected_error_count} <= MAX_NAG_MESSAGES ) { + if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) { my $msg = "found $found where $expecting expected"; my $pos = $rpretoken_map->[$i_tok]; interrupt_logfile(); - my $input_line_number = $tokenizer_self->{_last_line_number}; + my $input_line_number = $tokenizer_self->[_last_line_number_]; my ( $offset, $numbered_line, $underline ) = make_numbered_line( $input_line_number, $input_line, $pos ); $underline = write_on_underline( $underline, $pos - $offset, '^' ); @@ -4868,8 +5740,8 @@ sub increase_nesting_depth { $current_depth[$aa]++; $total_depth++; $total_depth[$aa][ $current_depth[$aa] ] = $total_depth; - my $input_line_number = $tokenizer_self->{_last_line_number}; - my $input_line = $tokenizer_self->{_line_text}; + my $input_line_number = $tokenizer_self->[_last_line_number_]; + my $input_line = $tokenizer_self->[_line_of_text_]; # Sequence numbers increment by number of items. This keeps # a unique set of numbers but still allows the relative location @@ -4905,6 +5777,27 @@ sub increase_nesting_depth { return ( $seqno, $indent ); } +sub is_balanced_closing_container { + + # Return true if a closing container can go here without error + # Return false if not + my ($aa) = @_; + + # cannot close if there was no opening + return unless ( $current_depth[$aa] > 0 ); + + # check that any other brace types $bb contained within would be balanced + for my $bb ( 0 .. @closing_brace_names - 1 ) { + next if ( $bb == $aa ); + return + unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] == + $current_depth[$bb] ); + } + + # OK, everything will be balanced + return 1; +} + sub decrease_nesting_depth { my ( $aa, $pos ) = @_; @@ -4913,8 +5806,8 @@ sub decrease_nesting_depth { # @current_sequence_number, @depth_array, @starting_line_of_current_depth # $statement_type my $seqno = 0; - my $input_line_number = $tokenizer_self->{_last_line_number}; - my $input_line = $tokenizer_self->{_line_text}; + my $input_line_number = $tokenizer_self->[_last_line_number_]; + my $input_line = $tokenizer_self->[_line_of_text_]; my $outdent = 0; $total_depth--; @@ -5000,6 +5893,10 @@ EOM indicate_error( $msg, $input_line_number, $input_line, $pos, '^' ); } increment_brace_error(); + + # keep track of errors in braces alone (ignoring ternary nesting errors) + $tokenizer_self->[_true_brace_error_count_]++ + if ( $closing_brace_names[$aa] ne "':'" ); } return ( $seqno, $outdent ); } @@ -5039,9 +5936,10 @@ sub peek_ahead_for_n_nonblank_pre_tokens { my $i = 0; my ( $rpre_tokens, $rmap, $rpre_types ); - while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) ) + while ( $line = + $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) ) { - $line =~ s/^\s*//; # trim leading blanks + $line =~ s/^\s*//; # trim leading blanks next if ( length($line) <= 0 ); # skip blank next if ( $line =~ /^#/ ); # skip comment ( $rpre_tokens, $rmap, $rpre_types ) = @@ -5059,9 +5957,10 @@ sub peek_ahead_for_nonblank_token { my $line; my $i = 0; - while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) ) + while ( $line = + $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) ) { - $line =~ s/^\s*//; # trim leading blanks + $line =~ s/^\s*//; # trim leading blanks next if ( length($line) <= 0 ); # skip blank next if ( $line =~ /^#/ ); # skip comment my ( $rtok, $rmap, $rtype ) = @@ -5074,7 +5973,7 @@ sub peek_ahead_for_nonblank_token { } last; } - return $rtokens; + return; } #########i############################################################# @@ -5092,8 +5991,6 @@ sub guess_if_pattern_or_conditional { # msg = a warning or diagnostic message # USES GLOBAL VARIABLES: $last_nonblank_token - # FIXME: this needs to be rewritten - my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_; my $is_pattern = 0; my $msg = "guessing that ? after $last_nonblank_token starts a "; @@ -5157,6 +6054,20 @@ sub guess_if_pattern_or_conditional { return ( $is_pattern, $msg ); } +my %is_known_constant; +my %is_known_function; + +BEGIN { + + # Constants like 'pi' in Trig.pm are common + my @q = qw(pi pi2 pi4 pip2 pip4); + @{is_known_constant}{@q} = (1) x scalar(@q); + + # parenless calls of 'ok' are common + @q = qw( ok ); + @{is_known_function}{@q} = (1) x scalar(@q); +} + sub guess_if_pattern_or_division { # this routine is called when we have encountered a / following an @@ -5177,11 +6088,32 @@ sub guess_if_pattern_or_division { } else { my $ibeg = $i; - my $divide_expected = - numerator_expected( $i, $rtokens, $max_token_index ); + my $divide_possible = + is_possible_numerator( $i, $rtokens, $max_token_index ); + + if ( $divide_possible < 0 ) { + $msg = "pattern (division not possible here)\n"; + $is_pattern = 1; + goto RETURN; + } + $i = $ibeg + 1; my $next_token = $rtokens->[$i]; # first token after slash + # One of the things we can look at is the spacing around the slash. + # There # are four possible spacings around the first slash: + # + # return pi/two;#/; -/- + # return pi/ two;#/; -/+ + # return pi / two;#/; +/+ + # return pi /two;#/; +/- <-- possible pattern + # + # Spacing rule: a space before the slash but not after the slash + # usually indicates a pattern. We can use this to break ties. + + my $is_pattern_by_spacing = + ( $i > 1 && $next_token ne ' ' && $rtokens->[ $i - 2 ] eq ' ' ); + # look for a possible ending / on this line.. my $in_quote = 1; my $quote_depth = 0; @@ -5197,54 +6129,104 @@ sub guess_if_pattern_or_division { if ($in_quote) { - # we didn't find an ending / on this line, - # so we bias towards division - if ( $divide_expected >= 0 ) { + # we didn't find an ending / on this line, so we bias towards + # division + if ( $divide_possible >= 0 ) { $is_pattern = 0; $msg .= "division (no ending / on this line)\n"; } else { + + # assuming a multi-line pattern ... this is risky, but division + # does not seem possible. If this fails, it would either be due + # to a syntax error in the code, or the division_expected logic + # needs to be fixed. $msg = "multi-line pattern (division not possible)\n"; $is_pattern = 1; } - } - # we found an ending /, so we bias towards a pattern + # we found an ending /, so we bias slightly towards a pattern else { - if ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) { + my $pattern_expected = + pattern_expected( $i, $rtokens, $max_token_index ); + + if ( $pattern_expected >= 0 ) { + + # pattern looks possible... + if ( $divide_possible >= 0 ) { + + # Both pattern and divide can work here... - if ( $divide_expected >= 0 ) { + # Increase weight of divide if a pure number follows + $divide_possible += $next_token =~ /^\d+$/; - if ( $i - $ibeg > 60 ) { - $msg .= "division (matching / too distant)\n"; + # Check for known constants in the numerator, like 'pi' + if ( $is_known_constant{$last_nonblank_token} ) { + $msg .= +"division (pattern works too but saw known constant '$last_nonblank_token')\n"; $is_pattern = 0; } - else { - $msg .= "pattern (but division possible too)\n"; + + # A very common bare word in pattern expressions is 'ok' + elsif ( $is_known_function{$last_nonblank_token} ) { + $msg .= +"pattern (division works too but saw '$last_nonblank_token')\n"; + $is_pattern = 1; + } + + # If one rule is more definite, use it + elsif ( $divide_possible > $pattern_expected ) { + $msg .= + "division (more likely based on following tokens)\n"; + $is_pattern = 0; + } + + # otherwise, use the spacing rule + elsif ($is_pattern_by_spacing) { + $msg .= +"pattern (guess on spacing, but division possible too)\n"; $is_pattern = 1; } + else { + $msg .= +"division (guess on spacing, but pattern is possible too)\n"; + $is_pattern = 0; + } } + + # divide_possible < 0 means divide can not work here else { $is_pattern = 1; $msg .= "pattern (division not possible)\n"; } } + + # pattern does not look possible... else { - if ( $divide_expected >= 0 ) { + if ( $divide_possible >= 0 ) { $is_pattern = 0; $msg .= "division (pattern not possible)\n"; } + + # Neither pattern nor divide look possible...go by spacing else { - $is_pattern = 1; - $msg .= - "pattern (uncertain, but division would not work here)\n"; + if ($is_pattern_by_spacing) { + $msg .= "pattern (guess on spacing)\n"; + $is_pattern = 1; + } + else { + $msg .= "division (guess on spacing)\n"; + $is_pattern = 0; + } } } } } + + RETURN: return ( $is_pattern, $msg ); } @@ -5266,7 +6248,8 @@ sub guess_if_here_doc { my $k = 0; my $msg = "checking <<"; - while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $k++ ) ) + while ( $line = + $tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) ) { chomp $line; @@ -5361,13 +6344,14 @@ sub scan_bare_identifier_do { else { $package = $current_package; - if ( $is_keyword{$tok} ) { + # patched for c043, part 1: keyword does not follow '->' + if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) { $type = 'k'; } } - # if it is a bareword.. - if ( $type eq 'w' ) { + # if it is a bareword.. patched for c043, part 2: not following '->' + if ( $type eq 'w' && $last_nonblank_type ne '->' ) { # check for v-string with leading 'v' type character # (This seems to have precedence over filehandle, type 'Y') @@ -5416,7 +6400,6 @@ sub scan_bare_identifier_do { elsif ( $is_block_function{$package}{$sub_name} ) { $type = 'G'; } - elsif ( $is_block_list_function{$package}{$sub_name} ) { $type = 'G'; } @@ -5449,8 +6432,12 @@ sub scan_bare_identifier_do { ) { - # may not be indirect object unless followed by a space - if ( $input_line =~ m/\G\s+/gc ) { + # may not be indirect object unless followed by a space; + # updated 2021-01-16 to consider newline to be a space. + # updated for case b990 to look for either ';' or space + if ( pos($input_line) == length($input_line) + || $input_line =~ m/\G[;\s]/gc ) + { $type = 'Y'; # Abandon Hope ... @@ -5528,6 +6515,7 @@ sub scan_id_do { my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state, $max_token_index ) = @_; + use constant DEBUG_NSCAN => 0; my $type = ''; my ( $i_beg, $pos_beg ); @@ -5579,9 +6567,17 @@ sub scan_id_do { if ( $is_sub{$id_scan_state} ) { ( $i, $tok, $type, $id_scan_state ) = do_scan_sub( - $input_line, $i, $i_beg, - $tok, $type, $rtokens, - $rtoken_map, $id_scan_state, $max_token_index + { + input_line => $input_line, + i => $i, + i_beg => $i_beg, + tok => $tok, + type => $type, + rtokens => $rtokens, + rtoken_map => $rtoken_map, + id_scan_state => $id_scan_state, + max_token_index => $max_token_index + } ); } @@ -5607,7 +6603,7 @@ sub scan_id_do { report_definite_bug(); } - TOKENIZER_DEBUG_FLAG_NSCAN && do { + DEBUG_NSCAN && do { print STDOUT "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n"; }; @@ -5678,7 +6674,7 @@ sub do_scan_package { pos($input_line) = $pos_beg; # handle non-blank line; package name, if any, must follow - if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w+)/gc ) { + if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) { $package = $1; $package = ( defined($1) && $1 ) ? $1 : 'main'; $package =~ s/\'/::/g; @@ -5713,7 +6709,8 @@ sub do_scan_package { # knows that the number is in a package statement. # Examples of valid primitive tokens that might follow are: # 1235 . ; { } v3 v - if ( $next_nonblank_token =~ /^([v\.\d;\{\}])|v\d|\d+$/ ) { + # FIX: added a '#' since a side comment may also follow + if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#])|v\d|\d+$/ ) { $statement_type = $tok; } else { @@ -5738,12 +6735,14 @@ sub scan_identifier_do { # scan state, id_scan_state. It updates id_scan_state based upon # current id_scan_state and token, and returns an updated # id_scan_state and the next index after the identifier. + # USES GLOBAL VARIABLES: $context, $last_nonblank_token, # $last_nonblank_type my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index, $expecting, $container_type ) = @_; + use constant DEBUG_SCAN_ID => 0; my $i_begin = $i; my $type = ''; my $tok_begin = $rtokens->[$i_begin]; @@ -5752,18 +6751,23 @@ sub scan_identifier_do { my $identifier_begin = $identifier; my $tok = $tok_begin; my $message = ""; + my $tok_is_blank; # a flag to speed things up - my $in_prototype_or_signature = $container_type =~ /^sub/; + my $in_prototype_or_signature = + $container_type && $container_type =~ /^sub\b/; # these flags will be used to help figure out the type: - my $saw_alpha = ( $tok =~ /^[A-Za-z_]/ ); + my $saw_alpha; my $saw_type; # allow old package separator (') except in 'use' statement my $allow_tick = ( $last_nonblank_token ne 'use' ); + ######################################################### # get started by defining a type and a state if necessary - unless ($id_scan_state) { + ######################################################### + + if ( !$id_scan_state ) { $context = UNKNOWN_CONTEXT; # fixup for digraph @@ -5792,8 +6796,9 @@ sub scan_identifier_do { elsif ( $tok eq '::' ) { $id_scan_state = 'A'; } - elsif ( $tok =~ /^[A-Za-z_]/ ) { + elsif ( $tok =~ /^\w/ ) { $id_scan_state = ':'; + $saw_alpha = 1; } elsif ( $tok eq '->' ) { $id_scan_state = '$'; @@ -5810,22 +6815,34 @@ sub scan_identifier_do { } else { $i--; - $saw_type = ( $tok =~ /([\$\%\@\*\&])/ ); + $saw_alpha = ( $tok =~ /^\w/ ); + $saw_type = ( $tok =~ /([\$\%\@\*\&])/ ); } - # now loop to gather the identifier + ############################### + # loop to gather the identifier + ############################### + my $i_save = $i; while ( $i < $max_token_index ) { - $i_save = $i unless ( $tok =~ /^\s*$/ ); - $tok = $rtokens->[ ++$i ]; + my $last_tok_is_blank = $tok_is_blank; + if ($tok_is_blank) { $tok_is_blank = undef } + else { $i_save = $i } + + $tok = $rtokens->[ ++$i ]; + # patch to make digraph :: if necessary if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) { $tok = '::'; $i++; } - if ( $id_scan_state eq '$' ) { # starting variable name + ######################## + # Starting variable name + ######################## + + if ( $id_scan_state eq '$' ) { if ( $tok eq '$' ) { @@ -5838,16 +6855,20 @@ sub scan_identifier_do { last; } } + elsif ( $tok =~ /^\w/ ) { # alphanumeric .. + $saw_alpha = 1; + $id_scan_state = ':'; # now need :: + $identifier .= $tok; + } + elsif ( $tok eq '::' ) { + $id_scan_state = 'A'; + $identifier .= $tok; + } # POSTDEFREF ->@ ->% ->& ->* elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) { $identifier .= $tok; } - elsif ( $tok =~ /^[A-Za-z_]/ ) { # alphanumeric .. - $saw_alpha = 1; - $id_scan_state = ':'; # now need :: - $identifier .= $tok; - } elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. $saw_alpha = 1; $id_scan_state = ':'; # now need :: @@ -5861,32 +6882,53 @@ sub scan_identifier_do { # howdy::123::bubba(); # } - elsif ( $tok =~ /^[0-9]/ ) { # numeric - $saw_alpha = 1; - $id_scan_state = ':'; # now need :: - $identifier .= $tok; - } - elsif ( $tok eq '::' ) { - $id_scan_state = 'A'; - $identifier .= $tok; - } + elsif ( $tok eq '#' ) { - # $# and POSTDEFREF ->$# - elsif ( ( $tok eq '#' ) && ( $identifier =~ /\$$/ ) ) { # $#array - $identifier .= $tok; # keep same state, a $ could follow - } - elsif ( $tok eq '{' ) { - - # check for something like ${#} or ${©} + # side comment or identifier? if ( - ( - $identifier eq '$' - || $identifier eq '@' - || $identifier eq '$#' - ) - && $i + 2 <= $max_token_index - && $rtokens->[ $i + 2 ] eq '}' - && $rtokens->[ $i + 1 ] !~ /[\s\w]/ + + # A '#' starts a comment if it follows a space. For example, + # the following is equivalent to $ans=40. + # my $ # + # ans = 40; + !$last_tok_is_blank + + # a # inside a prototype or signature can only start a + # comment + && !$in_prototype_or_signature + + # these are valid punctuation vars: *# %# @# $# + # May also be '$#array' or POSTDEFREF ->$# + && ( $identifier =~ /^[\%\@\$\*]$/ || $identifier =~ /\$$/ ) + + ) + { + $identifier .= $tok; # keep same state, a $ could follow + } + else { + + # otherwise it is a side comment + if ( $identifier eq '->' ) { } + elsif ( $id_scan_state eq '$' ) { $type = 't' } + else { $type = 'i' } + $i = $i_save; + $id_scan_state = ''; + last; + } + } + + elsif ( $tok eq '{' ) { + + # check for something like ${#} or ${©} + if ( + ( + $identifier eq '$' + || $identifier eq '@' + || $identifier eq '$#' + ) + && $i + 2 <= $max_token_index + && $rtokens->[ $i + 2 ] eq '}' + && $rtokens->[ $i + 1 ] !~ /[\s\w]/ ) { my $next2 = $rtokens->[ $i + 2 ]; @@ -5910,6 +6952,8 @@ sub scan_identifier_do { # space ok after leading $ % * & @ elsif ( $tok =~ /^\s*$/ ) { + $tok_is_blank = 1; + if ( $identifier =~ /^[\$\%\*\&\@]/ ) { if ( length($identifier) > 1 ) { @@ -5961,10 +7005,33 @@ sub scan_identifier_do { } else { # something else - if ( $in_prototype_or_signature && $tok =~ /^[\),=]/ ) { + if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) { + + # We might be in an extrusion of + # sub foo2 ( $first, $, $third ) { + # looking at a line starting with a comma, like + # $ + # , + # in this case the comma ends the signature variable + # '$' which will have been previously marked type 't' + # rather than 'i'. + if ( $i == $i_begin ) { + $identifier = ""; + $type = ""; + } + + # at a # we have to mark as type 't' because more may + # follow, otherwise, in a signature we can let '$' be an + # identifier here for better formatting. + # See 'mangle4.in' for a test case. + else { + $type = 'i'; + if ( $id_scan_state eq '$' && $tok eq '#' ) { + $type = 't'; + } + $i = $i_save; + } $id_scan_state = ''; - $i = $i_save; - $type = 'i'; # probably punctuation variable last; } @@ -5974,7 +7041,9 @@ sub scan_identifier_do { } # POSTDEFREF: Postfix reference ->$* ->%* ->@* ->** ->&* ->$#* - elsif ( $tok eq '*' && $identifier =~ /([\@\%\$\*\&]|\$\#)$/ ) { + elsif ($tok eq '*' + && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ ) + { $identifier .= $tok; } @@ -6002,6 +7071,7 @@ sub scan_identifier_do { # You would have to use # $a = ${$:}; + # '$$' alone is punctuation variable for PID $i = $i_save; if ( $tok eq '{' ) { $type = 't' } else { $type = 'i' } @@ -6017,84 +7087,30 @@ sub scan_identifier_do { last; } } - elsif ( $id_scan_state eq '&' ) { # starting sub call? - if ( $tok =~ /^[\$A-Za-z_]/ ) { # alphanumeric .. - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - $identifier .= $tok; - } - elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - $identifier .= $tok; - } - elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - $identifier .= $tok; - } - elsif ( $tok =~ /^\s*$/ ) { # allow space - } - elsif ( $tok eq '::' ) { # leading :: - $id_scan_state = 'A'; # accept alpha next - $identifier .= $tok; - } - elsif ( $tok eq '{' ) { - if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; } - $i = $i_save; - $id_scan_state = ''; - last; - } - else { + ################################### + # looking for alphanumeric after :: + ################################### - # punctuation variable? - # testfile: cunningham4.pl - # - # We have to be careful here. If we are in an unknown state, - # we will reject the punctuation variable. In the following - # example the '&' is a binary operator but we are in an unknown - # state because there is no sigil on 'Prima', so we don't - # know what it is. But it is a bad guess that - # '&~' is a function variable. - # $self->{text}->{colorMap}->[ - # Prima::PodView::COLOR_CODE_FOREGROUND - # & ~tb::COLOR_INDEX ] = - # $sec->{ColorCode} - if ( $identifier eq '&' && $expecting ) { - $identifier .= $tok; - } - else { - $identifier = ''; - $i = $i_save; - $type = '&'; - } - $id_scan_state = ''; - last; - } - } - elsif ( $id_scan_state eq 'A' ) { # looking for alpha (after ::) + elsif ( $id_scan_state eq 'A' ) { + + $tok_is_blank = $tok =~ /^\s*$/; - if ( $tok =~ /^[A-Za-z_]/ ) { # found it + if ( $tok =~ /^\w/ ) { # found it $identifier .= $tok; - $id_scan_state = ':'; # now need :: + $id_scan_state = ':'; # now need :: $saw_alpha = 1; } elsif ( $tok eq "'" && $allow_tick ) { $identifier .= $tok; - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - } - elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above - $identifier .= $tok; - $id_scan_state = ':'; # now need :: + $id_scan_state = ':'; # now need :: $saw_alpha = 1; } - elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { + elsif ( $tok_is_blank && $identifier =~ /^sub / ) { $id_scan_state = '('; $identifier .= $tok; } - elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { + elsif ( $tok eq '(' && $identifier =~ /^sub / ) { $id_scan_state = ')'; $identifier .= $tok; } @@ -6104,20 +7120,22 @@ sub scan_identifier_do { last; } } + + ################################### + # looking for :: after alphanumeric + ################################### + elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha + $tok_is_blank = $tok =~ /^\s*$/; + if ( $tok eq '::' ) { # got it $identifier .= $tok; $id_scan_state = 'A'; # now require alpha } - elsif ( $tok =~ /^[A-Za-z_]/ ) { # more alphanumeric is ok here - $identifier .= $tok; - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - } - elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above + elsif ( $tok =~ /^\w/ ) { # more alphanumeric is ok here $identifier .= $tok; - $id_scan_state = ':'; # now need :: + $id_scan_state = ':'; # now need :: $saw_alpha = 1; } elsif ( $tok eq "'" && $allow_tick ) { # tick @@ -6130,11 +7148,11 @@ sub scan_identifier_do { $identifier .= $tok; } } - elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { + elsif ( $tok_is_blank && $identifier =~ /^sub / ) { $id_scan_state = '('; $identifier .= $tok; } - elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { + elsif ( $tok eq '(' && $identifier =~ /^sub / ) { $id_scan_state = ')'; $identifier .= $tok; } @@ -6144,26 +7162,39 @@ sub scan_identifier_do { last; } } - elsif ( $id_scan_state eq '(' ) { # looking for ( of prototype - if ( $tok eq '(' ) { # got it + ############################## + # looking for '(' of prototype + ############################## + + elsif ( $id_scan_state eq '(' ) { + + if ( $tok eq '(' ) { # got it $identifier .= $tok; - $id_scan_state = ')'; # now find the end of it + $id_scan_state = ')'; # now find the end of it } - elsif ( $tok =~ /^\s*$/ ) { # blank - keep going + elsif ( $tok =~ /^\s*$/ ) { # blank - keep going $identifier .= $tok; + $tok_is_blank = 1; } else { - $id_scan_state = ''; # that's all - no prototype + $id_scan_state = ''; # that's all - no prototype $i = $i_save; last; } } - elsif ( $id_scan_state eq ')' ) { # looking for ) to end - if ( $tok eq ')' ) { # got it + ############################## + # looking for ')' of prototype + ############################## + + elsif ( $id_scan_state eq ')' ) { + + $tok_is_blank = $tok =~ /^\s*$/; + + if ( $tok eq ')' ) { # got it $identifier .= $tok; - $id_scan_state = ''; # all done + $id_scan_state = ''; # all done last; } elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) { @@ -6174,12 +7205,76 @@ sub scan_identifier_do { $identifier .= $tok; } } - else { # can get here due to error in initialization + + ################### + # Starting sub call + ################### + + elsif ( $id_scan_state eq '&' ) { + + if ( $tok =~ /^[\$\w]/ ) { # alphanumeric .. + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; + $identifier .= $tok; + } + elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; + $identifier .= $tok; + } + elsif ( $tok =~ /^\s*$/ ) { # allow space + $tok_is_blank = 1; + } + elsif ( $tok eq '::' ) { # leading :: + $id_scan_state = 'A'; # accept alpha next + $identifier .= $tok; + } + elsif ( $tok eq '{' ) { + if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; } + $i = $i_save; + $id_scan_state = ''; + last; + } + else { + + # punctuation variable? + # testfile: cunningham4.pl + # + # We have to be careful here. If we are in an unknown state, + # we will reject the punctuation variable. In the following + # example the '&' is a binary operator but we are in an unknown + # state because there is no sigil on 'Prima', so we don't + # know what it is. But it is a bad guess that + # '&~' is a function variable. + # $self->{text}->{colorMap}->[ + # Prima::PodView::COLOR_CODE_FOREGROUND + # & ~tb::COLOR_INDEX ] = + # $sec->{ColorCode} + + # Fix for case c033: a '#' here starts a side comment + if ( $identifier eq '&' && $expecting && $tok ne '#' ) { + $identifier .= $tok; + } + else { + $identifier = ''; + $i = $i_save; + $type = '&'; + } + $id_scan_state = ''; + last; + } + } + + ###################### + # unknown state - quit + ###################### + + else { # can get here due to error in initialization $id_scan_state = ''; $i = $i_save; last; } - } + } ## end of main loop if ( $id_scan_state eq ')' ) { warning("Hit end of line while seeking ) to end prototype\n"); @@ -6190,9 +7285,15 @@ sub scan_identifier_do { if ( $id_scan_state =~ /^[A\:\(\)]/ ) { $id_scan_state = ''; } + + # Patch: the deprecated variable $# does not combine with anything on the + # next line. + if ( $identifier eq '$#' ) { $id_scan_state = '' } + if ( $i < 0 ) { $i = 0 } - unless ($type) { + # Be sure a token type is defined + if ( !$type ) { if ($saw_type) { @@ -6230,16 +7331,19 @@ sub scan_identifier_do { } # this can happen on a restart } + # See if we formed an identifier... if ($identifier) { $tok = $identifier; if ($message) { write_logfile_entry($message) } } + + # did not find an identifier, back up else { $tok = $tok_begin; $i = $i_begin; } - TOKENIZER_DEBUG_FLAG_SCAN_ID && do { + DEBUG_SCAN_ID && do { my ( $a, $b, $c ) = caller; print STDOUT "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n"; @@ -6249,16 +7353,66 @@ sub scan_identifier_do { return ( $i, $tok, $type, $id_scan_state, $identifier ); } -{ +{ ## closure for sub do_scan_sub + + my %warn_if_lexical; + + BEGIN { + + # lexical subs with these names can cause parsing errors in this version + my @q = qw( m q qq qr qw qx s tr y ); + @{warn_if_lexical}{@q} = (1) x scalar(@q); + } # saved package and subnames in case prototype is on separate line my ( $package_saved, $subname_saved ); + # initialize subname each time a new 'sub' keyword is encountered + sub initialize_subname { + $package_saved = ""; + $subname_saved = ""; + return; + } + + use constant { + SUB_CALL => 1, + PAREN_CALL => 2, + PROTOTYPE_CALL => 3, + }; + sub do_scan_sub { - # do_scan_sub parses a sub name and prototype - # it is called with $i_beg equal to the index of the first nonblank - # token following a 'sub' token. + # do_scan_sub parses a sub name and prototype. + + # At present there are three basic CALL TYPES which are + # distinguished by the starting value of '$tok': + # 1. $tok='sub', id_scan_state='sub' + # it is called with $i_beg equal to the index of the first nonblank + # token following a 'sub' token. + # 2. $tok='(', id_scan_state='sub', + # it is called with $i_beg equal to the index of a '(' which may + # start a prototype. + # 3. $tok='prototype', id_scan_state='prototype' + # it is called with $i_beg equal to the index of a '(' which is + # preceded by ': prototype' and has $id_scan_state eq 'prototype' + + # Examples: + + # A single type 1 call will get both the sub and prototype + # sub foo1 ( $$ ) { } + # ^ + + # The subname will be obtained with a 'sub' call + # The prototype on line 2 will be obtained with a '(' call + # sub foo1 + # ^ <---call type 1 + # ( $$ ) { } + # ^ <---call type 2 + + # The subname will be obtained with a 'sub' call + # The prototype will be obtained with a 'prototype' call + # sub foo1 ( $x, $y ) : prototype ( $$ ) { } + # ^ <---type 1 ^ <---type 3 # TODO: add future error checks to be sure we have a valid # sub name. For example, 'sub &doit' is wrong. Also, be sure @@ -6268,14 +7422,32 @@ sub scan_identifier_do { # $in_attribute_list, %saw_function_definition, # $statement_type - my ( - $input_line, $i, $i_beg, - $tok, $type, $rtokens, - $rtoken_map, $id_scan_state, $max_token_index - ) = @_; + my ($rinput_hash) = @_; + + my $input_line = $rinput_hash->{input_line}; + my $i = $rinput_hash->{i}; + my $i_beg = $rinput_hash->{i_beg}; + my $tok = $rinput_hash->{tok}; + my $type = $rinput_hash->{type}; + my $rtokens = $rinput_hash->{rtokens}; + my $rtoken_map = $rinput_hash->{rtoken_map}; + my $id_scan_state = $rinput_hash->{id_scan_state}; + my $max_token_index = $rinput_hash->{max_token_index}; + + my $i_entry = $i; + + # Determine the CALL TYPE + # 1=sub + # 2=( + # 3=prototype + my $call_type = + $tok eq 'prototype' ? PROTOTYPE_CALL + : $tok eq '(' ? PAREN_CALL + : SUB_CALL; + $id_scan_state = ""; # normally we get everything in one call - my $subname = undef; - my $package = undef; + my $subname = $subname_saved; + my $package = $package_saved; my $proto = undef; my $attrs = undef; my $match; @@ -6283,9 +7455,10 @@ sub scan_identifier_do { my $pos_beg = $rtoken_map->[$i_beg]; pos($input_line) = $pos_beg; - # Look for the sub NAME + # Look for the sub NAME if this is a SUB call if ( - $input_line =~ m/\G\s* + $call_type == SUB_CALL + && $input_line =~ m/\G\s* ((?:\w*(?:'|::))*) # package - something that ends in :: or ' (\w+) # NAME - required /gcx @@ -6294,17 +7467,45 @@ sub scan_identifier_do { $match = 1; $subname = $2; - $package = ( defined($1) && $1 ) ? $1 : $current_package; - $package =~ s/\'/::/g; - if ( $package =~ /^\:/ ) { $package = 'main' . $package } - $package =~ s/::$//; + my $is_lexical_sub = + $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my'; + if ( $is_lexical_sub && $1 ) { + warning("'my' sub $subname cannot be in package '$1'\n"); + $is_lexical_sub = 0; + } + + if ($is_lexical_sub) { + + # lexical subs use the block sequence number as a package name + my $seqno = + $current_sequence_number[BRACE][ $current_depth[BRACE] ]; + $seqno = 1 unless ( defined($seqno) ); + $package = $seqno; + if ( $warn_if_lexical{$subname} ) { + warning( +"'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n" + ); + } + } + else { + $package = ( defined($1) && $1 ) ? $1 : $current_package; + $package =~ s/\'/::/g; + if ( $package =~ /^\:/ ) { $package = 'main' . $package } + $package =~ s/::$//; + } + my $pos = pos($input_line); my $numc = $pos - $pos_beg; $tok = 'sub ' . substr( $input_line, $pos_beg, $numc ); $type = 'i'; + + # remember the sub name in case another call is needed to + # get the prototype + $package_saved = $package; + $subname_saved = $subname; } - # Now look for PROTO ATTRS + # Now look for PROTO ATTRS for all call types # Look for prototype/attributes which are usually on the same # line as the sub name but which might be on a separate line. # For example, we might have an anonymous sub with attributes, @@ -6314,9 +7515,15 @@ sub scan_identifier_do { # does not look like a prototype, we assume it is a SIGNATURE and we # will stop and let the the standard tokenizer handle it. In # particular, we stop if we see any nested parens, braces, or commas. + # Also note, a valid prototype cannot contain any alphabetic character + # -- see https://perldoc.perl.org/perlsub + # But it appears that an underscore is valid in a prototype, so the + # regex below uses [A-Za-z] rather than \w + # This is the old regex which has been replaced: + # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))? # PROTO my $saw_opening_paren = $input_line =~ /\G\s*\(/; if ( - $input_line =~ m/\G(\s*\([^\)\(\}\{\,]*\))? # PROTO + $input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))? # PROTO (\s*:)? # ATTRS leading ':' /gcx && ( $1 || $2 ) @@ -6325,21 +7532,27 @@ sub scan_identifier_do { $proto = $1; $attrs = $2; - # If we also found the sub name on this call then append PROTO. - # This is not necessary but for compatibility with previous - # versions when the -csc flag is used: - if ( $match && $proto ) { + # Append the prototype to the starting token if it is 'sub' or + # 'prototype'. This is not necessary but for compatibility with + # previous versions when the -csc flag is used: + if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) { $tok .= $proto; } - $match ||= 1; - # Handle prototype on separate line from subname - if ($subname_saved) { - $package = $package_saved; - $subname = $subname_saved; - $tok = $last_nonblank_token; + # If we just entered the sub at an opening paren on this call, not + # a following :prototype, label it with the previous token. This is + # necessary to propagate the sub name to its opening block. + elsif ( $call_type == PAREN_CALL ) { + $tok = $last_nonblank_token; } + + $match ||= 1; + + # Patch part #1 to fixes cases b994 and b1053: + # Mark an anonymous sub keyword without prototype as type 'k', i.e. + # 'sub : lvalue { ...' $type = 'i'; + if ( $tok eq 'sub' && !$proto ) { $type = 'k' } } if ($match) { @@ -6369,6 +7582,19 @@ sub scan_identifier_do { $max_token_index ); if ($error) { warning("Possibly invalid sub\n") } + # Patch part #2 to fixes cases b994 and b1053: + # Do not let spaces be part of the token of an anonymous sub keyword + # which we marked as type 'k' above...i.e. for something like: + # 'sub : lvalue { ...' + # Back up and let it be parsed as a blank + if ( $type eq 'k' + && $attrs + && $i > $i_entry + && substr( $rtokens->[$i], 0, 1 ) eq ' ' ) + { + $i--; + } + # check for multiple definitions of a sub ( $next_nonblank_token, my $i_next ) = find_next_nonblank_token_on_this_line( $i, $rtokens, @@ -6386,8 +7612,6 @@ sub scan_identifier_do { $next_nonblank_token = '}'; } } - $package_saved = ""; - $subname_saved = ""; # See what's next... if ( $next_nonblank_token eq '{' ) { @@ -6396,16 +7620,24 @@ sub scan_identifier_do { # Check for multiple definitions of a sub, but # it is ok to have multiple sub BEGIN, etc, # so we do not complain if name is all caps - if ( $saw_function_definition{$package}{$subname} + if ( $saw_function_definition{$subname}{$package} && $subname !~ /^[A-Z]+$/ ) { - my $lno = $saw_function_definition{$package}{$subname}; - warning( + my $lno = $saw_function_definition{$subname}{$package}; + if ( $package =~ /^\d/ ) { + warning( +"already saw definition of lexical 'sub $subname' at line $lno\n" + ); + + } + else { + warning( "already saw definition of 'sub $subname' in package '$package' at line $lno\n" - ); + ); + } } - $saw_function_definition{$package}{$subname} = - $tokenizer_self->{_last_line_number}; + $saw_function_definition{$subname}{$package} = + $tokenizer_self->[_last_line_number_]; } } elsif ( $next_nonblank_token eq ';' ) { @@ -6418,7 +7650,10 @@ sub scan_identifier_do { # Setting 'statement_type' causes any ':'s to introduce # attributes. elsif ( $next_nonblank_token eq ':' ) { - $statement_type = $tok; + if ( $call_type == SUB_CALL ) { + $statement_type = + substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub'; + } } # if we stopped before an open paren ... @@ -6431,13 +7666,14 @@ sub scan_identifier_do { # Otherwise, we assume it is a SIGNATURE rather than a # PROTOTYPE and let the normal tokenizer handle it as a list if ( !$saw_opening_paren ) { - $id_scan_state = 'sub'; # we must come back to get proto - $package_saved = $package; - $subname_saved = $subname; + $id_scan_state = 'sub'; # we must come back to get proto + } + if ( $call_type == SUB_CALL ) { + $statement_type = + substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub'; } - $statement_type = $tok; } - elsif ($next_nonblank_token) { # EOF technically ok + elsif ($next_nonblank_token) { # EOF technically ok $subname = "" unless defined($subname); warning( "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n" @@ -6446,8 +7682,9 @@ sub scan_identifier_do { check_prototype( $proto, $package, $subname ); } - # no match but line not blank + # no match to either sub name or prototype, but line not blank else { + } return ( $i, $tok, $type, $id_scan_state ); } @@ -6460,87 +7697,113 @@ sub scan_identifier_do { sub find_next_nonblank_token { my ( $i, $rtokens, $max_token_index ) = @_; + # Returns the next nonblank token after the token at index $i + # To skip past a side comment, and any subsequent block comments + # and blank lines, call with i=$max_token_index + if ( $i >= $max_token_index ) { if ( !peeked_ahead() ) { peeked_ahead(1); - $rtokens = - peek_ahead_for_nonblank_token( $rtokens, $max_token_index ); + peek_ahead_for_nonblank_token( $rtokens, $max_token_index ); } } + my $next_nonblank_token = $rtokens->[ ++$i ]; + return ( " ", $i ) unless defined($next_nonblank_token); if ( $next_nonblank_token =~ /^\s*$/ ) { $next_nonblank_token = $rtokens->[ ++$i ]; + return ( " ", $i ) unless defined($next_nonblank_token); } return ( $next_nonblank_token, $i ); } -sub numerator_expected { +sub is_possible_numerator { - # this is a filter for a possible numerator, in support of guessing - # for the / pattern delimiter token. - # returns - + # Look at the next non-comment character and decide if it could be a + # numerator. Return # 1 - yes # 0 - can't tell # -1 - no - # Note: I am using the convention that variables ending in - # _expected have these 3 possible values. + my ( $i, $rtokens, $max_token_index ) = @_; - my $numerator_expected = 0; + my $is_possible_numerator = 0; my $next_token = $rtokens->[ $i + 1 ]; if ( $next_token eq '=' ) { $i++; } # handle /= my ( $next_nonblank_token, $i_next ) = find_next_nonblank_token( $i, $rtokens, $max_token_index ); + if ( $next_nonblank_token eq '#' ) { + ( $next_nonblank_token, $i_next ) = + find_next_nonblank_token( $max_token_index, $rtokens, + $max_token_index ); + } + if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) { - $numerator_expected = 1; + $is_possible_numerator = 1; + } + elsif ( $next_nonblank_token =~ /^\s*$/ ) { + $is_possible_numerator = 0; } else { - - if ( $next_nonblank_token =~ /^\s*$/ ) { - $numerator_expected = 0; - } - else { - $numerator_expected = -1; - } + $is_possible_numerator = -1; } - return $numerator_expected; + + return $is_possible_numerator; } -sub pattern_expected { +{ ## closure for sub pattern_expected + my %pattern_test; - # This is the start of a filter for a possible pattern. - # It looks at the token after a possible pattern and tries to - # determine if that token could end a pattern. - # returns - - # 1 - yes - # 0 - can't tell - # -1 - no - my ( $i, $rtokens, $max_token_index ) = @_; - my $is_pattern = 0; + BEGIN { - my $next_token = $rtokens->[ $i + 1 ]; - if ( $next_token =~ /^[msixpodualgc]/ ) { $i++; } # skip possible modifier - my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens, $max_token_index ); + # List of tokens which may follow a pattern. Note that we will not + # have formed digraphs at this point, so we will see '&' instead of + # '&&' and '|' instead of '||' - # list of tokens which may follow a pattern - # (can probably be expanded) - if ( $next_nonblank_token =~ /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/ ) - { - $is_pattern = 1; + # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/ + my @q = qw( & && | || ? : + - * and or while if unless); + push @q, ')', '}', ']', '>', ',', ';'; + @{pattern_test}{@q} = (1) x scalar(@q); } - else { - if ( $next_nonblank_token =~ /^\s*$/ ) { - $is_pattern = 0; + sub pattern_expected { + + # This a filter for a possible pattern. + # It looks at the token after a possible pattern and tries to + # determine if that token could end a pattern. + # returns - + # 1 - yes + # 0 - can't tell + # -1 - no + my ( $i, $rtokens, $max_token_index ) = @_; + my $is_pattern = 0; + + my $next_token = $rtokens->[ $i + 1 ]; + if ( $next_token =~ /^[msixpodualgc]/ ) { + $i++; + } # skip possible modifier + my ( $next_nonblank_token, $i_next ) = + find_next_nonblank_token( $i, $rtokens, $max_token_index ); + + if ( $pattern_test{$next_nonblank_token} ) { + $is_pattern = 1; } else { - $is_pattern = -1; + + # Added '#' to fix issue c044 + if ( $next_nonblank_token =~ /^\s*$/ + || $next_nonblank_token eq '#' ) + { + $is_pattern = 0; + } + else { + $is_pattern = -1; + } } + return $is_pattern; } - return $is_pattern; } sub find_next_nonblank_token_on_this_line { @@ -6649,16 +7912,29 @@ sub find_angle_operator_termination { report_possible_bug(); } + # count blanks on inside of brackets + my $blank_count = 0; + $blank_count++ if ( $str =~ /<\s+/ ); + $blank_count++ if ( $str =~ /\s+>/ ); + # Now let's see where we stand.... # OK if math op not possible if ( $expecting == TERM ) { } - # OK if there are no more than 2 pre-tokens inside + # OK if there are no more than 2 non-blank pre-tokens inside # (not possible to write 2 token math between < and >) # This catches most common cases - elsif ( $i <= $i_beg + 3 ) { - write_diagnostics("ANGLE(1 or 2 tokens): $str\n"); + elsif ( $i <= $i_beg + 3 + $blank_count ) { + + # No longer any need to document this common case + ## write_diagnostics("ANGLE(1 or 2 tokens): $str\n"); + } + + # OK if there is some kind of identifier inside + # print $fh ; + elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) { + write_diagnostics("ANGLE (contains identifier): $str\n"); } # Not sure.. @@ -6735,9 +8011,10 @@ sub scan_number_do { # handle v-string without leading 'v' character ('Two Dot' rule) # (vstring.t) - # TODO: v-strings may contain underscores + # Here is the format prior to including underscores: + ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) { pos($input_line) = $pos_beg; - if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) { + if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) { $pos = pos($input_line); my $numc = $pos - $pos_beg; $number = substr( $input_line, $pos_beg, $numc ); @@ -6748,8 +8025,45 @@ sub scan_number_do { # handle octal, hex, binary if ( !defined($number) ) { pos($input_line) = $pos_beg; - if ( $input_line =~ - /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g ) + + # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0' + # For reference, the format prior to hex floating point is: + # /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g ) + # (hex) (octal) (binary) + if ( + $input_line =~ + + /\G[+-]?0( # leading [signed] 0 + + # a hex float, i.e. '0x0.b17217f7d1cf78p0' + ([xX][0-9a-fA-F_]* # X and optional leading digits + (\.([0-9a-fA-F][0-9a-fA-F_]*)?)? # optional decimal and fraction + [Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit + [0-9a-fA-F_]*) # optional Additional exponent digits + + # or hex integer + |([xX][0-9a-fA-F_]+) + + # or octal fraction + |([oO]?[0-7_]+ # string of octal digits + (\.([0-7][0-7_]*)?)? # optional decimal and fraction + [Pp][+-]?[0-7] # REQUIRED exponent, no underscore + [0-7_]*) # Additional exponent digits with underscores + + # or octal integer + |([oO]?[0-7_]+) # string of octal digits + + # or a binary float + |([bB][01_]* # 'b' with string of binary digits + (\.([01][01_]*)?)? # optional decimal and fraction + [Pp][+-]?[01] # Required exponent indicator, no underscore + [01_]*) # additional exponent bits + + # or binary integer + |([bB][01_]+) # 'b' with string of binary digits + + )/gx + ) { $pos = pos($input_line); my $numc = $pos - $pos_beg; @@ -7016,7 +8330,7 @@ sub follow_quoted_string { my $i = $i_beg - 1; my $quoted_string = ""; - TOKENIZER_DEBUG_FLAG_QUOTE && do { + 0 && do { print STDOUT "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n"; }; @@ -7115,7 +8429,9 @@ sub follow_quoted_string { } } else { - $quoted_string .= substr( $tok, $old_pos ); + if ( $old_pos <= length($tok) ) { + $quoted_string .= substr( $tok, $old_pos ); + } } } } @@ -7327,7 +8643,7 @@ sub show_tokens { return; } -{ +{ ## closure for sub matching end token my %matching_end_token; BEGIN { @@ -7355,7 +8671,7 @@ sub dump_token_types { # This should be the latest list of token types in use # adding NEW_TOKENS: add a comment here - print $fh <<'END_OF_LIST'; + $fh->print(<<'END_OF_LIST'); Here is a list of the token types currently used for lines of type 'CODE'. For the following tokens, the "type" of a token is just the token itself. @@ -7476,11 +8792,12 @@ BEGIN { # These tokens may precede a code block # patched for SWITCH/CASE/CATCH. Actually these could be removed - # now and we could let the extended-syntax coding handle them + # now and we could let the extended-syntax coding handle them. + # Added 'default' for Switch::Plain. @q = qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else unless do while until eval for foreach map grep sort - switch case given when catch try finally); + switch case given when default catch try finally); @is_code_block_token{@q} = (1) x scalar(@q); # I'll build the list of keywords incrementally @@ -7539,10 +8856,12 @@ BEGIN { elsif eof eq + evalbytes exec exists exit exp + fc fcntl fileno flock @@ -7669,6 +8988,7 @@ BEGIN { sqrt srand stat + state study substr symlink @@ -7705,19 +9025,20 @@ BEGIN { switch case + default given when err say + isa catch ); # patched above for SWITCH/CASE given/when err say # 'err' is a fairly safe addition. - # TODO: 'default' still needed if appropriate - # 'use feature' seen, but perltidy works ok without it. - # Concerned that 'default' could break code. + # Added 'default' for Switch::Plain. Note that we could also have + # a separate set of keywords to include if we see 'use Switch::Plain' push( @Keywords, @value_requestor ); # These are treated the same but are not keywords: @@ -7782,7 +9103,7 @@ BEGIN { my @value_requestor_type = qw# L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //= - <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ + <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~ f F pp mm Y p m U J G j >> << ^ t ~. ^. |. &. ^.= |.= &.= #; @@ -7810,12 +9131,19 @@ BEGIN { @q = qw(q qq qw qx qr s y tr m); @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q); - @q = qw(sub); - @is_sub{@q} = (1) x scalar(@q); - @q = qw(package); @is_package{@q} = (1) x scalar(@q); + @q = qw( ? : ); + push @q, ','; + @is_comma_question_colon{@q} = (1) x scalar(@q); + + # Hash of other possible line endings which may occur. + # Keep these coordinated with the regex where this is used. + # Note: chr(13) = chr(015)="\r". + @q = ( chr(13), chr(29), chr(26) ); + @other_line_endings{@q} = (1) x scalar(@q); + # These keywords are handled specially in the tokenizer code: my @special_keywords = qw( do @@ -7893,6 +9221,7 @@ BEGIN { splice split sprintf + state substr syscall sysopen @@ -7915,19 +9244,104 @@ BEGIN { @is_keyword_taking_list{@keyword_taking_list} = (1) x scalar(@keyword_taking_list); - # perl functions which may be unary operators - my @keyword_taking_optional_args = qw( + # perl functions which may be unary operators. + + # This list is used to decide if a pattern delimited by slashes, /pattern/, + # can follow one of these keywords. + @q = qw( + chomp eof eval fc lc pop shift uc undef + ); + + @is_keyword_rejecting_slash_as_pattern_delimiter{@q} = + (1) x scalar(@q); + + # These are keywords for which an arg may optionally be omitted. They are + # currently only used to disambiguate a ? used as a ternary from one used + # as a (depricated) pattern delimiter. In the future, they might be used + # to give a warning about ambiguous syntax before a /. + # Note: split has been omitted (see not below). + my @keywords_taking_optional_arg = qw( + abs + alarm + caller + chdir chomp + chop + chr + chroot + close + cos + defined + die eof eval + evalbytes + exit + exp + fc + getc + glob + gmtime + hex + int + last lc + lcfirst + length + localtime + log + lstat + mkdir + next + oct + ord pop + pos + print + printf + prototype + quotemeta + rand + readline + readlink + readpipe + redo + ref + require + reset + reverse + rmdir + say + select shift + sin + sleep + sqrt + srand + stat + study + tell uc + ucfirst + umask undef + unlink + warn + write ); - @is_keyword_taking_optional_args{@keyword_taking_optional_args} = - (1) x scalar(@keyword_taking_optional_args); + @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} = + (1) x scalar(@keywords_taking_optional_arg); + + # This list is used to decide if a pattern delmited by question marks, + # ?pattern?, can follow one of these keywords. Note that from perl 5.22 + # on, a ?pattern? is not recognized, so we can be much more strict than + # with a /pattern/. Note that 'split' is not in this list. In current + # versions of perl a question following split must be a ternary, but + # in older versions it could be a pattern. The guessing algorithm will + # decide. We are combining two lists here to simplify the test. + @q = ( @keywords_taking_optional_arg, @operator_requestor ); + @is_keyword_rejecting_question_as_pattern_delimiter{@q} = + (1) x scalar(@q); # These are not used in any way yet # my @unused_keywords = qw( @@ -7949,4 +9363,3 @@ BEGIN { @is_keyword{@Keywords} = (1) x scalar(@Keywords); } 1; -