X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=lib%2FPerl%2FTidy%2FTokenizer.pm;h=8d16dd2fdbffca6948225d48e2f83eebb6224fba;hb=effbe8e558790d5f5e4eb49a10b2ed020b0eaaee;hp=be828299e7b9bc3ad7b0ea0b84f7ddf6e5d8fdb3;hpb=c514d57dc8088e1f4d3f51857b1155c20085c296;p=perltidy.git diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm index be82829..8d16dd2 100644 --- a/lib/Perl/Tidy/Tokenizer.pm +++ b/lib/Perl/Tidy/Tokenizer.pm @@ -23,14 +23,20 @@ use strict; use warnings; use English qw( -no_match_vars ); -our $VERSION = '20220613'; +our $VERSION = '20230309'; + +use Perl::Tidy::LineBuffer; +use Carp; use constant DEVEL_MODE => 0; use constant EMPTY_STRING => q{}; use constant SPACE => q{ }; -use Perl::Tidy::LineBuffer; -use Carp; +# Decimal values of some ascii characters for quick checks +use constant ORD_TAB => 9; +use constant ORD_SPACE => 32; +use constant ORD_PRINTABLE_MIN => 33; +use constant ORD_PRINTABLE_MAX => 126; # PACKAGE VARIABLES for processing an entire FILE. # These must be package variables because most may get localized during @@ -95,6 +101,7 @@ use vars qw{ %is_tetragraph %is_valid_token_type %is_keyword + %is_my_our_state %is_code_block_token %is_sort_map_grep_eval_do %is_sort_map_grep @@ -202,7 +209,7 @@ BEGIN { _rOpts_logfile_ => $i++, _rOpts_ => $i++, }; -} +} ## end BEGIN { ## closure for subs to count instances @@ -256,6 +263,8 @@ sub Fault { my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0); my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1); my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2); + my $pkg = __PACKAGE__; + my $input_stream_name = get_input_stream_name(); Die(<{'use-feature'} =~ /\bclass\b/; + + # These are the main updates for this option. There are additional + # changes elsewhere, usually indicated with a comment 'rt145706' + + # Update hash values for use_feature=class, added for rt145706 + # see 'perlclass.pod' + + # IMPORTANT: We are changing global hash values initially set in a BEGIN + # block. Values must be defined (true or false) for each of these new + # words whether true or false. Otherwise, programs using the module which + # change options between runs (such as test code) will have + # incorrect settings and fail. + + # There are 4 new keywords: + + # 'class' - treated specially as generalization of 'package' + # Note: we must not set 'class' to be a keyword to avoid problems + # with older uses. + $is_package{'class'} = $use_feature_class; + + # 'method' - treated like sub using the sub-alias-list option + # Note: we must not set 'method' to be a keyword to avoid problems + # with older uses. + + # 'field' - added as a keyword, and works like 'my' + $is_keyword{'field'} = $use_feature_class; + $is_my_our_state{'field'} = $use_feature_class; + + # 'ADJUST' - added as a keyword and works like 'BEGIN' + # TODO: if ADJUST gets a paren list, this will need to be updated + $is_keyword{'ADJUST'} = $use_feature_class; + $is_code_block_token{'ADJUST'} = $use_feature_class; + %is_grep_alias = (); if ( $rOpts->{'grep-alias-list'} ) { @@ -344,6 +390,7 @@ sub check_options { make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<>V' ); + return; } ## end sub check_options @@ -466,7 +513,7 @@ sub new { $tokenizer_self = $self; prepare_for_a_new_file(); - find_starting_indentation_level(); + $self->find_starting_indentation_level(); # This is not a full class yet, so die if an attempt is made to # create more than one object. @@ -488,7 +535,7 @@ sub warning { $logger_object->warning($msg); } return; -} +} ## end sub warning sub get_input_stream_name { my $input_stream_name = EMPTY_STRING; @@ -497,7 +544,7 @@ sub get_input_stream_name { $input_stream_name = $logger_object->get_input_stream_name(); } return $input_stream_name; -} +} ## end sub get_input_stream_name sub complain { my $msg = shift; @@ -517,7 +564,7 @@ sub write_logfile_entry { $logger_object->write_logfile_entry($msg); } return; -} +} ## end sub write_logfile_entry sub interrupt_logfile { my $logger_object = $tokenizer_self->[_logger_object_]; @@ -525,7 +572,7 @@ sub interrupt_logfile { $logger_object->interrupt_logfile(); } return; -} +} ## end sub interrupt_logfile sub resume_logfile { my $logger_object = $tokenizer_self->[_logger_object_]; @@ -533,7 +580,7 @@ sub resume_logfile { $logger_object->resume_logfile(); } return; -} +} ## end sub resume_logfile sub increment_brace_error { my $logger_object = $tokenizer_self->[_logger_object_]; @@ -541,7 +588,7 @@ sub increment_brace_error { $logger_object->increment_brace_error(); } return; -} +} ## end sub increment_brace_error sub report_definite_bug { $tokenizer_self->[_hit_bug_] = 1; @@ -550,7 +597,7 @@ sub report_definite_bug { $logger_object->report_definite_bug(); } return; -} +} ## end sub report_definite_bug sub brace_warning { my $msg = shift; @@ -559,7 +606,7 @@ sub brace_warning { $logger_object->brace_warning($msg); } return; -} +} ## end sub brace_warning sub get_saw_brace_error { my $logger_object = $tokenizer_self->[_logger_object_]; @@ -569,7 +616,7 @@ sub get_saw_brace_error { else { return 0; } -} +} ## end sub get_saw_brace_error sub get_unexpected_error_count { my ($self) = @_; @@ -583,7 +630,7 @@ sub write_diagnostics { $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg); } return; -} +} ## end sub write_diagnostics sub get_maximum_level { return $tokenizer_self->[_maximum_level_]; @@ -781,26 +828,29 @@ sub get_input_line_number { return $tokenizer_self->[_last_line_number_]; } +sub log_numbered_msg { + my ( $self, $msg ) = @_; + + # write input line number + message to logfile + my $input_line_number = $self->[_last_line_number_]; + write_logfile_entry("Line $input_line_number: $msg"); + return; +} ## end sub log_numbered_msg + # returns the next tokenized line sub get_line { my $self = shift; - # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth, - # $square_bracket_depth, $paren_depth + # USES GLOBAL VARIABLES: + # $brace_depth, $square_bracket_depth, $paren_depth - my $input_line = $tokenizer_self->[_line_buffer_object_]->get_line(); - $tokenizer_self->[_line_of_text_] = $input_line; + my $input_line = $self->[_line_buffer_object_]->get_line(); + $self->[_line_of_text_] = $input_line; return unless ($input_line); - my $input_line_number = ++$tokenizer_self->[_last_line_number_]; - - my $write_logfile_entry = sub { - my ($msg) = @_; - write_logfile_entry("Line $input_line_number: $msg"); - return; - }; + my $input_line_number = ++$self->[_last_line_number_]; # Find and remove what characters terminate this line, including any # control r @@ -820,7 +870,7 @@ sub get_line { # for backwards compatibility we keep the line text terminated with # a newline character $input_line .= "\n"; - $tokenizer_self->[_line_of_text_] = $input_line; # update + $self->[_line_of_text_] = $input_line; # create a data structure describing this line which will be # returned to the caller. @@ -860,26 +910,23 @@ sub get_line { _square_bracket_depth => $square_bracket_depth, _paren_depth => $paren_depth, _quote_character => EMPTY_STRING, -## _rtoken_type => undef, -## _rtokens => undef, -## _rlevels => undef, -## _rblock_type => undef, -## _rcontainer_type => undef, -## _rcontainer_environment => undef, -## _rtype_sequence => undef, -## _rnesting_tokens => undef, -## _rci_levels => undef, -## _rnesting_blocks => undef, -## _starting_in_quote => 0, -## _ending_in_quote => 0, +## Skip these needless initializations for efficiency: +## _rtoken_type => undef, +## _rtokens => undef, +## _rlevels => undef, +## _rblock_type => undef, +## _rtype_sequence => undef, +## _rci_levels => undef, +## _starting_in_quote => 0, +## _ending_in_quote => 0, }; # must print line unchanged if we are in a here document - if ( $tokenizer_self->[_in_here_doc_] ) { + if ( $self->[_in_here_doc_] ) { $line_of_tokens->{_line_type} = 'HERE'; - my $here_doc_target = $tokenizer_self->[_here_doc_target_]; - my $here_quote_character = $tokenizer_self->[_here_quote_character_]; + my $here_doc_target = $self->[_here_doc_target_]; + my $here_quote_character = $self->[_here_quote_character_]; my $candidate_target = $input_line; chomp $candidate_target; @@ -889,27 +936,26 @@ sub get_line { $candidate_target =~ s/^\s*//; } if ( $candidate_target eq $here_doc_target ) { - $tokenizer_self->[_nearly_matched_here_target_at_] = undef; + $self->[_nearly_matched_here_target_at_] = undef; $line_of_tokens->{_line_type} = 'HERE_END'; - $write_logfile_entry->("Exiting HERE document $here_doc_target\n"); + $self->log_numbered_msg("Exiting HERE document $here_doc_target\n"); - my $rhere_target_list = $tokenizer_self->[_rhere_target_list_]; + my $rhere_target_list = $self->[_rhere_target_list_]; if ( @{$rhere_target_list} ) { # there can be multiple here targets ( $here_doc_target, $here_quote_character ) = @{ shift @{$rhere_target_list} }; - $tokenizer_self->[_here_doc_target_] = $here_doc_target; - $tokenizer_self->[_here_quote_character_] = - $here_quote_character; - $write_logfile_entry->( + $self->[_here_doc_target_] = $here_doc_target; + $self->[_here_quote_character_] = $here_quote_character; + $self->log_numbered_msg( "Entering HERE document $here_doc_target\n"); - $tokenizer_self->[_nearly_matched_here_target_at_] = undef; - $tokenizer_self->[_started_looking_for_here_target_at_] = + $self->[_nearly_matched_here_target_at_] = undef; + $self->[_started_looking_for_here_target_at_] = $input_line_number; } else { - $tokenizer_self->[_in_here_doc_] = 0; - $tokenizer_self->[_here_doc_target_] = EMPTY_STRING; - $tokenizer_self->[_here_quote_character_] = EMPTY_STRING; + $self->[_in_here_doc_] = 0; + $self->[_here_doc_target_] = EMPTY_STRING; + $self->[_here_quote_character_] = EMPTY_STRING; } } @@ -919,24 +965,23 @@ sub get_line { $candidate_target =~ s/\s*$//; $candidate_target =~ s/^\s*//; if ( $candidate_target eq $here_doc_target ) { - $tokenizer_self->[_nearly_matched_here_target_at_] = - $input_line_number; + $self->[_nearly_matched_here_target_at_] = $input_line_number; } } return $line_of_tokens; } # Print line unchanged if we are in a format section - elsif ( $tokenizer_self->[_in_format_] ) { + elsif ( $self->[_in_format_] ) { if ( $input_line =~ /^\.[\s#]*$/ ) { # Decrement format depth count at a '.' after a 'format' - $tokenizer_self->[_in_format_]--; + $self->[_in_format_]--; # This is the end when count reaches 0 - if ( !$tokenizer_self->[_in_format_] ) { - $write_logfile_entry->("Exiting format section\n"); + if ( !$self->[_in_format_] ) { + $self->log_numbered_msg("Exiting format section\n"); $line_of_tokens->{_line_type} = 'FORMAT_END'; } } @@ -946,22 +991,22 @@ sub get_line { # Increment format depth count at a 'format' within a 'format' # This is a simple way to handle nested formats (issue c019). - $tokenizer_self->[_in_format_]++; + $self->[_in_format_]++; } } return $line_of_tokens; } # must print line unchanged if we are in pod documentation - elsif ( $tokenizer_self->[_in_pod_] ) { + elsif ( $self->[_in_pod_] ) { $line_of_tokens->{_line_type} = 'POD'; if ( $input_line =~ /^=cut/ ) { $line_of_tokens->{_line_type} = 'POD_END'; - $write_logfile_entry->("Exiting POD section\n"); - $tokenizer_self->[_in_pod_] = 0; + $self->log_numbered_msg("Exiting POD section\n"); + $self->[_in_pod_] = 0; } - if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) { + if ( $input_line =~ /^\#\!.*perl\b/ && !$self->[_in_end_] ) { warning( "Hash-bang in pod can cause older versions of perl to fail! \n" ); @@ -971,13 +1016,13 @@ sub get_line { } # print line unchanged if in skipped section - elsif ( $tokenizer_self->[_in_skipped_] ) { + elsif ( $self->[_in_skipped_] ) { $line_of_tokens->{_line_type} = 'SKIP'; if ( $input_line =~ /$code_skipping_pattern_end/ ) { $line_of_tokens->{_line_type} = 'SKIP_END'; - $write_logfile_entry->("Exiting code-skipping section\n"); - $tokenizer_self->[_in_skipped_] = 0; + $self->log_numbered_msg("Exiting code-skipping section\n"); + $self->[_in_skipped_] = 0; } return $line_of_tokens; } @@ -986,13 +1031,13 @@ sub get_line { # are seeing illegal tokens and cannot continue. Syntax errors do # not pass this route). Calling routine can decide what to do, but # the default can be to just pass all lines as if they were after __END__ - elsif ( $tokenizer_self->[_in_error_] ) { + elsif ( $self->[_in_error_] ) { $line_of_tokens->{_line_type} = 'ERROR'; return $line_of_tokens; } # print line unchanged if we are __DATA__ section - elsif ( $tokenizer_self->[_in_data_] ) { + elsif ( $self->[_in_data_] ) { # ...but look for POD # Note that the _in_data and _in_end flags remain set @@ -1000,8 +1045,8 @@ sub get_line { # end of a pod section if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) { $line_of_tokens->{_line_type} = 'POD_START'; - $write_logfile_entry->("Entering POD section\n"); - $tokenizer_self->[_in_pod_] = 1; + $self->log_numbered_msg("Entering POD section\n"); + $self->[_in_pod_] = 1; return $line_of_tokens; } else { @@ -1011,7 +1056,7 @@ sub get_line { } # print line unchanged if we are in __END__ section - elsif ( $tokenizer_self->[_in_end_] ) { + elsif ( $self->[_in_end_] ) { # ...but look for POD # Note that the _in_data and _in_end flags remain set @@ -1019,8 +1064,8 @@ sub get_line { # end of a pod section if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) { $line_of_tokens->{_line_type} = 'POD_START'; - $write_logfile_entry->("Entering POD section\n"); - $tokenizer_self->[_in_pod_] = 1; + $self->log_numbered_msg("Entering POD section\n"); + $self->[_in_pod_] = 1; return $line_of_tokens; } else { @@ -1030,17 +1075,17 @@ sub get_line { } # check for a hash-bang line if we haven't seen one - if ( !$tokenizer_self->[_saw_hash_bang_] ) { + if ( !$self->[_saw_hash_bang_] ) { if ( $input_line =~ /^\#\!.*perl\b/ ) { - $tokenizer_self->[_saw_hash_bang_] = $input_line_number; + $self->[_saw_hash_bang_] = $input_line_number; # check for -w and -P flags if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) { - $tokenizer_self->[_saw_perl_dash_P_] = 1; + $self->[_saw_perl_dash_P_] = 1; } if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) { - $tokenizer_self->[_saw_perl_dash_w_] = 1; + $self->[_saw_perl_dash_w_] = 1; } if ( @@ -1052,7 +1097,7 @@ sub get_line { $last_nonblank_block_type && $last_nonblank_block_type eq 'BEGIN' ) - && !$tokenizer_self->[_look_for_hash_bang_] + && !$self->[_look_for_hash_bang_] # Try to avoid giving a false alarm at a simple comment. # These look like valid hash-bang lines: @@ -1073,7 +1118,7 @@ sub get_line { # this is helpful for VMS systems; we may have accidentally # tokenized some DCL commands - if ( $tokenizer_self->[_started_tokenizing_] ) { + if ( $self->[_started_tokenizing_] ) { warning( "There seems to be a hash-bang after line 1; do you need to run with -x ?\n" ); @@ -1093,8 +1138,8 @@ sub get_line { } # wait for a hash-bang before parsing if the user invoked us with -x - if ( $tokenizer_self->[_look_for_hash_bang_] - && !$tokenizer_self->[_saw_hash_bang_] ) + if ( $self->[_look_for_hash_bang_] + && !$self->[_saw_hash_bang_] ) { $line_of_tokens->{_line_type} = 'SYSTEM'; return $line_of_tokens; @@ -1117,33 +1162,31 @@ sub get_line { # _in_skipped_ # _in_pod_ # _in_quote_ - my $ending_in_quote_last = $tokenizer_self->[_in_quote_]; - tokenize_this_line($line_of_tokens); + $self->tokenize_this_line($line_of_tokens); # Now finish defining the return structure and return it - $line_of_tokens->{_ending_in_quote} = $tokenizer_self->[_in_quote_]; + $line_of_tokens->{_ending_in_quote} = $self->[_in_quote_]; # handle severe error (binary data in script) - if ( $tokenizer_self->[_in_error_] ) { - $tokenizer_self->[_in_quote_] = 0; # to avoid any more messages + if ( $self->[_in_error_] ) { + $self->[_in_quote_] = 0; # to avoid any more messages warning("Giving up after error\n"); $line_of_tokens->{_line_type} = 'ERROR'; - reset_indentation_level(0); # avoid error messages + reset_indentation_level(0); # avoid error messages return $line_of_tokens; } # handle start of pod documentation - if ( $tokenizer_self->[_in_pod_] ) { + if ( $self->[_in_pod_] ) { # This gets tricky..above a __DATA__ or __END__ section, perl # accepts '=cut' as the start of pod section. But afterwards, # only pod utilities see it and they may ignore an =cut without # leading =head. In any case, this isn't good. if ( $input_line =~ /^=cut\b/ ) { - if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) - { + if ( $self->[_saw_data_] || $self->[_saw_end_] ) { complain("=cut while not in pod ignored\n"); - $tokenizer_self->[_in_pod_] = 0; + $self->[_in_pod_] = 0; $line_of_tokens->{_line_type} = 'POD_END'; } else { @@ -1151,67 +1194,66 @@ sub get_line { warning( "=cut starts a pod section .. this can fool pod utilities.\n" ) unless (DEVEL_MODE); - $write_logfile_entry->("Entering POD section\n"); + $self->log_numbered_msg("Entering POD section\n"); } } else { $line_of_tokens->{_line_type} = 'POD_START'; - $write_logfile_entry->("Entering POD section\n"); + $self->log_numbered_msg("Entering POD section\n"); } return $line_of_tokens; } # handle start of skipped section - if ( $tokenizer_self->[_in_skipped_] ) { + if ( $self->[_in_skipped_] ) { $line_of_tokens->{_line_type} = 'SKIP'; - $write_logfile_entry->("Entering code-skipping section\n"); + $self->log_numbered_msg("Entering code-skipping section\n"); return $line_of_tokens; } # see if this line contains here doc targets - my $rhere_target_list = $tokenizer_self->[_rhere_target_list_]; + my $rhere_target_list = $self->[_rhere_target_list_]; if ( @{$rhere_target_list} ) { my ( $here_doc_target, $here_quote_character ) = @{ shift @{$rhere_target_list} }; - $tokenizer_self->[_in_here_doc_] = 1; - $tokenizer_self->[_here_doc_target_] = $here_doc_target; - $tokenizer_self->[_here_quote_character_] = $here_quote_character; - $write_logfile_entry->("Entering HERE document $here_doc_target\n"); - $tokenizer_self->[_started_looking_for_here_target_at_] = - $input_line_number; + $self->[_in_here_doc_] = 1; + $self->[_here_doc_target_] = $here_doc_target; + $self->[_here_quote_character_] = $here_quote_character; + $self->log_numbered_msg("Entering HERE document $here_doc_target\n"); + $self->[_started_looking_for_here_target_at_] = $input_line_number; } # NOTE: __END__ and __DATA__ statements are written unformatted # because they can theoretically contain additional characters # which are not tokenized (and cannot be read with either!). - if ( $tokenizer_self->[_in_data_] ) { + if ( $self->[_in_data_] ) { $line_of_tokens->{_line_type} = 'DATA_START'; - $write_logfile_entry->("Starting __DATA__ section\n"); - $tokenizer_self->[_saw_data_] = 1; + $self->log_numbered_msg("Starting __DATA__ section\n"); + $self->[_saw_data_] = 1; # keep parsing after __DATA__ if use SelfLoader was seen - if ( $tokenizer_self->[_saw_selfloader_] ) { - $tokenizer_self->[_in_data_] = 0; - $write_logfile_entry->( + if ( $self->[_saw_selfloader_] ) { + $self->[_in_data_] = 0; + $self->log_numbered_msg( "SelfLoader seen, continuing; -nlsl deactivates\n"); } return $line_of_tokens; } - elsif ( $tokenizer_self->[_in_end_] ) { + elsif ( $self->[_in_end_] ) { $line_of_tokens->{_line_type} = 'END_START'; - $write_logfile_entry->("Starting __END__ section\n"); - $tokenizer_self->[_saw_end_] = 1; + $self->log_numbered_msg("Starting __END__ section\n"); + $self->[_saw_end_] = 1; # keep parsing after __END__ if use AutoLoader was seen - if ( $tokenizer_self->[_saw_autoloader_] ) { - $tokenizer_self->[_in_end_] = 0; - $write_logfile_entry->( + if ( $self->[_saw_autoloader_] ) { + $self->[_in_end_] = 0; + $self->log_numbered_msg( "AutoLoader seen, continuing; -nlal deactivates\n"); } return $line_of_tokens; @@ -1221,42 +1263,39 @@ sub get_line { $line_of_tokens->{_line_type} = 'CODE'; # remember if we have seen any real code - if ( !$tokenizer_self->[_started_tokenizing_] + if ( !$self->[_started_tokenizing_] && $input_line !~ /^\s*$/ && $input_line !~ /^\s*#/ ) { - $tokenizer_self->[_started_tokenizing_] = 1; + $self->[_started_tokenizing_] = 1; } - if ( $tokenizer_self->[_debugger_object_] ) { - $tokenizer_self->[_debugger_object_] - ->write_debug_entry($line_of_tokens); + if ( $self->[_debugger_object_] ) { + $self->[_debugger_object_]->write_debug_entry($line_of_tokens); } # Note: if keyword 'format' occurs in this line code, it is still CODE # (keyword 'format' need not start a line) - if ( $tokenizer_self->[_in_format_] ) { - $write_logfile_entry->("Entering format section\n"); + if ( $self->[_in_format_] ) { + $self->log_numbered_msg("Entering format section\n"); } - if ( $tokenizer_self->[_in_quote_] - and ( $tokenizer_self->[_line_start_quote_] < 0 ) ) + if ( $self->[_in_quote_] + and ( $self->[_line_start_quote_] < 0 ) ) { #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) { - if ( ( my $quote_target = $tokenizer_self->[_quote_target_] ) !~ - /^\s*$/ ) - { - $tokenizer_self->[_line_start_quote_] = $input_line_number; - $write_logfile_entry->( + if ( ( my $quote_target = $self->[_quote_target_] ) !~ /^\s*$/ ) { + $self->[_line_start_quote_] = $input_line_number; + $self->log_numbered_msg( "Start multi-line quote or pattern ending in $quote_target\n"); } } - elsif ( ( $tokenizer_self->[_line_start_quote_] >= 0 ) - && !$tokenizer_self->[_in_quote_] ) + elsif ( ( $self->[_line_start_quote_] >= 0 ) + && !$self->[_in_quote_] ) { - $tokenizer_self->[_line_start_quote_] = -1; - $write_logfile_entry->("End of multi-line quote or pattern\n"); + $self->[_line_start_quote_] = -1; + $self->log_numbered_msg("End of multi-line quote or pattern\n"); } # we are returning a line of CODE @@ -1271,17 +1310,17 @@ sub find_starting_indentation_level { # example) it may not be zero. The user may specify this with the # -sil=n parameter but normally doesn't so we have to guess. # - # USES GLOBAL VARIABLES: $tokenizer_self + my ($self) = @_; my $starting_level = 0; # use value if given as parameter - if ( $tokenizer_self->[_know_starting_level_] ) { - $starting_level = $tokenizer_self->[_starting_level_]; + if ( $self->[_know_starting_level_] ) { + $starting_level = $self->[_starting_level_]; } # if we know there is a hash_bang line, the level must be zero - elsif ( $tokenizer_self->[_look_for_hash_bang_] ) { - $tokenizer_self->[_know_starting_level_] = 1; + elsif ( $self->[_look_for_hash_bang_] ) { + $self->[_know_starting_level_] = 1; } # otherwise figure it out from the input file @@ -1291,9 +1330,7 @@ sub find_starting_indentation_level { # keep looking at lines until we find a hash bang or piece of code my $msg = EMPTY_STRING; - while ( $line = - $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) ) - { + while ( $line = $self->[_line_buffer_object_]->peek_ahead( $i++ ) ) { # if first line is #! then assume starting level is zero if ( $i == 1 && $line =~ /^\#\!/ ) { @@ -1308,7 +1345,7 @@ sub find_starting_indentation_level { $msg = "Line $i implies starting-indentation-level = $starting_level\n"; write_logfile_entry("$msg"); } - $tokenizer_self->[_starting_level_] = $starting_level; + $self->[_starting_level_] = $starting_level; reset_indentation_level($starting_level); return; } ## end sub find_starting_indentation_level @@ -1488,7 +1525,7 @@ sub prepare_for_a_new_file { # TV4: SCALARS for multi-line identifiers and # statements. These are initialized with a subroutine call # and continually updated as lines are processed. - my ( $id_scan_state, $identifier, $want_paren, $indented_if_level ); + my ( $id_scan_state, $identifier, $want_paren ); # TV5: SCALARS for tracking indentation level. # Initialized once and continually updated as lines are @@ -1531,10 +1568,9 @@ sub prepare_for_a_new_file { $allowed_quote_modifiers = EMPTY_STRING; # TV4: - $id_scan_state = EMPTY_STRING; - $identifier = EMPTY_STRING; - $want_paren = EMPTY_STRING; - $indented_if_level = 0; + $id_scan_state = EMPTY_STRING; + $identifier = EMPTY_STRING; + $want_paren = EMPTY_STRING; # TV5: $nesting_token_string = EMPTY_STRING; @@ -1587,8 +1623,7 @@ sub prepare_for_a_new_file { $quoted_string_2, $allowed_quote_modifiers, ]; - my $rTV4 = - [ $id_scan_state, $identifier, $want_paren, $indented_if_level ]; + my $rTV4 = [ $id_scan_state, $identifier, $want_paren ]; my $rTV5 = [ $nesting_token_string, $nesting_type_string, @@ -1636,8 +1671,7 @@ sub prepare_for_a_new_file { $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, ) = @{$rTV3}; - ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) = - @{$rTV4}; + ( $id_scan_state, $identifier, $want_paren ) = @{$rTV4}; ( $nesting_token_string, $nesting_type_string, @@ -1746,9 +1780,6 @@ EOM } ## end sub split_pretoken sub get_indentation_level { - - # patch to avoid reporting error if indented if is not terminated - if ($indented_if_level) { return $level_in_tokenizer - 1 } return $level_in_tokenizer; } @@ -1863,25 +1894,24 @@ EOM ); my %is_for_foreach; - @_ = qw(for foreach); - @is_for_foreach{@_} = (1) x scalar(@_); - - my %is_my_our_state; - @_ = qw(my our state); - @is_my_our_state{@_} = (1) x scalar(@_); + @q = qw(for foreach); + @is_for_foreach{@q} = (1) x scalar(@q); # These keywords may introduce blocks after parenthesized expressions, # in the form: # keyword ( .... ) { BLOCK } # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when' + # NOTE for --use-feature=class: if ADJUST blocks eventually take a + # parameter list, then ADJUST might need to be added to this list (see + # perlclass.pod) my %is_blocktype_with_paren; - @_ = + @q = qw(if elsif unless while until for foreach switch case given when catch); - @is_blocktype_with_paren{@_} = (1) x scalar(@_); + @is_blocktype_with_paren{@q} = (1) x scalar(@q); my %is_case_default; - @_ = qw(case default); - @is_case_default{@_} = (1) x scalar(@_); + @q = qw(case default); + @is_case_default{@q} = (1) x scalar(@q); #------------------------ # end of tokenizer hashes @@ -1936,12 +1966,10 @@ EOM _decrement_count(); # avoid error check for multiple tokenizers # make a new tokenizer - my $rOpts = {}; - my $rpending_logfile_message; + my $rOpts = {}; my $source_object = Perl::Tidy::LineSource->new( - input_file => \$replacement_text, - rOpts => $rOpts, - rpending_logfile_message => $rpending_logfile_message, + input_file => \$replacement_text, + rOpts => $rOpts, ); my $tokenizer = Perl::Tidy::Tokenizer->new( source_object => $source_object, @@ -1985,7 +2013,7 @@ EOM scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype, $rtoken_map, $max_token_index ); return; - } + } ## end sub scan_bare_identifier sub scan_identifier { ( @@ -2034,7 +2062,7 @@ EOM '%' => LIST_CONTEXT, '&' => UNKNOWN_CONTEXT, ); - } + } ## end BEGIN sub scan_simple_identifier { @@ -2047,24 +2075,41 @@ EOM # This gives the same results as the full scanner in about 1/4 the # total runtime for a typical input stream. + # Notation: + # $var * 2 + # ^^ ^ + # || | + # || ---- $i_next [= next nonblank pretoken ] + # |----$i_plus_1 [= a bareword ] + # ---$i_begin [= a sigil] + my $i_begin = $i; my $tok_begin = $tok; + my $i_plus_1 = $i + 1; my $fast_scan_type; - ############################### + #------------------------------------------------------- + # Do full scan for anything following a pointer, such as + # $cref->&*; # a postderef + #------------------------------------------------------- + if ( $last_nonblank_token eq '->' ) { + + } + + #------------------------------ # quick scan with leading sigil - ############################### - if ( !$id_scan_state - && $i + 1 <= $max_token_index + #------------------------------ + elsif ( !$id_scan_state + && $i_plus_1 <= $max_token_index && $fast_scan_context{$tok} ) { $context = $fast_scan_context{$tok}; # look for $var, @var, ... - if ( $rtoken_type->[ $i + 1 ] eq 'w' ) { + if ( $rtoken_type->[$i_plus_1] eq 'w' ) { my $pretype_next = EMPTY_STRING; - my $i_next = $i + 2; - if ( $i_next <= $max_token_index ) { + if ( $i_plus_1 < $max_token_index ) { + my $i_next = $i_plus_1 + 1; if ( $rtoken_type->[$i_next] eq 'b' && $i_next < $max_token_index ) { @@ -2075,10 +2120,10 @@ EOM if ( $pretype_next ne ':' && $pretype_next ne "'" ) { # Found type 'i' like '$var', '@var', or '%var' - $identifier = $tok . $rtokens->[ $i + 1 ]; + $identifier = $tok . $rtokens->[$i_plus_1]; $tok = $identifier; $type = 'i'; - $i = $i + 1; + $i = $i_plus_1; $fast_scan_type = $type; } } @@ -2087,7 +2132,7 @@ EOM # But we must let the full scanner handle things ${ because it may # keep going to get a complete identifier like '${#}' . elsif ( - $rtoken_type->[ $i + 1 ] eq '{' + $rtoken_type->[$i_plus_1] eq '{' && ( $tok_begin eq '@' || $tok_begin eq '%' ) ) @@ -2099,15 +2144,15 @@ EOM } } - ############################ + #--------------------------- # Quick scan with leading -> # Look for ->[ and ->{ - ############################ + #--------------------------- elsif ( $tok eq '->' && $i < $max_token_index - && ( $rtokens->[ $i + 1 ] eq '{' - || $rtokens->[ $i + 1 ] eq '[' ) + && ( $rtokens->[$i_plus_1] eq '{' + || $rtokens->[$i_plus_1] eq '[' ) ) { $type = $tok; @@ -2116,9 +2161,9 @@ EOM $context = UNKNOWN_CONTEXT; } - ####################################### + #-------------------------------------- # Verify correctness during development - ####################################### + #-------------------------------------- if ( VERIFY_FASTSCAN && $fast_scan_type ) { # We will call the full method @@ -2146,21 +2191,167 @@ EOM } } - ################################################### + #------------------------------------------------- # call full scanner if fast method did not succeed - ################################################### + #------------------------------------------------- if ( !$fast_scan_type ) { scan_identifier(); } return; } ## end sub scan_simple_identifier + sub method_ok_here { + + # Return: + # false if this is definitely an invalid method declaration + # true otherwise (even if not sure) + + # We are trying to avoid problems with old uses of 'method' + # when --use-feature=class is set (rt145706). + # For example, this should cause a return of 'false': + + # method paint => sub { + # return; + # }; + + # from do_scan_sub: + my $i_beg = $i + 1; + my $pos_beg = $rtoken_map->[$i_beg]; + pos($input_line) = $pos_beg; + + # TEST 1: look a valid sub NAME + if ( + $input_line =~ m/\G\s* + ((?:\w*(?:'|::))*) # package - something that ends in :: or ' + (\w+) # NAME - required + /gcx + ) + { + # For possible future use.. + my $subname = $2; + my $package = $1 ? $1 : EMPTY_STRING; + } + else { + return; + } + + # TEST 2: look for invalid characters after name, such as here: + # method paint => sub { + # ... + # } + my $next_char = EMPTY_STRING; + if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 } + if ( !$next_char || $next_char eq '#' ) { + ( $next_char, my $i_next ) = + find_next_nonblank_token( $max_token_index, + $rtokens, $max_token_index ); + } + + if ( !$next_char ) { + + # out of characters - give up + return; + } + + # Possibly valid next token types: + # '(' could start prototype or signature + # ':' could start ATTRIBUTE + # '{' cold start BLOCK + # ';' or '}' could end a statement + if ( $next_char !~ /^[\(\:\{\;\}]/ ) { + + # This does not match use feature 'class' syntax + return; + } + + # We will stop here and assume that this is valid syntax for + # use feature 'class'. + return 1; + } ## end sub method_ok_here + + sub class_ok_here { + + # Return: + # false if this is definitely an invalid class declaration + # true otherwise (even if not sure) + + # We are trying to avoid problems with old uses of 'class' + # when --use-feature=class is set (rt145706). We look ahead + # see if this use of 'class' is obviously inconsistent with + # the syntax of use feature 'class'. This allows the default + # setting --use-feature=class to work for old syntax too. + + # Valid class declarations look like + # class NAME ?ATTRS ?VERSION ?BLOCK + # where ATTRS VERSION and BLOCK are optional + + # For example, this should produce a return of 'false': + # + # class ExtendsBasicAttributes is BasicAttributes{ + + # TEST 1: class stmt can only go where a new statment can start + if ( !new_statement_ok() ) { return } + + my $i_beg = $i + 1; + my $pos_beg = $rtoken_map->[$i_beg]; + pos($input_line) = $pos_beg; + + # TEST 2: look for a valid NAME + if ( + $input_line =~ m/\G\s* + ((?:\w*(?:'|::))*) # package - something that ends in :: or ' + (\w+) # NAME - required + /gcx + ) + { + # For possible future use.. + my $subname = $2; + my $package = $1 ? $1 : EMPTY_STRING; + } + else { + return; + } + + # TEST 3: look for valid characters after NAME + my $next_char = EMPTY_STRING; + if ( $input_line =~ m/\s*(\S)/gcx ) { $next_char = $1 } + if ( !$next_char || $next_char eq '#' ) { + ( $next_char, my $i_next ) = + find_next_nonblank_token( $max_token_index, + $rtokens, $max_token_index ); + } + if ( !$next_char ) { + + # out of characters - give up + return; + } + + # Must see one of: ATTRIBUTE, VERSION, BLOCK, or end stmt + + # Possibly valid next token types: + # ':' could start ATTRIBUTE + # '\d' could start VERSION + # '{' cold start BLOCK + # ';' could end a statement + # '}' could end statement but would be strange + + if ( $next_char !~ /^[\:\d\{\;\}]/ ) { + + # This does not match use feature 'class' syntax + return; + } + + # We will stop here and assume that this is valid syntax for + # use feature 'class'. + return 1; + } ## end sub class_ok_here + sub scan_id { ( $i, $tok, $type, $id_scan_state ) = scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state, $max_token_index ); return; - } + } ## end sub scan_id sub scan_number { my $number; @@ -2168,7 +2359,7 @@ EOM scan_number_do( $input_line, $i, $rtoken_map, $type, $max_token_index ); return $number; - } + } ## end sub scan_number use constant VERIFY_FASTNUM => 0; @@ -2182,9 +2373,9 @@ EOM my $tok_begin = $tok; my $number; - ################################## + #--------------------------------- # Quick check for (signed) integer - ################################## + #--------------------------------- # This will be the string of digits: my $i_d = $i; @@ -2225,9 +2416,9 @@ EOM } } - ####################################### + #-------------------------------------- # Verify correctness during development - ####################################### + #-------------------------------------- if ( VERIFY_FASTNUM && defined($number) ) { # We will call the full method @@ -2251,9 +2442,9 @@ EOM } } - ######################################### + #---------------------------------------- # call full scanner if may not be integer - ######################################### + #---------------------------------------- if ( !defined($number) ) { $number = scan_number(); } @@ -2302,7 +2493,7 @@ EOM error_if_expecting_TERM() if ( $expecting == TERM ); return; - } + } ## end sub do_GREATER_THAN_SIGN sub do_VERTICAL_LINE { @@ -2310,7 +2501,7 @@ EOM error_if_expecting_TERM() if ( $expecting == TERM ); return; - } + } ## end sub do_VERTICAL_LINE sub do_DOLLAR_SIGN { @@ -2328,7 +2519,8 @@ EOM # (vorboard.pl, sort.t). Something like: # /^(print|printf|sort|exec|system)$/ if ( - $is_indirect_object_taker{$last_nonblank_token} + $is_indirect_object_taker{$last_nonblank_token} + && $last_nonblank_type eq 'k' || ( ( $last_nonblank_token eq '(' ) && $is_indirect_object_taker{ $paren_type[$paren_depth] } ) || ( $last_nonblank_type eq 'w' @@ -2372,7 +2564,6 @@ EOM # are not marked as a block, we might have a method call. # Added ')' to fix case c017, something like ()()() && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/ - ) { @@ -2421,7 +2612,15 @@ EOM } ## end else [ if ( $last_last_nonblank_token... } ## end if ( $expecting == OPERATOR... } - $paren_type[$paren_depth] = $container_type; + + # Do not update container type at ') ('; fix for git #105. This will + # propagate the container type onward so that any subsequent brace gets + # correctly marked. I have implemented this as a general rule, which + # should be safe, but if necessary it could be restricted to certain + # container statement types such as 'for'. + $paren_type[$paren_depth] = $container_type + if ( $last_nonblank_token ne ')' ); + ( $type_sequence, $indent_flag ) = increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] ); @@ -2644,7 +2843,8 @@ EOM # ATTRS: for a '{' following an attribute list, reset # things to look like we just saw the sub name - if ( $statement_type =~ /^sub\b/ ) { + # Added 'package' (can be 'class') for --use-feature=class (rt145706) + if ( $statement_type =~ /^(sub|package)\b/ ) { $last_nonblank_token = $statement_type; $last_nonblank_type = 'i'; $statement_type = EMPTY_STRING; @@ -2755,8 +2955,15 @@ EOM } } - $brace_type[ ++$brace_depth ] = $block_type; - $brace_package[$brace_depth] = $current_package; + $brace_type[ ++$brace_depth ] = $block_type; + + # Patch for CLASS BLOCK definitions: do not update the package for the + # current depth if this is a BLOCK type definition. + # TODO: should make 'class' separate from 'package' and only do + # this for 'class' + $brace_package[$brace_depth] = $current_package + if ( substr( $block_type, 0, 8 ) ne 'package ' ); + $brace_structural_type[$brace_depth] = $type; $brace_context[$brace_depth] = $context; ( $type_sequence, $indent_flag ) = @@ -2957,7 +3164,8 @@ EOM # ATTRS: check for a ':' which introduces an attribute list # either after a 'sub' keyword or within a paren list - elsif ( $statement_type =~ /^sub\b/ ) { + # Added 'package' (can be 'class') for --use-feature=class (rt145706) + elsif ( $statement_type =~ /^(sub|package)\b/ ) { $type = 'A'; $in_attribute_list = 1; } @@ -3032,7 +3240,7 @@ EOM if ( $expecting == OPERATOR ); scan_simple_identifier(); return; - } + } ## end sub do_AT_SIGN sub do_PERCENT_SIGN { @@ -3178,7 +3386,7 @@ EOM # '::' = probably a sub call scan_bare_identifier(); return; - } + } ## end sub do_DOUBLE_COLON sub do_LEFT_SHIFT { @@ -3300,20 +3508,8 @@ EOM sub do_POINTER { # '->' - # if -> points to a bare word, we must scan for an identifier, - # otherwise something like ->y would look like the y operator - - # NOTE: this will currently allow things like - # '->@array' '->*VAR' '->%hash' - # to get parsed as identifiers, even though these are not currently - # allowed syntax. To catch syntax errors like this we could first - # check that the next character and skip this call if it is one of - # ' @ % * '. A disadvantage with doing this is that this would - # have to be fixed if the perltidy syntax is ever extended to make - # any of these valid. So for now this check is not done. - scan_simple_identifier(); return; - } ## end sub do_POINTER + } sub do_PLUS_PLUS { @@ -3378,7 +3574,7 @@ EOM error_if_expecting_TERM() if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015 return; - } + } ## end sub do_LOGICAL_AND sub do_LOGICAL_OR { @@ -3386,7 +3582,7 @@ EOM error_if_expecting_TERM() if ( $expecting == TERM && $last_nonblank_token ne ',' ); #c015 return; - } + } ## end sub do_LOGICAL_OR sub do_SLASH_SLASH { @@ -3394,7 +3590,7 @@ EOM error_if_expecting_TERM() if ( $expecting == TERM ); return; - } + } ## end sub do_SLASH_SLASH sub do_DIGITS { @@ -3451,7 +3647,7 @@ EOM rtokens => $rtokens, rtoken_map => $rtoken_map, id_scan_state => $id_scan_state, - max_token_index => $max_token_index + max_token_index => $max_token_index, } ); @@ -3660,24 +3856,6 @@ EOM ); } } - elsif ( $tok eq 'continue' ) { - if ( $last_nonblank_token ne ';' - && $last_nonblank_block_type !~ - /(^(\{|\}|;|while|until|for|foreach)|:$)/ ) - { - - # note: ';' '{' and '}' in list above - # because continues can follow bare blocks; - # ':' is labeled block - # - ############################################ - # NOTE: This check has been deactivated because - # continue has an alternative usage for given/when - # blocks in perl 5.10 - ## warning("'$tok' should follow a block\n"); - ############################################ - } - } # patch for SWITCH/CASE if 'case' and 'when are # treated as keywords. Also 'default' for Switch::Plain @@ -3688,31 +3866,17 @@ EOM $statement_type = $tok; # next '{' is block } - # - # indent trailing if/unless/while/until - # outdenting will be handled by later indentation loop -## DEACTIVATED: unfortunately this can cause some unwanted indentation like: -##$opt_o = 1 -## if !( -## $opt_b -## || $opt_c -## || $opt_d -## || $opt_f -## || $opt_i -## || $opt_l -## || $opt_o -## || $opt_x -## ); -## if ( $tok =~ /^(if|unless|while|until)$/ -## && $next_nonblank_token ne '(' ) -## { -## $indent_flag = 1; -## } + # feature 'err' was removed in Perl 5.10. So mark this as + # a bareword unless an operator is expected (see c158). + elsif ( $tok eq 'err' ) { + if ( $expecting != OPERATOR ) { $type = 'w' } + } + return; } ## end sub do_KEYWORD sub do_QUOTE_OPERATOR { -##NICOL PATCH + if ( $expecting == OPERATOR ) { # Be careful not to call an error for a qw quote @@ -3779,7 +3943,7 @@ EOM # '-' => \&sse_sub, # '*' => \&sse_mul, # '/' => \&sse_div; - # FIXME: this should eventually be generalized + # TODO: this could eventually be generalized if ( $saw_use_module{$current_package}->{'RPerl'} && $tok =~ /^sse_(mul|div|add|sub)$/ ) { @@ -3822,9 +3986,17 @@ EOM $next_tok = $rtokens->[ $i + 1 ]; if ( $next_tok eq '(' ) { + # Patch for issue c151, where we are processing a snippet and + # have not seen that SPACE is a constant. In this case 'x' is + # probably an operator. The only disadvantage with an incorrect + # guess is that the space after it may be incorrect. For example + # $str .= SPACE x ( 16 - length($str) ); See also b1410. + if ( $tok eq 'x' && $last_nonblank_type eq 'w' ) { $type = 'x' } + # Fix part 2 for git #63. Leave type as 'w' to keep # the type the same as if the -> were not separated - $type = 'U' unless ( $last_nonblank_type eq '->' ); + elsif ( $last_nonblank_type ne '->' ) { $type = 'U' } + } # underscore after file test operator is file handle @@ -3843,16 +4015,22 @@ EOM $statement_type = $tok; # next '{' is block $type = 'k'; # for keyword syntax coloring } + if ( $next_nonblank_token eq '(' ) { - # patch for SWITCH/CASE if switch and given not keywords - # Switch is not a perl 5 keyword, but we will gamble - # and mark switch followed by paren as a keyword. This - # is only necessary to get html syntax coloring nice, - # and does not commit this as being a switch/case. - if ( $next_nonblank_token eq '(' - && ( $tok eq 'switch' || $tok eq 'given' ) ) - { - $type = 'k'; # for keyword syntax coloring + # patch for SWITCH/CASE if switch and given not keywords + # Switch is not a perl 5 keyword, but we will gamble + # and mark switch followed by paren as a keyword. This + # is only necessary to get html syntax coloring nice, + # and does not commit this as being a switch/case. + if ( $tok eq 'switch' || $tok eq 'given' ) { + $type = 'k'; # for keyword syntax coloring + } + + # mark 'x' as operator for something like this (see b1410) + # my $line = join( LD_X, map { LD_H x ( $_ + 2 ) } @$widths ); + elsif ( $tok eq 'x' && $last_nonblank_type eq 'w' ) { + $type = 'x'; + } } } return; @@ -3890,13 +4068,6 @@ EOM # true if this token ends the current line # false otherwise - # Patch for c043, part 3: A bareword after '->' expects a TERM - # FIXME: It would be cleaner to give method calls a new type 'M' - # and update sub operator_expected to handle this. - if ( $last_nonblank_type eq '->' ) { - $expecting = TERM; - } - my ( $next_nonblank_token, $i_next ) = find_next_nonblank_token( $i, $rtokens, $max_token_index ); @@ -3932,10 +4103,19 @@ EOM # They may also need to check and set various flags + # Scan a bare word following a -> as an identifier; it could + # have a long package name. Fixes c037, c041. + if ( $last_nonblank_token eq '->' ) { + scan_bare_identifier(); + + # a bareward after '->' gets type 'i' + $type = 'i'; + } + # Quote a word followed by => operator # unless the word __END__ or __DATA__ and the only word on # the line. - if ( !$is_END_or_DATA + elsif ( !$is_END_or_DATA && $next_nonblank_token eq '=' && $rtokens->[ $i_next + 1 ] eq '>' ) { @@ -3960,17 +4140,6 @@ EOM $type = 'w'; } - # Scan a bare word following a -> as an identifier; it could - # have a long package name. Fixes c037, c041. - elsif ( $last_nonblank_token eq '->' ) { - scan_bare_identifier(); - - # Patch for c043, part 4; use type 'w' after a '->'. - # This is just a safety check on sub scan_bare_identifier, - # which should get this case correct. - $type = 'w'; - } - # handle operator x (now we know it isn't $x=) elsif ( $expecting == OPERATOR @@ -4053,7 +4222,6 @@ EOM && ( $i_next <= $max_token_index ) # colon on same line # like 'sub : lvalue' ? - ##&& !$sub_attribute_ok_here # like 'sub : lvalue' ? && !sub_attribute_ok_here( $tok_kw, $next_nonblank_token, $i_next ) && label_ok() ) @@ -4067,19 +4235,54 @@ EOM $i = $i_next; } - # 'sub' or alias + # 'sub' or other sub alias elsif ( $is_sub{$tok_kw} ) { - error_if_expecting_OPERATOR() - if ( $expecting == OPERATOR ); - initialize_subname(); - scan_id(); + + # Update for --use-feature=class (rt145706): + # We have to be extra careful to avoid misparsing other uses of + # 'method' in older scripts. + if ( $tok_kw eq 'method' ) { + if ( $expecting == OPERATOR + || $next_nonblank_token !~ /^(\w|\:)/ + || !method_ok_here() ) + { + do_UNKNOWN_BAREWORD($next_nonblank_token); + } + else { + initialize_subname(); + scan_id(); + } + } + else { + error_if_expecting_OPERATOR() + if ( $expecting == OPERATOR ); + initialize_subname(); + scan_id(); + } } - # 'package' + # 'package' elsif ( $is_package{$tok_kw} ) { - error_if_expecting_OPERATOR() - if ( $expecting == OPERATOR ); - scan_id(); + + # Update for --use-feature=class (rt145706): + # We have to be extra careful because 'class' may be used for other + # purposes on older code; i.e. + # class($x) - valid sub call + # package($x) - error + if ( $tok_kw eq 'class' ) { + if ( $expecting == OPERATOR + || $next_nonblank_token !~ /^(\w|\:)/ + || !class_ok_here() ) + { + do_UNKNOWN_BAREWORD($next_nonblank_token); + } + else { scan_id() } + } + else { + error_if_expecting_OPERATOR() + if ( $expecting == OPERATOR ); + scan_id(); + } } # Fix for c035: split 'format' from 'is_format_END_DATA' to be @@ -4134,22 +4337,30 @@ EOM } - # Removed to fix b1280. This is not needed and was causing the - # starting type 'qw' to be lost, leading to mis-tokenization of - # a trailing block brace in a parenless for stmt 'for .. qw.. {' - ##$tok = $quote_character if ($quote_character); - # scan for the end of the quote or pattern ( - $i, $in_quote, $quote_character, $quote_pos, $quote_depth, - $quoted_string_1, $quoted_string_2 - ) - = do_quote( - $i, $in_quote, $quote_character, - $quote_pos, $quote_depth, $quoted_string_1, - $quoted_string_2, $rtokens, $rtoken_map, - $max_token_index - ); + $i, + $in_quote, + $quote_character, + $quote_pos, + $quote_depth, + $quoted_string_1, + $quoted_string_2, + + ) = do_quote( + + $i, + $in_quote, + $quote_character, + $quote_pos, + $quote_depth, + $quoted_string_1, + $quoted_string_2, + $rtokens, + $rtoken_map, + $max_token_index, + + ); # all done if we didn't find it if ($in_quote) { return } @@ -4430,30 +4641,23 @@ EOM # # ----------------------------------------------------------------------- - my $line_of_tokens = shift; + my ( $self, $line_of_tokens ) = @_; my ($untrimmed_input_line) = $line_of_tokens->{_line_text}; - # patch while coding change is underway - # make callers private data to allow access - # $tokenizer_self = $caller_tokenizer_self; - - # extract line number for use in error messages + # Extract line number for use in error messages $input_line_number = $line_of_tokens->{_line_number}; - # reinitialize for multi-line quote - $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q'; - - # check for pod documentation + # Check for pod documentation if ( substr( $untrimmed_input_line, 0, 1 ) eq '=' && $untrimmed_input_line =~ /^=[A-Za-z_]/ ) { - # must not be in multi-line quote + # Must not be in multi-line quote # and must not be in an equation if ( !$in_quote && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) ) { - $tokenizer_self->[_in_pod_] = 1; + $self->[_in_pod_] = 1; return; } } @@ -4467,27 +4671,33 @@ EOM # a fat comma. my $is_END_or_DATA; - # trim start of this line unless we are continuing a quoted line - # do not trim end because we might end in a quote (test: deken4.pl) - # Perl::Tidy::Formatter will delete needless trailing blanks - unless ( $in_quote && ( $quote_type eq 'Q' ) ) { - $input_line =~ s/^(\s+)//; # trim left end + # Reinitialize the multi-line quote flag + if ( $in_quote && $quote_type eq 'Q' ) { + $line_of_tokens->{_starting_in_quote} = 1; + } + else { + $line_of_tokens->{_starting_in_quote} = 0; + + # Trim start of this line unless we are continuing a quoted line. + # Do not trim end because we might end in a quote (test: deken4.pl) + # Perl::Tidy::Formatter will delete needless trailing blanks + $input_line =~ s/^(\s+)//; - # calculate a guessed level for nonblank lines to avoid calls to + # Calculate a guessed level for nonblank lines to avoid calls to # sub guess_old_indentation_level() - if ( $input_line && $1 ) { + if ( length($input_line) && $1 ) { my $leading_spaces = $1; my $spaces = length($leading_spaces); # handle leading tabs - if ( ord( substr( $leading_spaces, 0, 1 ) ) == 9 + if ( ord( substr( $leading_spaces, 0, 1 ) ) == ORD_TAB && $leading_spaces =~ /^(\t+)/ ) { - my $tabsize = $tokenizer_self->[_tabsize_]; + my $tabsize = $self->[_tabsize_]; $spaces += length($1) * ( $tabsize - 1 ); } - my $indent_columns = $tokenizer_self->[_indent_columns_]; + my $indent_columns = $self->[_indent_columns_]; $line_of_tokens->{_guessed_indentation_level} = int( $spaces / $indent_columns ); } @@ -4496,9 +4706,50 @@ EOM && $input_line =~ /^__(END|DATA)__\s*$/; } + # Optimize for a full-line comment. + if ( !$in_quote ) { + if ( substr( $input_line, 0, 1 ) eq '#' ) { + + # and check for skipped section + if ( $rOpts_code_skipping + && $input_line =~ /$code_skipping_pattern_begin/ ) + { + $self->[_in_skipped_] = 1; + return; + } + + # Optional fast processing of a block comment + my $ci_string_sum = + ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; + my $ci_string_i = $ci_string_sum + $in_statement_continuation; + $line_of_tokens->{_line_type} = 'CODE'; + $line_of_tokens->{_rtokens} = [$input_line]; + $line_of_tokens->{_rtoken_type} = ['#']; + $line_of_tokens->{_rlevels} = [$level_in_tokenizer]; + $line_of_tokens->{_rci_levels} = [$ci_string_i]; + $line_of_tokens->{_rblock_type} = [EMPTY_STRING]; + $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string; + $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string; + return; + } + + # Optimize handling of a blank line + if ( !length($input_line) ) { + $line_of_tokens->{_line_type} = 'CODE'; + $line_of_tokens->{_rtokens} = []; + $line_of_tokens->{_rtoken_type} = []; + $line_of_tokens->{_rlevels} = []; + $line_of_tokens->{_rci_levels} = []; + $line_of_tokens->{_rblock_type} = []; + $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string; + $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string; + return; + } + } + # update the copy of the line for use in error messages # This must be exactly what we give the pre_tokenizer - $tokenizer_self->[_line_of_text_] = $input_line; + $self->[_line_of_text_] = $input_line; # re-initialize for the main loop $routput_token_list = []; # stack of output token indexes @@ -4519,63 +4770,37 @@ EOM $indent_flag = 0; $peeked_ahead = 0; - # This variable signals pre_tokenize to get all tokens. - # But note that it is no longer needed with fast block comment - # option below. - my $max_tokens_wanted = 0; - - # optimize for a full-line comment - if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) { - $max_tokens_wanted = 1; # no use tokenizing a comment - - # and check for skipped section - if ( $rOpts_code_skipping - && $input_line =~ /$code_skipping_pattern_begin/ ) - { - $tokenizer_self->[_in_skipped_] = 1; - return; - } - - # Optional fast processing of a block comment - my $ci_string_sum = - ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; - my $ci_string_i = $ci_string_sum + $in_statement_continuation; - $line_of_tokens->{_line_type} = 'CODE'; - $line_of_tokens->{_rtokens} = [$input_line]; - $line_of_tokens->{_rtoken_type} = ['#']; - $line_of_tokens->{_rlevels} = [$level_in_tokenizer]; - $line_of_tokens->{_rci_levels} = [$ci_string_i]; - $line_of_tokens->{_rblock_type} = [EMPTY_STRING]; - $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string; - $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string; - return; - } - - tokenizer_main_loop( $max_tokens_wanted, $is_END_or_DATA ); + $self->tokenizer_main_loop($is_END_or_DATA); #----------------------------------------------- # all done tokenizing this line ... # now prepare the final list of tokens and types #----------------------------------------------- - tokenizer_wrapup_line($line_of_tokens); + $self->tokenizer_wrapup_line($line_of_tokens); return; } ## end sub tokenize_this_line sub tokenizer_main_loop { - my ( $max_tokens_wanted, $is_END_or_DATA ) = @_; - # tokenization is done in two stages.. - # stage 1 is a very simple pre-tokenization + my ( $self, $is_END_or_DATA ) = @_; + + #--------------------------------- + # Break one input line into tokens + #--------------------------------- + + # Input parameter: + # $is_END_or_DATA is true for a __END__ or __DATA__ line # start by breaking the line into pre-tokens + my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens ( $rtokens, $rtoken_map, $rtoken_type ) = pre_tokenize( $input_line, $max_tokens_wanted ); $max_token_index = scalar( @{$rtokens} ) - 1; push( @{$rtokens}, SPACE, SPACE, SPACE ) - ; # extra whitespace simplifies logic + ; # extra whitespace simplifies logic push( @{$rtoken_map}, 0, 0, 0 ); # shouldn't be referenced push( @{$rtoken_type}, 'b', 'b', 'b' ); @@ -4593,9 +4818,9 @@ EOM $i = -1; $i_tok = -1; - # ------------------------------------------------------------ + #----------------------------- # begin main tokenization loop - # ------------------------------------------------------------ + #----------------------------- # we are looking at each pre-token of one line and combining them # into tokens @@ -4649,8 +4874,7 @@ EOM # fix for git #63. if ( $last_last_nonblank_token eq '->' ) { if ( $last_nonblank_type eq 'w' - || $last_nonblank_type eq 'i' - && substr( $last_nonblank_token, 0, 1 ) eq '$' ) + || $last_nonblank_type eq 'i' ) { $last_nonblank_token = '->' . $last_nonblank_token; $last_nonblank_type = 'i'; @@ -4731,7 +4955,7 @@ EOM $tok = $pre_tok; } - my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : SPACE; +## my $prev_tok = $i > 0 ? $rtokens->[ $i - 1 ] : SPACE; my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b'; #----------------------------------------------------------- @@ -4843,13 +5067,13 @@ EOM $in_attribute_list = 0; } - ############################################################### + #-------------------------------------------------------- # We have the next token, $tok. # Now we have to examine this token and decide what it is # and define its $type # # section 1: bare words - ############################################################### + #-------------------------------------------------------- if ( $pre_type eq 'w' ) { $expecting = @@ -4858,18 +5082,18 @@ EOM last if ($is_last); } - ############################################################### + #----------------------------- # section 2: strings of digits - ############################################################### + #----------------------------- elsif ( $pre_type eq 'd' ) { $expecting = operator_expected( [ $prev_type, $tok, $next_type ] ); do_DIGITS(); } - ############################################################### + #---------------------------- # section 3: all other tokens - ############################################################### + #---------------------------- else { my $code = $tokenization_code->{$tok}; if ($code) { @@ -4895,7 +5119,7 @@ EOM } # Remember last nonblank values - unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) { + if ( $type ne 'b' && $type ne '#' ) { $last_last_nonblank_token = $last_nonblank_token; $last_last_nonblank_type = $last_nonblank_type; $last_last_nonblank_block_type = $last_nonblank_block_type; @@ -4918,23 +5142,135 @@ EOM } } - $tokenizer_self->[_in_attribute_list_] = $in_attribute_list; - $tokenizer_self->[_in_quote_] = $in_quote; - $tokenizer_self->[_quote_target_] = + $self->[_in_attribute_list_] = $in_attribute_list; + $self->[_in_quote_] = $in_quote; + $self->[_quote_target_] = $in_quote ? matching_end_token($quote_character) : EMPTY_STRING; - $tokenizer_self->[_rhere_target_list_] = $rhere_target_list; + $self->[_rhere_target_list_] = $rhere_target_list; return; } ## end sub tokenizer_main_loop sub tokenizer_wrapup_line { - my ($line_of_tokens) = @_; + my ( $self, $line_of_tokens ) = @_; + + #--------------------------------------------------------- + # Package a line of tokens for shipping back to the caller + #--------------------------------------------------------- + + # Most of the remaining work involves defining the two indentation + # parameters that the formatter needs for each token: + # - $level = structural indentation level and + # - $ci_level = continuation indentation level + + # The method for setting the indentation level is straightforward. + # But the method used to define the continuation indentation is + # complicated because it has evolved over a long time by trial and + # error. It could undoubtedly be simplified but it works okay as is. - # We have broken the current line into tokens. Now we have to wrap up - # the result for shipping. Most of the remaining work involves - # defining the various indentation parameters that the formatter needs - # (indentation level and continuation indentation). This turns out to - # be somewhat complicated. + # Here is a brief description of how indentation is computed. + # Perl::Tidy computes indentation as the sum of 2 terms: + # + # (1) structural indentation, such as if/else/elsif blocks + # (2) continuation indentation, such as long parameter call lists. + # + # These are occasionally called primary and secondary indentation. + # + # Structural indentation is introduced by tokens of type '{', + # although the actual tokens might be '{', '(', or '['. Structural + # indentation is of two types: BLOCK and non-BLOCK. Default + # structural indentation is 4 characters if the standard indentation + # scheme is used. + # + # Continuation indentation is introduced whenever a line at BLOCK + # level is broken before its termination. Default continuation + # indentation is 2 characters in the standard indentation scheme. + # + # Both types of indentation may be nested arbitrarily deep and + # interlaced. The distinction between the two is somewhat arbitrary. + # + # For each token, we will define two variables which would apply if + # the current statement were broken just before that token, so that + # that token started a new line: + # + # $level = the structural indentation level, + # $ci_level = the continuation indentation level + # + # The total indentation will be $level * (4 spaces) + $ci_level * (2 + # spaces), assuming defaults. However, in some special cases it is + # customary to modify $ci_level from this strict value. + # + # The total structural indentation is easy to compute by adding and + # subtracting 1 from a saved value as types '{' and '}' are seen. + # The running value of this variable is $level_in_tokenizer. + # + # The total continuation is much more difficult to compute, and + # requires several variables. These variables are: + # + # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for + # each indentation level, if there are intervening open secondary + # structures just prior to that level. + # $continuation_string_in_tokenizer = a string of 1's and 0's + # indicating if the last token at that level is "continued", meaning + # that it is not the first token of an expression. + # $nesting_block_string = a string of 1's and 0's indicating, for each + # indentation level, if the level is of type BLOCK or not. + # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string + # $nesting_list_string = a string of 1's and 0's indicating, for each + # indentation level, if it is appropriate for list formatting. + # If so, continuation indentation is used to indent long list items. + # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string + # @{$rslevel_stack} = a stack of total nesting depths at each + # structural indentation level, where "total nesting depth" means + # the nesting depth that would occur if every nesting token + # -- '{', '[', # and '(' -- , regardless of context, is used to + # compute a nesting depth. + + # Notes on the Continuation Indentation + # + # There is a sort of chicken-and-egg problem with continuation + # indentation. The formatter can't make decisions on line breaks + # without knowing what 'ci' will be at arbitrary locations. + # + # But a problem with setting the continuation indentation (ci) here + # in the tokenizer is that we do not know where line breaks will + # actually be. As a result, we don't know if we should propagate + # continuation indentation to higher levels of structure. + # + # For nesting of only structural indentation, we never need to do + # this. For example, in a long if statement, like this + # + # if ( !$output_block_type[$i] + # && ($in_statement_continuation) ) + # { <--outdented + # do_something(); + # } + # + # the second line has ci but we do normally give the lines within + # the BLOCK any ci. This would be true if we had blocks nested + # arbitrarily deeply. + # + # But consider something like this, where we have created a break + # after an opening paren on line 1, and the paren is not (currently) + # a structural indentation token: + # + # my $file = $menubar->Menubutton( + # qw/-text File -underline 0 -menuitems/ => [ + # [ + # Cascade => '~View', + # -menuitems => [ + # ... + # + # The second line has ci, so it would seem reasonable to propagate + # it down, giving the third line 1 ci + 1 indentation. This + # suggests the following rule, which is currently used to + # propagating ci down: if there are any non-structural opening + # parens (or brackets, or braces), before an opening structural + # brace, then ci is propagated down, and otherwise + # not. The variable $intervening_secondary_structure contains this + # information for the current token, and the string + # "$ci_string_in_tokenizer" is a stack of previous values of this + # variable. my @token_type = (); # stack of output token types my @block_type = (); # stack of output code block types @@ -4942,95 +5278,34 @@ EOM my @tokens = (); # output tokens my @levels = (); # structural brace levels of output tokens my @ci_string = (); # string needed to compute continuation indentation - my $container_environment = EMPTY_STRING; - my $im = -1; # previous $i value - my $num; # Count the number of '1's in the string (previously sub ones_count) my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; -# Computing Token Indentation -# -# The final section of the tokenizer forms tokens and also computes -# parameters needed to find indentation. It is much easier to do it -# in the tokenizer than elsewhere. Here is a brief description of how -# indentation is computed. Perl::Tidy computes indentation as the sum -# of 2 terms: -# -# (1) structural indentation, such as if/else/elsif blocks -# (2) continuation indentation, such as long parameter call lists. -# -# These are occasionally called primary and secondary indentation. -# -# Structural indentation is introduced by tokens of type '{', although -# the actual tokens might be '{', '(', or '['. Structural indentation -# is of two types: BLOCK and non-BLOCK. Default structural indentation -# is 4 characters if the standard indentation scheme is used. -# -# Continuation indentation is introduced whenever a line at BLOCK level -# is broken before its termination. Default continuation indentation -# is 2 characters in the standard indentation scheme. -# -# Both types of indentation may be nested arbitrarily deep and -# interlaced. The distinction between the two is somewhat arbitrary. -# -# For each token, we will define two variables which would apply if -# the current statement were broken just before that token, so that -# that token started a new line: -# -# $level = the structural indentation level, -# $ci_level = the continuation indentation level -# -# The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces), -# assuming defaults. However, in some special cases it is customary -# to modify $ci_level from this strict value. -# -# The total structural indentation is easy to compute by adding and -# subtracting 1 from a saved value as types '{' and '}' are seen. The -# running value of this variable is $level_in_tokenizer. -# -# The total continuation is much more difficult to compute, and requires -# several variables. These variables are: -# -# $ci_string_in_tokenizer = a string of 1's and 0's indicating, for -# each indentation level, if there are intervening open secondary -# structures just prior to that level. -# $continuation_string_in_tokenizer = a string of 1's and 0's indicating -# if the last token at that level is "continued", meaning that it -# is not the first token of an expression. -# $nesting_block_string = a string of 1's and 0's indicating, for each -# indentation level, if the level is of type BLOCK or not. -# $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string -# $nesting_list_string = a string of 1's and 0's indicating, for each -# indentation level, if it is appropriate for list formatting. -# If so, continuation indentation is used to indent long list items. -# $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string -# @{$rslevel_stack} = a stack of total nesting depths at each -# structural indentation level, where "total nesting depth" means -# the nesting depth that would occur if every nesting token -- '{', '[', -# and '(' -- , regardless of context, is used to compute a nesting -# depth. - $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string; my ( $ci_string_i, $level_i ); - # loop over the list of pre-tokens indexes + #----------------- + # Loop over tokens + #----------------- + my $rtoken_map_im; foreach my $i ( @{$routput_token_list} ) { - # Get $tok_i, the PRE-token. It only equals the token for symbols my $type_i = $routput_token_type->[$i]; - my $tok_i = $rtokens->[$i]; + $level_i = $level_in_tokenizer; # Quick handling of indentation levels for blanks and comments if ( $type_i eq 'b' || $type_i eq '#' ) { $ci_string_i = $ci_string_sum + $in_statement_continuation; - $level_i = $level_in_tokenizer; } # All other types else { + # $tok_i is the PRE-token. It only equals the token for symbols + my $tok_i = $rtokens->[$i]; + # Check for an invalid token type.. # This can happen by running perltidy on non-scripts although # it could also be bug introduced by programming change. Perl @@ -5040,113 +5315,28 @@ EOM warning( "unexpected character decimal $val ($type_i) in script\n" ); - $tokenizer_self->[_in_error_] = 1; + $self->[_in_error_] = 1; } - # See if we should undo the $forced_indentation_flag. - # Forced indentation after 'if', 'unless', 'while' and 'until' - # expressions without trailing parens is optional and doesn't - # always look good. It is usually okay for a trailing logical - # expression, but if the expression is a function call, code block, - # or some kind of list it puts in an unwanted extra indentation - # level which is hard to remove. - # - # Example where extra indentation looks ok: - # return 1 - # if $det_a < 0 and $det_b > 0 - # or $det_a > 0 and $det_b < 0; - # - # Example where extra indentation is not needed because - # the eval brace also provides indentation: - # print "not " if defined eval { - # reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4; - # }; - # - # The following rule works fairly well: - # Undo the flag if the end of this line, or start of the next - # line, is an opening container token or a comma. - # This almost always works, but if not after another pass it will - # be stable. - my $forced_indentation_flag = $routput_indent_flag->[$i]; - if ( $forced_indentation_flag && $type_i eq 'k' ) { - my $ixlast = -1; - my $ilast = $routput_token_list->[$ixlast]; - my $toklast = $routput_token_type->[$ilast]; - if ( $toklast eq '#' ) { - $ixlast--; - $ilast = $routput_token_list->[$ixlast]; - $toklast = $routput_token_type->[$ilast]; - } - if ( $toklast eq 'b' ) { - $ixlast--; - $ilast = $routput_token_list->[$ixlast]; - $toklast = $routput_token_type->[$ilast]; - } - if ( $toklast =~ /^[\{,]$/ ) { - $forced_indentation_flag = 0; - } - else { - ( $toklast, my $i_next ) = - find_next_nonblank_token( $max_token_index, $rtokens, - $max_token_index ); - if ( $toklast =~ /^[\{,]$/ ) { - $forced_indentation_flag = 0; - } - } - } ## end if ( $forced_indentation_flag...) - - # if we are already in an indented if, see if we should outdent - if ($indented_if_level) { - - # don't try to nest trailing if's - shouldn't happen - if ( $type_i eq 'k' ) { - $forced_indentation_flag = 0; - } - - # check for the normal case - outdenting at next ';' - elsif ( $type_i eq ';' ) { - if ( $level_in_tokenizer == $indented_if_level ) { - $forced_indentation_flag = -1; - $indented_if_level = 0; - } - } - - # handle case of missing semicolon - elsif ( $type_i eq '}' ) { - if ( $level_in_tokenizer == $indented_if_level ) { - $indented_if_level = 0; - - $level_in_tokenizer--; - if ( @{$rslevel_stack} > 1 ) { - pop( @{$rslevel_stack} ); - } - if ( length($nesting_block_string) > 1 ) - { # true for valid script - chop $nesting_block_string; - chop $nesting_list_string; - } - } - } - } ## end if ($indented_if_level) - - # Now we have the first approximation to the level - $level_i = $level_in_tokenizer; + # $ternary_indentation_flag indicates that we need a change + # in level at a nested ternary, as follows + # 1 => at a nested ternary ? + # -1 => at a nested ternary : + # 0 => otherwise + my $ternary_indentation_flag = $routput_indent_flag->[$i]; + #------------------------------------------- + # Section 1: handle a level-increasing token + #------------------------------------------- # set primary indentation levels based on structural braces # Note: these are set so that the leading braces have a HIGHER # level than their CONTENTS, which is convenient for indentation # Also, define continuation indentation for each token. if ( $type_i eq '{' || $type_i eq 'L' - || $forced_indentation_flag > 0 ) + || $ternary_indentation_flag > 0 ) { - # use environment before updating - $container_environment = - $nesting_block_flag ? 'BLOCK' - : $nesting_list_flag ? 'LIST' - : EMPTY_STRING; - # if the difference between total nesting levels is not 1, # there are intervening non-structural nesting types between # this '{' and the previous unclosed '{' @@ -5156,85 +5346,23 @@ EOM $slevel_in_tokenizer - $rslevel_stack->[-1]; } - # Continuation Indentation - # - # Having tried setting continuation indentation both in the formatter and - # in the tokenizer, I can say that setting it in the tokenizer is much, - # much easier. The formatter already has too much to do, and can't - # make decisions on line breaks without knowing what 'ci' will be at - # arbitrary locations. - # - # But a problem with setting the continuation indentation (ci) here - # in the tokenizer is that we do not know where line breaks will actually - # be. As a result, we don't know if we should propagate continuation - # indentation to higher levels of structure. - # - # For nesting of only structural indentation, we never need to do this. - # For example, in a long if statement, like this - # - # if ( !$output_block_type[$i] - # && ($in_statement_continuation) ) - # { <--outdented - # do_something(); - # } - # - # the second line has ci but we do normally give the lines within the BLOCK - # any ci. This would be true if we had blocks nested arbitrarily deeply. - # - # But consider something like this, where we have created a break after - # an opening paren on line 1, and the paren is not (currently) a - # structural indentation token: - # - # my $file = $menubar->Menubutton( - # qw/-text File -underline 0 -menuitems/ => [ - # [ - # Cascade => '~View', - # -menuitems => [ - # ... - # - # The second line has ci, so it would seem reasonable to propagate it - # down, giving the third line 1 ci + 1 indentation. This suggests the - # following rule, which is currently used to propagating ci down: if there - # are any non-structural opening parens (or brackets, or braces), before - # an opening structural brace, then ci is propagated down, and otherwise - # not. The variable $intervening_secondary_structure contains this - # information for the current token, and the string - # "$ci_string_in_tokenizer" is a stack of previous values of this - # variable. - # save the current states push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer ); $level_in_tokenizer++; - if ( $level_in_tokenizer > - $tokenizer_self->[_maximum_level_] ) - { - $tokenizer_self->[_maximum_level_] = - $level_in_tokenizer; + if ( $level_in_tokenizer > $self->[_maximum_level_] ) { + $self->[_maximum_level_] = $level_in_tokenizer; } - if ($forced_indentation_flag) { + if ($ternary_indentation_flag) { - # break BEFORE '?' when there is forced indentation + # break BEFORE '?' in a nested ternary if ( $type_i eq '?' ) { $level_i = $level_in_tokenizer; } - if ( $type_i eq 'k' ) { - $indented_if_level = $level_in_tokenizer; - } - - # do not change container environment here if we are not - # at a real list. Adding this check prevents "blinkers" - # often near 'unless" clauses, such as in the following - # code: -## next -## unless -e ( -## $archive = -## File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" ) -## ); $nesting_block_string .= "$nesting_block_flag"; - } ## end if ($forced_indentation_flag) + } ## end if ($ternary_indentation_flag) else { if ( $routput_block_type->[$i] ) { @@ -5276,28 +5404,31 @@ EOM $continuation_string_in_tokenizer .= ( $in_statement_continuation > 0 ) ? '1' : '0'; - # Sometimes we want to give an opening brace continuation indentation, - # and sometimes not. For code blocks, we don't do it, so that the leading - # '{' gets outdented, like this: - # - # if ( !$output_block_type[$i] - # && ($in_statement_continuation) ) - # { <--outdented - # - # For other types, we will give them continuation indentation. For example, - # here is how a list looks with the opening paren indented: - # - # @LoL = - # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ], - # [ "homer", "marge", "bart" ], ); - # - # This looks best when 'ci' is one-half of the indentation (i.e., 2 and 4) + # Sometimes we want to give an opening brace + # continuation indentation, and sometimes not. For code + # blocks, we don't do it, so that the leading '{' gets + # outdented, like this: + # + # if ( !$output_block_type[$i] + # && ($in_statement_continuation) ) + # { <--outdented + # + # For other types, we will give them continuation + # indentation. For example, here is how a list looks + # with the opening paren indented: + # + # @LoL = + # ( [ "fred", "barney" ], [ "george", "jane", "elroy" ], + # [ "homer", "marge", "bart" ], ); + # + # This looks best when 'ci' is one-half of the + # indentation (i.e., 2 and 4) my $total_ci = $ci_string_sum; if ( !$routput_block_type->[$i] # patch: skip for BLOCK && ($in_statement_continuation) - && !( $forced_indentation_flag && $type_i eq ':' ) + && !( $ternary_indentation_flag && $type_i eq ':' ) ) { $total_ci += $in_statement_continuation @@ -5309,16 +5440,27 @@ EOM $in_statement_continuation = 0; } ## end if ( $type_i eq '{' ||...}) + #------------------------------------------- + # Section 2: handle a level-decreasing token + #------------------------------------------- elsif ($type_i eq '}' || $type_i eq 'R' - || $forced_indentation_flag < 0 ) + || $ternary_indentation_flag < 0 ) { - # only a nesting error in the script would prevent popping here + # only a nesting error in the script would prevent + # popping here if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); } $level_i = --$level_in_tokenizer; + if ( $level_in_tokenizer < 0 ) { + unless ( $self->[_saw_negative_indentation_] ) { + $self->[_saw_negative_indentation_] = 1; + warning("Starting negative indentation\n"); + } + } + # restore previous level values if ( length($nesting_block_string) > 1 ) { # true for valid script @@ -5343,13 +5485,14 @@ EOM # ...These include non-anonymous subs # note: could be sub ::abc { or sub 'abc - if ( $block_type_i =~ m/^sub\s*/gc ) { + if ( substr( $block_type_i, 0, 3 ) eq 'sub' + && $block_type_i =~ m/^sub\s*/gc ) + { # note: older versions of perl require the /gc # modifier here or else the \G does not work. - if ( $block_type_i =~ /\G('|::|\w)/gc ) { - $in_statement_continuation = 0; - } + $in_statement_continuation = 0 + if ( $block_type_i =~ /\G('|::|\w)/gc ); } # ...and include all block types except user subs @@ -5400,42 +5543,36 @@ EOM ); ##if $routput_container_type->[$i] =~ /^[;,\{\}]$/; } - - elsif ( $tok_i eq ';' ) { - $in_statement_continuation = 0; - } } ## end if ( length($nesting_block_string...)) - # use environment after updating - $container_environment = - $nesting_block_flag ? 'BLOCK' - : $nesting_list_flag ? 'LIST' - : EMPTY_STRING; $ci_string_i = $ci_string_sum + $in_statement_continuation; } ## end elsif ( $type_i eq '}' ||...{) - # not a structural indentation type.. + #----------------------------------------- + # Section 3: handle a constant level token + #----------------------------------------- else { - $container_environment = - $nesting_block_flag ? 'BLOCK' - : $nesting_list_flag ? 'LIST' - : EMPTY_STRING; - # zero the continuation indentation at certain tokens so # that they will be at the same level as its container. For # commas, this simplifies the -lp indentation logic, which # counts commas. For ?: it makes them stand out. - if ($nesting_list_flag) { + if ( + $nesting_list_flag ## $type_i =~ /^[,\?\:]$/ - if ( $is_comma_question_colon{$type_i} ) { - $in_statement_continuation = 0; - } + && $is_comma_question_colon{$type_i} + ) + { + $in_statement_continuation = 0; } - # be sure binary operators get continuation indentation + # Be sure binary operators get continuation indentation. + # Note: the check on $nesting_block_flag is only needed + # to add ci to binary operators following a 'try' block, + # or similar extended syntax block operator (see c158). if ( - $container_environment + !$in_statement_continuation + && ( $nesting_block_flag || $nesting_list_flag ) && ( $type_i eq 'k' && $is_binary_keyword{$tok_i} || $is_binary_type{$type_i} ) ) @@ -5449,8 +5586,6 @@ EOM # update continuation flag ... - ## if ( $type_i ne 'b' && $type_i ne '#' ) { # moved above - # if we are in a BLOCK if ($nesting_block_flag) { @@ -5492,16 +5627,11 @@ EOM } } ## end else [ if ($nesting_block_flag)] - ##} ## end if ( $type_i ne 'b' ... # (old moved above) - } ## end else [ if ( $type_i eq '{' ||...})] - if ( $level_in_tokenizer < 0 ) { - unless ( $tokenizer_self->[_saw_negative_indentation_] ) { - $tokenizer_self->[_saw_negative_indentation_] = 1; - warning("Starting negative indentation\n"); - } - } + #------------------------------------------- + # Section 4: operations common to all levels + #------------------------------------------- # set secondary nesting levels based on all containment token # types Note: these are set so that the nesting depth is the @@ -5543,21 +5673,30 @@ EOM } } ## end else [ if ( $type_i eq 'b' ||...)] + #-------------------------------- # Store the values for this token + #-------------------------------- push( @ci_string, $ci_string_i ); push( @levels, $level_i ); push( @block_type, $routput_block_type->[$i] ); push( @type_sequence, $routput_type_sequence->[$i] ); push( @token_type, $type_i ); - # Form and store the previous token - if ( $im >= 0 ) { - $num = - $rtoken_map->[$i] - $rtoken_map->[$im]; # how many characters + # Form and store the PREVIOUS token + if ( defined($rtoken_map_im) ) { + my $numc = + $rtoken_map->[$i] - $rtoken_map_im; # how many characters - if ( $num > 0 ) { + if ( $numc > 0 ) { push( @tokens, - substr( $input_line, $rtoken_map->[$im], $num ) ); + substr( $input_line, $rtoken_map_im, $numc ) ); + } + else { + + # Should not happen unless @{$rtoken_map} is corrupted + DEVEL_MODE + && Fault( + "number of characters is '$numc' but should be >0\n"); } } @@ -5566,15 +5705,31 @@ EOM $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string; } - $im = $i; + $rtoken_map_im = $rtoken_map->[$i]; } ## end foreach my $i ( @{$routput_token_list...}) - # Form and store the final token - $num = length($input_line) - $rtoken_map->[$im]; # make the last token - if ( $num > 0 ) { - push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) ); + #------------------------ + # End loop to over tokens + #------------------------ + + # Form and store the final token of this line + if ( defined($rtoken_map_im) ) { + my $numc = length($input_line) - $rtoken_map_im; + if ( $numc > 0 ) { + push( @tokens, substr( $input_line, $rtoken_map_im, $numc ) ); + } + else { + + # Should not happen unless @{$rtoken_map} is corrupted + DEVEL_MODE + && Fault( + "Number of Characters is '$numc' but should be >0\n"); + } } + #---------------------------------------------------------- + # Wrap up this line of tokens for shipping to the Formatter + #---------------------------------------------------------- $line_of_tokens->{_rtoken_type} = \@token_type; $line_of_tokens->{_rtokens} = \@tokens; $line_of_tokens->{_rblock_type} = \@block_type; @@ -5586,7 +5741,7 @@ EOM } ## end sub tokenizer_wrapup_line } ## end tokenize_this_line -#########i############################################################# +####################################################################### # Tokenizer routines which assist in identifying token types ####################################################################### @@ -5611,12 +5766,13 @@ BEGIN { ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~ ); push @q, ','; - push @q, '('; # for completeness, not currently a token type + push @q, '('; # for completeness, not currently a token type + push @q, '->'; # was previously in UNKNOWN @{op_expected_table}{@q} = (TERM) x scalar(@q); - # Always UNKNOWN following these types: - # Fix for c030: added '->' to this list - @q = qw( w -> ); + # Always UNKNOWN following these types; + # previously had '->' in this list for c030 + @q = qw( w ); @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q); # Always expecting OPERATOR ... @@ -5638,7 +5794,7 @@ BEGIN { @q = qw( n v ); @{is_n_v}{@q} = (1) x scalar(@q); -} +} ## end BEGIN use constant DEBUG_OPERATOR_EXPECTED => 0; @@ -5693,39 +5849,37 @@ sub operator_expected { my ($rarg) = @_; - my $msg = EMPTY_STRING; - - ############## + #------------- # Table lookup - ############## + #------------- # Many types are can be obtained by a table lookup given the previous type. # This typically handles half or more of the calls. my $op_expected = $op_expected_table{$last_nonblank_type}; if ( defined($op_expected) ) { - $msg = "Table lookup"; - goto RETURN; + DEBUG_OPERATOR_EXPECTED + && print STDOUT +"OPERATOR_EXPECTED: Table Lookup; returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n"; + return $op_expected; } - ###################### + #--------------------- # Handle special cases - ###################### + #--------------------- $op_expected = UNKNOWN; my ( $prev_type, $tok, $next_type ) = @{$rarg}; # Types 'k', '}' and 'Z' depend on context - # FIXME: Types 'i', 'n', 'v', 'q' currently also temporarily depend on - # context but that dependence could eventually be eliminated with better - # token type definition + # Types 'i', 'n', 'v', 'q' currently also temporarily depend on context. # identifier... if ( $last_nonblank_type eq 'i' ) { $op_expected = OPERATOR; - # FIXME: it would be cleaner to make this a special type - # expecting VERSION or {} after package NAMESPACE - # TODO: maybe mark these words as type 'Y'? + # TODO: it would be cleaner to make this a special type + # expecting VERSION or {} after package NAMESPACE; + # maybe mark these words as type 'Y'? if ( substr( $last_nonblank_token, 0, 7 ) eq 'package' && $statement_type =~ /^package\b/ && $last_nonblank_token =~ /^package\b/ ) @@ -5790,7 +5944,7 @@ sub operator_expected { $op_expected = OPERATOR; # block mode following } } - ##elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) { + # $last_nonblank_token =~ /^(\)|\$|\-\>)/ elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) } || substr( $last_nonblank_token, 0, 2 ) eq '->' ) { @@ -5853,13 +6007,18 @@ sub operator_expected { } # quote... - # FIXME: labeled prototype words should probably be given type 'A' or maybe - # 'J'; not 'q'; or maybe mark as type 'Y' + # TODO: labeled prototype words would better be given type 'A' or maybe + # 'J'; not 'q'; or maybe mark as type 'Y'? elsif ( $last_nonblank_type eq 'q' ) { $op_expected = OPERATOR; - if ( $last_nonblank_token eq 'prototype' ) - ##|| $last_nonblank_token eq 'switch' ) - { + if ( $last_nonblank_token eq 'prototype' ) { + $op_expected = TERM; + } + + # update for --use-feature=class (rt145706): + # Look for class VERSION after possible attribute, as in + # class Example::Subclass : isa(Example::Base) 1.345 { ... } + elsif ( $statement_type =~ /^package\b/ ) { $op_expected = TERM; } } @@ -5926,12 +6085,9 @@ sub operator_expected { $op_expected = UNKNOWN; } - RETURN: - - DEBUG_OPERATOR_EXPECTED && do { - print STDOUT -"OPERATOR_EXPECTED: $msg: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n"; - }; + DEBUG_OPERATOR_EXPECTED + && print STDOUT +"OPERATOR_EXPECTED: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n"; return $op_expected; @@ -6030,11 +6186,11 @@ sub code_block_type { } } - ################################################################ + #-------------------------------------------------------------- # NOTE: braces after type characters start code blocks, but for # simplicity these are not identified as such. See also # sub is_non_structural_brace. - ################################################################ + #-------------------------------------------------------------- ## elsif ( $last_nonblank_type eq 't' ) { ## return $last_nonblank_token; @@ -6219,7 +6375,7 @@ sub decide_if_code_block { foreach my $k ( $j + 1 .. @pre_types - 2 ) { if ( $pre_types[$k] eq $quote_mark ) { $j = $k + 1; - my $next = $pre_types[$j]; + ##my $next = $pre_types[$j]; last; } } @@ -6320,7 +6476,7 @@ BEGIN { @q = qw(R ]); @{is_R_closing_sb}{@q} = (1) x scalar(@q); -} +} ## end BEGIN sub is_non_structural_brace { @@ -6337,11 +6493,11 @@ sub is_non_structural_brace { # return 0; # } - ################################################################ + #-------------------------------------------------------------- # NOTE: braces after type characters start code blocks, but for # simplicity these are not identified as such. See also # sub code_block_type - ################################################################ + #-------------------------------------------------------------- ##if ($last_nonblank_type eq 't') {return 0} @@ -6363,7 +6519,7 @@ sub is_non_structural_brace { ); } ## end sub is_non_structural_brace -#########i############################################################# +####################################################################### # Tokenizer routines for tracking container nesting depths ####################################################################### @@ -6423,16 +6579,8 @@ sub increase_nesting_depth { # a unique set of numbers but still allows the relative location # of any type to be determined. - ######################################################################## - # OLD SEQNO METHOD for incrementing sequence numbers. - # Keep this coding awhile for possible testing. - ## $nesting_sequence_number[$aa] += scalar(@closing_brace_names); - ## my $seqno = $nesting_sequence_number[$aa]; - - # NEW SEQNO METHOD, continuous sequence numbers. This allows sequence - # numbers to be used as array indexes, and allows them to be compared. + # make a new unique sequence number my $seqno = $next_sequence_number++; - ######################################################################## $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno; @@ -6623,7 +6771,7 @@ EOM return; } ## end sub check_final_nesting_depths -#########i############################################################# +####################################################################### # Tokenizer routines for looking ahead in input stream ####################################################################### @@ -6678,7 +6826,7 @@ sub peek_ahead_for_nonblank_token { return; } ## end sub peek_ahead_for_nonblank_token -#########i############################################################# +####################################################################### # Tokenizer guessing routines for ambiguous situations ####################################################################### @@ -6712,11 +6860,25 @@ sub guess_if_pattern_or_conditional { my $quote_pos = 0; my $quoted_string; ( - $i, $in_quote, $quote_character, $quote_pos, $quote_depth, - $quoted_string - ) - = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character, - $quote_pos, $quote_depth, $max_token_index ); + + $i, + $in_quote, + $quote_character, + $quote_pos, + $quote_depth, + $quoted_string, + + ) = follow_quoted_string( + + $ibeg, + $in_quote, + $rtokens, + $quote_character, + $quote_pos, + $quote_depth, + $max_token_index, + + ); if ($in_quote) { @@ -6768,7 +6930,7 @@ BEGIN { # parenless calls of 'ok' are common @q = qw( ok ); @{is_known_function}{@q} = (1) x scalar(@q); -} +} ## end BEGIN sub guess_if_pattern_or_division { @@ -6796,7 +6958,7 @@ sub guess_if_pattern_or_division { if ( $divide_possible < 0 ) { $msg = "pattern (division not possible here)\n"; $is_pattern = 1; - goto RETURN; + return ( $is_pattern, $msg ); } $i = $ibeg + 1; @@ -6927,8 +7089,6 @@ sub guess_if_pattern_or_division { } } } - - RETURN: return ( $is_pattern, $msg ); } ## end sub guess_if_pattern_or_division @@ -6987,7 +7147,7 @@ sub guess_if_here_doc { return $here_doc_expected; } ## end sub guess_if_here_doc -#########i############################################################# +####################################################################### # Tokenizer Routines for scanning identifiers and related items ####################################################################### @@ -7097,7 +7257,7 @@ sub scan_bare_identifier_do { # $tok='eval'; # patch to do braces like eval - doesn't work # $type = 'k'; #} - # FIXME: This could become a separate type to allow for different + # TODO: This could become a separate type to allow for different # future behavior: elsif ( $is_block_function{$package}{$sub_name} ) { $type = 'G'; @@ -7278,7 +7438,7 @@ sub scan_id_do { rtokens => $rtokens, rtoken_map => $rtoken_map, id_scan_state => $id_scan_state, - max_token_index => $max_token_index + max_token_index => $max_token_index, } ); } @@ -7417,7 +7577,8 @@ sub do_scan_package { # Examples of valid primitive tokens that might follow are: # 1235 . ; { } v3 v # FIX: added a '#' since a side comment may also follow - if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#])|v\d|\d+$/ ) { + # Added ':' for class attributes (for --use-feature=class, rt145706) + if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#\:])|v\d|\d+$/ ) { $statement_type = $tok; } else { @@ -7445,7 +7606,7 @@ BEGIN { my @q = qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ }; @{is_special_variable_char}{@q} = (1) x scalar(@q); -} +} ## end BEGIN { ## begin closure for sub scan_complex_identifier @@ -7527,7 +7688,6 @@ BEGIN { sub do_id_scan_state_dollar { # We saw a sigil, now looking to start a variable name - if ( $tok eq '$' ) { $identifier .= $tok; @@ -7608,7 +7768,7 @@ BEGIN { elsif ( $tok eq '{' ) { - # check for something like ${#} or ${©} + # check for something like ${#} or ${?}, where ? is a special char if ( ( $identifier eq '$' @@ -8053,15 +8213,15 @@ BEGIN { # return flag telling caller to split the pretoken my $split_pretoken_flag; - #################### + #------------------- # Initialize my vars - #################### + #------------------- initialize_my_scan_id_vars(); - ######################################################### + #-------------------------------------------------------- # get started by defining a type and a state if necessary - ######################################################### + #-------------------------------------------------------- if ( !$id_scan_state ) { $context = UNKNOWN_CONTEXT; @@ -8073,7 +8233,11 @@ BEGIN { } $identifier = $tok; - if ( $tok eq '$' || $tok eq '*' ) { + if ( $last_nonblank_token eq '->' ) { + $identifier = '->' . $identifier; + $id_scan_state = $scan_state_SIGIL; + } + elsif ( $tok eq '$' || $tok eq '*' ) { $id_scan_state = $scan_state_SIGIL; $context = SCALAR_CONTEXT; } @@ -8111,6 +8275,8 @@ BEGIN { $tokenizer_self->[_in_error_] = 1; } $id_scan_state = EMPTY_STRING; + + # emergency return goto RETURN; } $saw_type = !$saw_alpha; @@ -8128,9 +8294,9 @@ EOM } } - ############################### + #------------------------------ # loop to gather the identifier - ############################### + #------------------------------ $i_save = $i; @@ -8181,9 +8347,9 @@ EOM } ## end of main loop - ############## + #------------- # Check result - ############## + #------------- # Be sure a valid state is returned if ($id_scan_state) { @@ -8302,7 +8468,7 @@ EOM # lexical subs with these names can cause parsing errors in this version my @q = qw( m q qq qr qw qx s tr y ); @{warn_if_lexical}{@q} = (1) x scalar(@q); - } + } ## end BEGIN # saved package and subnames in case prototype is on separate line my ( $package_saved, $subname_saved ); @@ -8615,10 +8781,24 @@ EOM } } elsif ($next_nonblank_token) { # EOF technically ok - $subname = EMPTY_STRING unless defined($subname); - warning( + + if ( $rinput_hash->{tok} eq 'method' && $call_type == SUB_CALL ) + { + # For a method call, silently ignore this error (rt145706) + # to avoid needless warnings. Example which can produce it: + # test(method Pack (), "method"); + + # TODO: scan for use feature 'class' and: + # - if we saw 'use feature 'class' then issue the warning. + # - if we did not see use feature 'class' then issue the + # warning and suggest turning off --use-feature=class + } + else { + $subname = EMPTY_STRING unless defined($subname); + warning( "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n" - ); + ); + } } check_prototype( $proto, $package, $subname ); } @@ -8631,7 +8811,7 @@ EOM } ## end sub do_scan_sub } -#########i############################################################### +######################################################################### # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS ######################################################################### @@ -8650,12 +8830,31 @@ sub find_next_nonblank_token { } my $next_nonblank_token = $rtokens->[ ++$i ]; - return ( SPACE, $i ) unless defined($next_nonblank_token); + return ( SPACE, $i ) + unless ( defined($next_nonblank_token) && length($next_nonblank_token) ); + + # Quick test for nonblank ascii char. Note that we just have to + # examine the first character here. + my $ord = ord( substr( $next_nonblank_token, 0, 1 ) ); + if ( $ord >= ORD_PRINTABLE_MIN + && $ord <= ORD_PRINTABLE_MAX ) + { + return ( $next_nonblank_token, $i ); + } - if ( $next_nonblank_token =~ /^\s*$/ ) { + # Quick test to skip over an ascii space or tab + elsif ( $ord == ORD_SPACE || $ord == ORD_TAB ) { $next_nonblank_token = $rtokens->[ ++$i ]; return ( SPACE, $i ) unless defined($next_nonblank_token); } + + # Slow test to skip over something else identified as whitespace + elsif ( $next_nonblank_token =~ /^\s*$/ ) { + $next_nonblank_token = $rtokens->[ ++$i ]; + return ( SPACE, $i ) unless defined($next_nonblank_token); + } + + # We should be at a nonblank now return ( $next_nonblank_token, $i ); } ## end sub find_next_nonblank_token @@ -8675,23 +8874,27 @@ sub find_next_noncomment_type { find_next_nonblank_token( $i_next, $rtokens, $max_token_index ); } - goto RETURN if ( !$next_nonblank_token || $next_nonblank_token eq SPACE ); - - # check for possible a digraph - goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) ); - my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ]; - goto RETURN if ( !$is_digraph{$test2} ); - $next_nonblank_token = $test2; - $i_next = $i_next + 1; - - # check for possible a trigraph - goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) ); - my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ]; - goto RETURN if ( !$is_trigraph{$test3} ); - $next_nonblank_token = $test3; - $i_next = $i_next + 1; + # check for a digraph + if ( $next_nonblank_token + && $next_nonblank_token ne SPACE + && defined( $rtokens->[ $i_next + 1 ] ) ) + { + my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ]; + if ( $is_digraph{$test2} ) { + $next_nonblank_token = $test2; + $i_next = $i_next + 1; + + # check for a trigraph + if ( defined( $rtokens->[ $i_next + 1 ] ) ) { + my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ]; + if ( $is_trigraph{$test3} ) { + $next_nonblank_token = $test3; + $i_next = $i_next + 1; + } + } + } + } - RETURN: return ( $next_nonblank_token, $i_next ); } ## end sub find_next_noncomment_type @@ -8743,7 +8946,7 @@ sub is_possible_numerator { my @q = qw( & && | || ? : + - * and or while if unless); push @q, ')', '}', ']', '>', ',', ';'; @{pattern_test}{@q} = (1) x scalar(@q); - } + } ## end BEGIN sub pattern_expected { @@ -9269,13 +9472,19 @@ sub do_quote { # $quoted_string_1 = quoted string seen while in_quote=1 # $quoted_string_2 = quoted string seen while in_quote=2 my ( - $i, $in_quote, $quote_character, - $quote_pos, $quote_depth, $quoted_string_1, - $quoted_string_2, $rtokens, $rtoken_map, - $max_token_index - ) = @_; - my $in_quote_starting = $in_quote; + $i, + $in_quote, + $quote_character, + $quote_pos, + $quote_depth, + $quoted_string_1, + $quoted_string_2, + $rtokens, + $rtoken_map, + $max_token_index, + + ) = @_; my $quoted_string; if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow @@ -9284,7 +9493,7 @@ sub do_quote { $i, $in_quote, $quote_character, $quote_pos, $quote_depth, $quoted_string ) - = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character, + = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character, $quote_pos, $quote_depth, $max_token_index ); $quoted_string_2 .= $quoted_string; if ( $in_quote == 1 ) { @@ -9309,8 +9518,17 @@ sub do_quote { $quoted_string_1 .= "\n"; } } - return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth, - $quoted_string_1, $quoted_string_2 ); + return ( + + $i, + $in_quote, + $quote_character, + $quote_pos, + $quote_depth, + $quoted_string_1, + $quoted_string_2, + + ); } ## end sub do_quote sub follow_quoted_string { @@ -9330,9 +9548,18 @@ sub follow_quoted_string { # $quote_pos = index to check next for alphanumeric delimiter # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested. # $quoted_string = the text of the quote (without quotation tokens) - my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth, - $max_token_index ) - = @_; + my ( + + $i_beg, + $in_quote, + $rtokens, + $beginning_tok, + $quote_pos, + $quote_depth, + $max_token_index, + + ) = @_; + my ( $tok, $end_tok ); my $i = $i_beg - 1; my $quoted_string = EMPTY_STRING; @@ -9387,10 +9614,10 @@ sub follow_quoted_string { # characters, whereas for a non-alphanumeric delimiter, only tokens of # length 1 can match. - ################################################################### + #---------------------------------------------------------------- # Case 1 (rare): loop for case of alphanumeric quote delimiter.. # "quote_pos" is the position the current word to begin searching - ################################################################### + #---------------------------------------------------------------- if ( $beginning_tok =~ /\w/ ) { # Note this because it is not recommended practice except @@ -9449,9 +9676,9 @@ sub follow_quoted_string { } } - ######################################################################## + #----------------------------------------------------------------------- # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter.. - ######################################################################## + #----------------------------------------------------------------------- else { while ( $i < $max_token_index ) { @@ -9479,8 +9706,16 @@ sub follow_quoted_string { } } if ( $i > $max_token_index ) { $i = $max_token_index } - return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth, - $quoted_string ); + return ( + + $i, + $in_quote, + $beginning_tok, + $quote_pos, + $quote_depth, + $quoted_string, + + ); } ## end sub follow_quoted_string sub indicate_error { @@ -9490,7 +9725,7 @@ sub indicate_error { write_error_indicator_pair( $line_number, $input_line, $pos, $carrat ); resume_logfile(); return; -} +} ## end sub indicate_error sub write_error_indicator_pair { my ( $line_number, $input_line, $pos, $carrat ) = @_; @@ -9598,6 +9833,12 @@ sub write_on_underline { sub pre_tokenize { + my ( $str, $max_tokens_wanted ) = @_; + + # Input parameter: + # $max_tokens_wanted > 0 to stop on reaching this many tokens. + # = 0 means get all tokens + # Break a string, $str, into a sequence of preliminary tokens. We # are interested in these types of tokens: # words (type='w'), example: 'max_tokens_wanted' @@ -9611,9 +9852,8 @@ sub pre_tokenize { # An advantage of doing this pre-tokenization step is that it keeps almost # all of the regex work highly localized. A disadvantage is that in some # very rare instances we will have to go back and split a pre-token. - my ( $str, $max_tokens_wanted ) = @_; - # we return references to these 3 arrays: + # Return parameters: my @tokens = (); # array of the tokens themselves my @token_map = (0); # string position of start of each token my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct @@ -9670,7 +9910,7 @@ sub show_tokens { '[' => ']', '<' => '>', ); - } + } ## end BEGIN sub matching_end_token { @@ -9680,7 +9920,7 @@ sub show_tokens { return $matching_end_token{$beginning_token}; } return ($beginning_token); - } + } ## end sub matching_end_token } sub dump_token_types { @@ -9814,10 +10054,15 @@ BEGIN { @q = qw( print printf sort exec system say); @is_indirect_object_taker{@q} = (1) x scalar(@q); + # Note: 'field' will be added by sub check_options if --use-feature=class + @q = qw(my our state); + @is_my_our_state{@q} = (1) x scalar(@q); + # These tokens may precede a code block # patched for SWITCH/CASE/CATCH. Actually these could be removed # now and we could let the extended-syntax coding handle them. # Added 'default' for Switch::Plain. + # Note: 'ADJUST' will be added by sub check_options if --use-feature=class @q = qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else unless do while until eval for foreach map grep sort @@ -10067,8 +10312,12 @@ BEGIN { isa catch + ); + # Note: 'ADJUST', 'field' are added by sub check_options + # if --use-feature=class + # patched above for SWITCH/CASE given/when err say # 'err' is a fairly safe addition. # Added 'default' for Switch::Plain. Note that we could also have @@ -10168,6 +10417,7 @@ BEGIN { @q = qw(q qq qw qx qr s y tr m); @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q); + # Note: 'class' will be added by sub check_options if -use-feature=class @q = qw(package); @is_package{@q} = (1) x scalar(@q); @@ -10407,5 +10657,5 @@ BEGIN { # __DATA__ __END__ @is_keyword{@Keywords} = (1) x scalar(@Keywords); -} +} ## end BEGIN 1;