package Perl::Tidy::Tokenizer;
use strict;
use warnings;
-our $VERSION = '20210717';
+our $VERSION = '20220217';
+
+# this can be turned on for extra checking during development
+use constant DEVEL_MODE => 0;
use Perl::Tidy::LineBuffer;
use Carp;
@current_depth
@total_depth
$total_depth
+ $next_sequence_number
@nesting_sequence_number
@current_sequence_number
@paren_type
%is_valid_token_type
%is_keyword
%is_code_block_token
+ %is_sort_map_grep_eval_do
+ %is_grep_alias
%really_want_term
@opening_brace_names
@closing_brace_names
BEGIN {
- # Array index names for $self
+ # Array index names for $self.
+ # Do not combine with other BEGIN blocks (c101).
my $i = 0;
use constant {
_rhere_target_list_ => $i++,
croak "unexpected return from Perl::Tidy::Die";
}
+sub Fault {
+ my ($msg) = @_;
+
+ # This routine is called for errors that really should not occur
+ # except if there has been a bug introduced by a recent program change.
+ # Please add comments at calls to Fault to explain why the call
+ # should not occur, and where to look to fix it.
+ my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
+ my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
+ my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
+ my $input_stream_name = get_input_stream_name();
+
+ Die(<<EOM);
+==============================================================================
+While operating on input stream with name: '$input_stream_name'
+A fault was detected at line $line0 of sub '$subroutine1'
+in file '$filename1'
+which was called from line $line1 of sub '$subroutine2'
+Message: '$msg'
+This is probably an error introduced by a recent programming change.
+Perl::Tidy::Tokenizer.pm reports VERSION='$VERSION'.
+==============================================================================
+EOM
+
+ # We shouldn't get here, but this return is to keep Perl-Critic from
+ # complaining.
+ return;
+}
+
sub bad_pattern {
# See if a pattern will compile. We have to use a string eval here,
}
}
+ %is_grep_alias = ();
+ if ( $rOpts->{'grep-alias-list'} ) {
+
+ # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
+ # space-separated list
+ my @q = split /\s+/, $rOpts->{'grep-alias-list'};
+ @{is_grep_alias}{@q} = (1) x scalar(@q);
+ }
+
$rOpts_code_skipping = $rOpts->{'code-skipping'};
$code_skipping_pattern_begin =
make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
return;
}
+sub get_input_stream_name {
+ my $input_stream_name = "";
+ my $logger_object = $tokenizer_self->[_logger_object_];
+ if ($logger_object) {
+ $input_stream_name = $logger_object->get_input_stream_name();
+ }
+ return $input_stream_name;
+}
+
sub complain {
my $msg = shift;
my $logger_object = $tokenizer_self->[_logger_object_];
);
}
else {
- warning(
-"hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
- );
+ warning(<<EOM);
+Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
+ (Perl will match to the end of file but this may not be intended).
+EOM
}
my $nearly_matched_here_target_at =
$tokenizer_self->[_nearly_matched_here_target_at_];
return;
}
+sub is_valid_token_type {
+ my ($type) = @_;
+ return $is_valid_token_type{$type};
+}
+
sub get_input_line_number {
return $tokenizer_self->[_last_line_number_];
}
my $write_logfile_entry = sub {
my ($msg) = @_;
write_logfile_entry("Line $input_line_number: $msg");
+ return;
};
# Find and remove what characters terminate this line, including any
# HERE_END - last line of here-doc (target word)
# FORMAT - format section
# FORMAT_END - last line of format section, '.'
+ # SKIP - code skipping section
+ # SKIP_END - last line of code skipping section, '#>>V'
# DATA_START - __DATA__ line
# DATA - unidentified text following __DATA__
# END_START - __END__ line
# print line unchanged if in skipped section
elsif ( $tokenizer_self->[_in_skipped_] ) {
- # NOTE: marked as the existing type 'FORMAT' to keep html working
- $line_of_tokens->{_line_type} = 'FORMAT';
+ $line_of_tokens->{_line_type} = 'SKIP';
if ( $input_line =~ /$code_skipping_pattern_end/ ) {
+ $line_of_tokens->{_line_type} = 'SKIP_END';
$write_logfile_entry->("Exiting code-skipping section\n");
$tokenizer_self->[_in_skipped_] = 0;
}
$line_of_tokens->{_line_type} = 'POD_START';
warning(
"=cut starts a pod section .. this can fool pod utilities.\n"
- );
+ ) unless (DEVEL_MODE);
$write_logfile_entry->("Entering POD section\n");
}
}
# handle start of skipped section
if ( $tokenizer_self->[_in_skipped_] ) {
- # NOTE: marked as the existing type 'FORMAT' to keep html working
- $line_of_tokens->{_line_type} = 'FORMAT';
+ $line_of_tokens->{_line_type} = 'SKIP';
$write_logfile_entry->("Entering code-skipping section\n");
return $line_of_tokens;
}
@total_depth = ();
@nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
@current_sequence_number = ();
+ $next_sequence_number = 2; # The value 1 is reserved for SEQ_ROOT
@paren_type = ();
@paren_semicolon_count = ();
(
$routput_token_list, $routput_token_type,
$routput_block_type, $routput_container_type,
- $routput_type_sequence, $routput_type_sequence,
+ $routput_type_sequence, $routput_indent_flag,
) = @{$rTV2};
(
return;
}
+ sub split_pretoken {
+
+ my ($numc) = @_;
+
+ # Split the leading $numc characters from the current token (at index=$i)
+ # which is pre-type 'w' and insert the remainder back into the pretoken
+ # stream with appropriate settings. Since we are splitting a pre-type 'w',
+ # there are three cases, depending on if the remainder starts with a digit:
+ # Case 1: remainder is type 'd', all digits
+ # Case 2: remainder is type 'd' and type 'w': digits and other characters
+ # Case 3: remainder is type 'w'
+
+ # Examples, for $numc=1:
+ # $tok => $tok_0 $tok_1 $tok_2
+ # 'x10' => 'x' '10' # case 1
+ # 'x10if' => 'x' '10' 'if' # case 2
+ # '0ne => 'O' 'ne' # case 3
+
+ # where:
+ # $tok_1 is a possible string of digits (pre-type 'd')
+ # $tok_2 is a possible word (pre-type 'w')
+
+ # return 1 if successful
+ # return undef if error (shouldn't happen)
+
+ # Calling routine should update '$type' and '$tok' if successful.
+
+ my $pretoken = $rtokens->[$i];
+ if ( $pretoken
+ && length($pretoken) > $numc
+ && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
+ {
+
+ # Split $tok into up to 3 tokens:
+ my $tok_0 = substr( $pretoken, 0, $numc );
+ my $tok_1 = defined($1) ? $1 : "";
+ my $tok_2 = defined($2) ? $2 : "";
+
+ my $len_0 = length($tok_0);
+ my $len_1 = length($tok_1);
+ my $len_2 = length($tok_2);
+
+ my $pre_type_0 = 'w';
+ my $pre_type_1 = 'd';
+ my $pre_type_2 = 'w';
+
+ my $pos_0 = $rtoken_map->[$i];
+ my $pos_1 = $pos_0 + $len_0;
+ my $pos_2 = $pos_1 + $len_1;
+
+ my $isplice = $i + 1;
+
+ # Splice in any digits
+ if ($len_1) {
+ splice @{$rtoken_map}, $isplice, 0, $pos_1;
+ splice @{$rtokens}, $isplice, 0, $tok_1;
+ splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
+ $max_token_index++;
+ $isplice++;
+ }
+
+ # Splice in any trailing word
+ if ($len_2) {
+ splice @{$rtoken_map}, $isplice, 0, $pos_2;
+ splice @{$rtokens}, $isplice, 0, $tok_2;
+ splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
+ $max_token_index++;
+ }
+
+ $rtokens->[$i] = $tok_0;
+ return 1;
+ }
+ else {
+
+ # Shouldn't get here
+ if (DEVEL_MODE) {
+ Fault(<<EOM);
+While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
+EOM
+ }
+ }
+ return;
+ }
+
sub get_indentation_level {
# patch to avoid reporting error if indented if is not terminated
@brace_package, @square_bracket_type,
@square_bracket_structural_type, @depth_array,
@starting_line_of_current_depth, @nested_ternary_flag,
- @nested_statement_type,
+ @nested_statement_type, $next_sequence_number,
);
# save all lexical variables
( $i, $tok, $type, $id_scan_state, $identifier ) =
scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
$max_token_index, $expecting, $paren_type[$paren_depth] );
+
+ # Check for signal to fix a special variable adjacent to a keyword,
+ # such as '$^One$0'.
+ if ( $id_scan_state eq '^' ) {
+
+ # Try to fix it by splitting the pretoken
+ if ( $i > 0
+ && $rtokens->[ $i - 1 ] eq '^'
+ && split_pretoken(1) )
+ {
+ $identifier = substr( $identifier, 0, 3 );
+ $tok = $identifier;
+ }
+ else {
+
+ # This shouldn't happen ...
+ my $var = substr( $tok, 0, 3 );
+ my $excess = substr( $tok, 3 );
+ interrupt_logfile();
+ warning(<<EOM);
+$input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
+A space may be needed after '$var'.
+EOM
+ resume_logfile();
+ }
+ $id_scan_state = "";
+ }
return;
}
|| $last_nonblank_type eq 'U' ) # possible object
)
{
- $type = 'Z';
+
+ # An identifier followed by '->' is not indirect object;
+ # fixes b1175, b1176
+ my ( $next_nonblank_type, $i_next ) =
+ find_next_noncomment_type( $i, $rtokens, $max_token_index );
+ $type = 'Z' if ( $next_nonblank_type ne '->' );
}
},
'(' => sub {
&& $last_nonblank_i >= 0 )
{
if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
- $routput_token_type->[$last_nonblank_i] = 'G';
+ $routput_token_type->[$last_nonblank_i] =
+ $is_grep_alias{$block_type} ? 'k' : 'G';
}
}
# check for special variables like ${^WARNING_BITS}
if ( $expecting == TERM ) {
- # FIXME: this should work but will not catch errors
- # because we also have to be sure that previous token is
- # a type character ($,@,%).
if ( $last_nonblank_token eq '{'
&& ( $next_tok !~ /^\d/ )
&& ( $next_tok =~ /^\w/ ) )
$tok = $tok . $next_tok;
$i = $i + 1;
$type = 'w';
+
+ # Optional coding to try to catch syntax errors. This can
+ # be removed if it ever causes incorrect warning messages.
+ # The '{^' should be preceded by either by a type or '$#'
+ # Examples:
+ # $#{^CAPTURE} ok
+ # *${^LAST_FH}{NAME} ok
+ # @{^HOWDY} ok
+ # $hash{^HOWDY} error
+
+ # Note that a type sigil '$' may be tokenized as 'Z'
+ # after something like 'print', so allow type 'Z'
+ if ( $last_last_nonblank_type ne 't'
+ && $last_last_nonblank_type ne 'Z'
+ && $last_last_nonblank_token ne '$#' )
+ {
+ warning("Possible syntax error near '{^'\n");
+ }
}
else {
elsif ( $expecting == TERM ) {
unless ($saw_error) {
- # shouldn't happen..
- warning("Program bug; didn't find here doc target\n");
+ # shouldn't happen..arriving here implies an error in
+ # the logic in sub 'find_here_doc'
+ if (DEVEL_MODE) {
+ Fault(<<EOM);
+Program bug; didn't find here doc target
+EOM
+ }
+ warning(
+"Possible program error: didn't find here doc target\n"
+ );
report_definite_bug();
}
}
elsif ( $expecting == TERM ) {
unless ($saw_error) {
- # shouldn't happen..
- warning("Program bug; didn't find here doc target\n");
+ # shouldn't happen..arriving here implies an error in
+ # the logic in sub 'find_here_doc'
+ if (DEVEL_MODE) {
+ Fault(<<EOM);
+Program bug; didn't find here doc target
+EOM
+ }
+ warning(
+"Possible program error: didn't find here doc target\n"
+ );
report_definite_bug();
}
}
}
else {
+ error_if_expecting_OPERATOR();
}
},
'->' => sub {
if elsif else unless while until for foreach switch case given when);
@is_zero_continuation_block_type{@_} = (1) x scalar(@_);
- my %is_not_zero_continuation_block_type;
- @_ = qw(sort grep map do eval);
- @is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);
-
my %is_logical_container;
@_ = qw(if elsif unless while and or err not && ! || for foreach);
@is_logical_container{@_} = (1) x scalar(@_);
$routput_token_type->[$i] = $type;
}
- $tok = $quote_character if ($quote_character);
+
+ # Removed to fix b1280. This is not needed and was causing the
+ # starting type 'qw' to be lost, leading to mis-tokenization of
+ # a trailing block brace in a parenless for stmt 'for .. qw.. {'
+ ##$tok = $quote_character if ($quote_character);
# scan for the end of the quote or pattern
(
}
}
- $last_last_nonblank_token = $last_nonblank_token;
- $last_last_nonblank_type = $last_nonblank_type;
- $last_last_nonblank_block_type = $last_nonblank_block_type;
- $last_last_nonblank_container_type =
- $last_nonblank_container_type;
- $last_last_nonblank_type_sequence =
- $last_nonblank_type_sequence;
- $last_nonblank_token = $tok;
- $last_nonblank_type = $type;
- $last_nonblank_prototype = $prototype;
- $last_nonblank_block_type = $block_type;
- $last_nonblank_container_type = $container_type;
- $last_nonblank_type_sequence = $type_sequence;
- $last_nonblank_i = $i_tok;
+ # fix c090, only rotate vars if a new token will be stored
+ if ( $i_tok >= 0 ) {
+ $last_last_nonblank_token = $last_nonblank_token;
+ $last_last_nonblank_type = $last_nonblank_type;
+ $last_last_nonblank_block_type = $last_nonblank_block_type;
+ $last_last_nonblank_container_type =
+ $last_nonblank_container_type;
+ $last_last_nonblank_type_sequence =
+ $last_nonblank_type_sequence;
+
+ # Fix part #3 for git82: propagate type 'Z' though L-R pair
+ unless ( $type eq 'R' && $last_nonblank_type eq 'Z' ) {
+ $last_nonblank_token = $tok;
+ $last_nonblank_type = $type;
+ }
+ $last_nonblank_prototype = $prototype;
+ $last_nonblank_block_type = $block_type;
+ $last_nonblank_container_type = $container_type;
+ $last_nonblank_type_sequence = $type_sequence;
+ $last_nonblank_i = $i_tok;
+ }
# Patch for c030: Fix things in case a '->' got separated from
# the subsequent identifier by a side comment. We need the
scan_identifier();
}
- last if ($id_scan_state);
+ if ($id_scan_state) {
+
+ # Still scanning ...
+ # Check for side comment between sub and prototype (c061)
+
+ # done if nothing left to scan on this line
+ last if ( $i > $max_token_index );
+
+ my ( $next_nonblank_token, $i_next ) =
+ find_next_nonblank_token_on_this_line( $i, $rtokens,
+ $max_token_index );
+
+ # done if it was just some trailing space
+ last if ( $i_next > $max_token_index );
+
+ # something remains on the line ... must be a side comment
+ next;
+ }
+
next if ( ( $i > 0 ) || $type );
# didn't find any token; start over
# a key with 18 a's. But something like
# push @array, a x18;
# is a syntax error.
- if ( $expecting == OPERATOR && $tok =~ /^x\d+$/ ) {
+ if (
+ $expecting == OPERATOR
+ && substr( $tok, 0, 1 ) eq 'x'
+ && ( length($tok) == 1
+ || substr( $tok, 1, 1 ) =~ /^\d/ )
+ )
+ {
$type = 'n';
+ if ( split_pretoken(1) ) {
+ $type = 'x';
+ $tok = 'x';
+ }
}
else {
# Decide if 'sub :' can be the start of a sub attribute list.
# We will decide based on if the colon is followed by a
# bareword which is not a keyword.
+ # Changed inext+1 to inext to fixed case b1190.
my $sub_attribute_ok_here;
if ( $is_sub{$tok_kw}
&& $expecting != OPERATOR
&& $next_nonblank_token eq ':' )
{
my ( $nn_nonblank_token, $i_nn ) =
- find_next_nonblank_token( $i_next + 1,
+ find_next_nonblank_token( $i_next,
$rtokens, $max_token_index );
$sub_attribute_ok_here =
$nn_nonblank_token =~ /^\w/
}
# handle operator x (now we know it isn't $x=)
- if ( $expecting == OPERATOR
+ if (
+ $expecting == OPERATOR
&& substr( $tok, 0, 1 ) eq 'x'
- && $tok =~ /^x\d*$/ )
+ && ( length($tok) == 1
+ || substr( $tok, 1, 1 ) =~ /^\d/ )
+ )
{
- if ( $tok eq 'x' ) {
+ if ( $tok eq 'x' ) {
if ( $rtokens->[ $i + 1 ] eq '=' ) { # x=
$tok = 'x=';
$type = $tok;
$type = 'x';
}
}
-
- # NOTE: mark something like x4 as an integer for now
- # It gets fixed downstream. This is easier than
- # splitting the pretoken.
else {
+
+ # Split a pretoken like 'x10' into 'x' and '10'.
+ # Note: In previous versions of perltidy it was marked
+ # as a number, $type = 'n', and fixed downstream by the
+ # Formatter.
$type = 'n';
+ if ( split_pretoken(1) ) {
+ $type = 'x';
+ $tok = 'x';
+ }
}
}
elsif ( $tok_kw eq 'CORE::' ) {
if ( !defined($number) ) {
# shouldn't happen - we should always get a number
- warning("non-number beginning with digit--program bug\n");
+ if (DEVEL_MODE) {
+ Fault(<<EOM);
+non-number beginning with digit--program bug
+EOM
+ }
+ warning(
+"Unexpected error condition: non-number beginning with digit\n"
+ );
report_definite_bug();
}
}
# zero continuation flag at terminal BLOCK '}' which
# ends a statement.
- if ( $routput_block_type->[$i] ) {
+ my $block_type_i = $routput_block_type->[$i];
+ if ($block_type_i) {
# ...These include non-anonymous subs
# note: could be sub ::abc { or sub 'abc
- if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
+ if ( $block_type_i =~ m/^sub\s*/gc ) {
# note: older versions of perl require the /gc modifier
# here or else the \G does not work.
- if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
- {
+ if ( $block_type_i =~ /\G('|::|\w)/gc ) {
$in_statement_continuation = 0;
}
}
# block prototypes and these: (sort|grep|map|do|eval)
# /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
elsif (
- $is_zero_continuation_block_type{
- $routput_block_type->[$i]
- }
- )
+ $is_zero_continuation_block_type{$block_type_i} )
{
$in_statement_continuation = 0;
}
# ..but these are not terminal types:
# /^(sort|grep|map|do|eval)$/ )
- elsif (
- $is_not_zero_continuation_block_type{
- $routput_block_type->[$i]
- }
- )
+ elsif ($is_sort_map_grep_eval_do{$block_type_i}
+ || $is_grep_alias{$block_type_i} )
{
}
# ..and a block introduced by a label
# /^\w+\s*:$/gc ) {
- elsif ( $routput_block_type->[$i] =~ /:$/ ) {
+ elsif ( $block_type_i =~ /:$/ ) {
$in_statement_continuation = 0;
}
return;
}
-} # end tokenize_this_line
+} ## end tokenize_this_line
#########i#############################################################
# Tokenizer routines which assist in identifying token types
# exceptions to perl's weird parsing rules after type 'Z'
my %is_weird_parsing_rule_exception;
+my %is_paren_dollar;
+
+my %is_n_v;
+
BEGIN {
# Always expecting TERM following these types:
# Fix for git #62: added '*' and '%'
@q = qw( < ? * % );
- @{is_weird_parsing_rule_exception}{@q} = (OPERATOR) x scalar(@q);
+ @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
+
+ @q = qw<) $>;
+ @{is_paren_dollar}{@q} = (1) x scalar(@q);
+
+ @q = qw( n v );
+ @{is_n_v}{@q} = (1) x scalar(@q);
}
# FIXME: it would be cleaner to make this a special type
# expecting VERSION or {} after package NAMESPACE
# TODO: maybe mark these words as type 'Y'?
- if ( $statement_type =~ /^package\b/
+ if ( substr( $last_nonblank_token, 0, 7 ) eq 'package'
+ && $statement_type =~ /^package\b/
&& $last_nonblank_token =~ /^package\b/ )
{
$op_expected = TERM;
$op_expected = OPERATOR; # block mode following }
}
- elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
+ ##elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
+ elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
+ || substr( $last_nonblank_token, 0, 2 ) eq '->' )
+ {
$op_expected = OPERATOR;
if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
-
}
# Check for smartmatch operator before preceding brace or square
# use Module VERSION LIST
# We could avoid this exception by writing a special sub to parse 'use'
# statements and perhaps mark these numbers with a new type V (for VERSION)
- elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
+ ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
+ elsif ( $is_n_v{$last_nonblank_type} ) {
$op_expected = OPERATOR;
if ( $statement_type eq 'use' ) {
$op_expected = UNKNOWN;
$op_expected = UNKNOWN;
}
+ # Exception to weird parsing rules for 'x(' ... see case b1205:
+ # In something like 'print $vv x(...' the x is an operator;
+ # Likewise in 'print $vv x$ww' the x is an operatory (case b1207)
+ # otherwise x follows the weird parsing rules.
+ elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
+ $op_expected = OPERATOR;
+ }
+
# The 'weird parsing rules' of next section do not work for '<' and '?'
# It is best to mark them as unknown. Test case:
# print $fh <DATA>;
# otherwise, look at previous token. This must be a code block if
# it follows any of these:
# /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
- elsif ( $is_code_block_token{$last_nonblank_token} ) {
+ elsif ($is_code_block_token{$last_nonblank_token}
+ || $is_grep_alias{$last_nonblank_token} )
+ {
# Bug Patch: Note that the opening brace after the 'if' in the following
# snippet is an anonymous hash ref and not a code block!
return;
}
+my %is_sigil_or_paren;
+my %is_R_closing_sb;
+
+BEGIN {
+
+ my @q = qw< $ & % * @ ) >;
+ @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
+
+ @q = qw(R ]);
+ @{is_R_closing_sb}{@q} = (1) x scalar(@q);
+}
+
sub is_non_structural_brace {
# Decide if a brace or bracket is structural or non-structural
# otherwise, it is non-structural if it is decorated
# by type information.
# For example, the '{' here is non-structural: ${xxx}
+ # Removed '::' to fix c074
+ ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
return (
- $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
+ ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
+ $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
+ || substr( $last_nonblank_token, 0, 2 ) eq '->'
# or if we follow a hash or array closing curly brace or bracket
# For example, the second '{' in this is non-structural: $a{'x'}{'y'}
# because the first '}' would have been given type 'R'
- || $last_nonblank_type =~ /^([R\]])$/
+ ##|| $last_nonblank_type =~ /^([R\]])$/
+ || $is_R_closing_sb{$last_nonblank_type}
);
}
# Sequence numbers increment by number of items. This keeps
# a unique set of numbers but still allows the relative location
# of any type to be determined.
- $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
- my $seqno = $nesting_sequence_number[$aa];
+
+ ########################################################################
+ # OLD SEQNO METHOD for incrementing sequence numbers.
+ # Keep this coding awhile for possible testing.
+ ## $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
+ ## my $seqno = $nesting_sequence_number[$aa];
+
+ # NEW SEQNO METHOD, continuous sequence numbers. This allows sequence
+ # numbers to be used as array indexes, and allows them to be compared.
+ my $seqno = $next_sequence_number++;
+ ########################################################################
+
$current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
$starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
}
}
}
- $nested_statement_type[$aa][ $current_depth[$aa] ] = $statement_type;
+
+ # Fix part #1 for git82: save last token type for propagation of type 'Z'
+ $nested_statement_type[$aa][ $current_depth[$aa] ] =
+ [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
$statement_type = "";
return ( $seqno, $indent );
}
if ( $aa == QUESTION_COLON ) {
$outdent = $nested_ternary_flag[ $current_depth[$aa] ];
}
- $statement_type = $nested_statement_type[$aa][ $current_depth[$aa] ];
+
+ # Fix part #2 for git82: use saved type for propagation of type 'Z'
+ # through type L-R braces. Perl seems to allow ${bareword}
+ # as an indirect object, but nothing much more complex than that.
+ ( $statement_type, my $saved_type, my $saved_token ) =
+ @{ $nested_statement_type[$aa][ $current_depth[$aa] ] };
+ if ( $aa == BRACE
+ && $saved_type eq 'Z'
+ && $last_nonblank_type eq 'w'
+ && $brace_structural_type[$brace_depth] eq 'L' )
+ {
+ $last_nonblank_type = $saved_type;
+ }
# check that any brace types $bb contained within are balanced
for my $bb ( 0 .. @closing_brace_names - 1 ) {
$line =~ s/^\s*//; # trim leading blanks
next if ( length($line) <= 0 ); # skip blank
next if ( $line =~ /^#/ ); # skip comment
- my ( $rtok, $rmap, $rtype ) =
- pre_tokenize( $line, 2 ); # only need 2 pre-tokens
+
+ # Updated from 2 to 3 to get trigraphs, added for case b1175
+ my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
my $j = $max_token_index + 1;
foreach my $tok ( @{$rtok} ) {
# usually indicates a pattern. We can use this to break ties.
my $is_pattern_by_spacing =
- ( $i > 1 && $next_token ne ' ' && $rtokens->[ $i - 2 ] eq ' ' );
+ ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
# look for a possible ending / on this line..
my $in_quote = 1;
if ( $id_scan_state && ( !defined($type) || !$type ) ) {
# shouldn't happen:
+ if (DEVEL_MODE) {
+ Fault(<<EOM);
+Program bug in scan_id: undefined type but scan_state=$id_scan_state
+EOM
+ }
warning(
-"Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
+"Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
);
report_definite_bug();
}
return ( $i, $tok, $type );
}
+my %is_special_variable_char;
+
+BEGIN {
+
+ # These are the only characters which can (currently) form special
+ # variables, like $^W: (issue c066).
+ my @q =
+ qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
+ @{is_special_variable_char}{@q} = (1) x scalar(@q);
+}
+
sub scan_identifier_do {
# This routine assembles tokens into identifiers. It maintains a
}
else {
- # shouldn't happen
- my ( $a, $b, $c ) = caller;
- warning("Program Bug: scan_identifier given bad token = $tok \n");
- warning(" called from sub $a line: $c\n");
- report_definite_bug();
+ # shouldn't happen: bad call parameter
+ my $msg =
+"Program bug detected: scan_identifier received bad starting token = '$tok'\n";
+ if (DEVEL_MODE) { Fault($msg) }
+ if ( !$tokenizer_self->[_in_error_] ) {
+ warning($msg);
+ $tokenizer_self->[_in_error_] = 1;
+ }
+ $id_scan_state = '';
+ goto RETURN;
}
$saw_type = !$saw_alpha;
}
}
elsif ( $tok eq '^' ) {
- # check for some special variables like $^W
+ # check for some special variables like $^ $^W
if ( $identifier =~ /^[\$\*\@\%]$/ ) {
$identifier .= $tok;
- $id_scan_state = 'A';
+ $type = 'i';
- # Perl accepts '$^]' or '@^]', but
- # there must not be a space before the ']'.
+ # There may be one more character, not a space, after the ^
my $next1 = $rtokens->[ $i + 1 ];
- if ( $next1 eq ']' ) {
+ my $chr = substr( $next1, 0, 1 );
+ if ( $is_special_variable_char{$chr} ) {
+
+ # It is something like $^W
+ # Test case (c066) : $^Oeq'linux'
$i++;
$identifier .= $next1;
+
+ # If pretoken $next1 is more than one character long,
+ # set a flag indicating that it needs to be split.
+ $id_scan_state = ( length($next1) > 1 ) ? '^' : "";
+ last;
+ }
+ else {
+
+ # it is just $^
+ # Simple test case (c065): '$aa=$^if($bb)';
$id_scan_state = "";
last;
}
}
else {
$id_scan_state = '';
+ $i = $i_save;
+ last; # c106
}
}
- else { # something else
+ else { # something else
if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
$id_scan_state = '';
last;
}
+ elsif ( $tok eq '^' ) {
+ if ( $identifier eq '&' ) {
+
+ # Special variable (c066)
+ $identifier .= $tok;
+ $type = '&';
+
+ # There may be one more character, not a space, after the ^
+ my $next1 = $rtokens->[ $i + 1 ];
+ my $chr = substr( $next1, 0, 1 );
+ if ( $is_special_variable_char{$chr} ) {
+
+ # It is something like &^O
+ $i++;
+ $identifier .= $next1;
+
+ # If pretoken $next1 is more than one character long,
+ # set a flag indicating that it needs to be split.
+ $id_scan_state = ( length($next1) > 1 ) ? '^' : "";
+ }
+ else {
+
+ # it is &^
+ $id_scan_state = "";
+ }
+ last;
+ }
+ else {
+ $identifier = '';
+ $i = $i_save;
+ }
+ last;
+ }
else {
# punctuation variable?
if ($saw_type) {
if ($saw_alpha) {
- if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
+
+ # The type without the -> should be the same as with the -> so
+ # that if they get separated we get the same bond strengths,
+ # etc. See b1234
+ if ( $identifier =~ /^->/
+ && $last_nonblank_type eq 'w'
+ && substr( $identifier, 2, 1 ) =~ /^\w/ )
+ {
$type = 'w';
}
else { $type = 'i' }
$i = $i_begin;
}
+ RETURN:
+
DEBUG_SCAN_ID && do {
my ( $a, $b, $c ) = caller;
print STDOUT
$max_token_index );
if ($error) { warning("Possibly invalid sub\n") }
- # Patch part #2 to fixes cases b994 and b1053:
- # Do not let spaces be part of the token of an anonymous sub keyword
- # which we marked as type 'k' above...i.e. for something like:
- # 'sub : lvalue { ...'
- # Back up and let it be parsed as a blank
+ # Patch part #2 to fixes cases b994 and b1053:
+ # Do not let spaces be part of the token of an anonymous sub
+ # keyword which we marked as type 'k' above...i.e. for
+ # something like:
+ # 'sub : lvalue { ...'
+ # Back up and let it be parsed as a blank
if ( $type eq 'k'
&& $attrs
&& $i > $i_entry
- && substr( $rtokens->[$i], 0, 1 ) eq ' ' )
+ && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
{
$i--;
}
else {
warning(
"already saw definition of 'sub $subname' in package '$package' at line $lno\n"
- );
+ ) unless (DEVEL_MODE);
}
}
$saw_function_definition{$subname}{$package} =
return ( $next_nonblank_token, $i );
}
+sub find_next_noncomment_type {
+ my ( $i, $rtokens, $max_token_index ) = @_;
+
+ # Given the current character position, look ahead past any comments
+ # and blank lines and return the next token, including digraphs and
+ # trigraphs.
+
+ my ( $next_nonblank_token, $i_next ) =
+ find_next_nonblank_token( $i, $rtokens, $max_token_index );
+
+ # skip past any side comment
+ if ( $next_nonblank_token eq '#' ) {
+ ( $next_nonblank_token, $i_next ) =
+ find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
+ }
+
+ goto RETURN if ( !$next_nonblank_token || $next_nonblank_token eq " " );
+
+ # check for possible a digraph
+ goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
+ my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
+ goto RETURN if ( !$is_digraph{$test2} );
+ $next_nonblank_token = $test2;
+ $i_next = $i_next + 1;
+
+ # check for possible a trigraph
+ goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
+ my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
+ goto RETURN if ( !$is_trigraph{$test3} );
+ $next_nonblank_token = $test3;
+ $i_next = $i_next + 1;
+
+ RETURN:
+ return ( $next_nonblank_token, $i_next );
+}
+
sub is_possible_numerator {
# Look at the next non-comment character and decide if it could be a
elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
# shouldn't happen - we shouldn't be here if operator is expected
- else { warning("Program Bug in find_angle_operator_termination\n") }
+ else {
+ if (DEVEL_MODE) {
+ Fault(<<EOM);
+Bad call to find_angle_operator_termination
+EOM
+ }
+ return ( $i, $type );
+ }
# To illustrate what we might be looking at, in case we are
# guessing, here are some examples of valid angle operators
my $pos_beg = $rtoken_map->[$i];
my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
+ # Test for '<' after possible filehandle, issue c103
+ # print $fh <>; # syntax error
+ # print $fh <DATA>; # ok
+ # print $fh < DATA>; # syntax error at '>'
+ # print STDERR < DATA>; # ok, prints word 'DATA'
+ # print BLABLA <DATA>; # ok; does nothing unless BLABLA is defined
+ if ( $last_nonblank_type eq 'Z' ) {
+
+ # $str includes brackets; something like '<DATA>'
+ if ( substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
+ && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
+ {
+ return ( $i, $type );
+ }
+ }
+
# Reject if the closing '>' follows a '-' as in:
# if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
if ( $expecting eq UNKNOWN ) {
# It may be possible that a quote ends midway in a pretoken.
# If this happens, it may be necessary to split the pretoken.
if ($error) {
+ if (DEVEL_MODE) {
+ Fault(<<EOM);
+unexpected error condition returned by inverse_pretoken_map
+EOM
+ }
warning(
"Possible tokinization error..please check this line\n");
- report_possible_bug();
}
# count blanks on inside of brackets
# Look for bad starting characters; Shouldn't happen..
if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
- warning("Program bug - scan_number given character $first_char\n");
- report_definite_bug();
+ if (DEVEL_MODE) {
+ Fault(<<EOM);
+Program bug - scan_number given bad first character = '$first_char'
+EOM
+ }
return ( $i, $type, $number );
}
[Pp][+-]?[0-9a-fA-F] # REQUIRED exponent with digit
[0-9a-fA-F_]*) # optional Additional exponent digits
- # or hex integer
+ # or hex integer
|([xX][0-9a-fA-F_]+)
- # or octal fraction
+ # or octal fraction
|([oO]?[0-7_]+ # string of octal digits
(\.([0-7][0-7_]*)?)? # optional decimal and fraction
[Pp][+-]?[0-7] # REQUIRED exponent, no underscore
[Pp][+-]?[01] # Required exponent indicator, no underscore
[01_]*) # additional exponent bits
- # or binary integer
+ # or binary integer
|([bB][01_]+) # 'b' with string of binary digits
)/gx
"Note: alphanumeric quote delimiter ($beginning_tok) \n");
}
- while ( $i < $max_token_index ) {
+ # Note: changed < to <= here to fix c109. Relying on extra end blanks.
+ while ( $i <= $max_token_index ) {
if ( $quote_pos == 0 || ( $i < 0 ) ) {
$tok = $rtokens->[ ++$i ];
$quoted_string .=
substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
+ # NOTE: any quote modifiers will be at the end of '$tok'. If we
+ # wanted to check them, this is the place to get them. But
+ # this quote form is rarely used in practice, so it isn't
+ # worthwhile.
+
$quote_depth--;
if ( $quote_depth == 0 ) {
HERE_END - last line of here-doc (target word)
FORMAT - format section
FORMAT_END - last line of format section, '.'
+ SKIP - code skipping section
+ SKIP_END - last line of code skipping section, '#>>V'
DATA_START - __DATA__ line
DATA - unidentified text following __DATA__
END_START - __END__ line
switch case given when default catch try finally);
@is_code_block_token{@q} = (1) x scalar(@q);
+ # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
+ # to contrast it with the block types in '%is_zero_continuation_block_type'
+ @q = qw( sort map grep eval do );
+ @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
+
+ %is_grep_alias = ();
+
# I'll build the list of keywords incrementally
my @Keywords = ();