New upstream version 20220217

[perltidy.git] / lib / Perl / Tidy / Tokenizer.pm
diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm

index c012e47f9d89b0a0c068a375f21c8ece343ced89..b5305063b3cb9fa406f98614f06b342ee9eb3163 100644 (file)
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -21,7 +21,10 @@
  package Perl::Tidy::Tokenizer;
  use strict;
  use warnings;
-our $VERSION = '20210717';
+our $VERSION = '20220217';
+
+# this can be turned on for extra checking during development
+use constant DEVEL_MODE => 0;
  
  use Perl::Tidy::LineBuffer;
  use Carp;
@@ -55,6 +58,7 @@ use vars qw{
    @current_depth
    @total_depth
    $total_depth
+  $next_sequence_number
    @nesting_sequence_number
    @current_sequence_number
    @paren_type
@@ -88,6 +92,8 @@ use vars qw{
    %is_valid_token_type
    %is_keyword
    %is_code_block_token
+  %is_sort_map_grep_eval_do
+  %is_grep_alias
    %really_want_term
    @opening_brace_names
    @closing_brace_names
@@ -128,7 +134,8 @@ use constant MAX_NAG_MESSAGES => 6;
  
  BEGIN {
  
-    # Array index names for $self
+    # Array index names for $self.
+    # Do not combine with other BEGIN blocks (c101).
      my $i = 0;
      use constant {
          _rhere_target_list_                  => $i++,
@@ -229,6 +236,35 @@ sub Die {
      croak "unexpected return from Perl::Tidy::Die";
  }
  
+sub Fault {
+    my ($msg) = @_;
+
+    # This routine is called for errors that really should not occur
+    # except if there has been a bug introduced by a recent program change.
+    # Please add comments at calls to Fault to explain why the call
+    # should not occur, and where to look to fix it.
+    my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
+    my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
+    my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
+    my $input_stream_name = get_input_stream_name();
+
+    Die(<<EOM);
+==============================================================================
+While operating on input stream with name: '$input_stream_name'
+A fault was detected at line $line0 of sub '$subroutine1'
+in file '$filename1'
+which was called from line $line1 of sub '$subroutine2'
+Message: '$msg'
+This is probably an error introduced by a recent programming change.
+Perl::Tidy::Tokenizer.pm reports VERSION='$VERSION'.
+==============================================================================
+EOM
+
+    # We shouldn't get here, but this return is to keep Perl-Critic from
+    # complaining.
+    return;
+}
+
  sub bad_pattern {
  
      # See if a pattern will compile. We have to use a string eval here,
@@ -276,6 +312,15 @@ sub check_options {
          }
      }
  
+    %is_grep_alias = ();
+    if ( $rOpts->{'grep-alias-list'} ) {
+
+        # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
+        # space-separated list
+        my @q = split /\s+/, $rOpts->{'grep-alias-list'};
+        @{is_grep_alias}{@q} = (1) x scalar(@q);
+    }
+
      $rOpts_code_skipping = $rOpts->{'code-skipping'};
      $code_skipping_pattern_begin =
        make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
@@ -422,6 +467,15 @@ sub warning {
      return;
  }
  
+sub get_input_stream_name {
+    my $input_stream_name = "";
+    my $logger_object     = $tokenizer_self->[_logger_object_];
+    if ($logger_object) {
+        $input_stream_name = $logger_object->get_input_stream_name();
+    }
+    return $input_stream_name;
+}
+
  sub complain {
      my $msg           = shift;
      my $logger_object = $tokenizer_self->[_logger_object_];
@@ -601,9 +655,10 @@ EOM
              );
          }
          else {
-            warning(
-"hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
-            );
+            warning(<<EOM);
+Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
+  (Perl will match to the end of file but this may not be intended).
+EOM
          }
          my $nearly_matched_here_target_at =
            $tokenizer_self->[_nearly_matched_here_target_at_];
@@ -692,6 +747,11 @@ sub report_v_string {
      return;
  }
  
+sub is_valid_token_type {
+    my ($type) = @_;
+    return $is_valid_token_type{$type};
+}
+
  sub get_input_line_number {
      return $tokenizer_self->[_last_line_number_];
  }
@@ -714,6 +774,7 @@ sub get_line {
      my $write_logfile_entry = sub {
          my ($msg) = @_;
          write_logfile_entry("Line $input_line_number: $msg");
+        return;
      };
  
      # Find and remove what characters terminate this line, including any
@@ -747,6 +808,8 @@ sub get_line {
      #   HERE_END       - last line of here-doc (target word)
      #   FORMAT         - format section
      #   FORMAT_END     - last line of format section, '.'
+    #   SKIP           - code skipping section
+    #   SKIP_END       - last line of code skipping section, '#>>V'
      #   DATA_START     - __DATA__ line
      #   DATA           - unidentified text following __DATA__
      #   END_START      - __END__ line
@@ -884,9 +947,9 @@ sub get_line {
      # print line unchanged if in skipped section
      elsif ( $tokenizer_self->[_in_skipped_] ) {
  
-        # NOTE: marked as the existing type 'FORMAT' to keep html working
-        $line_of_tokens->{_line_type} = 'FORMAT';
+        $line_of_tokens->{_line_type} = 'SKIP';
          if ( $input_line =~ /$code_skipping_pattern_end/ ) {
+            $line_of_tokens->{_line_type} = 'SKIP_END';
              $write_logfile_entry->("Exiting code-skipping section\n");
              $tokenizer_self->[_in_skipped_] = 0;
          }
@@ -1061,7 +1124,7 @@ sub get_line {
                  $line_of_tokens->{_line_type} = 'POD_START';
                  warning(
  "=cut starts a pod section .. this can fool pod utilities.\n"
-                );
+                ) unless (DEVEL_MODE);
                  $write_logfile_entry->("Entering POD section\n");
              }
          }
@@ -1077,8 +1140,7 @@ sub get_line {
      # handle start of skipped section
      if ( $tokenizer_self->[_in_skipped_] ) {
  
-        # NOTE: marked as the existing type 'FORMAT' to keep html working
-        $line_of_tokens->{_line_type} = 'FORMAT';
+        $line_of_tokens->{_line_type} = 'SKIP';
          $write_logfile_entry->("Entering code-skipping section\n");
          return $line_of_tokens;
      }
@@ -1344,6 +1406,7 @@ sub prepare_for_a_new_file {
      @total_depth             = ();
      @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
      @current_sequence_number = ();
+    $next_sequence_number    = 2;    # The value 1 is reserved for SEQ_ROOT
  
      @paren_type                     = ();
      @paren_semicolon_count          = ();
@@ -1549,7 +1612,7 @@ sub prepare_for_a_new_file {
          (
              $routput_token_list,    $routput_token_type,
              $routput_block_type,    $routput_container_type,
-            $routput_type_sequence, $routput_type_sequence,
+            $routput_type_sequence, $routput_indent_flag,
          ) = @{$rTV2};
  
          (
@@ -1582,6 +1645,90 @@ sub prepare_for_a_new_file {
          return;
      }
  
+    sub split_pretoken {
+
+        my ($numc) = @_;
+
+     # Split the leading $numc characters from the current token (at index=$i)
+     # which is pre-type 'w' and insert the remainder back into the pretoken
+     # stream with appropriate settings.  Since we are splitting a pre-type 'w',
+     # there are three cases, depending on if the remainder starts with a digit:
+     # Case 1: remainder is type 'd', all digits
+     # Case 2: remainder is type 'd' and type 'w': digits and other characters
+     # Case 3: remainder is type 'w'
+
+        # Examples, for $numc=1:
+        #   $tok    => $tok_0 $tok_1 $tok_2
+        #   'x10'   => 'x'    '10'                # case 1
+        #   'x10if' => 'x'    '10'   'if'         # case 2
+        #   '0ne    => 'O'            'ne'        # case 3
+
+        # where:
+        #   $tok_1 is a possible string of digits (pre-type 'd')
+        #   $tok_2 is a possible word (pre-type 'w')
+
+        # return 1 if successful
+        # return undef if error (shouldn't happen)
+
+        # Calling routine should update '$type' and '$tok' if successful.
+
+        my $pretoken = $rtokens->[$i];
+        if (   $pretoken
+            && length($pretoken) > $numc
+            && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
+        {
+
+            # Split $tok into up to 3 tokens:
+            my $tok_0 = substr( $pretoken, 0, $numc );
+            my $tok_1 = defined($1) ? $1 : "";
+            my $tok_2 = defined($2) ? $2 : "";
+
+            my $len_0 = length($tok_0);
+            my $len_1 = length($tok_1);
+            my $len_2 = length($tok_2);
+
+            my $pre_type_0 = 'w';
+            my $pre_type_1 = 'd';
+            my $pre_type_2 = 'w';
+
+            my $pos_0 = $rtoken_map->[$i];
+            my $pos_1 = $pos_0 + $len_0;
+            my $pos_2 = $pos_1 + $len_1;
+
+            my $isplice = $i + 1;
+
+            # Splice in any digits
+            if ($len_1) {
+                splice @{$rtoken_map},  $isplice, 0, $pos_1;
+                splice @{$rtokens},     $isplice, 0, $tok_1;
+                splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
+                $max_token_index++;
+                $isplice++;
+            }
+
+            # Splice in any trailing word
+            if ($len_2) {
+                splice @{$rtoken_map},  $isplice, 0, $pos_2;
+                splice @{$rtokens},     $isplice, 0, $tok_2;
+                splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
+                $max_token_index++;
+            }
+
+            $rtokens->[$i] = $tok_0;
+            return 1;
+        }
+        else {
+
+            # Shouldn't get here
+            if (DEVEL_MODE) {
+                Fault(<<EOM);
+While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
+EOM
+            }
+        }
+        return;
+    }
+
      sub get_indentation_level {
  
          # patch to avoid reporting error if indented if is not terminated
@@ -1646,7 +1793,7 @@ sub prepare_for_a_new_file {
              @brace_package,                  @square_bracket_type,
              @square_bracket_structural_type, @depth_array,
              @starting_line_of_current_depth, @nested_ternary_flag,
-            @nested_statement_type,
+            @nested_statement_type,          $next_sequence_number,
          );
  
          # save all lexical variables
@@ -1709,6 +1856,33 @@ sub prepare_for_a_new_file {
          ( $i, $tok, $type, $id_scan_state, $identifier ) =
            scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
              $max_token_index, $expecting, $paren_type[$paren_depth] );
+
+        # Check for signal to fix a special variable adjacent to a keyword,
+        # such as '$^One$0'.
+        if ( $id_scan_state eq '^' ) {
+
+            # Try to fix it by splitting the pretoken
+            if (   $i > 0
+                && $rtokens->[ $i - 1 ] eq '^'
+                && split_pretoken(1) )
+            {
+                $identifier = substr( $identifier, 0, 3 );
+                $tok        = $identifier;
+            }
+            else {
+
+                # This shouldn't happen ...
+                my $var    = substr( $tok, 0, 3 );
+                my $excess = substr( $tok, 3 );
+                interrupt_logfile();
+                warning(<<EOM);
+$input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
+A space may be needed after '$var'. 
+EOM
+                resume_logfile();
+            }
+            $id_scan_state = "";
+        }
          return;
      }
  
@@ -2073,7 +2247,12 @@ EOM
                      || $last_nonblank_type eq 'U' )    # possible object
                )
              {
-                $type = 'Z';
+
+                # An identifier followed by '->' is not indirect object;
+                # fixes b1175, b1176
+                my ( $next_nonblank_type, $i_next ) =
+                  find_next_noncomment_type( $i, $rtokens, $max_token_index );
+                $type = 'Z' if ( $next_nonblank_type ne '->' );
              }
          },
          '(' => sub {
@@ -2441,7 +2620,8 @@ EOM
                      && $last_nonblank_i >= 0 )
                  {
                      if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
-                        $routput_token_type->[$last_nonblank_i] = 'G';
+                        $routput_token_type->[$last_nonblank_i] =
+                          $is_grep_alias{$block_type} ? 'k' : 'G';
                      }
                  }
  
@@ -2791,9 +2971,6 @@ EOM
              # check for special variables like ${^WARNING_BITS}
              if ( $expecting == TERM ) {
  
-                # FIXME: this should work but will not catch errors
-                # because we also have to be sure that previous token is
-                # a type character ($,@,%).
                  if (   $last_nonblank_token eq '{'
                      && ( $next_tok !~ /^\d/ )
                      && ( $next_tok =~ /^\w/ ) )
@@ -2805,6 +2982,24 @@ EOM
                      $tok  = $tok . $next_tok;
                      $i    = $i + 1;
                      $type = 'w';
+
+                    # Optional coding to try to catch syntax errors. This can
+                    # be removed if it ever causes incorrect warning messages.
+                    # The '{^' should be preceded by either by a type or '$#'
+                    # Examples:
+                    #   $#{^CAPTURE}       ok
+                    #   *${^LAST_FH}{NAME} ok
+                    #   @{^HOWDY}          ok
+                    #   $hash{^HOWDY}      error
+
+                    # Note that a type sigil '$' may be tokenized as 'Z'
+                    # after something like 'print', so allow type 'Z'
+                    if (   $last_last_nonblank_type ne 't'
+                        && $last_last_nonblank_type ne 'Z'
+                        && $last_last_nonblank_token ne '$#' )
+                    {
+                        warning("Possible syntax error near '{^'\n");
+                    }
                  }
  
                  else {
@@ -2860,8 +3055,16 @@ EOM
                  elsif ( $expecting == TERM ) {
                      unless ($saw_error) {
  
-                        # shouldn't happen..
-                        warning("Program bug; didn't find here doc target\n");
+                        # shouldn't happen..arriving here implies an error in
+                        # the logic in sub 'find_here_doc'
+                        if (DEVEL_MODE) {
+                            Fault(<<EOM);
+Program bug; didn't find here doc target
+EOM
+                        }
+                        warning(
+"Possible program error: didn't find here doc target\n"
+                        );
                          report_definite_bug();
                      }
                  }
@@ -2904,13 +3107,22 @@ EOM
                  elsif ( $expecting == TERM ) {
                      unless ($saw_error) {
  
-                        # shouldn't happen..
-                        warning("Program bug; didn't find here doc target\n");
+                        # shouldn't happen..arriving here implies an error in
+                        # the logic in sub 'find_here_doc'
+                        if (DEVEL_MODE) {
+                            Fault(<<EOM);
+Program bug; didn't find here doc target
+EOM
+                        }
+                        warning(
+"Possible program error: didn't find here doc target\n"
+                        );
                          report_definite_bug();
                      }
                  }
              }
              else {
+                error_if_expecting_OPERATOR();
              }
          },
          '->' => sub {
@@ -3007,10 +3219,6 @@ EOM
        if elsif else unless while until for foreach switch case given when);
      @is_zero_continuation_block_type{@_} = (1) x scalar(@_);
  
-    my %is_not_zero_continuation_block_type;
-    @_ = qw(sort grep map do eval);
-    @is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);
-
      my %is_logical_container;
      @_ = qw(if elsif unless while and or err not && !  || for foreach);
      @is_logical_container{@_} = (1) x scalar(@_);
@@ -3306,7 +3514,11 @@ EOM
                      $routput_token_type->[$i] = $type;
  
                  }
-                $tok = $quote_character if ($quote_character);
+
+                # Removed to fix b1280.  This is not needed and was causing the
+                # starting type 'qw' to be lost, leading to mis-tokenization of
+                # a trailing block brace in a parenless for stmt 'for .. qw.. {'
+                ##$tok = $quote_character if ($quote_character);
  
                  # scan for the end of the quote or pattern
                  (
@@ -3435,20 +3647,27 @@ EOM
                      }
                  }
  
-                $last_last_nonblank_token      = $last_nonblank_token;
-                $last_last_nonblank_type       = $last_nonblank_type;
-                $last_last_nonblank_block_type = $last_nonblank_block_type;
-                $last_last_nonblank_container_type =
-                  $last_nonblank_container_type;
-                $last_last_nonblank_type_sequence =
-                  $last_nonblank_type_sequence;
-                $last_nonblank_token          = $tok;
-                $last_nonblank_type           = $type;
-                $last_nonblank_prototype      = $prototype;
-                $last_nonblank_block_type     = $block_type;
-                $last_nonblank_container_type = $container_type;
-                $last_nonblank_type_sequence  = $type_sequence;
-                $last_nonblank_i              = $i_tok;
+                # fix c090, only rotate vars if a new token will be stored
+                if ( $i_tok >= 0 ) {
+                    $last_last_nonblank_token      = $last_nonblank_token;
+                    $last_last_nonblank_type       = $last_nonblank_type;
+                    $last_last_nonblank_block_type = $last_nonblank_block_type;
+                    $last_last_nonblank_container_type =
+                      $last_nonblank_container_type;
+                    $last_last_nonblank_type_sequence =
+                      $last_nonblank_type_sequence;
+
+                    # Fix part #3 for git82: propagate type 'Z' though L-R pair
+                    unless ( $type eq 'R' && $last_nonblank_type eq 'Z' ) {
+                        $last_nonblank_token = $tok;
+                        $last_nonblank_type  = $type;
+                    }
+                    $last_nonblank_prototype      = $prototype;
+                    $last_nonblank_block_type     = $block_type;
+                    $last_nonblank_container_type = $container_type;
+                    $last_nonblank_type_sequence  = $type_sequence;
+                    $last_nonblank_i              = $i_tok;
+                }
  
                  # Patch for c030: Fix things in case a '->' got separated from
                  # the subsequent identifier by a side comment.  We need the
@@ -3499,7 +3718,25 @@ EOM
                      scan_identifier();
                  }
  
-                last if ($id_scan_state);
+                if ($id_scan_state) {
+
+                    # Still scanning ...
+                    # Check for side comment between sub and prototype (c061)
+
+                    # done if nothing left to scan on this line
+                    last if ( $i > $max_token_index );
+
+                    my ( $next_nonblank_token, $i_next ) =
+                      find_next_nonblank_token_on_this_line( $i, $rtokens,
+                        $max_token_index );
+
+                    # done if it was just some trailing space
+                    last if ( $i_next > $max_token_index );
+
+                    # something remains on the line ... must be a side comment
+                    next;
+                }
+
                  next if ( ( $i > 0 ) || $type );
  
                  # didn't find any token; start over
@@ -3729,8 +3966,18 @@ EOM
                             # a key with 18 a's.  But something like
                             #    push @array, a x18;
                             # is a syntax error.
-                            if ( $expecting == OPERATOR && $tok =~ /^x\d+$/ ) {
+                            if (
+                                   $expecting == OPERATOR
+                                && substr( $tok, 0, 1 ) eq 'x'
+                                && ( length($tok) == 1
+                                    || substr( $tok, 1, 1 ) =~ /^\d/ )
+                              )
+                            {
                                  $type = 'n';
+                                if ( split_pretoken(1) ) {
+                                    $type = 'x';
+                                    $tok  = 'x';
+                                }
                              }
                              else {
  
@@ -3786,13 +4033,14 @@ EOM
                  # Decide if 'sub :' can be the start of a sub attribute list.
                  # We will decide based on if the colon is followed by a
                  # bareword which is not a keyword.
+                # Changed inext+1 to inext to fixed case b1190.
                  my $sub_attribute_ok_here;
                  if (   $is_sub{$tok_kw}
                      && $expecting != OPERATOR
                      && $next_nonblank_token eq ':' )
                  {
                      my ( $nn_nonblank_token, $i_nn ) =
-                      find_next_nonblank_token( $i_next + 1,
+                      find_next_nonblank_token( $i_next,
                          $rtokens, $max_token_index );
                      $sub_attribute_ok_here =
                           $nn_nonblank_token =~ /^\w/
@@ -3801,12 +4049,15 @@ EOM
                  }
  
                  # handle operator x (now we know it isn't $x=)
-                if (   $expecting == OPERATOR
+                if (
+                       $expecting == OPERATOR
                      && substr( $tok, 0, 1 ) eq 'x'
-                    && $tok =~ /^x\d*$/ )
+                    && ( length($tok) == 1
+                        || substr( $tok, 1, 1 ) =~ /^\d/ )
+                  )
                  {
-                    if ( $tok eq 'x' ) {
  
+                    if ( $tok eq 'x' ) {
                          if ( $rtokens->[ $i + 1 ] eq '=' ) {    # x=
                              $tok  = 'x=';
                              $type = $tok;
@@ -3816,12 +4067,17 @@ EOM
                              $type = 'x';
                          }
                      }
-
-                    # NOTE: mark something like x4 as an integer for now
-                    # It gets fixed downstream.  This is easier than
-                    # splitting the pretoken.
                      else {
+
+                        # Split a pretoken like 'x10' into 'x' and '10'.
+                        # Note: In previous versions of perltidy it was marked
+                        # as a number, $type = 'n', and fixed downstream by the
+                        # Formatter.
                          $type = 'n';
+                        if ( split_pretoken(1) ) {
+                            $type = 'x';
+                            $tok  = 'x';
+                        }
                      }
                  }
                  elsif ( $tok_kw eq 'CORE::' ) {
@@ -4243,7 +4499,14 @@ EOM
                  if ( !defined($number) ) {
  
                      # shouldn't happen - we should always get a number
-                    warning("non-number beginning with digit--program bug\n");
+                    if (DEVEL_MODE) {
+                        Fault(<<EOM);
+non-number beginning with digit--program bug
+EOM
+                    }
+                    warning(
+"Unexpected error condition: non-number beginning with digit\n"
+                    );
                      report_definite_bug();
                  }
              }
@@ -4714,16 +4977,16 @@ EOM
  
                      # zero continuation flag at terminal BLOCK '}' which
                      # ends a statement.
-                    if ( $routput_block_type->[$i] ) {
+                    my $block_type_i = $routput_block_type->[$i];
+                    if ($block_type_i) {
  
                          # ...These include non-anonymous subs
                          # note: could be sub ::abc { or sub 'abc
-                        if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
+                        if ( $block_type_i =~ m/^sub\s*/gc ) {
  
                           # note: older versions of perl require the /gc modifier
                           # here or else the \G does not work.
-                            if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
-                            {
+                            if ( $block_type_i =~ /\G('|::|\w)/gc ) {
                                  $in_statement_continuation = 0;
                              }
                          }
@@ -4732,27 +4995,21 @@ EOM
  # block prototypes and these: (sort|grep|map|do|eval)
  # /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
                          elsif (
-                            $is_zero_continuation_block_type{
-                                $routput_block_type->[$i]
-                            }
-                          )
+                            $is_zero_continuation_block_type{$block_type_i} )
                          {
                              $in_statement_continuation = 0;
                          }
  
                          # ..but these are not terminal types:
                          #     /^(sort|grep|map|do|eval)$/ )
-                        elsif (
-                            $is_not_zero_continuation_block_type{
-                                $routput_block_type->[$i]
-                            }
-                          )
+                        elsif ($is_sort_map_grep_eval_do{$block_type_i}
+                            || $is_grep_alias{$block_type_i} )
                          {
                          }
  
                          # ..and a block introduced by a label
                          # /^\w+\s*:$/gc ) {
-                        elsif ( $routput_block_type->[$i] =~ /:$/ ) {
+                        elsif ( $block_type_i =~ /:$/ ) {
                              $in_statement_continuation = 0;
                          }
  
@@ -4955,7 +5212,7 @@ EOM
  
          return;
      }
-}    # end tokenize_this_line
+} ## end tokenize_this_line
  
  #########i#############################################################
  # Tokenizer routines which assist in identifying token types
@@ -4967,6 +5224,10 @@ my %op_expected_table;
  # exceptions to perl's weird parsing rules after type 'Z'
  my %is_weird_parsing_rule_exception;
  
+my %is_paren_dollar;
+
+my %is_n_v;
+
  BEGIN {
  
      # Always expecting TERM following these types:
@@ -4997,7 +5258,13 @@ BEGIN {
  
      # Fix for git #62: added '*' and '%'
      @q = qw( < ? * % );
-    @{is_weird_parsing_rule_exception}{@q} = (OPERATOR) x scalar(@q);
+    @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
+
+    @q = qw<) $>;
+    @{is_paren_dollar}{@q} = (1) x scalar(@q);
+
+    @q = qw( n v );
+    @{is_n_v}{@q} = (1) x scalar(@q);
  
  }
  
@@ -5087,7 +5354,8 @@ sub operator_expected {
          # FIXME: it would be cleaner to make this a special type
          # expecting VERSION or {} after package NAMESPACE
          # TODO: maybe mark these words as type 'Y'?
-        if (   $statement_type =~ /^package\b/
+        if (   substr( $last_nonblank_token, 0, 7 ) eq 'package'
+            && $statement_type      =~ /^package\b/
              && $last_nonblank_token =~ /^package\b/ )
          {
              $op_expected = TERM;
@@ -5150,10 +5418,12 @@ sub operator_expected {
              $op_expected = OPERATOR;    # block mode following }
          }
  
-        elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
+        ##elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
+        elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
+            || substr( $last_nonblank_token, 0, 2 ) eq '->' )
+        {
              $op_expected = OPERATOR;
              if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
-
          }
  
          # Check for smartmatch operator before preceding brace or square
@@ -5202,7 +5472,8 @@ sub operator_expected {
      #     use Module VERSION LIST
      # We could avoid this exception by writing a special sub to parse 'use'
      # statements and perhaps mark these numbers with a new type V (for VERSION)
-    elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
+    ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
+    elsif ( $is_n_v{$last_nonblank_type} ) {
          $op_expected = OPERATOR;
          if ( $statement_type eq 'use' ) {
              $op_expected = UNKNOWN;
@@ -5231,6 +5502,14 @@ sub operator_expected {
              $op_expected = UNKNOWN;
          }
  
+        # Exception to weird parsing rules for 'x(' ... see case b1205:
+        # In something like 'print $vv x(...' the x is an operator;
+        # Likewise in 'print $vv x$ww' the x is an operatory (case b1207)
+        # otherwise x follows the weird parsing rules.
+        elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
+            $op_expected = OPERATOR;
+        }
+
          # The 'weird parsing rules' of next section do not work for '<' and '?'
          # It is best to mark them as unknown.  Test case:
          #  print $fh <DATA>;
@@ -5397,7 +5676,9 @@ sub code_block_type {
  # otherwise, look at previous token.  This must be a code block if
  # it follows any of these:
  # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
-    elsif ( $is_code_block_token{$last_nonblank_token} ) {
+    elsif ($is_code_block_token{$last_nonblank_token}
+        || $is_grep_alias{$last_nonblank_token} )
+    {
  
          # Bug Patch: Note that the opening brace after the 'if' in the following
          # snippet is an anonymous hash ref and not a code block!
@@ -5651,6 +5932,18 @@ sub report_unexpected {
      return;
  }
  
+my %is_sigil_or_paren;
+my %is_R_closing_sb;
+
+BEGIN {
+
+    my @q = qw< $ & % * @ ) >;
+    @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
+
+    @q = qw(R ]);
+    @{is_R_closing_sb}{@q} = (1) x scalar(@q);
+}
+
  sub is_non_structural_brace {
  
      # Decide if a brace or bracket is structural or non-structural
@@ -5677,13 +5970,18 @@ sub is_non_structural_brace {
      # otherwise, it is non-structural if it is decorated
      # by type information.
      # For example, the '{' here is non-structural:   ${xxx}
+    # Removed '::' to fix c074
+    ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
      return (
-        $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
+        ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
+        $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
+          || substr( $last_nonblank_token, 0, 2 ) eq '->'
  
            # or if we follow a hash or array closing curly brace or bracket
            # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
            # because the first '}' would have been given type 'R'
-          || $last_nonblank_type =~ /^([R\]])$/
+          ##|| $last_nonblank_type =~ /^([R\]])$/
+          || $is_R_closing_sb{$last_nonblank_type}
      );
  }
  
@@ -5746,8 +6044,18 @@ sub increase_nesting_depth {
      # Sequence numbers increment by number of items.  This keeps
      # a unique set of numbers but still allows the relative location
      # of any type to be determined.
-    $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
-    my $seqno = $nesting_sequence_number[$aa];
+
+    ########################################################################
+    # OLD SEQNO METHOD for incrementing sequence numbers.
+    # Keep this coding awhile for possible testing.
+    ## $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
+    ## my $seqno = $nesting_sequence_number[$aa];
+
+    # NEW SEQNO METHOD, continuous sequence numbers. This allows sequence
+    # numbers to be used as array indexes, and allows them to be compared.
+    my $seqno = $next_sequence_number++;
+    ########################################################################
+
      $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
  
      $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
@@ -5772,7 +6080,10 @@ sub increase_nesting_depth {
              }
          }
      }
-    $nested_statement_type[$aa][ $current_depth[$aa] ] = $statement_type;
+
+    # Fix part #1 for git82: save last token type for propagation of type 'Z'
+    $nested_statement_type[$aa][ $current_depth[$aa] ] =
+      [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
      $statement_type = "";
      return ( $seqno, $indent );
  }
@@ -5818,7 +6129,19 @@ sub decrease_nesting_depth {
          if ( $aa == QUESTION_COLON ) {
              $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
          }
-        $statement_type = $nested_statement_type[$aa][ $current_depth[$aa] ];
+
+        # Fix part #2 for git82: use saved type for propagation of type 'Z'
+        # through type L-R braces.  Perl seems to allow ${bareword}
+        # as an indirect object, but nothing much more complex than that.
+        ( $statement_type, my $saved_type, my $saved_token ) =
+          @{ $nested_statement_type[$aa][ $current_depth[$aa] ] };
+        if (   $aa == BRACE
+            && $saved_type eq 'Z'
+            && $last_nonblank_type eq 'w'
+            && $brace_structural_type[$brace_depth] eq 'L' )
+        {
+            $last_nonblank_type = $saved_type;
+        }
  
          # check that any brace types $bb contained within are balanced
          for my $bb ( 0 .. @closing_brace_names - 1 ) {
@@ -5963,8 +6286,9 @@ sub peek_ahead_for_nonblank_token {
          $line =~ s/^\s*//;                 # trim leading blanks
          next if ( length($line) <= 0 );    # skip blank
          next if ( $line =~ /^#/ );         # skip comment
-        my ( $rtok, $rmap, $rtype ) =
-          pre_tokenize( $line, 2 );        # only need 2 pre-tokens
+
+        # Updated from 2 to 3 to get trigraphs, added for case b1175
+        my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
          my $j = $max_token_index + 1;
  
          foreach my $tok ( @{$rtok} ) {
@@ -6112,7 +6436,7 @@ sub guess_if_pattern_or_division {
          # usually indicates a pattern.  We can use this to break ties.
  
          my $is_pattern_by_spacing =
-          ( $i > 1 && $next_token ne ' ' && $rtokens->[ $i - 2 ] eq ' ' );
+          ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
  
          # look for a possible ending / on this line..
          my $in_quote        = 1;
@@ -6597,8 +6921,13 @@ sub scan_id_do {
      if ( $id_scan_state && ( !defined($type) || !$type ) ) {
  
          # shouldn't happen:
+        if (DEVEL_MODE) {
+            Fault(<<EOM);
+Program bug in scan_id: undefined type but scan_state=$id_scan_state
+EOM
+        }
          warning(
-"Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
+"Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
          );
          report_definite_bug();
      }
@@ -6729,6 +7058,17 @@ sub do_scan_package {
      return ( $i, $tok, $type );
  }
  
+my %is_special_variable_char;
+
+BEGIN {
+
+    # These are the only characters which can (currently) form special
+    # variables, like $^W: (issue c066).
+    my @q =
+      qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
+    @{is_special_variable_char}{@q} = (1) x scalar(@q);
+}
+
  sub scan_identifier_do {
  
      # This routine assembles tokens into identifiers.  It maintains a
@@ -6805,11 +7145,16 @@ sub scan_identifier_do {
          }
          else {
  
-            # shouldn't happen
-            my ( $a, $b, $c ) = caller;
-            warning("Program Bug: scan_identifier given bad token = $tok \n");
-            warning("   called from sub $a  line: $c\n");
-            report_definite_bug();
+            # shouldn't happen: bad call parameter
+            my $msg =
+"Program bug detected: scan_identifier received bad starting token = '$tok'\n";
+            if (DEVEL_MODE) { Fault($msg) }
+            if ( !$tokenizer_self->[_in_error_] ) {
+                warning($msg);
+                $tokenizer_self->[_in_error_] = 1;
+            }
+            $id_scan_state = '';
+            goto RETURN;
          }
          $saw_type = !$saw_alpha;
      }
@@ -6984,26 +7329,41 @@ sub scan_identifier_do {
              }
              elsif ( $tok eq '^' ) {
  
-                # check for some special variables like $^W
+                # check for some special variables like $^ $^W
                  if ( $identifier =~ /^[\$\*\@\%]$/ ) {
                      $identifier .= $tok;
-                    $id_scan_state = 'A';
+                    $type = 'i';
  
-                    # Perl accepts '$^]' or '@^]', but
-                    # there must not be a space before the ']'.
+                    # There may be one more character, not a space, after the ^
                      my $next1 = $rtokens->[ $i + 1 ];
-                    if ( $next1 eq ']' ) {
+                    my $chr   = substr( $next1, 0, 1 );
+                    if ( $is_special_variable_char{$chr} ) {
+
+                        # It is something like $^W
+                        # Test case (c066) : $^Oeq'linux'
                          $i++;
                          $identifier .= $next1;
+
+                        # If pretoken $next1 is more than one character long,
+                        # set a flag indicating that it needs to be split.
+                        $id_scan_state = ( length($next1) > 1 ) ? '^' : "";
+                        last;
+                    }
+                    else {
+
+                        # it is just $^
+                        # Simple test case (c065): '$aa=$^if($bb)';
                          $id_scan_state = "";
                          last;
                      }
                  }
                  else {
                      $id_scan_state = '';
+                    $i             = $i_save;
+                    last;    # c106
                  }
              }
-            else {    # something else
+            else {           # something else
  
                  if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
  
@@ -7235,6 +7595,39 @@ sub scan_identifier_do {
                  $id_scan_state = '';
                  last;
              }
+            elsif ( $tok eq '^' ) {
+                if ( $identifier eq '&' ) {
+
+                    # Special variable (c066)
+                    $identifier .= $tok;
+                    $type = '&';
+
+                    # There may be one more character, not a space, after the ^
+                    my $next1 = $rtokens->[ $i + 1 ];
+                    my $chr   = substr( $next1, 0, 1 );
+                    if ( $is_special_variable_char{$chr} ) {
+
+                        # It is something like &^O
+                        $i++;
+                        $identifier .= $next1;
+
+                        # If pretoken $next1 is more than one character long,
+                        # set a flag indicating that it needs to be split.
+                        $id_scan_state = ( length($next1) > 1 ) ? '^' : "";
+                    }
+                    else {
+
+                        # it is &^
+                        $id_scan_state = "";
+                    }
+                    last;
+                }
+                else {
+                    $identifier = '';
+                    $i          = $i_save;
+                }
+                last;
+            }
              else {
  
                  # punctuation variable?
@@ -7298,7 +7691,14 @@ sub scan_identifier_do {
          if ($saw_type) {
  
              if ($saw_alpha) {
-                if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
+
+                # The type without the -> should be the same as with the -> so
+                # that if they get separated we get the same bond strengths,
+                # etc.  See b1234
+                if (   $identifier =~ /^->/
+                    && $last_nonblank_type eq 'w'
+                    && substr( $identifier, 2, 1 ) =~ /^\w/ )
+                {
                      $type = 'w';
                  }
                  else { $type = 'i' }
@@ -7343,6 +7743,8 @@ sub scan_identifier_do {
          $i   = $i_begin;
      }
  
+  RETURN:
+
      DEBUG_SCAN_ID && do {
          my ( $a, $b, $c ) = caller;
          print STDOUT
@@ -7582,15 +7984,16 @@ sub scan_identifier_do {
                      $max_token_index );
                  if ($error) { warning("Possibly invalid sub\n") }
  
-            # Patch part #2 to fixes cases b994 and b1053:
-            # Do not let spaces be part of the token of an anonymous sub keyword
-            # which we marked as type 'k' above...i.e. for something like:
-            #    'sub : lvalue { ...'
-            # Back up and let it be parsed as a blank
+                # Patch part #2 to fixes cases b994 and b1053:
+                # Do not let spaces be part of the token of an anonymous sub
+                # keyword which we marked as type 'k' above...i.e. for
+                # something like:
+                #    'sub : lvalue { ...'
+                # Back up and let it be parsed as a blank
                  if (   $type eq 'k'
                      && $attrs
                      && $i > $i_entry
-                    && substr( $rtokens->[$i], 0, 1 ) eq ' ' )
+                    && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
                  {
                      $i--;
                  }
@@ -7633,7 +8036,7 @@ sub scan_identifier_do {
                          else {
                              warning(
  "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
-                            );
+                            ) unless (DEVEL_MODE);
                          }
                      }
                      $saw_function_definition{$subname}{$package} =
@@ -7718,6 +8121,42 @@ sub find_next_nonblank_token {
      return ( $next_nonblank_token, $i );
  }
  
+sub find_next_noncomment_type {
+    my ( $i, $rtokens, $max_token_index ) = @_;
+
+    # Given the current character position, look ahead past any comments
+    # and blank lines and return the next token, including digraphs and
+    # trigraphs.
+
+    my ( $next_nonblank_token, $i_next ) =
+      find_next_nonblank_token( $i, $rtokens, $max_token_index );
+
+    # skip past any side comment
+    if ( $next_nonblank_token eq '#' ) {
+        ( $next_nonblank_token, $i_next ) =
+          find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
+    }
+
+    goto RETURN if ( !$next_nonblank_token || $next_nonblank_token eq " " );
+
+    # check for possible a digraph
+    goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
+    my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
+    goto RETURN if ( !$is_digraph{$test2} );
+    $next_nonblank_token = $test2;
+    $i_next              = $i_next + 1;
+
+    # check for possible a trigraph
+    goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
+    my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
+    goto RETURN if ( !$is_trigraph{$test3} );
+    $next_nonblank_token = $test3;
+    $i_next              = $i_next + 1;
+
+  RETURN:
+    return ( $next_nonblank_token, $i_next );
+}
+
  sub is_possible_numerator {
  
      # Look at the next non-comment character and decide if it could be a
@@ -7846,7 +8285,14 @@ sub find_angle_operator_termination {
      elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
  
      # shouldn't happen - we shouldn't be here if operator is expected
-    else { warning("Program Bug in find_angle_operator_termination\n") }
+    else {
+        if (DEVEL_MODE) {
+            Fault(<<EOM);
+Bad call to find_angle_operator_termination
+EOM
+        }
+        return ( $i, $type );
+    }
  
      # To illustrate what we might be looking at, in case we are
      # guessing, here are some examples of valid angle operators
@@ -7886,6 +8332,22 @@ sub find_angle_operator_termination {
              my $pos_beg = $rtoken_map->[$i];
              my $str     = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
  
+            # Test for '<' after possible filehandle, issue c103
+            # print $fh <>;          # syntax error
+            # print $fh <DATA>;      # ok
+            # print $fh < DATA>;     # syntax error at '>'
+            # print STDERR < DATA>;  # ok, prints word 'DATA'
+            # print BLABLA <DATA>;   # ok; does nothing unless BLABLA is defined
+            if ( $last_nonblank_type eq 'Z' ) {
+
+                # $str includes brackets; something like '<DATA>'
+                if (   substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
+                    && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
+                {
+                    return ( $i, $type );
+                }
+            }
+
              # Reject if the closing '>' follows a '-' as in:
              # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
              if ( $expecting eq UNKNOWN ) {
@@ -7907,9 +8369,13 @@ sub find_angle_operator_termination {
              # It may be possible that a quote ends midway in a pretoken.
              # If this happens, it may be necessary to split the pretoken.
              if ($error) {
+                if (DEVEL_MODE) {
+                    Fault(<<EOM);
+unexpected error condition returned by inverse_pretoken_map
+EOM
+                }
                  warning(
                      "Possible tokinization error..please check this line\n");
-                report_possible_bug();
              }
  
              # count blanks on inside of brackets
@@ -8004,8 +8470,11 @@ sub scan_number_do {
  
      # Look for bad starting characters; Shouldn't happen..
      if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
-        warning("Program bug - scan_number given character $first_char\n");
-        report_definite_bug();
+        if (DEVEL_MODE) {
+            Fault(<<EOM);
+Program bug - scan_number given bad first character = '$first_char'
+EOM
+        }
          return ( $i, $type, $number );
      }
  
@@ -8041,10 +8510,10 @@ sub scan_number_do {
             [Pp][+-]?[0-9a-fA-F]          # REQUIRED exponent with digit
             [0-9a-fA-F_]*)                # optional Additional exponent digits
  
-          # or hex integer
+           # or hex integer
             |([xX][0-9a-fA-F_]+)        
  
-          # or octal fraction
+           # or octal fraction
             |([oO]?[0-7_]+          # string of octal digits
             (\.([0-7][0-7_]*)?)?    # optional decimal and fraction
             [Pp][+-]?[0-7]          # REQUIRED exponent, no underscore
@@ -8059,7 +8528,7 @@ sub scan_number_do {
             [Pp][+-]?[01]           # Required exponent indicator, no underscore
             [01_]*)                 # additional exponent bits
  
-          # or binary integer
+           # or binary integer
             |([bB][01_]+)           # 'b' with string of binary digits 
  
             )/gx
@@ -8393,7 +8862,8 @@ sub follow_quoted_string {
                  "Note: alphanumeric quote delimiter ($beginning_tok) \n");
          }
  
-        while ( $i < $max_token_index ) {
+        # Note: changed < to <= here to fix c109. Relying on extra end blanks.
+        while ( $i <= $max_token_index ) {
  
              if ( $quote_pos == 0 || ( $i < 0 ) ) {
                  $tok = $rtokens->[ ++$i ];
@@ -8421,6 +8891,11 @@ sub follow_quoted_string {
                  $quoted_string .=
                    substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
  
+                # NOTE: any quote modifiers will be at the end of '$tok'. If we
+                # wanted to check them, this is the place to get them.  But
+                # this quote form is rarely used in practice, so it isn't
+                # worthwhile.
+
                  $quote_depth--;
  
                  if ( $quote_depth == 0 ) {
@@ -8732,6 +9207,8 @@ The following additional token types are defined:
      HERE_END       - last line of here-doc (target word)
      FORMAT         - format section
      FORMAT_END     - last line of format section, '.'
+    SKIP           - code skipping section
+    SKIP_END       - last line of code skipping section, '#>>V'
      DATA_START     - __DATA__ line
      DATA           - unidentified text following __DATA__
      END_START      - __END__ line
@@ -8800,6 +9277,13 @@ BEGIN {
        switch case given when default catch try finally);
      @is_code_block_token{@q} = (1) x scalar(@q);
  
+    # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
+    # to contrast it with the block types in '%is_zero_continuation_block_type'
+    @q = qw( sort map grep eval do );
+    @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
+
+    %is_grep_alias = ();
+
      # I'll build the list of keywords incrementally
      my @Keywords = ();