From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Sat, 23 Apr 2022 17:07:50 +0000 (-0700)
Subject: tokenizer optimizations
X-Git-Tag: 20220613~50
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=2b333a0ff1939ea5451502c0a685d0b4c9df2281;p=perltidy.git

tokenizer optimizations
---

diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm
index 9ebdda8b..79ab63b1 100644
--- a/lib/Perl/Tidy/Formatter.pm
+++ b/lib/Perl/Tidy/Formatter.pm
@@ -3161,7 +3161,7 @@ EOM
         @is_for_foreach{@q} = (1) x scalar(@q);
 
         @q = qw(
-          .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
+          .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
           <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
         );
         @is_digraph{@q} = (1) x scalar(@q);
@@ -5128,6 +5128,7 @@ EOM
             my $seqno = $rtype_sequence->[$j];
             my $token = $rtokens->[$j];
             my $type  = $rtoken_type->[$j];
+            $seqno = "" unless ( defined($seqno) );
             my $err_msg =
 "Error at j=$j, line number $input_line_no, seqno='$seqno', type='$type', tok='$token':\n";
 
@@ -5415,6 +5416,9 @@ EOM
                         push @{$rSS}, $sign * $seqno;
 
                     }
+                    else {
+                        $seqno = "" unless ( defined($seqno) );
+                    }
 
                     my @tokary;
                     @tokary[
diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index 4ffad0a7..eb7465c7 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -86,6 +86,7 @@ use vars qw{
   %expecting_term_types
   %expecting_term_token
   %is_digraph
+  %can_start_digraph
   %is_file_test_operator
   %is_trigraph
   %is_tetragraph
@@ -438,6 +439,11 @@ sub new {
       $rOpts->{'maximum-unexpected-errors'};
     $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
     $self->[_rOpts_]         = $rOpts;
+
+    # These vars are used for guessing indentation and must be positive
+    $self->[_tabsize_]        = 8 if ( !$self->[_tabsize_] );
+    $self->[_indent_columns_] = 4 if ( !$self->[_indent_columns_] );
+
     bless $self, $class;
 
     $tokenizer_self = $self;
@@ -3437,16 +3443,15 @@ EOM
                 my $leading_spaces = $1;
                 my $spaces         = length($leading_spaces);
 
-                # handle any tabs
+                # handle leading tabs
                 if ( ord( substr( $leading_spaces, 0, 1 ) ) == 9
                     && $leading_spaces =~ /^(\t+)/ )
                 {
-                    $spaces +=
-                      length($1) * ( $tokenizer_self->[_tabsize_] - 1 );
+                    my $tabsize = $tokenizer_self->[_tabsize_];
+                    $spaces += length($1) * ( $tabsize - 1 );
                 }
 
                 my $indent_columns = $tokenizer_self->[_indent_columns_];
-                $indent_columns = 4 if ( !$indent_columns );
                 $line_of_tokens->{_guessed_indentation_level} =
                   int( $spaces / $indent_columns );
             }
@@ -3505,6 +3510,7 @@ EOM
         push( @{$rtoken_type}, 'b', 'b', 'b' );
 
         # initialize for main loop
+        if (0) { #<<< this is not necessary
         foreach my $ii ( 0 .. $max_token_index + 3 ) {
             $routput_token_type->[$ii]     = "";
             $routput_block_type->[$ii]     = "";
@@ -3512,6 +3518,8 @@ EOM
             $routput_type_sequence->[$ii]  = "";
             $routput_indent_flag->[$ii]    = 0;
         }
+        }
+
         $i     = -1;
         $i_tok = -1;
 
@@ -3711,16 +3719,21 @@ EOM
                 $routput_type_sequence->[$i_tok]  = $type_sequence;
                 $routput_indent_flag->[$i_tok]    = $indent_flag;
             }
-            my $pre_tok  = $rtokens->[$i];        # get the next pre-token
-            my $pre_type = $rtoken_type->[$i];    # and type
-            $tok        = $pre_tok;
-            $type       = $pre_type;              # to be modified as necessary
-            $block_type = "";    # blank for all tokens except code block braces
-            $container_type = "";    # blank for all tokens except some parens
-            $type_sequence  = "";    # blank for all tokens except ?/:
-            $indent_flag    = 0;
-            $prototype = "";    # blank for all tokens except user defined subs
-            $i_tok     = $i;
+
+            # get the next pre-token and type
+            # $tok and $type will be modified to make the output token
+            my $pre_tok  = $tok  = $rtokens->[$i];      # get the next pre-token
+            my $pre_type = $type = $rtoken_type->[$i];  # and type
+
+            # remember the starting index of this token; we will be updating $i
+            $i_tok = $i;
+
+            # re-initialize various flags for the next output token
+            $block_type     &&= "";
+            $container_type &&= "";
+            $type_sequence  &&= "";
+            $indent_flag    &&= 0;
+            $prototype      &&= "";
 
             # this pre-token will start an output token
             push( @{$routput_token_list}, $i_tok );
@@ -3777,7 +3790,10 @@ EOM
             # I have allowed tokens starting with <, such as <=,
             # because I don't think these could be valid angle operators.
             # test file: storrs4.pl
-            if ( $is_digraph{ $tok . $rtokens->[ $i + 1 ] } ) {
+            if (   $can_start_digraph{$tok}
+                && $i < $max_token_index
+                && $is_digraph{ $tok . $rtokens->[ $i + 1 ] } )
+            {
 
                 my $combine_ok = 1;
                 my $test_tok   = $tok . $rtokens->[ $i + 1 ];
@@ -3788,12 +3804,12 @@ EOM
                 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
                 # could be migrated here for clarity
 
-              # Patch for RT#102371, misparsing a // in the following snippet:
-              #     state $b //= ccc();
-              # The solution is to always accept the digraph (or trigraph) after
-              # token type 'Z' (possible file handle).  The reason is that
-              # sub operator_expected gives TERM expected here, which is
-              # wrong in this case.
+                # Patch for RT#102371, misparsing a // in the following snippet:
+                #     state $b //= ccc();
+                # The solution is to always accept the digraph (or trigraph)
+                # after type 'Z' (possible file handle).  The reason is that
+                # sub operator_expected gives TERM expected here, which is
+                # wrong in this case.
                 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
                     my $next_type = $rtokens->[ $i + 1 ];
                     my $expecting =
@@ -3923,8 +3939,8 @@ EOM
                                 }
                             );
 
-                   # If successful, mark as type 'q' to be consistent with other
-                   # attributes.  Note that type 'w' would also work.
+                            # If successful, mark as type 'q' to be consistent
+                            # with other attributes.  Type 'w' would also work.
                             if ( $i > $i_beg ) {
                                 $type = 'q';
                                 next;
@@ -3969,24 +3985,24 @@ EOM
                         }
                         else {
 
-                           # Bareword followed by a fat comma ... see 'git18.in'
-                           # If tok is something like 'x17' then it could
-                           # actually be operator x followed by number 17.
-                           # For example, here:
-                           #     123x17 => [ 792, 1224 ],
-                           # (a key of 123 repeated 17 times, perhaps not
-                           # what was intended). We will mark x17 as type
-                           # 'n' and it will be split. If the previous token
-                           # was also a bareword then it is not very clear is
-                           # going on.  In this case we will not be sure that
-                           # an operator is expected, so we just mark it as a
-                           # bareword.  Perl is a little murky in what it does
-                           # with stuff like this, and its behavior can change
-                           # over time.  Something like
-                           #    a x18 => [792, 1224], will compile as
-                           # a key with 18 a's.  But something like
-                           #    push @array, a x18;
-                           # is a syntax error.
+                            # Bareword followed by a fat comma - see 'git18.in'
+                            # If tok is something like 'x17' then it could
+                            # actually be operator x followed by number 17.
+                            # For example, here:
+                            #     123x17 => [ 792, 1224 ],
+                            # (a key of 123 repeated 17 times, perhaps not
+                            # what was intended). We will mark x17 as type
+                            # 'n' and it will be split. If the previous token
+                            # was also a bareword then it is not very clear is
+                            # going on.  In this case we will not be sure that
+                            # an operator is expected, so we just mark it as a
+                            # bareword.  Perl is a little murky in what it does
+                            # with stuff like this, and its behavior can change
+                            # over time.  Something like
+                            #    a x18 => [792, 1224], will compile as
+                            # a key with 18 a's.  But something like
+                            #    push @array, a x18;
+                            # is a syntax error.
                             if (
                                    $expecting == OPERATOR
                                 && substr( $tok, 0, 1 ) eq 'x'
@@ -4012,11 +4028,12 @@ EOM
                     }
                 }
 
-     # quote a bare word within braces..like xxx->{s}; note that we
-     # must be sure this is not a structural brace, to avoid
-     # mistaking {s} in the following for a quoted bare word:
-     #     for(@[){s}bla}BLA}
-     # Also treat q in something like var{-q} as a bare word, not qoute operator
+                # quote a bare word within braces..like xxx->{s}; note that we
+                # must be sure this is not a structural brace, to avoid
+                # mistaking {s} in the following for a quoted bare word:
+                #     for(@[){s}bla}BLA}
+                # Also treat q in something like var{-q} as a bare word, not
+                # a qoute operator
                 if (
                     $next_nonblank_token eq '}'
                     && (
@@ -9281,11 +9298,16 @@ BEGIN {
     my @q;
 
     my @digraphs = qw(
-      .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
+      .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
       <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
     );
     @is_digraph{@digraphs} = (1) x scalar(@digraphs);
 
+    @q = qw(
+      . : < > * & | / - = + -  %  ^ !  x ~
+    );
+    @can_start_digraph{@q} = (1) x scalar(@q);
+
     my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
     @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);