From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Tue, 24 May 2022 18:16:18 +0000 (-0700)
Subject: improve tokenizer efficiency
X-Git-Tag: 20220613~15
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=07186fb049d916028dbcc9c154569f77d781ff23;p=perltidy.git

improve tokenizer efficiency
---

diff --git a/CHANGES.md b/CHANGES.md
index 3d78ed6e..71da9a71 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -31,8 +31,12 @@
       Now, these control side comments are retained when -dsc is set unless
       a -nnib flag is also set to deactivate them.
 
+    - This version runs 15 to 20 percent faster on large files than the
+      previous release due to optimizations made with the help of Devel::NYTProf.
+
     - Fixed and reactivated two failing tests (were reading local .perltidyrc file)
 
+
 ## 2022 02 17
 
     - A new flag, --encode-output-strings, or -eos, has been added to resolve
diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index e8c3d626..9f364870 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -114,6 +114,8 @@ use vars qw{
   %is_if_elsif_unless
   %is_if_elsif_unless_case_when
   %other_line_endings
+  %is_END_DATA_format_sub
+  %is_semicolon_or_t
   $code_skipping_pattern_begin
   $code_skipping_pattern_end
 };
@@ -308,6 +310,13 @@ sub check_options {
     %is_sub = ();
     $is_sub{'sub'} = 1;
 
+    %is_END_DATA_format_sub = (
+        '__END__'  => 1,
+        '__DATA__' => 1,
+        'format'   => 1,
+        'sub'      => 1,
+    );
+
     # Install any aliases to 'sub'
     if ( $rOpts->{'sub-alias-list'} ) {
 
@@ -316,7 +325,8 @@ sub check_options {
         # for example, it might be 'sub method fun'
         my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
         foreach my $word (@sub_alias_list) {
-            $is_sub{$word} = 1;
+            $is_sub{$word}                 = 1;
+            $is_END_DATA_format_sub{$word} = 1;
         }
     }
 
@@ -4911,6 +4921,14 @@ EOM
     sub tokenizer_finish {
         my ($line_of_tokens) = @_;
 
+        # We have broken the current line into tokens. Now we have to package
+        # the result up for shipping.  Most of the remaining work involves
+        # defining the various indentation parameters that the formatter needs
+        # (indentation level and continuation indentation).  This turns out to
+        # be rather complicated.
+
+        # TODO: variable 'slevel' is no longer needed and can be removed
+
         my @token_type    = ();    # stack of output token types
         my @block_type    = ();    # stack of output code block types
         my @type_sequence = ();    # stack of output type sequence numbers
@@ -4987,17 +5005,30 @@ EOM
 #       and '(' -- , regardless of context, is used to compute a nesting
 #       depth.
 
-        my ( $ci_string_i, $level_i, $nesting_token_string_i, );
+        $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
 
-        foreach my $i ( @{$routput_token_list} )
-        {    # scan the list of pre-tokens indexes
+        my $ci_string_i;
 
-            # self-checking for valid token types
-            # NOTE: would prefer 'my $type' here but that will cause
-            #    the PC error 'Reused variable name in lexical scope'
-            # TODO: change to 'my $type_i'
-            $type = $routput_token_type->[$i];
-            my $forced_indentation_flag = $routput_indent_flag->[$i];
+        # loop over the list of pre-tokens indexes
+        foreach my $i ( @{$routput_token_list} ) {
+
+            # We store the slevel value before it is updated for this token
+            push( @slevels, $slevel_in_tokenizer );
+
+            # Get $tok_i, the PRE-token.  It only equals the token for symbols
+            my $tok_i  = $rtokens->[$i];
+            my $type_i = $routput_token_type->[$i];
+
+            # Check for an invalid token type..
+            # This can happen by running perltidy on non-scripts
+            # although it could also be bug introduced by programming change.
+            # Perl silently accepts a 032 (^Z) and takes it as the end
+            if ( !$is_valid_token_type{$type_i} ) {
+                my $val = ord($type_i);
+                warning(
+                    "unexpected character decimal $val ($type_i) in script\n");
+                $tokenizer_self->[_in_error_] = 1;
+            }
 
             # See if we should undo the $forced_indentation_flag.
             # Forced indentation after 'if', 'unless', 'while' and 'until'
@@ -5023,7 +5054,8 @@ EOM
             #   line, is an opening container token or a comma.
             # This almost always works, but if not after another pass it will
             # be stable.
-            if ( $forced_indentation_flag && $type eq 'k' ) {
+            my $forced_indentation_flag = $routput_indent_flag->[$i];
+            if ( $forced_indentation_flag && $type_i eq 'k' ) {
                 my $ixlast  = -1;
                 my $ilast   = $routput_token_list->[$ixlast];
                 my $toklast = $routput_token_type->[$ilast];
@@ -5054,12 +5086,12 @@ EOM
             if ($indented_if_level) {
 
                 # don't try to nest trailing if's - shouldn't happen
-                if ( $type eq 'k' ) {
+                if ( $type_i eq 'k' ) {
                     $forced_indentation_flag = 0;
                 }
 
                 # check for the normal case - outdenting at next ';'
-                elsif ( $type eq ';' ) {
+                elsif ( $type_i eq ';' ) {
                     if ( $level_in_tokenizer == $indented_if_level ) {
                         $forced_indentation_flag = -1;
                         $indented_if_level       = 0;
@@ -5067,7 +5099,7 @@ EOM
                 }
 
                 # handle case of missing semicolon
-                elsif ( $type eq '}' ) {
+                elsif ( $type_i eq '}' ) {
                     if ( $level_in_tokenizer == $indented_if_level ) {
                         $indented_if_level = 0;
 
@@ -5084,40 +5116,16 @@ EOM
                 }
             }
 
-            # NOTE: would prefer 'my $tok' here but that will cause
-            #    the PC error 'Reused variable name in lexical scope'
-            $tok     = $rtokens->[$i]; # the token, but ONLY if same as pretoken
-            $level_i = $level_in_tokenizer;
-
-            # This can happen by running perltidy on non-scripts
-            # although it could also be bug introduced by programming change.
-            # Perl silently accepts a 032 (^Z) and takes it as the end
-            if ( !$is_valid_token_type{$type} ) {
-                my $val = ord($type);
-                warning(
-                    "unexpected character decimal $val ($type) in script\n");
-                $tokenizer_self->[_in_error_] = 1;
-            }
-
-            # ----------------------------------------------------------------
-            # TOKEN TYPE PATCHES
-            #  output __END__, __DATA__, and format as type 'k' instead of ';'
-            # to make html colors correct, etc.
-            my $fix_type = $type;
-            if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }
-
-            # output anonymous 'sub' as keyword
-            if ( $type eq 't' && $is_sub{$tok} ) { $fix_type = 'k' }
-
-            # -----------------------------------------------------------------
-
-            $nesting_token_string_i = $nesting_token_string;
+            # Now we have the first approximation to the level
+            my $level_i = $level_in_tokenizer;
 
             # set primary indentation levels based on structural braces
             # Note: these are set so that the leading braces have a HIGHER
             # level than their CONTENTS, which is convenient for indentation
             # Also, define continuation indentation for each token.
-            if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
+            if (   $type_i eq '{'
+                || $type_i eq 'L'
+                || $forced_indentation_flag > 0 )
             {
 
                 # use environment before updating
@@ -5193,8 +5201,8 @@ EOM
                 if ($forced_indentation_flag) {
 
                     # break BEFORE '?' when there is forced indentation
-                    if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
-                    if ( $type eq 'k' ) {
+                    if ( $type_i eq '?' ) { $level_i = $level_in_tokenizer; }
+                    if ( $type_i eq 'k' ) {
                         $indented_if_level = $level_in_tokenizer;
                     }
 
@@ -5272,7 +5280,7 @@ EOM
                 if (
                     !$routput_block_type->[$i]    # patch: skip for BLOCK
                     && ($in_statement_continuation)
-                    && !( $forced_indentation_flag && $type eq ':' )
+                    && !( $forced_indentation_flag && $type_i eq ':' )
                   )
                 {
                     $total_ci += $in_statement_continuation
@@ -5283,8 +5291,8 @@ EOM
                 $in_statement_continuation = 0;
             }
 
-            elsif ($type eq '}'
-                || $type eq 'R'
+            elsif ($type_i eq '}'
+                || $type_i eq 'R'
                 || $forced_indentation_flag < 0 )
             {
 
@@ -5326,9 +5334,8 @@ EOM
                             }
                         }
 
-# ...and include all block types except user subs with
-# block prototypes and these: (sort|grep|map|do|eval)
-# /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
+                        # ...and include all block types except user subs with
+                        # block prototypes and these: (sort|grep|map|do|eval)
                         elsif (
                             $is_zero_continuation_block_type{$block_type_i} )
                         {
@@ -5364,7 +5371,7 @@ EOM
                     #           or $check eq "new"
                     #           or $check eq "old",
                     #     );
-                    elsif ( $tok eq ')' ) {
+                    elsif ( $tok_i eq ')' ) {
                         $in_statement_continuation = 1
                           if (
                             $is_list_end_type{ $routput_container_type->[$i] }
@@ -5372,7 +5379,7 @@ EOM
                         ##if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
                     }
 
-                    elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
+                    elsif ( $tok_i eq ';' ) { $in_statement_continuation = 0 }
                 }
 
                 # use environment after updating
@@ -5396,8 +5403,8 @@ EOM
                 # commas, this simplifies the -lp indentation logic, which
                 # counts commas.  For ?: it makes them stand out.
                 if ($nesting_list_flag) {
-                    ##      $type =~ /^[,\?\:]$/
-                    if ( $is_comma_question_colon{$type} ) {
+                    ##      $type_i =~ /^[,\?\:]$/
+                    if ( $is_comma_question_colon{$type_i} ) {
                         $in_statement_continuation = 0;
                     }
                 }
@@ -5405,8 +5412,8 @@ EOM
                 # be sure binary operators get continuation indentation
                 if (
                     $container_environment
-                    && (   $type eq 'k' && $is_binary_keyword{$tok}
-                        || $is_binary_type{$type} )
+                    && (   $type_i eq 'k' && $is_binary_keyword{$tok_i}
+                        || $is_binary_type{$type_i} )
                   )
                 {
                     $in_statement_continuation = 1;
@@ -5418,13 +5425,13 @@ EOM
 
                 # update continuation flag ...
                 # if this isn't a blank or comment..
-                if ( $type ne 'b' && $type ne '#' ) {
+                if ( $type_i ne 'b' && $type_i ne '#' ) {
 
                     # and we are in a BLOCK
                     if ($nesting_block_flag) {
 
                         # the next token after a ';' and label starts a new stmt
-                        if ( $type eq ';' || $type eq 'J' ) {
+                        if ( $type_i eq ';' || $type_i eq 'J' ) {
                             $in_statement_continuation = 0;
                         }
 
@@ -5451,7 +5458,7 @@ EOM
                         # as a non block, to simplify formatting. But these
                         # are actually blocks and can have semicolons.
                         # See code_block_type() and is_non_structural_brace().
-                        elsif ( $type eq ',' || $type eq ';' ) {
+                        elsif ( $type_i eq ',' || $type_i eq ';' ) {
                             $in_statement_continuation = 0;
                         }
 
@@ -5474,37 +5481,54 @@ EOM
             # Note: these are set so that the nesting depth is the depth
             # of the PREVIOUS TOKEN, which is convenient for setting
             # the strength of token bonds
-            my $slevel_i = $slevel_in_tokenizer;
 
             #    /^[L\{\(\[]$/
-            if ( $is_opening_type{$type} ) {
+            if ( $is_opening_type{$type_i} ) {
                 $slevel_in_tokenizer++;
-                $nesting_token_string .= $tok;
-                $nesting_type_string  .= $type;
+                $nesting_token_string .= $tok_i;
+                $nesting_type_string  .= $type_i;
             }
 
             #       /^[R\}\)\]]$/
-            elsif ( $is_closing_type{$type} ) {
+            elsif ( $is_closing_type{$type_i} ) {
                 $slevel_in_tokenizer--;
                 my $char = chop $nesting_token_string;
 
-                if ( $char ne $matching_start_token{$tok} ) {
-                    $nesting_token_string .= $char . $tok;
-                    $nesting_type_string  .= $type;
+                if ( $char ne $matching_start_token{$tok_i} ) {
+                    $nesting_token_string .= $char . $tok_i;
+                    $nesting_type_string  .= $type_i;
                 }
                 else {
                     chop $nesting_type_string;
                 }
             }
 
+            # Store the values for this token. Note that @slevel was
+            # stored at the top of the loop and @tokens is handled below.
             push( @block_type,    $routput_block_type->[$i] );
             push( @ci_string,     $ci_string_i );
             push( @levels,        $level_i );
-            push( @slevels,       $slevel_i );
-            push( @token_type,    $fix_type );
             push( @type_sequence, $routput_type_sequence->[$i] );
+            push( @token_type,    $type_i );
+
+            #------------------
+            # TOKEN TYPE PATCH:
+            #------------------
+            # - output __END__, __DATA__, and format as type 'k' instead of ';'
+            #   to make html colors correct, etc.
+            # - output anonymous 'sub' as keyword
+            # The following hash tests are equivalent to these previous tests:
+            #   if ( $type_i eq ';' && $tok_i =~ /\w/ ) { $fix_type = 'k' }
+            #   if ( $type_i eq 't' && $is_sub{$tok_i} ) { $fix_type = 'k' }
+            # This is seldom needed and profiling showed that it is fastest to
+            # do it as follows:
+            if (   $is_END_DATA_format_sub{$tok_i}
+                && $is_semicolon_or_t{$type_i} )
+            {
+                $token_type[-1] = 'k';
+            }
 
-            # now form the previous token
+            # Form and store the previous token
             if ( $im >= 0 ) {
                 $num =
                   $rtoken_map->[$i] - $rtoken_map->[$im];  # how many characters
@@ -5517,12 +5541,13 @@ EOM
 
             # or grab some values for the leading token (needed for log output)
             else {
-                $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string_i;
                 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
             }
+
             $im = $i;
         }
 
+        # Form and store the final token
         $num = length($input_line) - $rtoken_map->[$im];   # make the last token
         if ( $num > 0 ) {
             push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
@@ -10142,6 +10167,9 @@ BEGIN {
     @q = qw( if elsif unless );
     @is_if_elsif_unless{@q} = (1) x scalar(@q);
 
+    @q = qw( ; t );
+    @is_semicolon_or_t{@q} = (1) x scalar(@q);
+
     @q = qw( if elsif unless case when );
     @is_if_elsif_unless_case_when{@q} = (1) x scalar(@q);