From 71b8a13f4d4557104eaf623f3bd105bd31d0c671 Mon Sep 17 00:00:00 2001
From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Thu, 30 Jun 2022 06:08:16 -0700
Subject: [PATCH] speedup tokenizer by several percent

---
 lib/Perl/Tidy/Tokenizer.pm | 165 +++++++++++++++++++++----------------
 1 file changed, 92 insertions(+), 73 deletions(-)

diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index b178e19c..596076be 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -4440,22 +4440,15 @@ EOM
         my $line_of_tokens = shift;
         my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
 
-        # patch while coding change is underway
-        # make callers private data to allow access
-        # $tokenizer_self = $caller_tokenizer_self;
-
-        # extract line number for use in error messages
+        # Extract line number for use in error messages
         $input_line_number = $line_of_tokens->{_line_number};
 
-        # reinitialize for multi-line quote
-        $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
-
-        # check for pod documentation
+        # Check for pod documentation
         if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
             && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
         {
 
-            # must not be in multi-line quote
+            # Must not be in multi-line quote
             # and must not be in an equation
             if ( !$in_quote
                 && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
@@ -4474,15 +4467,21 @@ EOM
         # a fat comma.
         my $is_END_or_DATA;
 
-        # trim start of this line unless we are continuing a quoted line
-        # do not trim end because we might end in a quote (test: deken4.pl)
-        # Perl::Tidy::Formatter will delete needless trailing blanks
-        unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
-            $input_line =~ s/^(\s+)//;    # trim left end
+        # Reinitialize the multi-line quote flag
+        if ( $in_quote && $quote_type eq 'Q' ) {
+            $line_of_tokens->{_starting_in_quote} = 1;
+        }
+        else {
+            $line_of_tokens->{_starting_in_quote} = 0;
+
+            # Trim start of this line unless we are continuing a quoted line.
+            # Do not trim end because we might end in a quote (test: deken4.pl)
+            # Perl::Tidy::Formatter will delete needless trailing blanks
+            $input_line =~ s/^(\s+)//;
 
-            # calculate a guessed level for nonblank lines to avoid calls to
+            # Calculate a guessed level for nonblank lines to avoid calls to
             #    sub guess_old_indentation_level()
-            if ( $input_line && $1 ) {
+            if ( length($input_line) && $1 ) {
                 my $leading_spaces = $1;
                 my $spaces         = length($leading_spaces);
 
@@ -4503,6 +4502,47 @@ EOM
               && $input_line =~ /^__(END|DATA)__\s*$/;
         }
 
+        # Optimize for a full-line comment.
+        if ( !$in_quote ) {
+            if ( substr( $input_line, 0, 1 ) eq '#' ) {
+
+                # and check for skipped section
+                if (   $rOpts_code_skipping
+                    && $input_line =~ /$code_skipping_pattern_begin/ )
+                {
+                    $tokenizer_self->[_in_skipped_] = 1;
+                    return;
+                }
+
+                # Optional fast processing of a block comment
+                my $ci_string_sum =
+                  ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
+                my $ci_string_i = $ci_string_sum + $in_statement_continuation;
+                $line_of_tokens->{_line_type}        = 'CODE';
+                $line_of_tokens->{_rtokens}          = [$input_line];
+                $line_of_tokens->{_rtoken_type}      = ['#'];
+                $line_of_tokens->{_rlevels}          = [$level_in_tokenizer];
+                $line_of_tokens->{_rci_levels}       = [$ci_string_i];
+                $line_of_tokens->{_rblock_type}      = [EMPTY_STRING];
+                $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
+                $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
+                return;
+            }
+
+            # Optimize handling of a blank line
+            if ( !length($input_line) ) {
+                $line_of_tokens->{_line_type}        = 'CODE';
+                $line_of_tokens->{_rtokens}          = [];
+                $line_of_tokens->{_rtoken_type}      = [];
+                $line_of_tokens->{_rlevels}          = [];
+                $line_of_tokens->{_rci_levels}       = [];
+                $line_of_tokens->{_rblock_type}      = [];
+                $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
+                $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
+                return;
+            }
+        }
+
         # update the copy of the line for use in error messages
         # This must be exactly what we give the pre_tokenizer
         $tokenizer_self->[_line_of_text_] = $input_line;
@@ -4526,39 +4566,7 @@ EOM
         $indent_flag     = 0;
         $peeked_ahead    = 0;
 
-        # This variable signals pre_tokenize to get all tokens.
-        # But note that it is no longer needed with fast block comment
-        # option below.
-        my $max_tokens_wanted = 0;
-
-        # optimize for a full-line comment
-        if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) {
-            $max_tokens_wanted = 1;    # no use tokenizing a comment
-
-            # and check for skipped section
-            if (   $rOpts_code_skipping
-                && $input_line =~ /$code_skipping_pattern_begin/ )
-            {
-                $tokenizer_self->[_in_skipped_] = 1;
-                return;
-            }
-
-            # Optional fast processing of a block comment
-            my $ci_string_sum =
-              ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
-            my $ci_string_i = $ci_string_sum + $in_statement_continuation;
-            $line_of_tokens->{_line_type}        = 'CODE';
-            $line_of_tokens->{_rtokens}          = [$input_line];
-            $line_of_tokens->{_rtoken_type}      = ['#'];
-            $line_of_tokens->{_rlevels}          = [$level_in_tokenizer];
-            $line_of_tokens->{_rci_levels}       = [$ci_string_i];
-            $line_of_tokens->{_rblock_type}      = [EMPTY_STRING];
-            $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
-            $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
-            return;
-        }
-
-        tokenizer_main_loop( $max_tokens_wanted, $is_END_or_DATA );
+        tokenizer_main_loop($is_END_or_DATA);
 
         #-----------------------------------------------
         # all done tokenizing this line ...
@@ -4571,18 +4579,16 @@ EOM
     } ## end sub tokenize_this_line
 
     sub tokenizer_main_loop {
-        my ( $max_tokens_wanted, $is_END_or_DATA ) = @_;
-
-        # tokenization is done in two stages..
-        # stage 1 is a very simple pre-tokenization
+        my ($is_END_or_DATA) = @_;
 
         # start by breaking the line into pre-tokens
+        my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
         ( $rtokens, $rtoken_map, $rtoken_type ) =
           pre_tokenize( $input_line, $max_tokens_wanted );
 
         $max_token_index = scalar( @{$rtokens} ) - 1;
         push( @{$rtokens}, SPACE, SPACE, SPACE )
-          ;    # extra whitespace simplifies logic
+          ;                        # extra whitespace simplifies logic
         push( @{$rtoken_map},  0,   0,   0 );     # shouldn't be referenced
         push( @{$rtoken_type}, 'b', 'b', 'b' );
 
@@ -4950,7 +4956,6 @@ EOM
         my @levels        = ();    # structural brace levels of output tokens
         my @ci_string = ();  # string needed to compute continuation indentation
         my $container_environment = EMPTY_STRING;
-        my $im                    = -1;             # previous $i value
 
         # Count the number of '1's in the string (previously sub ones_count)
         my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
@@ -5022,6 +5027,7 @@ EOM
         my ( $ci_string_i, $level_i );
 
         # loop over the list of pre-tokens indexes
+        my $rtoken_map_im;
         foreach my $i ( @{$routput_token_list} ) {
 
             # Get $tok_i, the PRE-token.  It only equals the token for symbols
@@ -5552,13 +5558,20 @@ EOM
             push( @token_type,    $type_i );
 
             # Form and store the previous token
-            if ( $im >= 0 ) {
+            if ( defined($rtoken_map_im) ) {
                 my $numc =
-                  $rtoken_map->[$i] - $rtoken_map->[$im];  # how many characters
+                  $rtoken_map->[$i] - $rtoken_map_im;    # how many characters
 
                 if ( $numc > 0 ) {
                     push( @tokens,
-                        substr( $input_line, $rtoken_map->[$im], $numc ) );
+                        substr( $input_line, $rtoken_map_im, $numc ) );
+                }
+                else {
+
+                    # Should not happen unless @{$rtoken_map} is corrupted
+                    DEVEL_MODE
+                      && Fault(
+                        "number of characters is '$numc' but should be >0\n");
                 }
             }
 
@@ -5567,13 +5580,22 @@ EOM
                 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
             }
 
-            $im = $i;
+            $rtoken_map_im = $rtoken_map->[$i];
         } ## end foreach my $i ( @{$routput_token_list...})
 
         # Form and store the final token
-        my $numc = length($input_line) - $rtoken_map->[$im];
-        if ( $numc > 0 ) {
-            push( @tokens, substr( $input_line, $rtoken_map->[$im], $numc ) );
+        if ( defined($rtoken_map_im) ) {
+            my $numc = length($input_line) - $rtoken_map_im;
+            if ( $numc > 0 ) {
+                push( @tokens, substr( $input_line, $rtoken_map_im, $numc ) );
+            }
+            else {
+
+                # Should not happen unless @{$rtoken_map} is corrupted
+                DEVEL_MODE
+                  && Fault(
+                    "Number of Characters is '$numc' but should be >0\n");
+            }
         }
 
         $line_of_tokens->{_rtoken_type}    = \@token_type;
@@ -5694,8 +5716,6 @@ sub operator_expected {
 
     my ($rarg) = @_;
 
-    my $msg = EMPTY_STRING;
-
     ##############
     # Table lookup
     ##############
@@ -5704,8 +5724,10 @@ sub operator_expected {
     # This typically handles half or more of the calls.
     my $op_expected = $op_expected_table{$last_nonblank_type};
     if ( defined($op_expected) ) {
-        $msg = "Table lookup";
-        goto RETURN;
+        DEBUG_OPERATOR_EXPECTED
+          && print STDOUT
+"OPERATOR_EXPECTED: Table Lookup; returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
+        return $op_expected;
     }
 
     ######################
@@ -5927,12 +5949,9 @@ sub operator_expected {
         $op_expected = UNKNOWN;
     }
 
-  RETURN:
-
-    DEBUG_OPERATOR_EXPECTED && do {
-        print STDOUT
-"OPERATOR_EXPECTED: $msg: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
-    };
+    DEBUG_OPERATOR_EXPECTED
+      && print STDOUT
+"OPERATOR_EXPECTED: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
 
     return $op_expected;
 
-- 
2.39.5