From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Tue, 3 Nov 2020 15:19:13 +0000 (-0800)
Subject: added some tokenizer speedups
X-Git-Tag: 20201001.03~10
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=5847ced819fbea1b075c292aaedd6a3a092ade1a;p=perltidy.git

added some tokenizer speedups
---

diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index 5454ee52..d96be7b3 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -1,4 +1,4 @@
-######################################################################
+#####################################################################
 #
 # The Perl::Tidy::Tokenizer package is essentially a filter which
 # reads lines of perl source code from a source object and provides
@@ -97,6 +97,7 @@ use vars qw{
   %is_q_qq_qw_qx_qr_s_y_tr_m
   %is_sub
   %is_package
+  %is_comma_question_colon
 };
 
 # possible values of operator_expected()
@@ -2800,7 +2801,9 @@ sub prepare_for_a_new_file {
         $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
 
         # check for pod documentation
-        if ( ( $untrimmed_input_line =~ /^=[A-Za-z_]/ ) ) {
+        if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
+            && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
+        {
 
             # must not be in multi-line quote
             # and must not be in an equation
@@ -2820,7 +2823,7 @@ sub prepare_for_a_new_file {
         # do not trim end because we might end in a quote (test: deken4.pl)
         # Perl::Tidy::Formatter will delete needless trailing blanks
         unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
-            $input_line =~ s/^\s*//;    # trim left end
+            $input_line =~ s/^\s+//;    # trim left end
         }
 
         # Set a flag to indicate if we might be at an __END__ or __DATA__ line
@@ -2856,7 +2859,7 @@ sub prepare_for_a_new_file {
         my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
 
         # a little optimization for a full-line comment
-        if ( !$in_quote && ( $input_line =~ /^#/ ) ) {
+        if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) {
             $max_tokens_wanted = 1    # no use tokenizing a comment
         }
 
@@ -3066,7 +3069,7 @@ EOM
 
             # continue gathering identifier if necessary
             # but do not start on blanks and comments
-            if ( $id_scan_state && $pre_type !~ /[b#]/ ) {
+            if ( $id_scan_state && $pre_type ne 'b' && $pre_type ne '#' ) {
 
                 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
                     scan_id();
@@ -3337,7 +3340,10 @@ EOM
                 }
 
                 # handle operator x (now we know it isn't $x=)
-                if ( ( $tok =~ /^x\d*$/ ) && ( $expecting == OPERATOR ) ) {
+                if (   $expecting == OPERATOR
+                    && substr( $tok, 0, 1 ) eq 'x'
+                    && $tok =~ /^x\d*$/ )
+                {
                     if ( $tok eq 'x' ) {
 
                         if ( $rtokens->[ $i + 1 ] eq '=' ) {    # x=
@@ -4169,7 +4175,7 @@ EOM
                   )
                 {
                     $total_ci += $in_statement_continuation
-                      unless ( $ci_string_in_tokenizer =~ /1$/ );
+                      unless ( substr( $ci_string_in_tokenizer, -1 ) eq '1' );
                 }
 
                 $ci_string_i               = $total_ci;
@@ -4190,9 +4196,11 @@ EOM
                 if ( length($nesting_block_string) > 1 )
                 {    # true for valid script
                     chop $nesting_block_string;
-                    $nesting_block_flag = ( $nesting_block_string =~ /1$/ );
+                    $nesting_block_flag =
+                      substr( $nesting_block_string, -1 ) eq '1';
                     chop $nesting_list_string;
-                    $nesting_list_flag = ( $nesting_list_string =~ /1$/ );
+                    $nesting_list_flag =
+                      substr( $nesting_list_string, -1 ) eq '1';
 
                     chop $ci_string_in_tokenizer;
                     $ci_string_sum = ones_count($ci_string_in_tokenizer);
@@ -4287,7 +4295,8 @@ EOM
                 # commas, this simplifies the -lp indentation logic, which
                 # counts commas.  For ?: it makes them stand out.
                 if ($nesting_list_flag) {
-                    if ( $type =~ /^[,\?\:]$/ ) {
+                    ##      $type =~ /^[,\?\:]$/
+                    if ( $is_comma_question_colon{$type} ) {
                         $in_statement_continuation = 0;
                     }
                 }
@@ -8464,6 +8473,10 @@ BEGIN {
     @q = qw(package);
     @is_package{@q} = (1) x scalar(@q);
 
+    @q = qw( ? : );
+    push @q, ',';
+    @is_comma_question_colon{@q} = (1) x scalar(@q);
+
     # These keywords are handled specially in the tokenizer code:
     my @special_keywords = qw(
       do