From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Fri, 3 Sep 2021 13:14:14 +0000 (-0700)
Subject: eliminate some needless regex calls
X-Git-Tag: 20210717.02~11
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=402f2466a1260414786ea44bcbc1b160d78c1158;p=perltidy.git

eliminate some needless regex calls
---

diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm
index 30ac1e6e..475d4a5a 100644
--- a/lib/Perl/Tidy/Formatter.pm
+++ b/lib/Perl/Tidy/Formatter.pm
@@ -495,6 +495,11 @@ BEGIN {
     # Maximum number of little messages; probably need not be changed.
     use constant MAX_NAG_MESSAGES => 6;
 
+    # This is the decimal range of printable characters in ASCII.  It is used to
+    # make quick preliminary checks before resorting to using a regex.
+    use constant ORD_PRINTABLE_MIN => 33;
+    use constant ORD_PRINTABLE_MAX => 126;
+
     # Initialize constant hashes ...
     my @q;
 
@@ -853,7 +858,9 @@ sub check_rLL {
     my $rLL    = $self->[_rLL_];
     my $Klimit = $self->[_Klimit_];
     my $num    = @{$rLL};
-    if ( $num > 0 && !defined($Klimit) || $Klimit != $num - 1 ) {
+    if (   ( defined($Klimit) && $Klimit != $num - 1 )
+        || ( !defined($Klimit) && $num > 0 ) )
+    {
 
         # This fault can occur if the array has been accessed for an index
         # greater than $Klimit, which is the last token index.  Just accessing
@@ -861,7 +868,7 @@ sub check_rLL {
         # increase beyond $Klimit.  If this occurs, the problem can be located
         # by making calls to this routine at different locations in
         # sub 'finish_formatting'.
-        $Klimit = '' if ( !defined($Klimit) );
+        $Klimit = 'undef' if ( !defined($Klimit) );
         $msg    = "" unless $msg;
         Fault("$msg ERROR: rLL has num=$num but Klimit='$Klimit'\n");
     }
@@ -902,6 +909,44 @@ EOM
     return;
 }
 
+sub check_token_array {
+    my $self = shift;
+
+    # Check for errors in the array of tokens. This is only called
+    # when the DEVEL_MODE flag is set, so this Fault will only occur
+    # during code development.
+    my $rLL = $self->[_rLL_];
+    for ( my $KK = 0 ; $KK < @{$rLL} ; $KK++ ) {
+        my $nvars = @{ $rLL->[$KK] };
+        if ( $nvars != _NVARS ) {
+            my $NVARS = _NVARS;
+            my $type  = $rLL->[$KK]->[_TYPE_];
+            $type = '*' unless defined($type);
+
+            # The number of variables per token node is _NVARS and was set when
+            # the array indexes were generated. So if the number of variables
+            # is different we have done something wrong, like not store all of
+            # them in sub 'write_line' when they were received from the
+            # tokenizer.
+            Fault(
+"number of vars for node $KK, type '$type', is $nvars but should be $NVARS"
+            );
+        }
+        foreach my $var ( _TOKEN_, _TYPE_ ) {
+            if ( !defined( $rLL->[$KK]->[$var] ) ) {
+                my $iline = $rLL->[$KK]->[_LINE_INDEX_];
+
+                # This is a simple check that each token has some basic
+                # variables.  In other words, that there are no holes in the
+                # array of tokens.  Sub 'write_line' pushes tokens into the
+                # $rLL array, so this should guarantee no gaps.
+                Fault("Undefined variable $var for K=$KK, line=$iline\n");
+            }
+        }
+    }
+    return;
+}
+
 {    ## begin closure check_line_hashes
 
     # This code checks that no autovivification occurs in the 'line' hash
@@ -1050,44 +1095,6 @@ sub get_output_line_number {
     return $vao->get_output_line_number();
 }
 
-sub check_token_array {
-    my $self = shift;
-
-    # Check for errors in the array of tokens. This is only called now
-    # when the DEVEL_MODE flag is set, so this Fault will only occur
-    # during code development.
-    my $rLL = $self->[_rLL_];
-    for ( my $KK = 0 ; $KK < @{$rLL} ; $KK++ ) {
-        my $nvars = @{ $rLL->[$KK] };
-        if ( $nvars != _NVARS ) {
-            my $NVARS = _NVARS;
-            my $type  = $rLL->[$KK]->[_TYPE_];
-            $type = '*' unless defined($type);
-
-            # The number of variables per token node is _NVARS and was set when
-            # the array indexes were generated. So if the number of variables
-            # is different we have done something wrong, like not store all of
-            # them in sub 'write_line' when they were received from the
-            # tokenizer.
-            Fault(
-"number of vars for node $KK, type '$type', is $nvars but should be $NVARS"
-            );
-        }
-        foreach my $var ( _TOKEN_, _TYPE_ ) {
-            if ( !defined( $rLL->[$KK]->[$var] ) ) {
-                my $iline = $rLL->[$KK]->[_LINE_INDEX_];
-
-                # This is a simple check that each token has some basic
-                # variables.  In other words, that there are no holes in the
-                # array of tokens.  Sub 'write_line' pushes tokens into the
-                # $rLL array, so this should guarantee no gaps.
-                Fault("Undefined variable $var for K=$KK, line=$iline\n");
-            }
-        }
-    }
-    return;
-}
-
 sub want_blank_line {
     my $self = shift;
     $self->flush();
@@ -2805,8 +2812,8 @@ EOM
 
         # Note2: The -mangle option causes large numbers of calls to this
         # routine and therefore is a good test. So if a change is made, be sure
-        # to run a large number of files with the -mangle option and check for
-        # differences.
+        # to use nytprof to profile with both old and reviesed coding using the
+        # -mangle option and check differences.
 
         my ( $tokenll, $typell, $tokenl, $typel, $tokenr, $typer ) = @_;
 
@@ -2900,11 +2907,14 @@ EOM
 
                 # keep a space between a token ending in '$' and any word;
                 # this caused trouble:  "die @$ if $@"
-                || $typel eq 'i' && $tokenl =~ /\$$/
+                ##|| $typel eq 'i' && $tokenl =~ /\$$/
+                || $typel eq 'i' && substr( $tokenl, -1, 1 ) eq '$'
 
                 # don't combine $$ or $# with any alphanumeric
                 # (testfile mangle.t with --mangle)
-                || $tokenl =~ /^\$[\$\#]$/
+                ##|| $tokenl =~ /^\$[\$\#]$/
+                || $tokenl eq '$$'
+                || $tokenl eq '$#'
 
             )
           )    ## end $tokenr_is_bareword
@@ -2928,7 +2938,8 @@ EOM
           || $typel eq 'w' && ( $tokenr eq '-' || $typer eq 'Q' )
 
           # perl is very fussy about spaces before <<
-          || $tokenr =~ /^\<\</
+          || substr( $tokenr, 0, 2 ) eq '<<'
+          ##|| $tokenr =~ /^\<\</
 
           # avoid combining tokens to create new meanings. Example:
           #     $a+ +$b must not become $a++$b
@@ -2977,8 +2988,11 @@ EOM
 
           # be careful with a space around ++ and --, to avoid ambiguity as to
           # which token it applies
-          || $typer =~ /^(pp|mm)$/     && $tokenl !~ /^[\;\{\(\[]/
-          || $typel =~ /^(\+\+|\-\-)$/ && $tokenr !~ /^[\;\}\)\]]/
+          ##|| $typer =~ /^(pp|mm)$/     && $tokenl !~ /^[\;\{\(\[]/
+          || ( $typer eq 'pp' || $typer eq 'mm' ) && $tokenl !~ /^[\;\{\(\[]/
+          || ( $typel eq '++' || $typel eq '--' )
+          && $tokenr !~ /^[\;\}\)\]]/
+          ##|| $typel =~ /^(\+\+|\-\-)$/ && $tokenr !~ /^[\;\}\)\]]/
 
           # need space after foreach my; for example, this will fail in
           # older versions of Perl:
@@ -2986,9 +3000,11 @@ EOM
           || (
             $tokenl eq 'my'
 
+            && substr( $tokenr, 0, 1 ) eq '$'
+            ##&& $tokenr =~ /^\$/
+
             #  /^(for|foreach)$/
             && $is_for_foreach{$tokenll}
-            && $tokenr =~ /^\$/
           )
 
           # We must be sure that a space between a ? and a quoted string
@@ -5402,6 +5418,8 @@ my %is_sigil;
 my %is_nonlist_keyword;
 my %is_nonlist_type;
 my %is_special_check_type;
+my %is_s_y_m_slash;
+my %is_unexpected_equals;
 
 BEGIN {
 
@@ -5428,6 +5446,12 @@ BEGIN {
     @q = qw( && || );
     @is_nonlist_type{@q} = (1) x scalar(@q);
 
+    @q = qw( s y m / );
+    @is_s_y_m_slash{@q} = (1) x scalar(@q);
+
+    @q = qw( = == != );
+    @is_unexpected_equals{@q} = (1) x scalar(@q);
+
 }
 
 sub respace_tokens {
@@ -5653,7 +5677,14 @@ sub respace_tokens {
         if ($is_comment) {
 
             # trim comments if necessary
-            if ( $token =~ s/\s+$// ) {
+            my $ord = ord( substr( $token, -1, 1 ) );
+            if (
+                $ord > 0
+                && (   $ord < ORD_PRINTABLE_MIN
+                    || $ord > ORD_PRINTABLE_MAX )
+                && $token =~ s/\s+$//
+              )
+            {
                 $token_length = $length_function->($token);
                 $item->[_TOKEN_] = $token;
             }
@@ -5670,7 +5701,6 @@ sub respace_tokens {
             {
                 $set_permanently_broken->($seqno);
             }
-
         }
 
         $item->[_TOKEN_LENGTH_] = $token_length;
@@ -5902,9 +5932,20 @@ sub respace_tokens {
         my $token = $rLL->[$KK]->[_TOKEN_];
         $self->note_embedded_tab($line_number) if ( $token =~ "\t" );
 
+        # The remainder of this routine looks for something like
+        #        '$var = s/xxx/yyy/;'
+        # in case it should have been '$var =~ s/xxx/yyy/;'
+
+        # Start by looking for a token begining with one of: s y m / tr
+        return
+          unless ( $is_s_y_m_slash{ substr( $token, 0, 1 ) }
+            || substr( $token, 0, 2 ) eq 'tr' );
+
+        # ... and preceded by one of: = == !=
         my $Kp = $self->K_previous_nonblank( undef, $rLL_new );
         return unless ( defined($Kp) );
-        my $previous_nonblank_type  = $rLL_new->[$Kp]->[_TYPE_];
+        my $previous_nonblank_type = $rLL_new->[$Kp]->[_TYPE_];
+        return unless ( $is_unexpected_equals{$previous_nonblank_type} );
         my $previous_nonblank_token = $rLL_new->[$Kp]->[_TOKEN_];
 
         my $previous_nonblank_type_2  = 'b';
@@ -5925,11 +5966,10 @@ sub respace_tokens {
         my $token_0 = $rLL->[$Kfirst]->[_TOKEN_];
         my $type_0  = $rLL->[$Kfirst]->[_TYPE_];
 
-        # make note of something like '$var = s/xxx/yyy/;'
-        # in case it should have been '$var =~ s/xxx/yyy/;'
         if (
-               $token =~ /^(s|tr|y|m|\/)/
-            && $previous_nonblank_token =~ /^(=|==|!=)$/
+            ##$token =~ /^(s|tr|y|m|\/)/
+            ##&& $previous_nonblank_token =~ /^(=|==|!=)$/
+            1
 
             # preceded by simple scalar
             && $previous_nonblank_type_2 eq 'i'
@@ -6213,21 +6253,24 @@ sub respace_tokens {
             #     ( $type =~ /^[wit]$/ )
             elsif ( $is_wit{$type} ) {
 
-                my $leading_char = substr( $token, 0, 1 );
-
-                # $sigil =~ /^[\$\&\%\*\@]$/ )
-                if ( $is_sigil{$leading_char} ) {
+                # change '$  var'  to '$var' etc
+                # change '@    '   to '@'
+                # Examples: <<snippets/space1.in>>
+                my $ord = ord( substr( $token, 1, 1 ) );
+                if (
 
-                    # change '$  var'  to '$var' etc
-                    # change '@    '   to '@'
-                    # Examples: <<snippets/space1.in>>
+                    # quick test for possible blank at second char
+                    $ord > 0 && ( $ord < ORD_PRINTABLE_MIN
+                        || $ord > ORD_PRINTABLE_MAX )
+                  )
+                {
                     my ( $sigil, $word ) = split /\s+/, $token, 2;
-                    if ( length($sigil) == 1 ) {
-                        {
-                            $token = $sigil;
-                            $token .= $word if ($word);
-                            $rtoken_vars->[_TOKEN_] = $token;
-                        }
+
+                    # $sigil =~ /^[\$\&\%\*\@]$/ )
+                    if ( $is_sigil{$sigil} ) {
+                        $token = $sigil;
+                        $token .= $word if ($word);
+                        $rtoken_vars->[_TOKEN_] = $token;
                     }
                 }
 
@@ -6237,7 +6280,8 @@ sub respace_tokens {
                 # and 'new' with a possible blank between.
                 #
                 # Note: there is a related patch in sub set_whitespace_flags
-                elsif ($leading_char eq '-'
+                elsif (length($token) > 2
+                    && substr( $token, 0, 2 ) eq '->'
                     && $token =~ /^\-\>(.*)$/
                     && $1 )
                 {
@@ -6323,8 +6367,17 @@ sub respace_tokens {
                     # witch
                     # ()   # prototype may be on new line ...
                     # ...
-                    $token =~ s/\s+$//g;
-                    $rtoken_vars->[_TOKEN_] = $token;
+                    my $ord = ord( substr( $token, -1, 1 ) );
+                    if (
+
+                        # quick check for possible ending space
+                        $ord > 0 && ( $ord < ORD_PRINTABLE_MIN
+                            || $ord > ORD_PRINTABLE_MAX )
+                      )
+                    {
+                        $token =~ s/\s+$//g;
+                        $rtoken_vars->[_TOKEN_] = $token;
+                    }
                 }
             }
 
@@ -6396,7 +6449,7 @@ sub respace_tokens {
             # patch to add space to something like "x10"
             # This avoids having to split this token in the pre-tokenizer
             elsif ( $type eq 'n' ) {
-                if ( $token =~ /^x\d+/ ) {
+                if ( substr( $token, 0, 1 ) eq 'x' && $token =~ /^x\d+/ ) {
                     $token =~ s/x/x /;
                     $rtoken_vars->[_TOKEN_] = $token;
                 }
@@ -6637,8 +6690,7 @@ sub respace_tokens {
     if ( @{$rLL_new} ) { $Klimit = @{$rLL_new} - 1 }
     $self->[_Klimit_] = $Klimit;
 
-    # DEBUG OPTION: make sure the new array looks okay.
-    # This is no longer needed but should be retained for future development.
+    # During development, verify that the new array still looks okay.
     DEVEL_MODE && $self->check_token_array();
 
     # reset the token limits of each line
@@ -11016,7 +11068,14 @@ EOM
         # end the current batch, EXCEPT for a few special cases
         my ($self) = @_;
 
-        return unless ( $max_index_to_go >= 0 );
+        if ( $max_index_to_go < 0 ) {
+
+            # This is harmless but should be elimintated in development
+            if (DEVEL_MODE) {
+                Fault("End batch called with nothing to do; please fix\n");
+            }
+            return;
+        }
 
         # Exceptions when a line does not end with a comment... (fixes c058)
         if ( $types_to_go[$max_index_to_go] ne '#' ) {
@@ -11057,7 +11116,9 @@ EOM
         # Exception: if we are flushing within the code stream only to insert
         # blank line(s), then we can keep the batch intact at a weld. This
         # improves formatting of -ce.  See test 'ce1.ce'
-        if ( $CODE_type && $CODE_type eq 'BL' ) { $self->end_batch() }
+        if ( $CODE_type && $CODE_type eq 'BL' ) {
+            $self->end_batch() if ( $max_index_to_go >= 0 );
+        }
 
         # otherwise, we have to shut things down completely.
         else { $self->flush_batch_of_CODE() }
@@ -11181,7 +11242,7 @@ EOM
             }
 
             destroy_one_line_block();
-            $self->end_batch();
+            $self->end_batch() if ( $max_index_to_go >= 0 );
 
             # output a blank line before block comments
             if (
@@ -11354,7 +11415,7 @@ EOM
             if ( $rbrace_follower && $type ne 'b' ) {
 
                 unless ( $rbrace_follower->{$token} ) {
-                    $self->end_batch();
+                    $self->end_batch() if ( $max_index_to_go >= 0 );
                 }
                 $rbrace_follower = undef;
             }
@@ -11485,7 +11546,7 @@ EOM
                         $self->unstore_token_to_go();
 
                         # then output the line
-                        $self->end_batch();
+                        $self->end_batch() if ( $max_index_to_go >= 0 );
 
                         # and now store this token at the start of a new line
                         $self->store_token_to_go( $Ktoken_vars, $rtoken_vars );
@@ -11497,7 +11558,7 @@ EOM
 
                 # now output this line
                 unless ($no_internal_newlines) {
-                    $self->end_batch();
+                    $self->end_batch() if ( $max_index_to_go >= 0 );
                 }
             }
 
@@ -11530,7 +11591,7 @@ EOM
                 {
 
                     # write out everything before this closing curly brace
-                    $self->end_batch();
+                    $self->end_batch() if ( $max_index_to_go >= 0 );
                 }
 
                 # Now update for side comment
@@ -11669,14 +11730,16 @@ EOM
 
                     unless ( $rbrace_follower->{$next_nonblank_token} ) {
                         $self->end_batch()
-                          unless ($no_internal_newlines);
+                          unless ( $no_internal_newlines
+                            || $max_index_to_go < 0 );
                     }
                     $rbrace_follower = undef;
                 }
 
                 else {
                     $self->end_batch()
-                      unless ($no_internal_newlines);
+                      unless ( $no_internal_newlines
+                        || $max_index_to_go < 0 );
                 }
 
             }    # end treatment of closing block token
@@ -11697,7 +11760,8 @@ EOM
                   )
                 {
                     destroy_one_line_block();
-                    $self->end_batch() if ($break_before_semicolon);
+                    $self->end_batch()
+                      if ( $break_before_semicolon && $max_index_to_go >= 0 );
                 }
 
                 $self->store_token_to_go( $Ktoken_vars, $rtoken_vars );
@@ -11788,7 +11852,7 @@ EOM
           )
         {
             destroy_one_line_block();
-            $self->end_batch();
+            $self->end_batch() if ( $max_index_to_go >= 0 );
         }
 
         # Check for a soft break request