From 365afb34b419349c36f5cfbcaaaabe6f25b7c685 Mon Sep 17 00:00:00 2001
From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Tue, 15 Aug 2023 09:00:07 -0700
Subject: [PATCH] rewrite sub pre_tokenize for improved efficiency

This is a time-critical routine. It runs over 2x faster now compared to the
previous release.
---
 lib/Perl/Tidy/Tokenizer.pm | 55 ++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 26 deletions(-)

diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index 9e38e794..83c81075 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -9943,23 +9943,26 @@ sub pre_tokenize {
 
     my ( $str, $max_tokens_wanted ) = @_;
 
-    # Input parameter:
+    # Input parameters:
+    #  $str = string to be parsed
     #  $max_tokens_wanted > 0  to stop on reaching this many tokens.
     #                     = 0 means get all tokens
 
-    # Break a string, $str, into a sequence of preliminary tokens.  We
-    # are interested in these types of tokens:
-    #   words       (type='w'),            example: 'max_tokens_wanted'
-    #   digits      (type = 'd'),          example: '0755'
-    #   whitespace  (type = 'b'),          example: '   '
-    #   any other single character (i.e. punct; type = the character itself).
-    # We cannot do better than this yet because we might be in a quoted
-    # string or pattern.  Caller sets $max_tokens_wanted to 0 to get all
-    # tokens.
+    # Break a string, $str, into a sequence of preliminary tokens (pre-tokens).
+    # We look for these types of tokens:
+    #   words       (type='w'),               example: 'max_tokens_wanted'
+    #   digits      (type = 'd'),             example: '0755'
+    #   whitespace  (type = 'b'),             example: '   '
+    #   single character punct (type = char)  example: '='
+
+    # Later operations will combine one or more of these pre-tokens into final
+    # tokens.  We cannot do better than this yet because we might be in a
+    # quoted string or pattern.
 
     # An advantage of doing this pre-tokenization step is that it keeps almost
-    # all of the regex work highly localized.  A disadvantage is that in some
-    # very rare instances we will have to go back and split a pre-token.
+    # all of the regex parsing very simple and localized right here.  A
+    # disadvantage is that in some extremely rare instances we will have to go
+    # back and split a pre-token.
 
     # Return parameters:
     my @tokens    = ();     # array of the tokens themselves
@@ -9968,26 +9971,26 @@ sub pre_tokenize {
 
     do {
 
-        # whitespace - this must come before \W
-        if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
-
-        # non-whitespace single-character punctuation
-        elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
-
-        # sequence of digits - this must come before \w
-        elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
-
-        # words not starting with a digit
-        elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
+        if (
+            $str =~ /\G(
+             (\s+) #     type 'b'  = whitespace - this must come before \W
+            |(\W)  #  or type=char = single-character, non-whitespace punct
+            |(\d+) #  or type 'd'  = sequence of digits - must come before \w
+            |(\w+) #  or type 'w'  = words not starting with a digit
+            )/gcx
+          )
+        {
+            push @tokens, $1;
+            push @type,
+              defined($2) ? 'b' : defined($3) ? $1 : defined($4) ? 'd' : 'w';
+            push @token_map, pos($str);
+        }
 
         # that's all..
         else {
             return ( \@tokens, \@token_map, \@type );
         }
 
-        push @tokens,    $1;
-        push @token_map, pos($str);
-
     } while ( --$max_tokens_wanted != 0 );
 
     return ( \@tokens, \@token_map, \@type );
-- 
2.39.5