From 7e936e8c642bc6e49a5fd69a94bff53fba74b962 Mon Sep 17 00:00:00 2001
From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Mon, 25 Sep 2023 06:40:06 -0700
Subject: [PATCH] revise method for finding leading spaces

---
 lib/Perl/Tidy/Formatter.pm |   9 +-
 lib/Perl/Tidy/Tokenizer.pm | 172 +++++++++++++++++++++++++++++--------
 2 files changed, 139 insertions(+), 42 deletions(-)

diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm
index 191f61b9..5d0b9055 100644
--- a/lib/Perl/Tidy/Formatter.pm
+++ b/lib/Perl/Tidy/Formatter.pm
@@ -9159,10 +9159,11 @@ EOM
         # check for a qw quote
         elsif ( $type eq 'q' ) {
 
-            # trim blanks from right of qw quotes
-            # (To avoid trimming qw quotes use -ntqw; the tokenizer handles
-            # this)
-            $token =~ s/\s*$//;
+            # Trim spaces from right of qw quotes.  Also trim from the left for
+            # safety (the tokenizer should have done this).
+            # To avoid trimming qw quotes use -ntqw; this causes the
+            # tokenizer to set them as type 'Q' instead of 'q'.
+            $token =~ s/^ \s+ | \s+ $//x;
             $rtoken_vars->[_TOKEN_] = $token;
             if ( $self->[_save_logfile_] && $token =~ /\t/ ) {
                 $self->note_embedded_tab($input_line_number);
diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index 2a8603f7..4bd3bc8f 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -20,15 +20,17 @@
 package Perl::Tidy::Tokenizer;
 use strict;
 use warnings;
-use English qw( -no_match_vars );
+use English    qw( -no_match_vars );
+use List::Util qw( min max );          # min, max first are in Perl 5.8
 
 our $VERSION = '20230912.01';
 
 use Carp;
 
-use constant DEVEL_MODE   => 0;
-use constant EMPTY_STRING => q{};
-use constant SPACE        => q{ };
+use constant USE_FAST_TRIM => 1;
+use constant DEVEL_MODE    => 0;
+use constant EMPTY_STRING  => q{};
+use constant SPACE         => q{ };
 
 # Decimal values of some ascii characters for quick checks
 use constant ORD_TAB           => 9;
@@ -214,6 +216,7 @@ BEGIN {
         _rOpts_                              => $i++,
         _rinput_lines_                       => $i++,
         _input_line_index_next_              => $i++,
+        _rleading_space_char_count_          => $i++,
     };
 } ## end BEGIN
 
@@ -598,6 +601,7 @@ sub make_source_array {
     my $rinput_lines = [];
 
     my $rsource = ref($line_source_object);
+    my $source_string;
 
     if ( !$rsource ) {
 
@@ -610,12 +614,14 @@ EOM
 
     # handle an ARRAY ref
     elsif ( $rsource eq 'ARRAY' ) {
-        $rinput_lines = $line_source_object;
+        $rinput_lines  = $line_source_object;
+        $source_string = join( EMPTY_STRING, @{$line_source_object} );
     }
 
     # handle a SCALAR ref
     elsif ( $rsource eq 'SCALAR' ) {
-        my @lines = split /^/, ${$line_source_object};
+        $source_string = ${$line_source_object};
+        my @lines = split /^/, $source_string;
         $rinput_lines = \@lines;
     }
 
@@ -624,10 +630,48 @@ EOM
         while ( my $line = $line_source_object->get_line() ) {
             push( @{$rinput_lines}, $line );
         }
+        $source_string = join( EMPTY_STRING, @{$rinput_lines} );
     }
 
-    $self->[_rinput_lines_]          = $rinput_lines;
-    $self->[_input_line_index_next_] = 0;
+    # Optional optimization. It is much faster to find leading whitespace on
+    # the whole input file than line-by-line.  If we define an array @spaces
+    # with the count of leading space characters for each line, they will be
+    # used. If @spaces is an empty array, spaces will be found line-by-line in
+    # sub 'tokenize_this_line'.
+    my @spaces;
+    if (USE_FAST_TRIM) {
+
+        # we remove all whitespace from left, but stop at a newline
+        $source_string =~ s/^ [^\S\n]+ //gxm;
+        my @trimmed_lines = split /^/, $source_string;
+
+        # The change in line length gives the number of space characters
+        if ( @trimmed_lines == @{$rinput_lines} ) {
+            my $i = -1;
+            foreach my $line (@trimmed_lines) {
+                push @spaces, length( $rinput_lines->[ ++$i ] ) - length($line);
+            }
+
+            # Be sure there are no negative spaces (shouldn't happen)
+            my $min_space = List::Util::min(@spaces);
+            if ( defined($min_space) && $min_space < 0 ) {
+
+                # shouldn't happen - safely continue with undefined spaces
+                DEVEL_MODE
+                  && $self->Fault(
+                    "Expecting min spaces >=0 but is $min_space\n");
+                @spaces = ();
+            }
+        }
+        else {
+            # Shouldn't happen - safely continue with undefined spaces
+            DEVEL_MODE && $self->Fault("line counts differ\n");
+        }
+    }
+
+    $self->[_rinput_lines_]              = $rinput_lines;
+    $self->[_rleading_space_char_count_] = \@spaces;
+    $self->[_input_line_index_next_]     = 0;
     return;
 } ## end sub make_source_array
 
@@ -996,9 +1040,12 @@ sub get_line {
 
     # get the next line from the input array
     my $input_line;
-    my $rinput_lines = $self->[_rinput_lines_];
+    my $leading_space_char_count;
     my $line_index   = $self->[_input_line_index_next_];
+    my $rinput_lines = $self->[_rinput_lines_];
     if ( $line_index < @{$rinput_lines} ) {
+        $leading_space_char_count =
+          $self->[_rleading_space_char_count_]->[$line_index];
         $input_line = $rinput_lines->[ $line_index++ ];
         $self->[_input_line_index_next_] = $line_index;
     }
@@ -1021,10 +1068,15 @@ sub get_line {
     if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
         if ( $input_line =~ s/([\r\035\032])+$// ) {
             $input_line_separator = $1 . $input_line_separator;
+
+            # This could make the old leading space count incorrect, so the
+            # safe thing to do is to make it undef. This will cause the slow
+            # method to be used to find the leading space.
+            $leading_space_char_count = undef;
         }
     }
 
-    # for backwards compatibility we keep the line text terminated with
+    # For backwards compatibility we keep the line text terminated with
     # a newline character
     $input_line .= "\n";
     $self->[_line_of_text_] = $input_line;
@@ -1061,6 +1113,7 @@ sub get_line {
     my $line_of_tokens = {
         _line_type                 => 'EOF',
         _line_text                 => $input_line,
+        _leading_space_char_count  => $leading_space_char_count,
         _line_number               => $input_line_number,
         _guessed_indentation_level => 0,
         _curly_brace_depth         => $brace_depth,
@@ -1073,7 +1126,6 @@ sub get_line {
 ##      _rlevels                   => undef,
 ##      _rblock_type               => undef,
 ##      _rtype_sequence            => undef,
-##      _rci_levels                => undef,
 ##      _starting_in_quote         => 0,
 ##      _ending_in_quote           => 0,
     };
@@ -5097,7 +5149,7 @@ EOM
   # -----------------------------------------------------------------------
 
         my ( $self, $line_of_tokens ) = @_;
-        my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
+        my $untrimmed_input_line = $line_of_tokens->{_line_text};
 
         # Extract line number for use in error messages
         $input_line_number = $line_of_tokens->{_line_number};
@@ -5122,32 +5174,87 @@ EOM
         }
 
         $input_line = $untrimmed_input_line;
-
         chomp $input_line;
 
         # Reinitialize the multi-line quote flag
         if ( $in_quote && $quote_type eq 'Q' ) {
             $line_of_tokens->{_starting_in_quote} = 1;
         }
+
+        # Trim start of this line unless we are continuing a quoted line.
+        # Do not trim end because we might end in a quote (test: deken4.pl)
+        # Perl::Tidy::Formatter will delete needless trailing blanks
         else {
             $line_of_tokens->{_starting_in_quote} = 0;
 
-            # Trim start of this line unless we are continuing a quoted line.
-            # Do not trim end because we might end in a quote (test: deken4.pl)
-            # Perl::Tidy::Formatter will delete needless trailing blanks
             if ( !length($input_line) ) {
 
                 # line is empty
             }
-            elsif ( $input_line =~ m/\S/g ) {
+            else {
+
+                # Trim the leading spaces..
+
+                #----------------------------------------------
+                # Option 1: Use saved leading spaces if defined
+                #----------------------------------------------
+                my $spaces = $line_of_tokens->{_leading_space_char_count};
+                if ( defined($spaces) ) {
+
+                    if ( $spaces >= length($input_line) ) {
 
-                # There are $spaces blank characters before a nonblank character
-                my $spaces = pos($input_line) - 1;
+                        # line has all blank characters
+                        $input_line = EMPTY_STRING;
+                        $spaces     = 0;
+                    }
+                }
+
+                #----------------------------------------------------------
+                # Option 2: Otherwise, find leading whitespace with a regex
+                #----------------------------------------------------------
+                else {
+
+                    # otherwise use slow method
+                    if ( $input_line =~ m/\S/g ) {
+
+                        # line has non-space with possible leading spaces
+                        $spaces = pos($input_line) - 1;
+                    }
+                    else {
+
+                        # line has all blank characters
+                        $input_line = EMPTY_STRING;
+                        $spaces     = 0;
+                    }
+                }
+
+                # Any leading whitespace?
                 if ( $spaces > 0 ) {
 
-                    # Trim the leading spaces
+                    if (DEVEL_MODE) {
+                        my $leading_space = substr( $input_line, 0, $spaces );
+                        if ( $leading_space =~ /\S/ ) {
+                            $self->Fault(<<EOM);
+space count $spaces caused non-whitespace in trimmed leading string: '$leading_space'
+Untrimmed line is:
+$untrimmed_input_line
+EOM
+                        }
+                    }
+
+                    # Trim the line
                     $input_line = substr( $input_line, $spaces );
 
+                    if (DEVEL_MODE) {
+                        if ( $input_line =~ /^\s/ ) {
+                            $self->Fault(<<EOM);
+space count $spaces caused whitespace in trimmed line: '$input_line'
+Untrimmed line is:
+$untrimmed_input_line
+EOM
+                        }
+                    }
+
                     # Find actual space count if there are leading tabs
                     if (
                         ord( substr( $untrimmed_input_line, 0, 1 ) ) == ORD_TAB
@@ -5161,11 +5268,6 @@ EOM
                       int( $spaces / $rOpts_indent_columns );
                 }
             }
-            else {
-
-                # line has all blank characters
-                $input_line = EMPTY_STRING;
-            }
         }
 
         if ( !$in_quote ) {
@@ -5176,7 +5278,6 @@ EOM
                 $line_of_tokens->{_rtokens}          = [];
                 $line_of_tokens->{_rtoken_type}      = [];
                 $line_of_tokens->{_rlevels}          = [];
-                $line_of_tokens->{_rci_levels}       = [];
                 $line_of_tokens->{_rblock_type}      = [];
                 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
                 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
@@ -5205,7 +5306,6 @@ EOM
                 $line_of_tokens->{_rtokens}          = [$input_line];
                 $line_of_tokens->{_rtoken_type}      = ['#'];
                 $line_of_tokens->{_rlevels}          = [$level_in_tokenizer];
-                $line_of_tokens->{_rci_levels}       = [0];
                 $line_of_tokens->{_rblock_type}      = [EMPTY_STRING];
                 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
                 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
@@ -5855,17 +5955,12 @@ EOM
             }
         }
 
-        # NOTE: This routine returns ci=0. Eventually '_rci_levels' can be
-        # removed. The ci values are computed later by sub Formatter::set_ci.
-        my @ci_levels = (0) x scalar(@levels);
-
         # Wrap up this line of tokens for shipping to the Formatter
         $line_of_tokens->{_rtoken_type}    = \@token_type;
         $line_of_tokens->{_rtokens}        = \@tokens;
         $line_of_tokens->{_rblock_type}    = \@block_type;
         $line_of_tokens->{_rtype_sequence} = \@type_sequence;
         $line_of_tokens->{_rlevels}        = \@levels;
-        $line_of_tokens->{_rci_levels}     = \@ci_levels;
 
         return;
     } ## end sub tokenizer_wrapup_line
@@ -6017,13 +6112,8 @@ sub operator_expected {
     # keyword...
     if ( $last_nonblank_type eq 'k' ) {
 
-        # keywords expecting OPERATOR:
-        if ( $expecting_operator_token{$last_nonblank_token} ) {
-            $op_expected = OPERATOR;
-        }
-
         # keywords expecting TERM:
-        elsif ( $expecting_term_token{$last_nonblank_token} ) {
+        if ( $expecting_term_token{$last_nonblank_token} ) {
 
             # Exceptions from TERM:
 
@@ -6053,6 +6143,12 @@ sub operator_expected {
                 $op_expected = TERM;
             }
         }
+
+        # keywords expecting OPERATOR:
+        elsif ( $expecting_operator_token{$last_nonblank_token} ) {
+            $op_expected = OPERATOR;
+        }
+
         else {
             $op_expected = TERM;
         }
-- 
2.39.5