From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Mon, 14 Aug 2023 17:26:32 +0000 (-0700)
Subject: simplify coding for GCString length function
X-Git-Tag: 20230701.03~20
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=3d11d1a7405921527ea4c627b4e71222adbef245;p=perltidy.git

simplify coding for GCString length function
---

diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm
index 79ba7df9..6738b08c 100644
--- a/lib/Perl/Tidy.pm
+++ b/lib/Perl/Tidy.pm
@@ -1590,7 +1590,7 @@ EOM
 
     # Define the function to determine the display width of character
     # strings
-    my $length_function = sub { return length( $_[0] ) };
+    my $length_function;
     if ($is_encoded_data) {
 
         # Try to load Unicode::GCString for defining text display width, if
diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm
index 68d79cb1..dc9d51b6 100644
--- a/lib/Perl/Tidy/Formatter.pm
+++ b/lib/Perl/Tidy/Formatter.pm
@@ -234,7 +234,6 @@ my (
     $rOpts_tee_block_comments,
     $rOpts_tee_pod,
     $rOpts_tee_side_comments,
-    $rOpts_use_unicode_gcstring,
     $rOpts_variable_maximum_line_length,
     $rOpts_valign_code,
     $rOpts_valign_side_comments,
@@ -854,7 +853,7 @@ sub new {
         sink_object        => undef,
         diagnostics_object => undef,
         logger_object      => undef,
-        length_function    => sub { return length( $_[0] ) },
+        length_function    => undef,
         is_encoded_data    => EMPTY_STRING,
         fh_tee             => undef,
     );
@@ -890,7 +889,6 @@ sub new {
         file_writer_object => $file_writer_object,
         logger_object      => $logger_object,
         diagnostics_object => $diagnostics_object,
-        length_function    => $length_function,
     );
 
     write_logfile_entry("\nStarting tokenization pass...\n");
@@ -2479,7 +2477,6 @@ sub initialize_global_option_vars {
     $rOpts_tee_block_comments        = $rOpts->{'tee-block-comments'};
     $rOpts_tee_pod                   = $rOpts->{'tee-pod'};
     $rOpts_tee_side_comments         = $rOpts->{'tee-side-comments'};
-    $rOpts_use_unicode_gcstring      = $rOpts->{'use-unicode-gcstring'};
     $rOpts_valign_code               = $rOpts->{'valign-code'};
     $rOpts_valign_side_comments      = $rOpts->{'valign-side-comments'};
     $rOpts_valign_if_unless          = $rOpts->{'valign-if-unless'};
@@ -8084,6 +8081,7 @@ my %is_nonlist_keyword;
 my %is_nonlist_type;
 my %is_s_y_m_slash;
 my %is_unexpected_equals;
+my %is_ascii_type;
 
 BEGIN {
 
@@ -8116,6 +8114,18 @@ BEGIN {
     @q = qw( = == != );
     @is_unexpected_equals{@q} = (1) x scalar(@q);
 
+    # We can always skip expensive length_function->() calls for these
+    # ascii token types
+    @q = qw#
+      b k L R ; ( { [ ? : ] } ) f t n v F p m pp mm
+      .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
+      ( ) <= >= == =~ !~ != ++ -- /= x=
+      ... **= <<= >>= &&= ||= //= <=>
+      + - / * | % ! x ~ = \ ? : . < > ^ &
+      #;
+    push @q, ',';
+    @is_ascii_type{@q} = (1) x scalar(@q);
+
 } ## end BEGIN
 
 { #<<< begin closure respace_tokens
@@ -8125,7 +8135,6 @@ my $rLL_new;    # This will be the new array of tokens
 # These are variables in $self
 my $rLL;
 my $length_function;
-my $is_encoded_data;
 
 my $K_closing_ternary;
 my $K_opening_ternary;
@@ -8180,10 +8189,9 @@ sub initialize_respace_tokens_closure {
 
     $rLL_new = [];    # This is the new array
 
-    $rLL             = $self->[_rLL_];
-    $length_function = $self->[_length_function_];
-    $is_encoded_data = $self->[_is_encoded_data_];
+    $rLL = $self->[_rLL_];
 
+    $length_function           = $self->[_length_function_];
     $K_closing_ternary         = $self->[_K_closing_ternary_];
     $K_opening_ternary         = $self->[_K_opening_ternary_];
     $rchildren_of_seqno        = $self->[_rchildren_of_seqno_];
@@ -8262,12 +8270,12 @@ sub respace_tokens {
         # though these values are not actually needed for option --indent-only.
 
         $rLL               = $self->[_rLL_];
-        $length_function   = $self->[_length_function_];
         $cumulative_length = 0;
 
         foreach my $item ( @{$rLL} ) {
-            my $token        = $item->[_TOKEN_];
-            my $token_length = $length_function->($token);
+            my $token = $item->[_TOKEN_];
+            my $token_length =
+              $length_function ? $length_function->($token) : length($token);
             $cumulative_length += $token_length;
             $item->[_TOKEN_LENGTH_]      = $token_length;
             $item->[_CUMULATIVE_LENGTH_] = $cumulative_length;
@@ -9078,22 +9086,6 @@ sub set_permanently_broken {
     return;
 } ## end sub set_permanently_broken
 
-# We do not need to call the unicode GCstring length function for these types.
-# This speeds up perltidy about 4% on large utf8 files.
-my %is_non_encoded_type;
-
-BEGIN {
-    my @q = qw#
-      b k L R ; ( { [ ? : ] } ) f t n v F p m pp mm
-      .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
-      ( ) <= >= == =~ !~ != ++ -- /= x=
-      ... **= <<= >>= &&= ||= //= <=>
-      + - / * | % ! x ~ = \ ? : . < > ^ &
-      #;
-    push @q, ',';
-    @is_non_encoded_type{@q} = (1) x scalar(@q);
-}
-
 sub store_token {
 
     my ( $self, $item ) = @_;
@@ -9147,15 +9139,13 @@ sub store_token {
 
     # Set the token length.  Later it may be adjusted again if phantom or
     # ignoring side comment lengths. It is always okay to calculate the length
-    # with $length_function->(), and necessary for wide characters, but it is
-    # very slow so we avoid it and use length() when possible.  This reduces
-    # run time by several percent.  Printable ascii can use the builtin
-    # length function, but non-printable ascii characters (like tab) may get
-    # different lengths by the two methods.
+    # with $length_function->() if it is defined, but it is extremely slow so
+    # we avoid it and use the builtin length() for printable ascii tokens.
+    # Note: non-printable ascii characters (like tab) may get different lengths
+    # by the two methods, so we have to use $length_function for them.
     my $token_length =
-      (      $is_encoded_data
-          && $rOpts_use_unicode_gcstring
-          && !$is_non_encoded_type{$type}
+      (      $length_function
+          && !$is_ascii_type{$type}
           && $token =~ /[[:^ascii:][:^print:]]/ )
       ? $length_function->($token)
       : length($token);
@@ -9183,7 +9173,8 @@ sub store_token {
             && $token =~ s/\s+$//
           )
         {
-            $token_length = $length_function->($token);
+            $token_length =
+              $length_function ? $length_function->($token) : length($token);
             $item->[_TOKEN_] = $token;
         }
 
@@ -9575,9 +9566,13 @@ sub add_trailing_comma {
         my $rlines         = $self->[_rlines_];
         my $line_of_tokens = $rlines->[$line_index];
         my $input_line     = $line_of_tokens->{_line_text};
-        my $len            = $self->[_length_function_]->($input_line) - 1;
-        my $level          = $rLL->[$Kfirst]->[_LEVEL_];
-        my $max_len        = $maximum_line_length_at_level[$level];
+        my $len =
+            $length_function
+          ? $length_function->($input_line) - 1
+          : length($input_line) - 1;
+        my $level   = $rLL->[$Kfirst]->[_LEVEL_];
+        my $max_len = $maximum_line_length_at_level[$level];
+
         if ( $len >= $max_len ) {
             $match = 0;
         }
diff --git a/lib/Perl/Tidy/VerticalAligner.pm b/lib/Perl/Tidy/VerticalAligner.pm
index ba326b2c..0c3e7cb9 100644
--- a/lib/Perl/Tidy/VerticalAligner.pm
+++ b/lib/Perl/Tidy/VerticalAligner.pm
@@ -167,7 +167,6 @@ BEGIN {
         _file_writer_object_ => $i++,
         _logger_object_      => $i++,
         _diagnostics_object_ => $i++,
-        _length_function_    => $i++,
 
         _rOpts_                             => $i++,
         _rOpts_indent_columns_              => $i++,
@@ -318,7 +317,6 @@ sub new {
         file_writer_object => undef,
         logger_object      => undef,
         diagnostics_object => undef,
-        length_function    => sub { return length( $_[0] ) },
     );
     my %args = ( %defaults, @args );
 
@@ -337,7 +335,6 @@ sub new {
     $self->[_file_writer_object_] = $args{file_writer_object};
     $self->[_logger_object_]      = $args{logger_object};
     $self->[_diagnostics_object_] = $args{diagnostics_object};
-    $self->[_length_function_]    = $args{length_function};
 
     # shortcuts to user options
     my $rOpts = $args{rOpts};