From: Steve Hancock Date: Mon, 14 Aug 2023 03:27:02 +0000 (-0700) Subject: avoid calling GCString length function when possible X-Git-Tag: 20230701.03~21 X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=b123dc6e3057a1f963a78609f7c3c7340d253e65;p=perltidy.git avoid calling GCString length function when possible This speeds up processing large utf8 files about 4% --- diff --git a/CHANGES.md b/CHANGES.md index be5a6d98..b628eb25 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,8 +6,8 @@ to limit tidy operations to a limited line range. Line numbers start with 1. The man pages have details. - - This version runs about 7% faster than the previous release on - large files. + - This version runs 7 to 10 percent faster than the previous release on + large files, depending on options and file type. ## 2023 07 01 diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm index 18e8169a..68d79cb1 100644 --- a/lib/Perl/Tidy/Formatter.pm +++ b/lib/Perl/Tidy/Formatter.pm @@ -234,6 +234,7 @@ my ( $rOpts_tee_block_comments, $rOpts_tee_pod, $rOpts_tee_side_comments, + $rOpts_use_unicode_gcstring, $rOpts_variable_maximum_line_length, $rOpts_valign_code, $rOpts_valign_side_comments, @@ -2478,6 +2479,7 @@ sub initialize_global_option_vars { $rOpts_tee_block_comments = $rOpts->{'tee-block-comments'}; $rOpts_tee_pod = $rOpts->{'tee-pod'}; $rOpts_tee_side_comments = $rOpts->{'tee-side-comments'}; + $rOpts_use_unicode_gcstring = $rOpts->{'use-unicode-gcstring'}; $rOpts_valign_code = $rOpts->{'valign-code'}; $rOpts_valign_side_comments = $rOpts->{'valign-side-comments'}; $rOpts_valign_if_unless = $rOpts->{'valign-if-unless'}; @@ -9076,6 +9078,22 @@ sub set_permanently_broken { return; } ## end sub set_permanently_broken +# We do not need to call the unicode GCstring length function for these types. +# This speeds up perltidy about 4% on large utf8 files. +my %is_non_encoded_type; + +BEGIN { + my @q = qw# + b k L R ; ( { [ ? : ] } ) f t n v F p m pp mm + .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <> + ( ) <= >= == =~ !~ != ++ -- /= x= + ... **= <<= >>= &&= ||= //= <=> + + - / * | % ! x ~ = \ ? : . < > ^ & + #; + push @q, ','; + @is_non_encoded_type{@q} = (1) x scalar(@q); +} + sub store_token { my ( $self, $item ) = @_; @@ -9128,9 +9146,19 @@ sub store_token { ]; # Set the token length. Later it may be adjusted again if phantom or - # ignoring side comment lengths. + # ignoring side comment lengths. It is always okay to calculate the length + # with $length_function->(), and necessary for wide characters, but it is + # very slow so we avoid it and use length() when possible. This reduces + # run time by several percent. Printable ascii can use the builtin + # length function, but non-printable ascii characters (like tab) may get + # different lengths by the two methods. my $token_length = - $is_encoded_data ? $length_function->($token) : length($token); + ( $is_encoded_data + && $rOpts_use_unicode_gcstring + && !$is_non_encoded_type{$type} + && $token =~ /[[:^ascii:][:^print:]]/ ) + ? $length_function->($token) + : length($token); # handle blanks if ( $type eq 'b' ) {