From 41797ca5f978b8664016c6c1493e89861692ba88 Mon Sep 17 00:00:00 2001 From: Steve Hancock Date: Thu, 21 Sep 2023 20:33:11 -0700 Subject: [PATCH] improve tokenizer efficiency --- lib/Perl/Tidy.pm | 42 ++++++++++++---------- lib/Perl/Tidy/Formatter.pm | 4 +-- lib/Perl/Tidy/Tokenizer.pm | 71 ++++++++++++++++++++++---------------- 3 files changed, 66 insertions(+), 51 deletions(-) diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm index 6880b2ad..8b47cedf 100644 --- a/lib/Perl/Tidy.pm +++ b/lib/Perl/Tidy.pm @@ -3480,25 +3480,29 @@ sub generate_options { $add_option->( 'stack-opening-hash-brace', 'sohb', '!' ); $add_option->( 'stack-opening-paren', 'sop', '!' ); $add_option->( 'stack-opening-square-bracket', 'sosb', '!' ); - $add_option->( 'vertical-tightness', 'vt', '=i' ); - $add_option->( 'vertical-tightness-closing', 'vtc', '=i' ); - $add_option->( 'want-break-after', 'wba', '=s' ); - $add_option->( 'want-break-before', 'wbb', '=s' ); - $add_option->( 'break-after-all-operators', 'baao', '!' ); - $add_option->( 'break-before-all-operators', 'bbao', '!' ); - $add_option->( 'keep-interior-semicolons', 'kis', '!' ); - $add_option->( 'one-line-block-semicolons', 'olbs', '=i' ); - $add_option->( 'one-line-block-nesting', 'olbn', '=i' ); - $add_option->( 'one-line-block-exclusion-list', 'olbxl', '=s' ); - $add_option->( 'break-before-hash-brace', 'bbhb', '=i' ); - $add_option->( 'break-before-hash-brace-and-indent', 'bbhbi', '=i' ); - $add_option->( 'break-before-square-bracket', 'bbsb', '=i' ); - $add_option->( 'break-before-square-bracket-and-indent', 'bbsbi', '=i' ); - $add_option->( 'break-before-paren', 'bbp', '=i' ); - $add_option->( 'break-before-paren-and-indent', 'bbpi', '=i' ); - $add_option->( 'brace-left-list', 'bll', '=s' ); - $add_option->( 'brace-left-exclusion-list', 'blxl', '=s' ); - $add_option->( 'break-after-labels', 'bal', '=i' ); + + # FIXME: --vt and --vtc are actually expansions now, so these two lines + # should eventually be removed. + $add_option->( 'vertical-tightness', 'vt', '=i' ); + $add_option->( 'vertical-tightness-closing', 'vtc', '=i' ); + + $add_option->( 'want-break-after', 'wba', '=s' ); + $add_option->( 'want-break-before', 'wbb', '=s' ); + $add_option->( 'break-after-all-operators', 'baao', '!' ); + $add_option->( 'break-before-all-operators', 'bbao', '!' ); + $add_option->( 'keep-interior-semicolons', 'kis', '!' ); + $add_option->( 'one-line-block-semicolons', 'olbs', '=i' ); + $add_option->( 'one-line-block-nesting', 'olbn', '=i' ); + $add_option->( 'one-line-block-exclusion-list', 'olbxl', '=s' ); + $add_option->( 'break-before-hash-brace', 'bbhb', '=i' ); + $add_option->( 'break-before-hash-brace-and-indent', 'bbhbi', '=i' ); + $add_option->( 'break-before-square-bracket', 'bbsb', '=i' ); + $add_option->( 'break-before-square-bracket-and-indent', 'bbsbi', '=i' ); + $add_option->( 'break-before-paren', 'bbp', '=i' ); + $add_option->( 'break-before-paren-and-indent', 'bbpi', '=i' ); + $add_option->( 'brace-left-list', 'bll', '=s' ); + $add_option->( 'brace-left-exclusion-list', 'blxl', '=s' ); + $add_option->( 'break-after-labels', 'bal', '=i' ); # This was an experiment mentioned in git #78, originally named -bopl. I # expanded it to also open logical blocks, based on git discussion #100, diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm index 95e866ec..c4f28800 100644 --- a/lib/Perl/Tidy/Formatter.pm +++ b/lib/Perl/Tidy/Formatter.pm @@ -6941,7 +6941,7 @@ sub follow_if_chain { push @seqno_list, $seqno; # Update info for this block - my $block_type = $rblock_type_of_seqno->{$seqno}; + $block_type = $rblock_type_of_seqno->{$seqno}; if ( $block_type eq 'elsif' ) { $elsif_count++ } my $item = $rlevel_info->{$seqno}; if ( defined($item) ) { @@ -7014,7 +7014,7 @@ sub follow_if_chain { } # check count - return unless ( $elsif_count >= $elsif_count_min ); + return if ( $elsif_count < $elsif_count_min ); # Store the chain my $K_opening = $K_opening_container->{$seqno_if}; diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm index 63005061..9275bb38 100644 --- a/lib/Perl/Tidy/Tokenizer.pm +++ b/lib/Perl/Tidy/Tokenizer.pm @@ -120,6 +120,7 @@ my ( %is_keyword, %is_my_our_state, %is_package, + %matching_end_token, # INITIALIZER: sub check_options $code_skipping_pattern_begin, @@ -5609,7 +5610,11 @@ EOM $self->[_in_quote_] = $in_quote; $self->[_quote_target_] = - $in_quote ? matching_end_token($quote_character) : EMPTY_STRING; + $in_quote + ? $matching_end_token{$quote_character} + ? $matching_end_token{$quote_character} + : $quote_character + : EMPTY_STRING; $self->[_rhere_target_list_] = $rhere_target_list; return; @@ -9781,6 +9786,17 @@ sub do_quote { ); } ## end sub do_quote +# Some possible non-word quote delimiters, for preliminary checking +my %is_punct_char; + +BEGIN { + + my @q = qw# / " ' { } ( ) [ ] < > ; + - * | % ! x ~ = ? : . ^ & #; + push @q, '#'; + push @q, ','; + @is_punct_char{@q} = (1) x scalar(@q); +} + sub follow_quoted_string { # scan for a specific token, skipping escaped characters @@ -9820,12 +9836,20 @@ sub follow_quoted_string { "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n"; }; - # get the corresponding end token - if ( $beginning_tok !~ /^\s*$/ ) { - $end_tok = matching_end_token($beginning_tok); + # for a non-blank token, get the corresponding end token + if ( + $is_punct_char{$beginning_tok} + || ( length($beginning_tok) + && $beginning_tok !~ /^\s+$/ ) + ) + { + $end_tok = + $matching_end_token{$beginning_tok} + ? $matching_end_token{$beginning_tok} + : $beginning_tok; } - # a blank token means we must find and use the first non-blank one + # for a blank token, find and use the first non-blank one else { my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a @@ -9847,7 +9871,10 @@ sub follow_quoted_string { $beginning_tok = $tok; $quote_pos = 0; } - $end_tok = matching_end_token($beginning_tok); + $end_tok = + $matching_end_token{$beginning_tok} + ? $matching_end_token{$beginning_tok} + : $beginning_tok; $quote_depth = 1; last; } @@ -9869,7 +9896,7 @@ sub follow_quoted_string { # Case 1 (rare): loop for case of alphanumeric quote delimiter.. # "quote_pos" is the position the current word to begin searching #---------------------------------------------------------------- - if ( $beginning_tok =~ /\w/ ) { + if ( !$is_punct_char{$beginning_tok} && $beginning_tok =~ /\w/ ) { # Note this because it is not recommended practice except # for obfuscated perl contests @@ -10156,29 +10183,6 @@ sub show_tokens { return; } ## end sub show_tokens -{ ## closure for sub matching end token - my %matching_end_token; - - BEGIN { - %matching_end_token = ( - '{' => '}', - '(' => ')', - '[' => ']', - '<' => '>', - ); - } ## end BEGIN - - sub matching_end_token { - - # return closing character for a pattern - my $beginning_token = shift; - if ( $matching_end_token{$beginning_token} ) { - return $matching_end_token{$beginning_token}; - } - return ($beginning_token); - } ## end sub matching_end_token -} - sub dump_token_types { my ( $class, $fh ) = @_; @@ -10917,5 +10921,12 @@ BEGIN { # __DATA__ __END__ @is_keyword{@Keywords} = (1) x scalar(@Keywords); + + %matching_end_token = ( + '{' => '}', + '(' => ')', + '[' => ']', + '<' => '>', + ); } ## end BEGIN 1; -- 2.39.5