From: Steve Hancock Date: Tue, 17 Nov 2020 21:00:49 +0000 (-0800) Subject: faster scanning of numbers and identifiers X-Git-Tag: 20201202~31 X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=065acdf7a6632d05d1e41e6592696aae1f79140d;p=perltidy.git faster scanning of numbers and identifiers --- diff --git a/CHANGES.md b/CHANGES.md index cb346716..8fc7aa82 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -61,7 +61,7 @@ - Added 'state' as a keyword. - - This version is about 15% faster than previous versions due to some optimizations + - This version is about 20% faster than the previous version due to optimizations made with the help of Devel::NYTProf. - Line breaks are now automatically placed after 'use overload' to diff --git a/dev-bin/build.pl b/dev-bin/build.pl index 078bb868..84299961 100755 --- a/dev-bin/build.pl +++ b/dev-bin/build.pl @@ -628,9 +628,9 @@ sub update_VERSION { my $is_pod_file = !$is_md_file && $source_file !~ /\.pm/; while ( my $line = <$fh> ) { - # Look for and turn off any DEVEL_MODE or DEBUG_XXX constants + # Look for and turn off any DEVEL_MODE or DEBUG_XXX or VERIFY_XXX constants if ( $line =~ - /^(\s*use\s+constant\s+(?:DEBUG|DEVEL)_[A-Z]+\s*)=>\s*(-?\d*);(.*)$/ + /^(\s*use\s+constant\s+(?:DEBUG|DEVEL|VERIFY)_[A-Z]+\s*)=>\s*(-?\d*);(.*)$/ ) { if ( $2 != 0 ) { diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm index 9091dd6e..a527a2e7 100644 --- a/lib/Perl/Tidy/Tokenizer.pm +++ b/lib/Perl/Tidy/Tokenizer.pm @@ -14,7 +14,7 @@ # The Tokenizer returns a reference to a data structure 'line_of_tokens' # containing one tokenized line for each call to its get_line() method. # -# WARNING: This is not a real class yet. Only one tokenizer my be used. +# WARNING: This is not a real class. Only one tokenizer my be used. # ######################################################################## @@ -1611,6 +1611,139 @@ sub prepare_for_a_new_file { return; } + use constant VERIFY_FASTSCAN => 0; + my %fast_scan_context; + + BEGIN { + %fast_scan_context = ( + '$' => SCALAR_CONTEXT, + '*' => SCALAR_CONTEXT, + '@' => LIST_CONTEXT, + '%' => LIST_CONTEXT, + '&' => UNKNOWN_CONTEXT, + ); + } + + sub scan_identifier_fast { + + # This is a wrapper for sub scan_identifier. It does a fast preliminary + # scan for certain common identifiers: + # '$var', '@var', %var, *var, &var, '@{...}', '%{...}' + # If it does not find one of these, or this is a restart, it calls the + # original scanner directly. + + # This gives the same results as the full scanner in about 1/4 the + # total runtime for a typical input stream. + + my $i_begin = $i; + my $tok_begin = $tok; + my $fast_scan_type; + + ############################### + # quick scan with leading sigil + ############################### + if ( !$id_scan_state + && $i + 1 <= $max_token_index + && $fast_scan_context{$tok} ) + { + $context = $fast_scan_context{$tok}; + + # look for $var, @var, ... + if ( $rtoken_type->[ $i + 1 ] eq 'w' ) { + my $pretype_next = ""; + my $i_next = $i + 2; + if ( $i_next <= $max_token_index ) { + if ( $rtoken_type->[$i_next] eq 'b' + && $i_next < $max_token_index ) + { + $i_next += 1; + } + $pretype_next = $rtoken_type->[$i_next]; + } + if ( $pretype_next ne ':' && $pretype_next ne "'" ) { + + # Found type 'i' like '$var', '@var', or '%var' + $identifier = $tok . $rtokens->[ $i + 1 ]; + $tok = $identifier; + $type = 'i'; + $i = $i + 1; + $fast_scan_type = $type; + } + } + + # Look for @{ or %{ . + # But we must let the full scanner handle things ${ because it may + # keep going to get a complete identifier like '${#}' . + elsif ( + $rtoken_type->[ $i + 1 ] eq '{' + && ( $tok_begin eq '@' + || $tok_begin eq '%' ) + ) + { + + $identifier = $tok; + $type = 't'; + $fast_scan_type = $type; + } + } + + ############################ + # Quick scan with leading -> + # Look for ->[ and ->{ + ############################ + elsif ( + $tok eq '->' + && $i < $max_token_index + && ( $rtokens->[ $i + 1 ] eq '{' + || $rtokens->[ $i + 1 ] eq '[' ) + ) + { + $type = $tok; + $fast_scan_type = $type; + $identifier = $tok; + $context = UNKNOWN_CONTEXT; + } + + ####################################### + # Verify correctness during development + ####################################### + if ( VERIFY_FASTSCAN && $fast_scan_type ) { + + # We will call the full method + my $identifier_simple = $identifier; + my $tok_simple = $tok; + my $fast_scan_type = $type; + my $i_simple = $i; + my $context_simple = $context; + + $tok = $tok_begin; + $i = $i_begin; + scan_identifier(); + + if ( $tok ne $tok_simple + || $type ne $fast_scan_type + || $i != $i_simple + || $identifier ne $identifier_simple + || $id_scan_state + || $context ne $context_simple ) + { + print STDERR < 0; + + sub scan_number_fast { + + # This is a wrapper for sub scan_number. It does a fast preliminary + # scan for a simple integer. It calls the original scan_number if it + # does not find one. + + my $i_begin = $i; + my $tok_begin = $tok; + my $number; + + ################################## + # Quick check for (signed) integer + ################################## + + # This will be the string of digits: + my $i_d = $i; + my $tok_d = $tok; + my $typ_d = $rtoken_type->[$i_d]; + + # check for signed integer + my $sign = ""; + if ( $typ_d ne 'd' + && ( $typ_d eq '+' || $typ_d eq '-' ) + && $i_d < $max_token_index ) + { + $sign = $tok_d; + $i_d++; + $tok_d = $rtokens->[$i_d]; + $typ_d = $rtoken_type->[$i_d]; + } + + # Handle integers + if ( + $typ_d eq 'd' + && ( + $i_d == $max_token_index + || ( $i_d < $max_token_index + && $rtoken_type->[ $i_d + 1 ] ne '.' + && $rtoken_type->[ $i_d + 1 ] ne 'w' ) + ) + ) + { + # Let let full scanner handle multi-digit integers beginning with + # '0' because there could be error messages. For example, '009' is + # not a valid number. + + if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) { + $number = $sign . $tok_d; + $type = 'n'; + $i = $i_d; + } + } + + ####################################### + # Verify correctness during development + ####################################### + if ( VERIFY_FASTNUM && defined($number) ) { + + # We will call the full method + my $type_simple = $type; + my $i_simple = $i; + my $number_simple = $number; + + $tok = $tok_begin; + $i = $i_begin; + $number = scan_number(); + + if ( $type ne $type_simple + || ( $i != $i_simple && $i <= $max_token_index ) + || $number ne $number_simple ) + { + print STDERR <[_saw_perl_dash_w_] = 1; @@ -2014,6 +2237,7 @@ sub prepare_for_a_new_file { # ATTRS: for a '{' following an attribute list, reset # things to look like we just saw the sub name + # FIXME: need to end with \b here?? if ( $statement_type =~ /^sub/ ) { $last_nonblank_token = $statement_type; $last_nonblank_type = 'i'; @@ -2171,7 +2395,7 @@ sub prepare_for_a_new_file { # For example we probably don't want & as sub call here: # Fcntl::S_IRUSR & $mode; if ( $expecting == TERM || $next_type ne 'b' ) { - scan_identifier(); + scan_identifier_fast(); } } else { @@ -2251,7 +2475,7 @@ sub prepare_for_a_new_file { '*' => sub { # typeglob, or multiply? if ( $expecting == TERM ) { - scan_identifier(); + scan_identifier_fast(); } else { @@ -2347,7 +2571,7 @@ sub prepare_for_a_new_file { '+' => sub { # what kind of plus? if ( $expecting == TERM ) { - my $number = scan_number(); + my $number = scan_number_fast(); # unary plus is safest assumption if not a number if ( !defined($number) ) { $type = 'p'; } @@ -2362,7 +2586,7 @@ sub prepare_for_a_new_file { error_if_expecting_OPERATOR("Array") if ( $expecting == OPERATOR ); - scan_identifier(); + scan_identifier_fast(); }, '%' => sub { # hash or modulo? @@ -2371,7 +2595,7 @@ sub prepare_for_a_new_file { if ( $next_type ne 'b' ) { $expecting = TERM } } if ( $expecting == TERM ) { - scan_identifier(); + scan_identifier_fast(); } }, '[' => sub { @@ -2426,7 +2650,7 @@ sub prepare_for_a_new_file { } } elsif ( $expecting == TERM ) { - my $number = scan_number(); + my $number = scan_number_fast(); # maybe part of bareword token? unary is safest if ( !defined($number) ) { $type = 'm'; } @@ -2573,7 +2797,7 @@ sub prepare_for_a_new_file { # if -> points to a bare word, we must scan for an identifier, # otherwise something like ->y would look like the y operator - scan_identifier(); + scan_identifier_fast(); }, # type = 'pp' for pre-increment, '++' for post-increment @@ -2928,7 +3152,7 @@ sub prepare_for_a_new_file { $routput_token_type->[$i] = $type; } - $tok = $quote_character if ($quote_character); + $tok = $quote_character if ($quote_character); # scan for the end of the quote or pattern ( @@ -3787,7 +4011,8 @@ EOM operator_expected( [ $prev_type, $tok, $next_type ] ); error_if_expecting_OPERATOR("Number") if ( $expecting == OPERATOR ); - my $number = scan_number(); + + my $number = scan_number_fast(); if ( !defined($number) ) { # shouldn't happen - we should always get a number @@ -4196,7 +4421,8 @@ EOM $ci_string_in_tokenizer .= ( $intervening_secondary_structure != 0 ) ? '1' : '0'; - $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; + $ci_string_sum = + ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; $continuation_string_in_tokenizer .= ( $in_statement_continuation > 0 ) ? '1' : '0'; @@ -4253,7 +4479,8 @@ EOM substr( $nesting_list_string, -1 ) eq '1'; chop $ci_string_in_tokenizer; - $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; + $ci_string_sum = + ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/; $in_statement_continuation = chop $continuation_string_in_tokenizer;