From: Steve Hancock Date: Fri, 16 Oct 2020 01:11:07 +0000 (-0700) Subject: rewrite sub scan_identifier for improved efficiency X-Git-Tag: 20201001.03~75 X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=b97f347c0c474399e9a7cc63d26f25e4f8a88708;p=perltidy.git rewrite sub scan_identifier for improved efficiency --- diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm index e66de17a..502a70ff 100644 --- a/lib/Perl/Tidy/Tokenizer.pm +++ b/lib/Perl/Tidy/Tokenizer.pm @@ -6022,6 +6022,7 @@ sub scan_identifier_do { # scan state, id_scan_state. It updates id_scan_state based upon # current id_scan_state and token, and returns an updated # id_scan_state and the next index after the identifier. + # USES GLOBAL VARIABLES: $context, $last_nonblank_token, # $last_nonblank_type @@ -6037,18 +6038,23 @@ sub scan_identifier_do { my $identifier_begin = $identifier; my $tok = $tok_begin; my $message = ""; + my $tok_is_blank; # a flag to speed things up my $in_prototype_or_signature = $container_type =~ /^sub\b/; # these flags will be used to help figure out the type: - my $saw_alpha = ( $tok =~ /^\w/ ); + ##my $saw_alpha = ( $tok =~ /^\w/ ); # This was slow + my $saw_alpha; my $saw_type; # allow old package separator (') except in 'use' statement my $allow_tick = ( $last_nonblank_token ne 'use' ); + ######################################################### # get started by defining a type and a state if necessary - unless ($id_scan_state) { + ######################################################### + + if ( !$id_scan_state ) { $context = UNKNOWN_CONTEXT; # fixup for digraph @@ -6079,6 +6085,7 @@ sub scan_identifier_do { } elsif ( $tok =~ /^\w/ ) { $id_scan_state = ':'; + $saw_alpha = 1; } elsif ( $tok eq '->' ) { $id_scan_state = '$'; @@ -6095,22 +6102,34 @@ sub scan_identifier_do { } else { $i--; - $saw_type = ( $tok =~ /([\$\%\@\*\&])/ ); + $saw_alpha = ( $tok =~ /^\w/ ); + $saw_type = ( $tok =~ /([\$\%\@\*\&])/ ); } - # now loop to gather the identifier + ############################### + # loop to gather the identifier + ############################### + my $i_save = $i; while ( $i < $max_token_index ) { - $i_save = $i unless ( $tok =~ /^\s*$/ ); - $tok = $rtokens->[ ++$i ]; + ##$i_save = $i unless ( $tok =~ /^\s*$/ ); # This was a slow statement + if ($tok_is_blank) { $tok_is_blank = undef } + else { $i_save = $i } + $tok = $rtokens->[ ++$i ]; + + # patch to make digraph :: if necessary if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) { $tok = '::'; $i++; } - if ( $id_scan_state eq '$' ) { # starting variable name + ######################## + # Starting variable name + ######################## + + if ( $id_scan_state eq '$' ) { if ( $tok eq '$' ) { @@ -6123,16 +6142,20 @@ sub scan_identifier_do { last; } } - - # POSTDEFREF ->@ ->% ->& ->* - elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) { - $identifier .= $tok; - } elsif ( $tok =~ /^\w/ ) { # alphanumeric .. $saw_alpha = 1; $id_scan_state = ':'; # now need :: $identifier .= $tok; } + elsif ( $tok eq '::' ) { + $id_scan_state = 'A'; + $identifier .= $tok; + } + + # POSTDEFREF ->@ ->% ->& ->* + elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) { + $identifier .= $tok; + } elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. $saw_alpha = 1; $id_scan_state = ':'; # now need :: @@ -6146,10 +6169,6 @@ sub scan_identifier_do { # howdy::123::bubba(); # } - elsif ( $tok eq '::' ) { - $id_scan_state = 'A'; - $identifier .= $tok; - } # $# and POSTDEFREF ->$# elsif ( ( $tok eq '#' ) && ( $identifier =~ /\$$/ ) ) { # $#array @@ -6190,6 +6209,8 @@ sub scan_identifier_do { # space ok after leading $ % * & @ elsif ( $tok =~ /^\s*$/ ) { + $tok_is_blank = 1; + if ( $identifier =~ /^[\$\%\*\&\@]/ ) { if ( length($identifier) > 1 ) { @@ -6297,74 +6318,32 @@ sub scan_identifier_do { last; } } - elsif ( $id_scan_state eq '&' ) { # starting sub call? - if ( $tok =~ /^[\$\w]/ ) { # alphanumeric .. - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - $identifier .= $tok; - } - elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - $identifier .= $tok; - } - elsif ( $tok =~ /^\s*$/ ) { # allow space - } - elsif ( $tok eq '::' ) { # leading :: - $id_scan_state = 'A'; # accept alpha next - $identifier .= $tok; - } - elsif ( $tok eq '{' ) { - if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; } - $i = $i_save; - $id_scan_state = ''; - last; - } - else { + ################################### + # looking for alphanumeric after :: + ################################### - # punctuation variable? - # testfile: cunningham4.pl - # - # We have to be careful here. If we are in an unknown state, - # we will reject the punctuation variable. In the following - # example the '&' is a binary operator but we are in an unknown - # state because there is no sigil on 'Prima', so we don't - # know what it is. But it is a bad guess that - # '&~' is a function variable. - # $self->{text}->{colorMap}->[ - # Prima::PodView::COLOR_CODE_FOREGROUND - # & ~tb::COLOR_INDEX ] = - # $sec->{ColorCode} - if ( $identifier eq '&' && $expecting ) { - $identifier .= $tok; - } - else { - $identifier = ''; - $i = $i_save; - $type = '&'; - } - $id_scan_state = ''; - last; - } - } - elsif ( $id_scan_state eq 'A' ) { # looking for alpha (after ::) + elsif ( $id_scan_state eq 'A' ) { + + $tok_is_blank = $tok =~ /^\s*$/; - if ( $tok =~ /^\w/ ) { # found it + if ( $tok =~ /^\w/ ) { # found it $identifier .= $tok; - $id_scan_state = ':'; # now need :: + $id_scan_state = ':'; # now need :: $saw_alpha = 1; } elsif ( $tok eq "'" && $allow_tick ) { $identifier .= $tok; - $id_scan_state = ':'; # now need :: + $id_scan_state = ':'; # now need :: $saw_alpha = 1; } - elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { + ##elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { + elsif ( $tok_is_blank && $identifier =~ /^sub / ) { $id_scan_state = '('; $identifier .= $tok; } - elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { + ##elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { + elsif ( $tok eq '(' && $identifier =~ /^sub / ) { $id_scan_state = ')'; $identifier .= $tok; } @@ -6374,8 +6353,15 @@ sub scan_identifier_do { last; } } + + ################################### + # looking for :: after alphanumeric + ################################### + elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha + $tok_is_blank = $tok =~ /^\s*$/; + if ( $tok eq '::' ) { # got it $identifier .= $tok; $id_scan_state = 'A'; # now require alpha @@ -6395,11 +6381,13 @@ sub scan_identifier_do { $identifier .= $tok; } } - elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { + ##elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { + elsif ( $tok_is_blank && $identifier =~ /^sub / ) { $id_scan_state = '('; $identifier .= $tok; } - elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { + ##elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { + elsif ( $tok eq '(' && $identifier =~ /^sub / ) { $id_scan_state = ')'; $identifier .= $tok; } @@ -6409,26 +6397,39 @@ sub scan_identifier_do { last; } } - elsif ( $id_scan_state eq '(' ) { # looking for ( of prototype - if ( $tok eq '(' ) { # got it + ############################## + # looking for '(' of prototype + ############################## + + elsif ( $id_scan_state eq '(' ) { + + if ( $tok eq '(' ) { # got it $identifier .= $tok; - $id_scan_state = ')'; # now find the end of it + $id_scan_state = ')'; # now find the end of it } - elsif ( $tok =~ /^\s*$/ ) { # blank - keep going + elsif ( $tok =~ /^\s*$/ ) { # blank - keep going $identifier .= $tok; + $tok_is_blank = 1; } else { - $id_scan_state = ''; # that's all - no prototype + $id_scan_state = ''; # that's all - no prototype $i = $i_save; last; } } - elsif ( $id_scan_state eq ')' ) { # looking for ) to end - if ( $tok eq ')' ) { # got it + ############################## + # looking for ')' of prototype + ############################## + + elsif ( $id_scan_state eq ')' ) { + + $tok_is_blank = $tok =~ /^\s*$/; + + if ( $tok eq ')' ) { # got it $identifier .= $tok; - $id_scan_state = ''; # all done + $id_scan_state = ''; # all done last; } elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) { @@ -6439,12 +6440,74 @@ sub scan_identifier_do { $identifier .= $tok; } } - else { # can get here due to error in initialization + + ################### + # Starting sub call + ################### + + elsif ( $id_scan_state eq '&' ) { + + if ( $tok =~ /^[\$\w]/ ) { # alphanumeric .. + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; + $identifier .= $tok; + } + elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; + $identifier .= $tok; + } + elsif ( $tok =~ /^\s*$/ ) { # allow space + $tok_is_blank = 1; + } + elsif ( $tok eq '::' ) { # leading :: + $id_scan_state = 'A'; # accept alpha next + $identifier .= $tok; + } + elsif ( $tok eq '{' ) { + if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; } + $i = $i_save; + $id_scan_state = ''; + last; + } + else { + + # punctuation variable? + # testfile: cunningham4.pl + # + # We have to be careful here. If we are in an unknown state, + # we will reject the punctuation variable. In the following + # example the '&' is a binary operator but we are in an unknown + # state because there is no sigil on 'Prima', so we don't + # know what it is. But it is a bad guess that + # '&~' is a function variable. + # $self->{text}->{colorMap}->[ + # Prima::PodView::COLOR_CODE_FOREGROUND + # & ~tb::COLOR_INDEX ] = + # $sec->{ColorCode} + if ( $identifier eq '&' && $expecting ) { + $identifier .= $tok; + } + else { + $identifier = ''; + $i = $i_save; + $type = '&'; + } + $id_scan_state = ''; + last; + } + } + + ###################### + # unknown state - quit + ###################### + + else { # can get here due to error in initialization $id_scan_state = ''; $i = $i_save; last; } - } + } ## end of main loop if ( $id_scan_state eq ')' ) { warning("Hit end of line while seeking ) to end prototype\n"); @@ -6456,12 +6519,14 @@ sub scan_identifier_do { $id_scan_state = ''; } - # The deprecated variable $# does not combine with anything on the next line + # Patch: the deprecated variable $# does not combine with anything on the + # next line. if ( $identifier eq '$#' ) { $id_scan_state = '' } if ( $i < 0 ) { $i = 0 } - unless ($type) { + # Be sure a token type is defined + if ( !$type ) { if ($saw_type) { @@ -6499,10 +6564,13 @@ sub scan_identifier_do { } # this can happen on a restart } + # See if we formed an identifier... if ($identifier) { $tok = $identifier; if ($message) { write_logfile_entry($message) } } + + # did not find an identifier, back up else { $tok = $tok_begin; $i = $i_begin;