From 409c510f7f0254a580ab48f6c094216500f2aa8d Mon Sep 17 00:00:00 2001 From: Steve Hancock Date: Thu, 17 Aug 2023 07:42:31 -0700 Subject: [PATCH] fix issue c250 part 1, new package token type P --- lib/Perl/Tidy/Formatter.pm | 49 ++++++++++++--------- lib/Perl/Tidy/HtmlWriter.pm | 14 ++++-- lib/Perl/Tidy/Tokenizer.pm | 85 ++++++++++++++++++++----------------- 3 files changed, 85 insertions(+), 63 deletions(-) diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm index f44a2200..ed0139a7 100644 --- a/lib/Perl/Tidy/Formatter.pm +++ b/lib/Perl/Tidy/Formatter.pm @@ -2801,10 +2801,11 @@ sub initialize_whitespace_hashes { # simple as adding your new letter to @spaces_both_sides, for # example. + # fix for c250: added space rules new package type 'P' my @spaces_both_sides = qw# + - * / % ? = . : x < > | & ^ .. << >> ** && .. || // => += -= .= %= x= &= |= ^= *= <> <= >= == =~ !~ /= != ... <<= >>= ~~ !~~ - &&= ||= //= <=> A k f w F n C Y U G v + &&= ||= //= <=> A k f w F n C Y U G v P #; my @spaces_left_side = qw< @@ -4048,6 +4049,8 @@ EOM # These routines and variables are involved in deciding where to break very # long lines. + # NEW_TOKENS must add bond strength rules + my %is_good_keyword_breakpoint; my %is_lt_gt_le_ge; my %is_container_token; @@ -4179,6 +4182,10 @@ EOM $left_bond_strength{'CORE::'} = NOMINAL; $right_bond_strength{'CORE::'} = NO_BREAK; + # Fix for c250: added strengths for new type 'P' + $left_bond_strength{'P'} = NOMINAL; + $right_bond_strength{'P'} = NOMINAL; + # breaking AFTER modulus operator is ok: @q = qw< % >; @left_bond_strength{@q} = (STRONG) x scalar(@q); @@ -6442,9 +6449,10 @@ sub find_selected_packages { foreach my $KK ( 0 .. $Klimit ) { my $item = $rLL->[$KK]; my $type = $item->[_TYPE_]; - if ( $type ne 'i' ) { - next; - } + + # fix for c250: package type has changed from 'i' to 'P' + next if ( $type ne 'P' ); + my $token = $item->[_TOKEN_]; if ( substr( $token, 0, 7 ) eq 'package' && $token =~ /^package\s/ || substr( $token, 0, 5 ) eq 'class' && $token =~ /^class\s/ ) @@ -8075,7 +8083,7 @@ sub dump_verbatim { my %wU; my %wiq; -my %is_wit; +my %is_witP; my %is_sigil; my %is_nonlist_keyword; my %is_nonlist_type; @@ -8092,8 +8100,8 @@ BEGIN { @q = qw(w i q Q G C Z); @{wiq}{@q} = (1) x scalar(@q); - @q = qw(w i t); - @{is_wit}{@q} = (1) x scalar(@q); + @q = qw(w i t P); # Fix for c250: added new type 'P', formerly 'i' + @{is_witP}{@q} = (1) x scalar(@q); @q = qw($ & % * @); @{is_sigil}{@q} = (1) x scalar(@q); @@ -8632,7 +8640,7 @@ sub respace_tokens_inner_loop { # The following is not yet done, but could be: # sub (x x x) # ( $type =~ /^[wit]$/ ) - elsif ( $is_wit{$type} ) { + elsif ( $is_witP{$type} ) { # index() is several times faster than a regex test with \s here ## $token =~ /\s/ @@ -8688,16 +8696,6 @@ sub respace_tokens_inner_loop { } - # clean up spaces in package identifiers, like - # "package Bob::Dog;" - elsif ( $token =~ /^(package|class)\s/ ) { - $token =~ s/\s+/ /g; - $rtoken_vars->[_TOKEN_] = $token; - - $self->[_ris_special_identifier_token_]->{$token} = - 'package'; - } - # trim identifiers of trailing blanks which can occur # under some unusual circumstances, such as if the # identifier 'witch' has trailing blanks on input here: @@ -8718,6 +8716,18 @@ sub respace_tokens_inner_loop { $rtoken_vars->[_TOKEN_] = $token; } } + + # and trim spaces in package statements (added for c250) + elsif ( $type eq 'P' ) { + + # clean up spaces in package identifiers, like + # "package Bob::Dog;" + if ( $token =~ s/\s+/ /g ) { + $rtoken_vars->[_TOKEN_] = $token; + $self->[_ris_special_identifier_token_]->{$token} = + 'package'; + } + } } } @@ -18111,7 +18121,8 @@ EOM } # blank lines before subs except declarations and one-liners - elsif ( $leading_type eq 'i' ) { + # Fix for c250: added new type 'P' + elsif ( $leading_type eq 'i' || $leading_type eq 'P' ) { my $special_identifier = $self->[_ris_special_identifier_token_]->{$leading_token}; if ($special_identifier) { diff --git a/lib/Perl/Tidy/HtmlWriter.pm b/lib/Perl/Tidy/HtmlWriter.pm index c5f0b315..7c751c62 100644 --- a/lib/Perl/Tidy/HtmlWriter.pm +++ b/lib/Perl/Tidy/HtmlWriter.pm @@ -391,12 +391,14 @@ BEGIN { '(' => 'p', ')' => 'p', 'M' => 'm', - 'P' => 'pd', + 'pd' => 'pd', 'A' => 'co', ); # These token types will all be called identifiers for now - my @identifier = qw< i t U C Y Z G :: CORE::>; + # Fix for c250: added new type 'P', formerly 'i' + # ( but package statements will eventually be split into 'k' and 'i') + my @identifier = qw< i t U C Y Z G P :: CORE::>; @token_short_names{@identifier} = ('i') x scalar(@identifier); # These token types will be called 'structure' @@ -1341,7 +1343,8 @@ sub markup_tokens { # into keyword 'package' and name; add to the toc, # and update the package stack #------------------------------------------------------- - if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) { + # Fix for c250: switch from 'i' to 'P' and allow 'class' or 'package' + if ( $type eq 'P' && $token =~ /^(\w+\s+)(\w.*)$/ ) { $token = $self->markup_html_element( $1, 'k' ); push @colored_tokens, $token; $token = $2; @@ -1459,7 +1462,10 @@ sub write_line { $self->add_toc_item( '__DATA__', '__DATA__' ); } elsif ( $line_type =~ /^POD/ ) { - $line_character = 'P'; + + # fix for c250: changed 'P' to 'pd' here and in %token_short_names + # to allow use of 'P' as new package token type + $line_character = 'pd'; if ( $rOpts->{'pod2html'} ) { my $html_pod_fh = $self->{_html_pod_fh}; if ( $line_type eq 'POD_START' ) { diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm index 83c81075..24634f76 100644 --- a/lib/Perl/Tidy/Tokenizer.pm +++ b/lib/Perl/Tidy/Tokenizer.pm @@ -3100,13 +3100,18 @@ EOM $container_type = EMPTY_STRING; # ATTRS: for a '{' following an attribute list, reset - # things to look like we just saw the sub name + # things to look like we just saw a sub name # Added 'package' (can be 'class') for --use-feature=class (rt145706) - if ( $statement_type =~ /^(sub|package)\b/ ) { + if ( substr( $statement_type, 0, 3 ) eq 'sub' ) { $last_nonblank_token = $statement_type; $last_nonblank_type = 'i'; $statement_type = EMPTY_STRING; } + elsif ( substr( $statement_type, 0, 7 ) eq 'package' ) { + $last_nonblank_token = $statement_type; + $last_nonblank_type = 'P'; # c250 change + $statement_type = EMPTY_STRING; + } # patch for SWITCH/CASE: hide these keywords from an immediately # following opening brace @@ -5276,16 +5281,16 @@ EOM # this pre-token will start an output token push( @{$routput_token_list}, $i_tok ); - # The search for the full token ends in one of 5 main end NODES + # The search for the full token ends in one of 5 main END NODES - #---------------------------------- - # NODE 1: handle a whitespace token - #---------------------------------- + #----------------------- + # END NODE 1: whitespace + #----------------------- next if ( $pre_type eq 'b' ); - #------------------------- - # NODE 2: handle a comment - #------------------------- + #---------------------- + # END NODE 2: a comment + #---------------------- last if ( $pre_type eq '#' ); # continue gathering identifier if necessary @@ -5442,9 +5447,9 @@ EOM # Now we have to examine this token and decide what it is # and define its $type - #--------------------------- - # NODE 3: handle a bare word - #--------------------------- + #------------------------ + # END NODE 3: a bare word + #------------------------ if ( $pre_type eq 'w' ) { my $is_last = $self->do_BAREWORD($is_END_or_DATA); last if ($is_last); @@ -5455,17 +5460,17 @@ EOM # Added '#' to fix c038 (later moved above). $self->[_in_attribute_list_] &&= 0; - #---------------------------------- - # NODE 4: handle a string of digits - #---------------------------------- + #------------------------------- + # END NODE 4: a string of digits + #------------------------------- if ( $pre_type eq 'd' ) { $self->do_DIGITS(); next; } - #-------------------------------- - # NODE 5: handle all other tokens - #-------------------------------- + #----------------------------- + # END NODE 5: all other tokens + #----------------------------- my $code = $tokenization_code->{$tok}; if ($code) { $code->($self); @@ -5777,8 +5782,10 @@ BEGIN { # Always expecting TERM following these types: # note: this is identical to '@value_requestor_type' defined later. + # Fix for c250: add new type 'P' for package (expecting VERSION or {} + # after package NAMESPACE, so expecting TERM) my @q = qw( - ; ! + x & ? F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t + ; ! + x & ? F J - p / Y : % f U ~ A G j L P * . | ^ < = [ m { \ > t || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /= &= // >> ~. &. |. ^. ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~ @@ -5798,7 +5805,8 @@ BEGIN { # 'i' is currently excluded because it might be a package # 'q' is currently excluded because it might be a prototype # Fix for c030: removed '->' from this list: - @q = qw( -- C h R ++ ] Q <> ); ## n v q i ); + # Fix for c250: added 'i' after new type 'P' added + @q = qw( -- C h R ++ ] Q <> i ); ## n v q ); push @q, ')'; @{op_expected_table}{@q} = (OPERATOR) x scalar(@q); @@ -5891,25 +5899,13 @@ sub operator_expected { #--------------------------------------------- # Types 'k', '}' and 'Z' depend on context - # Types 'i', 'n', 'v', 'q' currently also temporarily depend on context. + # Types 'n', 'v', 'q' also depend on context. # identifier... - if ( $last_nonblank_type eq 'i' ) { - $op_expected = OPERATOR; - - # TODO: it would be cleaner to make this a special type - # expecting VERSION or {} after package NAMESPACE; - # maybe mark these words as type 'Y'? - if ( substr( $last_nonblank_token, 0, 7 ) eq 'package' - && $statement_type =~ /^package\b/ - && $last_nonblank_token =~ /^package\b/ ) - { - $op_expected = TERM; - } - } + # Fix for c250: type 'i' and new type 'P' are in the hash table now # keyword... - elsif ( $last_nonblank_type eq 'k' ) { + if ( $last_nonblank_type eq 'k' ) { $op_expected = TERM; if ( $expecting_operator_token{$last_nonblank_token} ) { $op_expected = OPERATOR; @@ -6264,8 +6260,16 @@ sub code_block_type { } # or a sub or package BLOCK - elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' ) - && $last_nonblank_token =~ /^(sub|package)\b/ ) + # Fixed for c250 to include new package type 'P' + # FIXME: this could use optimization + elsif ( + ( + $last_nonblank_type eq 'i' + || $last_nonblank_type eq 't' + || $last_nonblank_type eq 'P' + ) + && $last_nonblank_token =~ /^(sub|package)\b/ + ) { return $last_nonblank_token; } @@ -7586,7 +7590,7 @@ sub do_scan_package { my $pos = pos($input_line); my $numc = $pos - $pos_beg; $tok = 'package ' . substr( $input_line, $pos_beg, $numc ); - $type = 'i'; + $type = 'P'; # Fix for c250, previously 'i' # Now we must convert back from character position # to pre_token index. @@ -10072,7 +10076,7 @@ The following additional token types are defined: U user-defined function taking parameters G user-defined function taking block parameter (like grep/map/eval) M (unused, but reserved for subroutine definition name) - P (unused, but -html uses it to label pod text) + P package definition t type indicater such as %,$,@,*,&,sub w bare word (perhaps a subroutine call) i identifier of some type (with leading %, $, @, *, &, sub, -> ) @@ -10138,8 +10142,9 @@ BEGIN { # make a hash of all valid token types for self-checking the tokenizer # (adding NEW_TOKENS : select a new character and add to this list) + # fix for c250: added new token type 'P' my @valid_token_types = qw# - A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v + A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v P { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ & #; push( @valid_token_types, @digraphs ); -- 2.39.5