From 409c510f7f0254a580ab48f6c094216500f2aa8d Mon Sep 17 00:00:00 2001
From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Thu, 17 Aug 2023 07:42:31 -0700
Subject: [PATCH] fix issue c250 part 1, new package token type P

---
 lib/Perl/Tidy/Formatter.pm  | 49 ++++++++++++---------
 lib/Perl/Tidy/HtmlWriter.pm | 14 ++++--
 lib/Perl/Tidy/Tokenizer.pm  | 85 ++++++++++++++++++++-----------------
 3 files changed, 85 insertions(+), 63 deletions(-)

diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm
index f44a2200..ed0139a7 100644
--- a/lib/Perl/Tidy/Formatter.pm
+++ b/lib/Perl/Tidy/Formatter.pm
@@ -2801,10 +2801,11 @@ sub initialize_whitespace_hashes {
     # simple as adding your new letter to @spaces_both_sides, for
     # example.
 
+    # fix for c250: added space rules new package type 'P'
     my @spaces_both_sides = qw#
       + - * / % ? = . : x < > | & ^ .. << >> ** && .. || // => += -=
       .= %= x= &= |= ^= *= <> <= >= == =~ !~ /= != ... <<= >>= ~~ !~~
-      &&= ||= //= <=> A k f w F n C Y U G v
+      &&= ||= //= <=> A k f w F n C Y U G v P
       #;
 
     my @spaces_left_side = qw<
@@ -4048,6 +4049,8 @@ EOM
     # These routines and variables are involved in deciding where to break very
     # long lines.
 
+    # NEW_TOKENS must add bond strength rules
+
     my %is_good_keyword_breakpoint;
     my %is_lt_gt_le_ge;
     my %is_container_token;
@@ -4179,6 +4182,10 @@ EOM
         $left_bond_strength{'CORE::'}  = NOMINAL;
         $right_bond_strength{'CORE::'} = NO_BREAK;
 
+        # Fix for c250: added strengths for new type 'P'
+        $left_bond_strength{'P'}  = NOMINAL;
+        $right_bond_strength{'P'} = NOMINAL;
+
         # breaking AFTER modulus operator is ok:
         @q = qw< % >;
         @left_bond_strength{@q} = (STRONG) x scalar(@q);
@@ -6442,9 +6449,10 @@ sub find_selected_packages {
     foreach my $KK ( 0 .. $Klimit ) {
         my $item = $rLL->[$KK];
         my $type = $item->[_TYPE_];
-        if ( $type ne 'i' ) {
-            next;
-        }
+
+        # fix for c250: package type has changed from 'i' to 'P'
+        next if ( $type ne 'P' );
+
         my $token = $item->[_TOKEN_];
         if (   substr( $token, 0, 7 ) eq 'package' && $token =~ /^package\s/
             || substr( $token, 0, 5 ) eq 'class' && $token =~ /^class\s/ )
@@ -8075,7 +8083,7 @@ sub dump_verbatim {
 
 my %wU;
 my %wiq;
-my %is_wit;
+my %is_witP;
 my %is_sigil;
 my %is_nonlist_keyword;
 my %is_nonlist_type;
@@ -8092,8 +8100,8 @@ BEGIN {
     @q = qw(w i q Q G C Z);
     @{wiq}{@q} = (1) x scalar(@q);
 
-    @q = qw(w i t);
-    @{is_wit}{@q} = (1) x scalar(@q);
+    @q = qw(w i t P);    # Fix for c250: added new type 'P', formerly 'i'
+    @{is_witP}{@q} = (1) x scalar(@q);
 
     @q = qw($ & % * @);
     @{is_sigil}{@q} = (1) x scalar(@q);
@@ -8632,7 +8640,7 @@ sub respace_tokens_inner_loop {
         # The following is not yet done, but could be:
         #   sub (x x x)
         #     ( $type =~ /^[wit]$/ )
-        elsif ( $is_wit{$type} ) {
+        elsif ( $is_witP{$type} ) {
 
             # index() is several times faster than a regex test with \s here
             ##   $token =~ /\s/
@@ -8688,16 +8696,6 @@ sub respace_tokens_inner_loop {
 
                     }
 
-                    # clean up spaces in package identifiers, like
-                    #   "package        Bob::Dog;"
-                    elsif ( $token =~ /^(package|class)\s/ ) {
-                        $token =~ s/\s+/ /g;
-                        $rtoken_vars->[_TOKEN_] = $token;
-
-                        $self->[_ris_special_identifier_token_]->{$token} =
-                          'package';
-                    }
-
                     # trim identifiers of trailing blanks which can occur
                     # under some unusual circumstances, such as if the
                     # identifier 'witch' has trailing blanks on input here:
@@ -8718,6 +8716,18 @@ sub respace_tokens_inner_loop {
                         $rtoken_vars->[_TOKEN_] = $token;
                     }
                 }
+
+                # and trim spaces in package statements (added for c250)
+                elsif ( $type eq 'P' ) {
+
+                    # clean up spaces in package identifiers, like
+                    #   "package        Bob::Dog;"
+                    if ( $token =~ s/\s+/ /g ) {
+                        $rtoken_vars->[_TOKEN_] = $token;
+                        $self->[_ris_special_identifier_token_]->{$token} =
+                          'package';
+                    }
+                }
             }
         }
 
@@ -18111,7 +18121,8 @@ EOM
             }
 
             # blank lines before subs except declarations and one-liners
-            elsif ( $leading_type eq 'i' ) {
+            # Fix for c250: added new type 'P'
+            elsif ( $leading_type eq 'i' || $leading_type eq 'P' ) {
                 my $special_identifier =
                   $self->[_ris_special_identifier_token_]->{$leading_token};
                 if ($special_identifier) {
diff --git a/lib/Perl/Tidy/HtmlWriter.pm b/lib/Perl/Tidy/HtmlWriter.pm
index c5f0b315..7c751c62 100644
--- a/lib/Perl/Tidy/HtmlWriter.pm
+++ b/lib/Perl/Tidy/HtmlWriter.pm
@@ -391,12 +391,14 @@ BEGIN {
         '('  => 'p',
         ')'  => 'p',
         'M'  => 'm',
-        'P'  => 'pd',
+        'pd' => 'pd',
         'A'  => 'co',
     );
 
     # These token types will all be called identifiers for now
-    my @identifier = qw< i t U C Y Z G :: CORE::>;
+    # Fix for c250: added new type 'P', formerly 'i'
+    # ( but package statements will eventually be split into 'k' and 'i')
+    my @identifier = qw< i t U C Y Z G P :: CORE::>;
     @token_short_names{@identifier} = ('i') x scalar(@identifier);
 
     # These token types will be called 'structure'
@@ -1341,7 +1343,8 @@ sub markup_tokens {
         # into keyword 'package' and name; add to the toc,
         # and update the package stack
         #-------------------------------------------------------
-        if ( $type eq 'i' && $token =~ /^(package\s+)(\w.*)$/ ) {
+        # Fix for c250: switch from 'i' to 'P' and allow 'class' or 'package'
+        if ( $type eq 'P' && $token =~ /^(\w+\s+)(\w.*)$/ ) {
             $token = $self->markup_html_element( $1, 'k' );
             push @colored_tokens, $token;
             $token = $2;
@@ -1459,7 +1462,10 @@ sub write_line {
             $self->add_toc_item( '__DATA__', '__DATA__' );
         }
         elsif ( $line_type =~ /^POD/ ) {
-            $line_character = 'P';
+
+            # fix for c250: changed 'P' to 'pd' here and in %token_short_names
+            # to allow use of 'P' as new package token type
+            $line_character = 'pd';
             if ( $rOpts->{'pod2html'} ) {
                 my $html_pod_fh = $self->{_html_pod_fh};
                 if ( $line_type eq 'POD_START' ) {
diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index 83c81075..24634f76 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -3100,13 +3100,18 @@ EOM
         $container_type = EMPTY_STRING;
 
         # ATTRS: for a '{' following an attribute list, reset
-        # things to look like we just saw the sub name
+        # things to look like we just saw a sub name
         # Added 'package' (can be 'class') for --use-feature=class (rt145706)
-        if ( $statement_type =~ /^(sub|package)\b/ ) {
+        if ( substr( $statement_type, 0, 3 ) eq 'sub' ) {
             $last_nonblank_token = $statement_type;
             $last_nonblank_type  = 'i';
             $statement_type      = EMPTY_STRING;
         }
+        elsif ( substr( $statement_type, 0, 7 ) eq 'package' ) {
+            $last_nonblank_token = $statement_type;
+            $last_nonblank_type  = 'P';               # c250 change
+            $statement_type      = EMPTY_STRING;
+        }
 
         # patch for SWITCH/CASE: hide these keywords from an immediately
         # following opening brace
@@ -5276,16 +5281,16 @@ EOM
             # this pre-token will start an output token
             push( @{$routput_token_list}, $i_tok );
 
-            # The search for the full token ends in one of 5 main end NODES
+            # The search for the full token ends in one of 5 main END NODES
 
-            #----------------------------------
-            # NODE 1: handle a whitespace token
-            #----------------------------------
+            #-----------------------
+            # END NODE 1: whitespace
+            #-----------------------
             next if ( $pre_type eq 'b' );
 
-            #-------------------------
-            # NODE 2: handle a comment
-            #-------------------------
+            #----------------------
+            # END NODE 2: a comment
+            #----------------------
             last if ( $pre_type eq '#' );
 
             # continue gathering identifier if necessary
@@ -5442,9 +5447,9 @@ EOM
             # Now we have to examine this token and decide what it is
             # and define its $type
 
-            #---------------------------
-            # NODE 3: handle a bare word
-            #---------------------------
+            #------------------------
+            # END NODE 3: a bare word
+            #------------------------
             if ( $pre_type eq 'w' ) {
                 my $is_last = $self->do_BAREWORD($is_END_or_DATA);
                 last if ($is_last);
@@ -5455,17 +5460,17 @@ EOM
             # Added '#' to fix c038 (later moved above).
             $self->[_in_attribute_list_] &&= 0;
 
-            #----------------------------------
-            # NODE 4: handle a string of digits
-            #----------------------------------
+            #-------------------------------
+            # END NODE 4: a string of digits
+            #-------------------------------
             if ( $pre_type eq 'd' ) {
                 $self->do_DIGITS();
                 next;
             }
 
-            #--------------------------------
-            # NODE 5: handle all other tokens
-            #--------------------------------
+            #-----------------------------
+            # END NODE 5: all other tokens
+            #-----------------------------
             my $code = $tokenization_code->{$tok};
             if ($code) {
                 $code->($self);
@@ -5777,8 +5782,10 @@ BEGIN {
 
     # Always expecting TERM following these types:
     # note: this is identical to '@value_requestor_type' defined later.
+    # Fix for c250: add new type 'P' for package (expecting VERSION or {}
+    # after package NAMESPACE, so expecting TERM)
     my @q = qw(
-      ; ! + x & ?  F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
+      ; ! + x & ?  F J - p / Y : % f U ~ A G j L P * . | ^ < = [ m { \ > t
       || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
       &= // >> ~. &. |. ^.
       ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
@@ -5798,7 +5805,8 @@ BEGIN {
     # 'i' is currently excluded because it might be a package
     # 'q' is currently excluded because it might be a prototype
     # Fix for c030: removed '->' from this list:
-    @q = qw( -- C h R ++ ] Q <> );    ## n v q i );
+    # Fix for c250: added 'i' after new type 'P' added
+    @q = qw( -- C h R ++ ] Q <> i );    ## n v q );
     push @q, ')';
     @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
 
@@ -5891,25 +5899,13 @@ sub operator_expected {
     #---------------------------------------------
 
     # Types 'k', '}' and 'Z' depend on context
-    # Types 'i', 'n', 'v', 'q' currently also temporarily depend on context.
+    # Types 'n', 'v', 'q' also depend on context.
 
     # identifier...
-    if ( $last_nonblank_type eq 'i' ) {
-        $op_expected = OPERATOR;
-
-        # TODO: it would be cleaner to make this a special type
-        # expecting VERSION or {} after package NAMESPACE;
-        # maybe mark these words as type 'Y'?
-        if (   substr( $last_nonblank_token, 0, 7 ) eq 'package'
-            && $statement_type      =~ /^package\b/
-            && $last_nonblank_token =~ /^package\b/ )
-        {
-            $op_expected = TERM;
-        }
-    }
+    # Fix for c250: type 'i' and new type 'P' are in the hash table now
 
     # keyword...
-    elsif ( $last_nonblank_type eq 'k' ) {
+    if ( $last_nonblank_type eq 'k' ) {
         $op_expected = TERM;
         if ( $expecting_operator_token{$last_nonblank_token} ) {
             $op_expected = OPERATOR;
@@ -6264,8 +6260,16 @@ sub code_block_type {
     }
 
     # or a sub or package BLOCK
-    elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
-        && $last_nonblank_token =~ /^(sub|package)\b/ )
+    # Fixed for c250 to include new package type 'P'
+    # FIXME: this could use optimization
+    elsif (
+        (
+               $last_nonblank_type eq 'i'
+            || $last_nonblank_type eq 't'
+            || $last_nonblank_type eq 'P'
+        )
+        && $last_nonblank_token =~ /^(sub|package)\b/
+      )
     {
         return $last_nonblank_token;
     }
@@ -7586,7 +7590,7 @@ sub do_scan_package {
         my $pos  = pos($input_line);
         my $numc = $pos - $pos_beg;
         $tok  = 'package ' . substr( $input_line, $pos_beg, $numc );
-        $type = 'i';
+        $type = 'P';    # Fix for c250, previously 'i'
 
         # Now we must convert back from character position
         # to pre_token index.
@@ -10072,7 +10076,7 @@ The following additional token types are defined:
     U    user-defined function taking parameters
     G    user-defined function taking block parameter (like grep/map/eval)
     M    (unused, but reserved for subroutine definition name)
-    P    (unused, but -html uses it to label pod text)
+    P    package definition
     t    type indicater such as %,$,@,*,&,sub
     w    bare word (perhaps a subroutine call)
     i    identifier of some type (with leading %, $, @, *, &, sub, -> )
@@ -10138,8 +10142,9 @@ BEGIN {
 
     # make a hash of all valid token types for self-checking the tokenizer
     # (adding NEW_TOKENS : select a new character and add to this list)
+    # fix for c250: added new token type 'P'
     my @valid_token_types = qw#
-      A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
+      A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v P
       { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
       #;
     push( @valid_token_types, @digraphs );
-- 
2.39.5