From 388aa3a2b83ae1ec3c4ceb703f942ea80070f16f Mon Sep 17 00:00:00 2001
From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Fri, 18 Aug 2023 12:21:00 -0700
Subject: [PATCH] fix issue c250 part 2, new sub token type S

---
 lib/Perl/Tidy/Formatter.pm  | 111 +++++++++++++++++++-----------------
 lib/Perl/Tidy/HtmlWriter.pm |   8 ++-
 lib/Perl/Tidy/Tokenizer.pm  |  44 +++++++-------
 3 files changed, 86 insertions(+), 77 deletions(-)

diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm
index ed0139a7..7708609c 100644
--- a/lib/Perl/Tidy/Formatter.pm
+++ b/lib/Perl/Tidy/Formatter.pm
@@ -2801,11 +2801,11 @@ sub initialize_whitespace_hashes {
     # simple as adding your new letter to @spaces_both_sides, for
     # example.
 
-    # fix for c250: added space rules new package type 'P'
+    # fix for c250: added space rules new package type 'P' and sub type 'S'
     my @spaces_both_sides = qw#
       + - * / % ? = . : x < > | & ^ .. << >> ** && .. || // => += -=
       .= %= x= &= |= ^= *= <> <= >= == =~ !~ /= != ... <<= >>= ~~ !~~
-      &&= ||= //= <=> A k f w F n C Y U G v P
+      &&= ||= //= <=> A k f w F n C Y U G v P S
       #;
 
     my @spaces_left_side = qw<
@@ -4183,6 +4183,8 @@ EOM
         $right_bond_strength{'CORE::'} = NO_BREAK;
 
         # Fix for c250: added strengths for new type 'P'
+        # Note: these are working okay, but may eventually need to be
+        # adjusted or even removed.
         $left_bond_strength{'P'}  = NOMINAL;
         $right_bond_strength{'P'} = NOMINAL;
 
@@ -4460,6 +4462,10 @@ EOM
         $binary_bond_strength{'t'}{'L{'} = NO_BREAK;
         $binary_bond_strength{'i'}{'L{'} = NO_BREAK;
 
+        # Fix for c250: set strength for new 'S' to be same as 'i'
+        # testfile is test11/Hub.pm
+        $binary_bond_strength{'S'}{'L{'} = NO_BREAK;
+
         # As a defensive measure, do not break between a '(' and a
         # filehandle.  In some cases, this can cause an error.  For
         # example, the following program works:
@@ -8083,7 +8089,7 @@ sub dump_verbatim {
 
 my %wU;
 my %wiq;
-my %is_witP;
+my %is_witPS;
 my %is_sigil;
 my %is_nonlist_keyword;
 my %is_nonlist_type;
@@ -8100,8 +8106,8 @@ BEGIN {
     @q = qw(w i q Q G C Z);
     @{wiq}{@q} = (1) x scalar(@q);
 
-    @q = qw(w i t P);    # Fix for c250: added new type 'P', formerly 'i'
-    @{is_witP}{@q} = (1) x scalar(@q);
+    @q = qw(w i t P S);   # Fix for c250: added new types 'P', 'S', formerly 'i'
+    @{is_witPS}{@q} = (1) x scalar(@q);
 
     @q = qw($ & % * @);
     @{is_sigil}{@q} = (1) x scalar(@q);
@@ -8639,8 +8645,8 @@ sub respace_tokens_inner_loop {
         # Modify certain tokens here for whitespace
         # The following is not yet done, but could be:
         #   sub (x x x)
-        #     ( $type =~ /^[wit]$/ )
-        elsif ( $is_witP{$type} ) {
+        #     ( $type =~ /^[witPS]$/ )
+        elsif ( $is_witPS{$type} ) {
 
             # index() is several times faster than a regex test with \s here
             ##   $token =~ /\s/
@@ -8667,54 +8673,49 @@ sub respace_tokens_inner_loop {
                     }
                 }
 
-                # Trim certain spaces in identifiers
-                if ( $type eq 'i' ) {
-
-                    if ( $token =~ /$SUB_PATTERN/ ) {
-
-                        # -spp = 0 : no space before opening prototype paren
-                        # -spp = 1 : stable (follow input spacing)
-                        # -spp = 2 : always space before opening prototype paren
-                        if ( !defined($rOpts_space_prototype_paren)
-                            || $rOpts_space_prototype_paren == 1 )
-                        {
-                            ## default: stable
-                        }
-                        elsif ( $rOpts_space_prototype_paren == 0 ) {
-                            $token =~ s/\s+\(/\(/;
-                        }
-                        elsif ( $rOpts_space_prototype_paren == 2 ) {
-                            $token =~ s/\(/ (/;
-                        }
+                # trim identifiers of trailing blanks which can occur
+                # under some unusual circumstances, such as if the
+                # identifier 'witch' has trailing blanks on input here:
+                #
+                # sub
+                # witch
+                # ()   # prototype may be on new line ...
+                # ...
+                my $ord_ch = ord( substr( $token, -1, 1 ) );
+                if (
 
-                        # one space max, and no tabs
-                        $token =~ s/\s+/ /g;
-                        $rtoken_vars->[_TOKEN_] = $token;
+                    # quick check for possible ending space
+                    $ord_ch > 0 && ( $ord_ch < ORD_PRINTABLE_MIN
+                        || $ord_ch > ORD_PRINTABLE_MAX )
+                  )
+                {
+                    $token =~ s/\s+$//g;
+                    $rtoken_vars->[_TOKEN_] = $token;
+                }
 
-                        $self->[_ris_special_identifier_token_]->{$token} =
-                          'sub';
+                # Fixed for c250 to use 'S' for sub definitions
+                if ( $type eq 'S' ) {
 
+                    # -spp = 0 : no space before opening prototype paren
+                    # -spp = 1 : stable (follow input spacing)
+                    # -spp = 2 : always space before opening prototype paren
+                    if ( !defined($rOpts_space_prototype_paren)
+                        || $rOpts_space_prototype_paren == 1 )
+                    {
+                        ## default: stable
+                    }
+                    elsif ( $rOpts_space_prototype_paren == 0 ) {
+                        $token =~ s/\s+\(/\(/;
+                    }
+                    elsif ( $rOpts_space_prototype_paren == 2 ) {
+                        $token =~ s/\(/ (/;
                     }
 
-                    # trim identifiers of trailing blanks which can occur
-                    # under some unusual circumstances, such as if the
-                    # identifier 'witch' has trailing blanks on input here:
-                    #
-                    # sub
-                    # witch
-                    # ()   # prototype may be on new line ...
-                    # ...
-                    my $ord_ch = ord( substr( $token, -1, 1 ) );
-                    if (
+                    # one space max, and no tabs
+                    $token =~ s/\s+/ /g;
+                    $rtoken_vars->[_TOKEN_] = $token;
 
-                        # quick check for possible ending space
-                        $ord_ch > 0 && ( $ord_ch < ORD_PRINTABLE_MIN
-                            || $ord_ch > ORD_PRINTABLE_MAX )
-                      )
-                    {
-                        $token =~ s/\s+$//g;
-                        $rtoken_vars->[_TOKEN_] = $token;
-                    }
+                    $self->[_ris_special_identifier_token_]->{$token} = 'sub';
                 }
 
                 # and trim spaces in package statements (added for c250)
@@ -18121,8 +18122,8 @@ EOM
             }
 
             # blank lines before subs except declarations and one-liners
-            # Fix for c250: added new type 'P'
-            elsif ( $leading_type eq 'i' || $leading_type eq 'P' ) {
+            # Fix for c250: added new type 'P', changed 'i' to 'S'
+            elsif ( $leading_type eq 'S' || $leading_type eq 'P' ) {
                 my $special_identifier =
                   $self->[_ris_special_identifier_token_]->{$leading_token};
                 if ($special_identifier) {
@@ -21709,6 +21710,11 @@ sub break_lines_inner_loop {
     # Do not separate an isolated bare word from an opening paren.
     # Alternate Fix #2 for issue b1299.  This waits as long as possible
     # to make the decision.
+    # Note for fix #c250: to keep line breaks unchanged under -extrude when
+    # switching from 'i' to 'S' for subs, we would have to also check 'S', i.e.
+    # =~/^[Si]$/.  But this was never necessary at a sub signature, so we leave
+    # it alone and allow the new version to be different for --extrude. For a
+    # test file run perl527/signatures.t with --extrude.
     if ( $types_to_go[$i_begin] eq 'i'
         && substr( $tokens_to_go[$i_begin], 0, 1 ) =~ /\w/ )
     {
@@ -22400,7 +22406,8 @@ sub do_colon_breaks {
                 # always open comma lists not preceded by keywords,
                 # barewords, identifiers (that is, anything that doesn't
                 # look like a function call)
-                my $must_break_open = $last_nonblank_type[$dd] !~ /^[kwiU]$/;
+                # c250: added new sub identifier type 'S'
+                my $must_break_open = $last_nonblank_type[$dd] !~ /^[kwiUS]$/;
 
                 $self->table_maker(
                     {
diff --git a/lib/Perl/Tidy/HtmlWriter.pm b/lib/Perl/Tidy/HtmlWriter.pm
index 7c751c62..180c4aa1 100644
--- a/lib/Perl/Tidy/HtmlWriter.pm
+++ b/lib/Perl/Tidy/HtmlWriter.pm
@@ -370,6 +370,7 @@ BEGIN {
 
     # When adding NEW_TOKENS: update this hash table
     # $type => $short_name
+    # c250: changed 'M' to 'S'
     %token_short_names = (
         '#'  => 'c',
         'n'  => 'n',
@@ -390,7 +391,7 @@ BEGIN {
         'f'  => 'sc',
         '('  => 'p',
         ')'  => 'p',
-        'M'  => 'm',
+        'S'  => 'm',
         'pd' => 'pd',
         'A'  => 'co',
     );
@@ -1321,12 +1322,13 @@ sub markup_tokens {
         # Intercept a sub name here; split it
         # into keyword 'sub' and sub name; and add an
         # entry in the toc
+        # Fix for c250: switch from 'i' to 'S'
         #-------------------------------------------------------
-        if ( $type eq 'i' && $token =~ /^(sub\s+)(\w.*)$/ ) {
+        if ( $type eq 'S' && $token =~ /^(\w+\s+)(\w.*)$/ ) {
             $token = $self->markup_html_element( $1, 'k' );
             push @colored_tokens, $token;
             $token = $2;
-            $type  = 'M';
+            $type  = 'S';
 
             # but don't include sub declarations in the toc;
             # these will have leading token types 'i;'
diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index 24634f76..1a87fbbd 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -3104,7 +3104,7 @@ EOM
         # Added 'package' (can be 'class') for --use-feature=class (rt145706)
         if ( substr( $statement_type, 0, 3 ) eq 'sub' ) {
             $last_nonblank_token = $statement_type;
-            $last_nonblank_type  = 'i';
+            $last_nonblank_type  = 'S';               # c250 change
             $statement_type      = EMPTY_STRING;
         }
         elsif ( substr( $statement_type, 0, 7 ) eq 'package' ) {
@@ -5281,7 +5281,7 @@ EOM
             # this pre-token will start an output token
             push( @{$routput_token_list}, $i_tok );
 
-            # The search for the full token ends in one of 5 main END NODES
+            # The search for the full token ends in one of 5 main END NODES:
 
             #-----------------------
             # END NODE 1: whitespace
@@ -5468,9 +5468,9 @@ EOM
                 next;
             }
 
-            #-----------------------------
-            # END NODE 5: all other tokens
-            #-----------------------------
+            #------------------------------------------
+            # END NODE 5: everything else (punctuation)
+            #------------------------------------------
             my $code = $tokenization_code->{$tok};
             if ($code) {
                 $code->($self);
@@ -5784,8 +5784,9 @@ BEGIN {
     # note: this is identical to '@value_requestor_type' defined later.
     # Fix for c250: add new type 'P' for package (expecting VERSION or {}
     # after package NAMESPACE, so expecting TERM)
+    # Fix for c250: add new type 'S' for sub (not expecting operator)
     my @q = qw(
-      ; ! + x & ?  F J - p / Y : % f U ~ A G j L P * . | ^ < = [ m { \ > t
+      ; ! + x & ?  F J - p / Y : % f U ~ A G j L P S * . | ^ < = [ m { \ > t
       || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
       &= // >> ~. &. |. ^.
       ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
@@ -5805,7 +5806,7 @@ BEGIN {
     # 'i' is currently excluded because it might be a package
     # 'q' is currently excluded because it might be a prototype
     # Fix for c030: removed '->' from this list:
-    # Fix for c250: added 'i' after new type 'P' added
+    # Fix for c250: added 'i' because new type 'P' was added
     @q = qw( -- C h R ++ ] Q <> i );    ## n v q );
     push @q, ')';
     @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
@@ -5902,7 +5903,8 @@ sub operator_expected {
     # Types 'n', 'v', 'q' also depend on context.
 
     # identifier...
-    # Fix for c250: type 'i' and new type 'P' are in the hash table now
+    # Fix for c250: removed coding for type 'i' because 'i' and new type 'P'
+    # are now done by hash table lookup
 
     # keyword...
     if ( $last_nonblank_type eq 'k' ) {
@@ -6260,21 +6262,19 @@ sub code_block_type {
     }
 
     # or a sub or package BLOCK
-    # Fixed for c250 to include new package type 'P'
-    # FIXME: this could use optimization
+    # Fixed for c250 to include new package type 'P', and change 'i' to 'S'
     elsif (
-        (
-               $last_nonblank_type eq 'i'
-            || $last_nonblank_type eq 't'
-            || $last_nonblank_type eq 'P'
-        )
-        && $last_nonblank_token =~ /^(sub|package)\b/
+           $last_nonblank_type eq 'P'
+        || $last_nonblank_type eq 'S'
+        || ( $last_nonblank_type eq 't'
+            && substr( $last_nonblank_token, 0, 3 ) eq 'sub' )
       )
     {
         return $last_nonblank_token;
     }
 
     # or a sub alias
+    # FIXME: see if this is really needed after the c250 update
     elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
         && ( $is_sub{$last_nonblank_token} ) )
     {
@@ -8670,7 +8670,7 @@ EOM
             my $pos  = pos($input_line);
             my $numc = $pos - $pos_beg;
             $tok  = 'sub ' . substr( $input_line, $pos_beg, $numc );
-            $type = 'i';
+            $type = 'S';    ## Fix for c250, was 'i';
 
             # remember the sub name in case another call is needed to
             # get the prototype
@@ -8724,7 +8724,7 @@ EOM
             # Patch part #1 to fixes cases b994 and b1053:
             # Mark an anonymous sub keyword without prototype as type 'k', i.e.
             #    'sub : lvalue { ...'
-            $type = 'i';
+            $type = 'S';    ## C250, was 'i';
             if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
         }
 
@@ -10075,8 +10075,8 @@ The following additional token types are defined:
     C    user-defined constant or constant function (with void prototype = ())
     U    user-defined function taking parameters
     G    user-defined function taking block parameter (like grep/map/eval)
-    M    (unused, but reserved for subroutine definition name)
-    P    package definition
+    S    sub definition     (reported as type 'i' in older versions)
+    P    package definition (reported as type 'i' in older versions)
     t    type indicater such as %,$,@,*,&,sub
     w    bare word (perhaps a subroutine call)
     i    identifier of some type (with leading %, $, @, *, &, sub, -> )
@@ -10142,9 +10142,9 @@ BEGIN {
 
     # make a hash of all valid token types for self-checking the tokenizer
     # (adding NEW_TOKENS : select a new character and add to this list)
-    # fix for c250: added new token type 'P'
+    # fix for c250: added new token type 'P' and 'S'
     my @valid_token_types = qw#
-      A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v P
+      A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v P S
       { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
       #;
     push( @valid_token_types, @digraphs );
-- 
2.39.5