From 7ba4f3bb2bebbfc44eab6bda37126b55480c50d1 Mon Sep 17 00:00:00 2001
From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Wed, 16 Dec 2020 11:13:41 -0800
Subject: [PATCH] rewrote sub check_match and sub match_line_pair

---
 lib/Perl/Tidy/VerticalAligner.pm | 410 ++++++++++++++++++-------------
 local-docs/BugLog.pod            |  35 ++-
 2 files changed, 274 insertions(+), 171 deletions(-)

diff --git a/lib/Perl/Tidy/VerticalAligner.pm b/lib/Perl/Tidy/VerticalAligner.pm
index 72f86a09..7205f76c 100644
--- a/lib/Perl/Tidy/VerticalAligner.pm
+++ b/lib/Perl/Tidy/VerticalAligner.pm
@@ -1114,20 +1114,21 @@ sub check_match {
 
     # See if the current line matches the current vertical alignment group.
 
-    my ( $self, $new_line, $old_line ) = @_;
+    my ( $self, $new_line, $base_line, $prev_line ) = @_;
+
+    # Given:
+    #  $new_line  = the line being considered for group inclusion
+    #  $base_line = the first line of the current group
+    #  $prev_line = the line just before $new_line
 
     # returns a flag and a value as follows:
     #    return (0, $imax_align)     if the line does not match
     #    return (1, $imax_align)     if the line matches but does not fit
     #    return (2, $imax_align)     if the line matches and fits
 
-    # Variable $imax_align will be set to indicate the maximum token index to
-    # be matched in the subsequent left-to-right sweep, in the case that this
-    # line does not exactly match the current group.
-
-    my $jmax                = $new_line->get_jmax();
-    my $maximum_field_index = $old_line->get_jmax();
-
+    # Returns '$imax_align' which is the index of the maximum matching token.
+    # It will be used in the subsequent left-to-right sweep to align as many
+    # tokens as possible for lines which partially match.
     my $imax_align = -1;
 
     # variable $GoToMsg explains reason for no match, for debugging
@@ -1139,152 +1140,38 @@ sub check_match {
     # This flag should normally be zero.
     use constant TEST_SWEEP_ONLY => 0;
 
-    my $is_hanging_side_comment = $new_line->get_is_hanging_side_comment();
-    my $rtokens                 = $new_line->get_rtokens();
-    my $rfields                 = $new_line->get_rfields();
-    my $rfield_lengths          = $new_line->get_rfield_lengths();
-    my $rpatterns               = $new_line->get_rpatterns();
-    my $list_type               = $new_line->get_list_type();
-
-    my $group_list_type = $old_line->get_list_type();
-    my $old_rpatterns   = $old_line->get_rpatterns();
-    my $old_rtokens     = $old_line->get_rtokens();
+    my $jmax                = $new_line->get_jmax();
+    my $maximum_field_index = $base_line->get_jmax();
 
     my $jlimit = $jmax - 2;
     if ( $jmax > $maximum_field_index ) {
         $jlimit = $maximum_field_index - 2;
     }
 
-    # Handle comma-separated lists ..
-    # We require all alignment tokens to match but will not be concerned if
-    # patterns differ.
-    if ( $group_list_type && ( $list_type eq $group_list_type ) ) {
-        for my $j ( 0 .. $jlimit ) {
-            my $old_tok = $old_rtokens->[$j];
-            my $new_tok = $rtokens->[$j];
-            $GoToMsg = "different tokens: $old_tok ne $new_tok";
-            goto NO_MATCH if ( $old_tok ne $new_tok );
-            $imax_align = $j;
-        }
+    if ( $new_line->get_is_hanging_side_comment() ) {
+
+        # HSC's can join the group if they fit
     }
 
-    # Handle everything else except hanging side comments ..
-    # We require all alignment tokens to match, and we also put a few
-    # restrictions on patterns.
-    elsif ( !$is_hanging_side_comment ) {
+    # Everything else
+    else {
 
         # A group with hanging side comments ends with the first non hanging
         # side comment.
-        if ( $old_line->get_is_hanging_side_comment() ) {
+        if ( $base_line->get_is_hanging_side_comment() ) {
             $GoToMsg = "end of hanging side comments";
             goto NO_MATCH;
         }
 
-        my $leading_space_count = $new_line->get_leading_space_count();
-
-        for my $j ( 0 .. $jlimit ) {
-
-            my $old_tok = $old_rtokens->[$j];
-            my $new_tok = $rtokens->[$j];
-
-            my $tokens_match = $new_tok eq $old_tok;
-
-            # No match if the alignment tokens differ...
-            if ( !$tokens_match ) {
-                $GoToMsg = "tokens differ: $new_tok ne $old_tok";
-                goto NO_MATCH;
-            }
-
-            # Calculate amount of padding required to fit this in.
-            # $pad is the number of spaces by which we must increase
-            # the current field to squeeze in this field.
-            my $pad =
-              $rfield_lengths->[$j] - $old_line->current_field_width($j);
-            if ( $j == 0 ) { $pad += $leading_space_count; }
-
-            # If patterns don't match, we have to be careful...
-            if ( $old_rpatterns->[$j] ne $rpatterns->[$j] ) {
-
-                my ( $alignment_token, $lev, $tag, $tok_count ) =
-                  decode_alignment_token($new_tok);
-
-                # We have to be very careful about aligning commas
-                # when the pattern's don't match, because it can be
-                # worse to create an alignment where none is needed
-                # than to omit one.  Here's an example where the ','s
-                # are not in named containers.  The first line below
-                # should not match the next two:
-                #   ( $a, $b ) = ( $b, $r );
-                #   ( $x1, $x2 ) = ( $x2 - $q * $x1, $x1 );
-                #   ( $y1, $y2 ) = ( $y2 - $q * $y1, $y1 );
-                if ( $alignment_token eq ',' ) {
-
-                    # do not align commas unless they are in named
-                    # containers
-                    $GoToMsg = "do not align commas in unnamed containers";
-                    goto NO_MATCH unless ( $new_tok =~ /[A-Za-z]/ );
-                }
-
-                # do not align parens unless patterns match;
-                # large ugly spaces can occur in math expressions.
-                elsif ( $alignment_token eq '(' ) {
-
-                    # But we can allow a match if the parens don't
-                    # require any padding.
-                    $GoToMsg =
-                      "do not align '(' unless patterns match or pad=0";
-                    if ( $pad != 0 ) { goto NO_MATCH }
-                }
-
-                # Handle an '=' alignment with different patterns to
-                # the left.
-                elsif ( $alignment_token eq '=' ) {
-
-                    # It is best to be a little restrictive when
-                    # aligning '=' tokens.  Here is an example of
-                    # two lines that we will not align:
-                    #       my $variable=6;
-                    #       $bb=4;
-                    # The problem is that one is a 'my' declaration,
-                    # and the other isn't, so they're not very similar.
-                    # We will filter these out by comparing the first
-                    # letter of the pattern.  This is crude, but works
-                    # well enough.
-                    if (
-                        substr( $old_rpatterns->[$j], 0, 1 ) ne
-                        substr( $rpatterns->[$j],     0, 1 ) )
-                    {
-                        $GoToMsg = "first character before equals differ";
-                        goto NO_MATCH;
-                    }
-
-                    # The introduction of sub 'prune_alignment_tree'
-                    # enabled alignment of lists left of the equals with
-                    # other scalar variables. For example:
-                    # my ( $D, $s, $e ) = @_;
-                    # my $d             = length $D;
-                    # my $c             = $e - $s - $d;
-
-                    # But this would change formatting of a lot of scripts,
-                    # so for now we prevent alignment of comma lists on the
-                    # left with scalars on the left.  We will also prevent
-                    # any partial alignments.
-                    elsif ( ( index( $old_rpatterns->[$j], ',' ) >= 0 ) ne
-                        ( index( $rpatterns->[$j], ',' ) >= 0 ) )
-                    {
-                        $imax_align = -1;
-                        $GoToMsg    = "mixed commas/no-commas before equals";
-                        goto NO_MATCH;
-                    }
-                }
-            }
-
-            # Everything matches so far, so we can update the maximum index
-            # for partial alignment.
-            $imax_align = $j;
-
-        } ## end for my $j ( 0 .. $jlimit)
+        # The number of tokens that this line shares with the previous line
+        # has been stored with the previous line.  This value was calculated
+        # and stored by sub 'match_line_pair'.
+        $imax_align = $prev_line->get_imax_pair();
 
+        if ( $imax_align != $jlimit ) {
+            $GoToMsg = "Not all tokens match: $imax_align != $jlimit\n";
+            goto NO_MATCH;
+        }
     }
 
     # The tokens match, but the lines must have identical number of
@@ -1296,7 +1183,7 @@ sub check_match {
 
     # The tokens match. Now See if there is space for this line in the
     # current group.
-    if ( $self->check_fit( $new_line, $old_line ) && !TEST_SWEEP_ONLY ) {
+    if ( $self->check_fit( $new_line, $base_line ) && !TEST_SWEEP_ONLY ) {
 
         EXPLAIN_CHECK_MATCH
           && print "match and fit, imax_align=$imax_align, jmax=$jmax\n";
@@ -1650,6 +1537,10 @@ sub _flush_group_lines {
         if ( $jend - $jbeg == 1 ) {
             my $line_0 = $rall_lines->[$jbeg];
             my $line_1 = $rall_lines->[$jend];
+
+            my $imax_pair = $line_1->get_imax_pair();
+            if ( $imax_pair > $imax_align ) { $imax_align = $imax_pair }
+
             my ( $is_marginal, $imax_align_fix ) =
               is_marginal_match( $line_0, $line_1, $grp_level, $imax_align );
             if ($is_marginal) {
@@ -1775,7 +1666,8 @@ EOM
             my $match_code;
             if ($group_line_count) {
                 ( $match_code, my $imax_align ) =
-                  $self->check_match( $new_line, $base_line );
+                  $self->check_match( $new_line, $base_line,
+                    $rall_lines->[ $jline - 1 ] );
                 if ( $match_code != 2 ) { end_rgroup($imax_align) }
             }
 
@@ -2040,9 +1932,9 @@ sub sweep_left_to_right {
         $is_good_alignment_token{'unless'} = 1;
         $is_good_alignment_token{'=>'}     = 1
 
-        # Note the hash values are set so that:
-        #         if ($is_good_alignment_token{$raw_tok}) => best
-        # if defined ($is_good_alignment_token{$raw_tok}) => good or best
+          # Note the hash values are set so that:
+          #         if ($is_good_alignment_token{$raw_tok}) => best
+          # if defined ($is_good_alignment_token{$raw_tok}) => good or best
 
     }
 
@@ -2897,7 +2789,7 @@ EOM
         prune_alignment_tree($rnew_lines) if ($max_lev_diff);
 
         # PASS 4: compare all lines for common tokens
-        match_line_pairs( $rnew_lines, $rline_hashes, \@subgroups );
+        match_line_pairs( $rlines, $rnew_lines, \@subgroups );
 
         return ( $max_lev_diff, $saw_side_comment );
     }
@@ -3086,53 +2978,239 @@ sub delete_null_alignments {
 } ## end sub delete_null_alignments
 
 sub match_line_pairs {
-    my ( $rnew_lines, $rline_hashes, $rsubgroups ) = @_;
+    my ( $rlines, $rnew_lines, $rsubgroups ) = @_;
 
-    # The subgroup line index range
-    my ( $jbeg, $jend );
+    # Compare each pair of lines and save information about common matches
+    # $rlines     = list of lines including hanging side comments
+    # $rnew_lines = list of lines without any hanging side comments
+    # $rsubgroups = list of subgroups of the new lines
+
+    # TODO:
+    # Change: imax_pair => pair_match_info = ref to array
+    #  = [$imax_align, $rMsg, ... ]
+    #  This may eventually have multi-level match info
 
     # Previous line vars
-    my ( $line_m, $rtokens_m, $imax_m );
+    my ( $line_m, $rtokens_m, $rpatterns_m, $rfield_lengths_m, $imax_m,
+        $list_type_m );
 
     # Current line vars
-    my ( $line, $rtokens, $imax );
+    my ( $line, $rtokens, $rpatterns, $rfield_lengths, $imax, $list_type );
+
+    use constant EXPLAIN_COMPARE_PATTERNS => 0;
+
+    my $compare_patterns = sub {
+
+        # helper routine to decide if patterns match well enough..
+        # return code:
+        #   0 = patterns match, continue
+        #   1 = no match
+        #   2 = no match, and lines do not match at all
+
+        my ( $tok, $tok_m, $pat, $pat_m, $pad ) = @_;
+        my $GoToMsg     = "";
+        my $return_code = 1;
+
+        my ( $alignment_token, $lev, $tag, $tok_count ) =
+          decode_alignment_token($tok);
+
+        # We have to be very careful about aligning commas
+        # when the pattern's don't match, because it can be
+        # worse to create an alignment where none is needed
+        # than to omit one.  Here's an example where the ','s
+        # are not in named containers.  The first line below
+        # should not match the next two:
+        #   ( $a, $b ) = ( $b, $r );
+        #   ( $x1, $x2 ) = ( $x2 - $q * $x1, $x1 );
+        #   ( $y1, $y2 ) = ( $y2 - $q * $y1, $y1 );
+        if ( $alignment_token eq ',' ) {
+
+            # do not align commas unless they are in named
+            # containers
+            $GoToMsg = "do not align commas in unnamed containers";
+            goto NO_MATCH unless ( $tok =~ /[A-Za-z]/ );
+        }
+
+        # do not align parens unless patterns match;
+        # large ugly spaces can occur in math expressions.
+        elsif ( $alignment_token eq '(' ) {
+
+            # But we can allow a match if the parens don't
+            # require any padding.
+            $GoToMsg = "do not align '(' unless patterns match or pad=0";
+            if ( $pad != 0 ) { goto NO_MATCH }
+        }
 
+        # Handle an '=' alignment with different patterns to
+        # the left.
+        elsif ( $alignment_token eq '=' ) {
+
+            # It is best to be a little restrictive when
+            # aligning '=' tokens.  Here is an example of
+            # two lines that we will not align:
+            #       my $variable=6;
+            #       $bb=4;
+            # The problem is that one is a 'my' declaration,
+            # and the other isn't, so they're not very similar.
+            # We will filter these out by comparing the first
+            # letter of the pattern.  This is crude, but works
+            # well enough.
+            if ( substr( $pat_m, 0, 1 ) ne substr( $pat, 0, 1 ) ) {
+                $GoToMsg = "first character before equals differ";
+                goto NO_MATCH;
+            }
+
+            # The introduction of sub 'prune_alignment_tree'
+            # enabled alignment of lists left of the equals with
+            # other scalar variables. For example:
+            # my ( $D, $s, $e ) = @_;
+            # my $d             = length $D;
+            # my $c             = $e - $s - $d;
+
+            # But this would change formatting of a lot of scripts,
+            # so for now we prevent alignment of comma lists on the
+            # left with scalars on the left.  We will also prevent
+            # any partial alignments.
+
+          # FIXME: can set return code 1 if the = is below line level, i.e.
+          #    sub new { my ( $p, $v ) = @_; bless \$v, $p }
+          #    sub iter { my ($x) = @_; return undef if $$x < 0; return $$x--; }
+          # but keep as is until verification with old routine is finished.
+
+            elsif (
+                ( index( $pat_m, ',' ) >= 0 ) ne ( index( $pat, ',' ) >= 0 ) )
+            {
+                $GoToMsg     = "mixed commas/no-commas before equals";
+                $return_code = 2;
+                goto NO_MATCH;
+            }
+        }
+
+      MATCH:
+        return ( 0, \$GoToMsg );
+
+      NO_MATCH:
+
+        EXPLAIN_COMPARE_PATTERNS
+          && print STDERR "no match because $GoToMsg";
+
+        return ( $return_code, \$GoToMsg );
+
+    };    ## end of $compare_patterns->()
+
+    # loop over subgroups
     foreach my $item ( @{$rsubgroups} ) {
-        ( $jbeg, $jend ) = @{$item};
+        my ( $jbeg, $jend ) = @{$item};
         my $nlines = $jend - $jbeg + 1;
         next unless ( $nlines > 1 );
 
+        # loop over lines in a subgroup
         for ( my $jj = $jbeg ; $jj <= $jend ; $jj++ ) {
 
-            $line_m    = $line;
-            $rtokens_m = $rtokens;
-            $imax_m    = $imax;
+            $line_m           = $line;
+            $rtokens_m        = $rtokens;
+            $rpatterns_m      = $rpatterns;
+            $rfield_lengths_m = $rfield_lengths;
+            $imax_m           = $imax;
+            $list_type_m      = $list_type;
 
-            $line    = $rnew_lines->[$jj];
-            $rtokens = $line->get_rtokens();
-            $imax    = @{$rtokens} - 2;
+            $line           = $rnew_lines->[$jj];
+            $rtokens        = $line->get_rtokens();
+            $rpatterns      = $line->get_rpatterns();
+            $rfield_lengths = $line->get_rfield_lengths();
+            $imax           = @{$rtokens} - 2;
+            $list_type      = $line->get_list_type();
 
             # nothing to do for first line
             next if ( $jj == $jbeg );
 
+            my $imax_min = $imax_m < $imax ? $imax_m : $imax;
+
+            my $imax_align = -1;
+
             # find number of leading common tokens
-            my $imax_min  = $imax_m < $imax ? $imax_m : $imax;
-            my $i_nomatch = $imax_min + 1;
-            for ( my $i = 0 ; $i <= $imax_min ; $i++ ) {
-                my $tok   = $rtokens->[$i];
-                my $tok_m = $rtokens_m->[$i];
-                if ( $tok ne $tok_m ) {
-                    $i_nomatch = $i;
-                    last;
+
+            #################################
+            # No match to hanging side comment
+            #################################
+            if ( $line->get_is_hanging_side_comment() ) {
+
+                # Should not get here; HSC's have been filtered out
+                $imax_align = -1;
+            }
+
+            ##############################
+            # Handle comma-separated lists
+            ##############################
+            elsif ( $list_type && $list_type eq $list_type_m ) {
+
+                my $i_nomatch = $imax_min + 1;
+                for ( my $i = 0 ; $i <= $imax_min ; $i++ ) {
+                    my $tok   = $rtokens->[$i];
+                    my $tok_m = $rtokens_m->[$i];
+                    if ( $tok ne $tok_m ) {
+                        $i_nomatch = $i;
+                        last;
+                    }
+                }
+                $imax_align = $i_nomatch - 1;
+            }
+
+            ##################
+            # Handle non-lists
+            ##################
+            else {
+                my $i_nomatch = $imax_min + 1;
+                for ( my $i = 0 ; $i <= $imax_min ; $i++ ) {
+                    my $tok   = $rtokens->[$i];
+                    my $tok_m = $rtokens_m->[$i];
+                    if ( $tok ne $tok_m ) {
+                        $i_nomatch = $i;
+                        last;
+                    }
+
+                    my $pat   = $rpatterns->[$i];
+                    my $pat_m = $rpatterns_m->[$i];
+
+                    # If patterns don't match, we have to be careful...
+                    if ( $pat_m ne $pat ) {
+                        my $pad =
+                          $rfield_lengths->[$i] - $rfield_lengths_m->[$i];
+                        my ( $match_code, $rmsg ) = $compare_patterns->(
+                            $tok, $tok_m, $pat, $pat_m, $pad
+                        );
+                        if ($match_code) {
+                            if    ( $match_code eq 1 ) { $i_nomatch = $i }
+                            elsif ( $match_code eq 2 ) { $i_nomatch = 0 }
+                            last;
+                        }
+                    }
                 }
+                $imax_align = $i_nomatch - 1;
+            }
 
-            } ## end loop over tokens
-            $line_m->set_imax_pair( $i_nomatch - 1 );
+            $line_m->set_imax_pair($imax_align);
 
         } ## end loop over lines
+
+        # Put fence at end of subgroup
         $line->set_imax_pair(-1);
 
     } ## end loop over subgroups
+
+    # if there are hanging side comments, propagate the pair info down to them
+    # so that lines can just look back one line for their pair info.
+    if ( @{$rlines} > @{$rnew_lines} ) {
+        my $last_pair_info = -1;
+        foreach my $line ( @{$rlines} ) {
+            if ( $line->get_is_hanging_side_comment() ) {
+                $line->set_imax_pair($last_pair_info);
+            }
+            else {
+                $last_pair_info = $line->get_imax_pair();
+            }
+        }
+    }
     return;
 }
 
diff --git a/local-docs/BugLog.pod b/local-docs/BugLog.pod
index fb1be9e7..05b75855 100644
--- a/local-docs/BugLog.pod
+++ b/local-docs/BugLog.pod
@@ -2,15 +2,40 @@
 
 =over 4
 
+=item B<Rewrote sub check_match>
+
+Moved inner part of sub check_match into sub match_line_pair in order to
+make info available earlier.  This gave some minor alignment improvements.
+
+    # OLD:
+    @tests = (
+        @common,     '$_',
+        '"\$_"',     '@_',
+        '"\@_"',     '??N',
+        '"??N"',     chr 256,
+        '"\x{100}"', chr 65536,
+        '"\x{10000}"', ord 'N' == 78 ? ( chr 11, '"\013"' ) : ()
+    );
+
+    # NEW:
+    @tests = (
+        @common,       '$_',
+        '"\$_"',       '@_',
+        '"\@_"',       '??N',
+        '"??N"',       chr 256,
+        '"\x{100}"',   chr 65536,
+        '"\x{10000}"', ord 'N' == 78 ? ( chr 11, '"\013"' ) : ()
+    );
+
 =item B<Improved vertical alignments by avoiding pruning step>
 
 There is a step in vertical alignment where the alignments are formed into a
 tree with different levels, and some deeper levels are pruned to preserve
-higher level alignments.  This usually works well, but some deeper alignments
-can be lost, which is what was happening in the example below.  It turns out
-that if the tree pruning is skipped when alignments vary monotonically across
-lines, as in the example, then better alignments is achieved when a later pass
-is made with the 'sweep' pass.
+lower level alignments.  This usually works well, but some deeper alignments
+will be lost, which is what was happening in the example below.  It turns out
+that if the tree pruning is skipped when alignment depths increase
+monotonically across lines, as in the example, then better overall alignment is
+achieved by the subsequent 'sweep' pass. 
 
     # OLD
     my $cmd = shift @ARGV;
-- 
2.39.5