From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Sun, 26 Jul 2020 14:57:36 +0000 (-0700)
Subject: speedup sub decode_alignment; fix rare issue with terminal else statements
X-Git-Tag: 20200822~36
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=6d5b8b2c9bb4fdc70f50b01dc26d87b9c5f1bd8a;p=perltidy.git

speedup sub decode_alignment; fix rare issue with terminal else statements
---

diff --git a/lib/Perl/Tidy/VerticalAligner.pm b/lib/Perl/Tidy/VerticalAligner.pm
index 7a0465fe..6cc4d399 100644
--- a/lib/Perl/Tidy/VerticalAligner.pm
+++ b/lib/Perl/Tidy/VerticalAligner.pm
@@ -181,6 +181,7 @@ sub initialize {
 
     initialize_for_new_group();
     initialize_leading_string_cache();
+    initialize_decode();
 
     $vertical_aligner_self = { length_function => $length_function, };
     bless $vertical_aligner_self, $class;
@@ -1039,8 +1040,8 @@ sub check_match {
     # do detailed check for everything else except hanging side comments
     elsif ( !$is_hanging_side_comment ) {
 
-	# A group with hanging side comments ends with the first non hanging
-	# side comment.
+        # A group with hanging side comments ends with the first non hanging
+        # side comment.
         if ( $old_line->get_is_hanging_side_comment() ) {
             $GoToMsg = "end of hanging side comments";
             goto NO_MATCH;
@@ -1489,12 +1490,11 @@ sub my_flush {
         # STEP 3: Sweep top to bottom, forming subgroups of lines with exactly
         # matching common alignments.  The indexes of these subgroups are in the
         # return variable.
-        my $rgroups =
-          sweep_top_down( \@all_lines, $group_level );
+        my $rgroups = sweep_top_down( \@all_lines, $group_level );
 
         # STEP 4: Sweep left to right through the lines, looking for leading
         # alignment tokens shared by groups.
-        sweep_left_to_right( \@all_lines, $rgroups );
+        sweep_left_to_right( \@all_lines, $rgroups, $group_level );
 
         # STEP 5: Move side comments to a common column if possible.
         adjust_side_comment_multiple_groups( \@all_lines, $rgroups )
@@ -1584,8 +1584,10 @@ sub my_flush {
         if ( $jend - $jbeg == 1 ) {
             my $line_0 = $rall_lines->[$jbeg];
             my $line_1 = $rall_lines->[$jend];
-            if ( is_marginal_match( $line_0, $line_1, $grp_level ) ) {
-                combine_fields( $line_0, $line_1, $imax_align );
+            my ( $is_marginal, $imax_align_fix ) =
+              is_marginal_match( $line_0, $line_1, $grp_level, $imax_align );
+            if ($is_marginal) {
+                combine_fields( $line_0, $line_1, $imax_align_fix );
             }
         }
 
@@ -1593,8 +1595,16 @@ sub my_flush {
         return;
     }
 
+    sub block_penultimate_match {
+
+        # emergency reset to prevent sweep_left_to_right from trying to match a
+        # failed terminal else match
+        return unless @{$rgroups} > 1;
+        $rgroups->[-2]->[2] = -1;
+    }
+
     sub sweep_top_down {
-        my ( $rlines, $group_common_level) = @_; 
+        my ( $rlines, $group_common_level ) = @_;
 
         # uses no Global symbols
 
@@ -1703,6 +1713,9 @@ EOM
                         $base_line->increase_field_width( $j_terminal_match,
                             $pad );
                     }
+
+                    # do not let sub sweep_left_to_right change this
+                    block_penultimate_match();
                 }
                 end_rgroup(-1);
             }
@@ -1720,7 +1733,7 @@ EOM
 
 sub sweep_left_to_right {
 
-    my ( $rlines, $rgroups ) = @_;
+    my ( $rlines, $rgroups, $grp_level ) = @_;
 
     # uses no Global symbols
 
@@ -1866,33 +1879,32 @@ sub sweep_left_to_right {
             my $var = pop(@todo);
             $ng_beg = $var->[1];
         }
-        my ( $raw_tok, $lev, $tag, $tok_count ) =
-              decode_alignment_token($tok);
-        push @todo, [ $i, $ng_beg, $ng_end, $tok, $lev ];
+        my ( $raw_tok, $lev, $tag, $tok_count ) = decode_alignment_token($tok);
+        push @todo, [ $i, $ng_beg, $ng_end, $raw_tok, $lev ];
     }
 
     ###############################
     # Step 3: Execute the task list
     ###############################
-    do_left_to_right_sweep( $rlines, $rgroups, \@todo, \%max_move, $short_pad );
+    do_left_to_right_sweep( $rlines, $rgroups, \@todo, \%max_move, $short_pad,
+        $grp_level );
     return;
 }
 
 sub do_left_to_right_sweep {
-    my ( $rlines, $rgroups, $rtodo, $rmax_move, $short_pad ) = @_;
+    my ( $rlines, $rgroups, $rtodo, $rmax_move, $short_pad, $grp_level ) = @_;
 
     # uses no Global symbols
 
-    # arrays to keep track of failed matches so that we can stop trying
-    # after a failure.
-    my @blocking_token;  # [$ng] token at a match failure
-    my @blocking_level;  # [$ng] level at a match failure
+    # $blocking_level[$nj is the level at a match failure between groups $ng-1
+    # and $ng
+    my @blocking_level; 
 
     my $move_to_common_column = sub {
 
         # Move the alignment column of token $itok to $col_want for a sequence
         # of groups.
-        my ( $ngb, $nge, $itok, $tok, $col_want ) = @_;
+        my ( $ngb, $nge, $itok, $col_want ) = @_;
         return unless ( defined($ngb) && $nge > $ngb );
         foreach my $ng ( $ngb .. $nge ) {
 
@@ -1907,14 +1919,15 @@ sub do_left_to_right_sweep {
                     && $move > $rmax_move->{$ng} );
                 $line->increase_field_width( $itok, $move );
             }
-            elsif ($move < 0) {
+            elsif ( $move < 0 ) {
+
                 # spot to take special action on failure to move
             }
         }
     };
 
     foreach my $task ( @{$rtodo} ) {
-        my ( $itok, $ng_beg, $ng_end, $tok, $lev ) = @{$task};
+        my ( $itok, $ng_beg, $ng_end, $raw_tok, $lev ) = @{$task};
 
         # Nothing to do for a single group
         next unless ( $ng_end > $ng_beg );
@@ -1924,17 +1937,22 @@ sub do_left_to_right_sweep {
         my $col_limit;    # maximum column before bumping into max line length
         my $line_count_ng_m = 0;
         my $jmax_m;
-        my $istop_m;
+        my $it_stop_m;
 
         # Loop over the groups
+        # 'ix_' = index in the array of lines
+        # 'ng_' = index in the array of groups
+        # 'it_' = index in the array of tokens
+        my $ix_min = $rgroups->[$ng_beg]->[0];
+        my $ix_max = $rgroups->[$ng_end]->[1];
         foreach my $ng ( $ng_beg .. $ng_end ) {
-            my ( $jbeg, $jend, $istop ) = @{ $rgroups->[$ng] };
-            my $line_count_ng = $jend - $jbeg + 1;
+            my ( $ix_beg, $ix_end, $it_stop ) = @{ $rgroups->[$ng] };
+            my $line_count_ng = $ix_end - $ix_beg + 1;
 
             # Important: note that since all lines in a group have a common
             # alignments object, we just have to work on one of the lines (the
             # first line).  All of the rest will be changed automatically.
-            my $line = $rlines->[$jbeg];
+            my $line = $rlines->[$ix_beg];
             my $jmax = $line->get_jmax();
 
             # the maximum space without exceeding the line length:
@@ -1949,16 +1967,17 @@ sub do_left_to_right_sweep {
                 $col_limit       = $col_max;
                 $line_count_ng_m = $line_count_ng;
                 $jmax_m          = $jmax;
-                $istop_m         = $istop;
+                $it_stop_m       = $it_stop;
                 next;
             }
 
-	    # RULE: Throw a blocking flag upon encountering a token level
-	    # different from the level of the first blocking token.  For
-	    # example, in the following example, the = matches get blocked
-	    # between two groups.  So we want to start blocking matches at the
-	    # commas, which are at deeper level, so that we do not get the big
-	    # gaps shown here:
+            # RULE: Throw a blocking flag upon encountering a token level
+            # different from the level of the first blocking token.  For
+            # example, in the following example, if the = matches get blocked
+            # between two groups as shown, then we want to start blocking
+            # matches at the commas, which are at deeper level, so that we do
+            # not get the big
+            # gaps shown here:
 
             #  my $unknown3 = pack( "v",          -2 );
             #  my $unknown4 = pack( "v",          0x09 );
@@ -1967,8 +1986,8 @@ sub do_left_to_right_sweep {
             #  my $root_startblock = pack( "V",   $root_start );
             #  my $unknown6        = pack( "VV",  0x00, 0x1000 );
 
-	    # On the other hand, it is okay to keep matching at the same level
-	    # such as in a simple list of commas and/or fat arrors.
+            # On the other hand, it is okay to keep matching at the same level
+            # such as in a simple list of commas and/or fat arrors.
 
             my $is_blocked =
               defined( $blocking_level[$ng] ) && $lev > $blocking_level[$ng];
@@ -1984,20 +2003,26 @@ sub do_left_to_right_sweep {
             #  $worksheet->write( "D8", "",                    $format );
             #  $worksheet->write( "D8", "",                    $format );
 
+            # Allow a larger gap group level
+            my $factor = 1;
+            if ( $lev == $grp_level && $raw_tok eq '=' || $raw_tok eq '=>' ) {
+                $factor = 2;
+            }
+
             # We should exclude from consideration two groups which are
             # effectively the same but separated because one does not
             # fit in the maximum allowed line length.
-            my $is_same_group = $jmax == $jmax_m && $istop_m == $jmax_m - 2;
+            my $is_same_group = $jmax == $jmax_m && $it_stop_m == $jmax_m - 2;
             my $is_big_gap;
             if ( !$is_same_group ) {
                 $is_big_gap ||=
                      $line_count_ng >= 4
-                  && $line_count_ng_m <= 2
-                  && $col_want > $col + $short_pad;
+                  && $ix_beg <= $ix_min + 2
+                  && $col_want > $col + $short_pad * $factor;
                 $is_big_gap ||=
                      $line_count_ng_m >= 4
-                  && $line_count_ng <= 2
-                  && $col > $col_want + $short_pad;
+                  && $ix_beg >= $ix_max - 1
+                  && $col > $col_want + $short_pad * $factor;
             }
 
             # quit and restart if it cannot join this batch
@@ -2009,19 +2034,17 @@ sub do_left_to_right_sweep {
 
                 # remember the level of the first blocking token
                 if ( !defined( $blocking_level[$ng] ) ) {
-                    $blocking_token[$ng] = $tok;
                     $blocking_level[$ng] = $lev;
                 }
 
-                $move_to_common_column->(
-                    $ng_first, $ng - 1, $itok, $tok, $col_want
-                );
+                $move_to_common_column->( $ng_first, $ng - 1, $itok,
+                    $col_want );
                 $ng_first        = $ng;
                 $col_want        = $col;
                 $col_limit       = $col_max;
                 $line_count_ng_m = $line_count_ng;
                 $jmax_m          = $jmax;
-                $istop_m         = $istop;
+                $it_stop_m       = $it_stop;
                 next;
             }
 
@@ -2034,9 +2057,7 @@ sub do_left_to_right_sweep {
         } ## end loop over groups
 
         if ( $ng_end > $ng_first ) {
-            $move_to_common_column->(
-                $ng_first, $ng_end, $itok, $tok, $col_want
-            );
+            $move_to_common_column->( $ng_first, $ng_end, $itok, $col_want );
         } ## end loop over groups for one task
     } ## end loop over tasks
 
@@ -2179,39 +2200,53 @@ EOM
     return;
 }
 
-sub decode_alignment_token {
+{
+    my %decoded_token;
 
-    # Unpack the values packed in an alignment token
-    #
-    # Usage:
-    #        my ( $raw_tok, $lev, $tag, $tok_count ) =
-    #          decode_alignment_token($token);
-
-    # Alignment tokens have a trailing decimal level and optional tag (for
-    # commas):
-    # For example, the first comma in the following line
-    #     sub banner  { crlf; report( shift, '/', shift ); crlf }
-    # is decorated as follows:
-    #    ,2+report-6  => (tok,lev,tag) =qw( ,   2   +report-6)
-
-    # An optional token count may be appended with a leading dot.
-    # Currently this is only done for '=' tokens but this could change.
-    # For example, consider the following line:
-    #   $nport   = $port = shift || $name;
-    # The first '=' may either be '=0' or '=0.1' [level 0, first equals]
-    # The second '=' will be '=0.2' [level 0, second equals]
-    my ($tok) = @_;
+    sub initialize_decode {
+        %decoded_token = ();
+    }
 
-    # uses no Global symbols
+    sub decode_alignment_token {
+
+        # Unpack the values packed in an alignment token
+        #
+        # Usage:
+        #        my ( $raw_tok, $lev, $tag, $tok_count ) =
+        #          decode_alignment_token($token);
+
+        # Alignment tokens have a trailing decimal level and optional tag (for
+        # commas):
+        # For example, the first comma in the following line
+        #     sub banner  { crlf; report( shift, '/', shift ); crlf }
+        # is decorated as follows:
+        #    ,2+report-6  => (tok,lev,tag) =qw( ,   2   +report-6)
+
+        # An optional token count may be appended with a leading dot.
+        # Currently this is only done for '=' tokens but this could change.
+        # For example, consider the following line:
+        #   $nport   = $port = shift || $name;
+        # The first '=' may either be '=0' or '=0.1' [level 0, first equals]
+        # The second '=' will be '=0.2' [level 0, second equals]
+        my ($tok) = @_;
+
+        # uses no Global symbols
 
-    my ( $raw_tok, $lev, $tag, $tok_count ) = ( $tok, 0, "", 1 );
-    if ( $tok =~ /^(\D+)(\d+)([^\.]*)(\.(\d+))?$/ ) {
-        $raw_tok   = $1;
-        $lev       = $2;
-        $tag       = $3 if ($3);
-        $tok_count = $5 if ($5);
+        if ( defined( $decoded_token{$tok} ) ) {
+            return @{ $decoded_token{$tok} };
+        }
+
+        my ( $raw_tok, $lev, $tag, $tok_count ) = ( $tok, 0, "", 1 );
+        if ( $tok =~ /^(\D+)(\d+)([^\.]*)(\.(\d+))?$/ ) {
+            $raw_tok   = $1;
+            $lev       = $2;
+            $tag       = $3 if ($3);
+            $tok_count = $5 if ($5);
+        }
+        my @vals = ( $raw_tok, $lev, $tag, $tok_count );
+        $decoded_token{$tok} = \@vals;
+        return @vals;
     }
-    return ( $raw_tok, $lev, $tag, $tok_count );
 }
 
 {    # closure for sub is_deletable_token
@@ -2277,7 +2312,7 @@ sub delete_unmatched_tokens {
     # many obviously un-needed alignment tokens as possible.  This will prevent
     # them from interfering with the final alignment.
 
-    return unless @{$rlines} > 1;  # shouldn't happen
+    return unless @{$rlines} > 1;    # shouldn't happen
 
     my $has_terminal_match = $rlines->[-1]->get_j_terminal_match();
 
@@ -2286,7 +2321,7 @@ sub delete_unmatched_tokens {
     my $rnew_lines = \@filtered;
 
     my $saw_side_comment = @filtered != @{$rlines};
-    my $max_lev_diff = 0;
+    my $max_lev_diff     = 0;
 
     # nothing to do if all lines were hanging side comments
     my $jmax = @{$rnew_lines} - 1;
@@ -2526,7 +2561,7 @@ sub delete_unmatched_tokens {
 
     }    # End loop over subgroups
 
-    return ($max_lev_diff, $saw_side_comment);
+    return ( $max_lev_diff, $saw_side_comment );
 }
 
 sub get_line_token_info {
@@ -3001,7 +3036,7 @@ sub prune_alignment_tree {
             #  $deep1               ~~ $deep1;
 
             # So we will use two thresholds.
-            my $nmin_mono     = $depth + 3;    #TODO: test with 2
+            my $nmin_mono     = $depth + 2;
             my $nmin_non_mono = $depth + 6;
             if ( $nmin_mono > $nlines_p - 1 ) {
                 $nmin_mono = $nlines_p - 1;
@@ -3101,7 +3136,7 @@ sub Dump_tree_groups {
 
     sub is_marginal_match {
 
-        my ( $line_0, $line_1, $grp_level ) = @_;
+        my ( $line_0, $line_1, $grp_level, $imax_align ) = @_;
 
         # uses no Global symbols
 
@@ -3138,6 +3173,7 @@ sub Dump_tree_groups {
         my $saw_good_alignment = 0;
         my $saw_if_or;        # if we saw an 'if' or 'or' at group level
         my $raw_tokb = "";    # first token seen at group level
+        my $jfirst_bad;
         for ( my $j = 0 ; $j < $jmax_1 - 1 ; $j++ ) {
             my ( $raw_tok, $lev, $tag, $tok_count ) =
               decode_alignment_token( $rtokens_1->[$j] );
@@ -3156,13 +3192,17 @@ sub Dump_tree_groups {
             if ( $is_good_alignment{$raw_tok} ) {
                 $saw_good_alignment = 1;
             }
+            else {
+                $jfirst_bad = $j unless defined($jfirst_bad);
+            }
             if ( $rpatterns_0->[$j] ne $rpatterns_1->[$j] ) {
 
                 # Flag this as a marginal match since patterns differ.
                 # Normally, we will not allow just two lines to match if
                 # marginal. But we can allow matching in some specific cases.
 
-                $is_marginal = 1 if ( $is_marginal == 0 );
+                $jfirst_bad  = $j if ( !defined($jfirst_bad) );
+                $is_marginal = 1  if ( $is_marginal == 0 );
                 if ( $raw_tok eq '=' ) {
 
                     # Here is an example of a marginal match:
@@ -3178,6 +3218,8 @@ sub Dump_tree_groups {
             }
         }
 
+        if ( !defined($jfirst_bad) ) { $jfirst_bad = $jmax_1 - 1; }
+
         # Turn off the "marginal match" flag in some cases...
         # A "marginal match" occurs when the alignment tokens agree
         # but there are differences in the other tokens (patterns).
@@ -3301,8 +3343,10 @@ sub Dump_tree_groups {
                 }
             }
         }
-
-        return $is_marginal;
+        if ( $is_marginal && $imax_align > $jfirst_bad - 1 ) {
+            $imax_align = $jfirst_bad - 1;
+        }
+        return ( $is_marginal, $imax_align );
     }
 }
 
@@ -3566,7 +3610,6 @@ sub adjust_side_comment_single_group {
     return;
 }
 
-
 sub valign_output_step_A {
 
     ###############################################################
@@ -3726,16 +3769,6 @@ sub combine_fields {
 
     if ( !defined($imax_align) ) { $imax_align = -1 }
 
-    # Correction: although this routine has the ability to retain some leading
-    # alignments, overall the results are much better if we always remove all
-    # of the alignments.  Here is an example of the problem if we do not
-    # do this. The first two lines are marginal but match their =~ matches
-    # the third line. But if we keep it we get a big gap:
-    #  return $path unless $path =~ /^~/;
-    #  $path                     =~ s:^~([^/]+):(getpwnam($1))[$[+7]:e;
-    #  $path =~ s:^~:$ENV{'HOME'} || (getpwuid($<))[$[+7]:e;
-    $imax_align = -1;
-
     # First delete the unwanted tokens
     my $jmax_old       = $line_0->get_jmax();
     my @old_alignments = $line_0->get_alignments();