From: Steve Hancock Date: Wed, 15 Feb 2023 15:28:00 +0000 (-0800) Subject: add two optimization modes to recombine operation X-Git-Tag: 20230309~23 X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=5eae806dc8505f8ddc03f3f3f12a49cb9afbe5b6;p=perltidy.git add two optimization modes to recombine operation This looks for two common patterns of monotonic variation in joint strengths that can be handled very efficiently. --- diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm index e91b86b6..efc29bec 100644 --- a/lib/Perl/Tidy/Formatter.pm +++ b/lib/Perl/Tidy/Formatter.pm @@ -18074,18 +18074,30 @@ sub break_equals { sub recombine_breakpoints { - # We are given indexes to the current lines: - # $ri_beg = ref to array of BEGinning indexes of each line - # $ri_end = ref to array of ENDing indexes of each line my ( $self, $ri_beg, $ri_end, $rbond_strength_to_go ) = @_; - # sub break_long_lines is very liberal in setting line breaks + # This sub implements the 'recombine' operation on a batch. + # Its task is to combine some of these lines back together to + # improve formatting. The need for this arises because + # sub 'break_long_lines' is very liberal in setting line breaks # for long lines, always setting breaks at good breakpoints, even # when that creates small lines. Sometimes small line fragments # are produced which would look better if they were combined. - # That's the task of this routine. - # do nothing under extreme stress; use <= 2 for c171 + # Input parameters: + # $ri_beg = ref to array of BEGinning indexes of each line + # $ri_end = ref to array of ENDing indexes of each line + # $rbond_strength_to_go = array of bond strengths pulling + # tokens together, used to decide where best to recombine lines. + + #--------------------------------------------------------------- + # Do nothing under extreme stress; use <= 2 for c171. + # (Actually, c118 & c171 run okay without this now due to the + # new optimizations. So this is not strictly necessary now. But + # removing this check is not really useful because this condition + # only occurs in test runs, and another formatting pass will fix + # things anyway.) + #--------------------------------------------------------------- return if ( $high_stress_level <= 2 ); my $nmax_start = @{$ri_end} - 1; @@ -18093,7 +18105,7 @@ sub break_equals { #---------------------------------------------------------------- # Break into small sub-sections to decrease the maximum n-squared - # operations and avoid excess run time. See comments below. + # operations and avoid excess run time. #---------------------------------------------------------------- # Also make a list of all good joining tokens between the lines @@ -18157,46 +18169,13 @@ sub break_equals { my $num_sections = @{$rsections}; - # This is potentially an O(n-squared) loop, but not critical, so we can - # put a finite limit on the total number of iterations. This is - # suggested by issue c118, which pushed about 5.e5 lines through here - # and caused an excessive run time. For another test case see c167. - - # Three lines of defense have been put in place to prevent excessive - # run times: - # 1. do nothing if formatting under stress (c118 was under stress) - # 2. break into small sub-sections to decrease the maximum n-squared. - # 3. put a finite limit on the number of iterations. - - # Testing shows that most batches only require one or two iterations. - # A very large batch which is broken into sub-sections can require one - # iteration per section. This suggests the limit here, which allows - # up to 10 iterations plus one pass per sub-section. - my $it_count = 0; - my $it_count_max = - 10 + int( 1000 / ( 1 + $nmax_section ) ) + $num_sections; - - # If the current script has more lines than the original script, - # then we must allow more iterations. So we increase the max - # by one iteration per additional line. Fixes c186, c187. - my $rLL = $self->[_rLL_]; - my $K_0 = $K_to_go[0]; - my $K_x = $K_to_go[$max_index_to_go]; - my $li_0 = $rLL->[$K_0]->[_LINE_INDEX_]; - my $li_x = $rLL->[$K_x]->[_LINE_INDEX_]; - - my $nlines_gain = $nmax_start - ( $li_x - $li_0 ); - if ( $nlines_gain > 0 ) { - $it_count_max += $nlines_gain; - } - - if ( DEBUG_RECOMBINE > 0 ) { + if ( DEBUG_RECOMBINE > 1 ) { print STDERR < 1 ) { + if ( DEBUG_RECOMBINE > 0 ) { my $max = 0; print STDERR "-----\n$num_sections sections found for nmax=$nmax_start\n"; @@ -18214,27 +18193,18 @@ EOM # numbers, and the line numbers change as we go. while ( my $section = pop @{$rsections} ) { my ( $nbeg, $nend ) = @{$section}; - $it_count = $self->recombine_breakpoints_section_loop( + $self->recombine_breakpoints_section_loop( $ri_beg, $ri_end, $nbeg, $nend, - $it_count, - $it_count_max, \@joint, - $rbond_strength_to_go + $rbond_strength_to_go, ); - last if ( !defined($it_count) ); } - if (DEBUG_RECOMBINE) { - my $nmax_last = @{$ri_end} - 1; - if ( !defined($it_count) ) { $it_count = 'UNDEF' } - print STDERR -"exiting recombine with $nmax_last lines, starting lines=$nmax_start, iterations=$it_count, max_it=$it_count_max numsec=$num_sections\n"; - } return; } ## end sub recombine_breakpoints @@ -18246,8 +18216,6 @@ EOM $ri_end, $nbeg, $nend, - $it_count, - $it_count_max, $rjoint, $rbond_strength_to_go, @@ -18259,41 +18227,134 @@ EOM # $ri_beg, $ri_end = ref to arrays with token indexes of the first # and last line # $nbeg, $nend = line numbers bounding this section - # $it_count = iteration counter - # $it_count_max = maximum iteration counter (stop if reached) + # $num_compares = number of inner loop passes + # $max_compares = maximum inner loop passes (stop if reached) # $rjoint = ref to array of good joining tokens per line # $rbond_strength_to_go = ref to array of token bond strengths - # Updates: $ri_beg, $ri_end, $rjoint if lines are joied + # Updates: $ri_beg, $ri_end, $rjoint if lines are joined # Returns: - # $it_count = updated iteration count if success - # = undef if ERROR (emergency stop) + # nothing + + my $rLL = $self->[_rLL_]; my $rK_weld_right = $self->[_rK_weld_right_]; my $rK_weld_left = $self->[_rK_weld_left_]; - # number of ending lines to leave untouched in this pass + # $num_freeze = number of trailing lines to leave untouched my $nmax_sec = @{$ri_end} - 1; my $num_freeze = $nmax_sec - $nend; + my $ibeg = $ri_beg->[$nbeg]; + my $iend = $ri_end->[$nend]; + my $Kbeg = $K_to_go[$ibeg]; + my $Kend = $K_to_go[$iend]; + + # This is potentially an O(n-squared) loop, but not critical, so we can + # put a finite limit on the total number of iterations. This is + # suggested by issue c118, which pushed about 5.e5 lines through here + # and caused an excessive run time. For other test cases see: + # c167, c186, c187. + + # Several lines of defense have been put in place to prevent excessive + # run times: + # 1. do nothing if formatting under stress (c118 was under stress) + # (this has already been done by the calling routine) + # 2. break into small sub-sections to decrease the maximum n-squared. + # (this has already been done by the calling routine) + # 3. put a finite limit on the number of iterations. + # 4. optimize the inner loop on certain common patterns. + + #---------------- + # Iteration limit + #---------------- + # The iteration limit guards against excessive run time. + # Testing shows that most cases require roughly 1 comparison per line. + # The most extreme cases in my large collection of files are: + # camel1.t - 9 => fault, 10 => pass (obfuscated perl test) + # ternary.t - 10 => fault, 11 => pass + # f2html.pm - 12 => fault, 13 => pass + # Poll.pm - 16 => fault, 17 => pass + # A limiting ratio of 20 will allow essentially all code to pass. + # Most of these cases involve input code with extremely long lines. + # If the limit is ever exceeded, things will be fixed on the next + # formatting pass. + use constant MAX_COMPARE_RATIO => 20; + my $num_tot = $nend - $nbeg + 1; + my $max_compares = MAX_COMPARE_RATIO * $num_tot; + my $num_compares = 0; + + #------------- + # Optimization + #------------- + # There are four modes of operation, as follows: + # $optimization_on $reverse MODE + # ---------------- -------- ---- + # false false Normal Mode + # false true Reverse Mode + # true false Optimized Forward Search + # true true Optimized Reverse Search + + # The Normal Mode is the basic method. The main issue is + # that it is can potentially take O(N^2) compares. + + # The Reverse Mode works but is mainly for testing because it can give + # different results from the Normal Mode in a few cases involving + # joining at parens which are order dependent. + + # The Optimized Modes give the same results as Normal Mode but + # run in O(N) time when certain patterns are detected. + + # The Optimized Modes may be deactivated with these flags for + # testing. These flags should normally be true: + use constant OPTIMIZED_FORWARD_SEARCH => 1; + use constant OPTIMIZED_REVERSE_SEARCH => 1; + + # Optimization parameters: + my $optimization_on = 0; + my $reverse = 0; + my $n_best_last; + my $more_to_do = 1; # We keep looping over all of the lines of this batch # until there are no more possible recombinations my $nmax_last = $nmax_sec + 1; + # This will be the current maximum bond strength after the + # iteration gets going. When set, we can stop immediately + # on a joint with this strength, since we know it is the + # best, or equal to the best. + my $bs_previous_best; + + #--------------------- + # loop over iterations + #--------------------- while ($more_to_do) { # Emergency return on excess total iterations. The allowed - # maximum is large enough that this should never happen. - $it_count++; - if ( $it_count > $it_count_max ) { - my $KK = $K_to_go[0]; - my $lno = $self->[_rLL_]->[$KK]->[_LINE_INDEX_]; - DEVEL_MODE && Fault(< $max_compares ) { + + # If the current script has many more lines than the original + # script, then we may have trouble converging must allow more + # iterations. In that case it will be better to stop iterating + # early and let another formatting pass fix the problem. And + # there is no need to trigger a fault check during testing if + # that happens. + my $li_0 = $rLL->[$Kbeg]->[_LINE_INDEX_]; + my $li_x = $rLL->[$Kend]->[_LINE_INDEX_]; + my $nlines_gain = $num_tot - ( $li_x - $li_0 ); + + # The line check is temporarily ignored to collect info + use constant IGNORE_CHECK => 1; + if ( IGNORE_CHECK || $nlines_gain < 2 * $num_tot ) { + my $lno = $self->[_rLL_]->[$Kbeg]->[_LINE_INDEX_]; + DEVEL_MODE && Fault(< 0 ) { + + # We will accept this joint but turn off + # optimization; it can restart again if possible. + $optimization_on = 0; + $reverse = 0; + } + + last; + } } else { + my $dbs = $bs - $bs_last; + if ($reverse) { $dbs = -$dbs } + $bs_last = $bs; + + if ( $num_bs <= 1 ) { + $dbs_min = $dbs_max = $dbs; + } + else { + if ( $dbs < $dbs_min ) { $dbs_min = $dbs } + if ( $dbs > $dbs_max ) { $dbs_max = $dbs } + } - if ( $bs > $bs_best ) { - $n_best = $n; - $bs_best = $bs; + if ( $n > $nbs_max ) { $nbs_max = $n } + if ( $n < $nbs_min ) { $nbs_min = $n } + $num_bs++; + + # for forward mode - use first max bs + if ( !$reverse ) { + if ( $bs > $bs_best ) { + $n_best = $n; + $bs_best = $bs; + } + } + + # for reverse mode - use last max bs + else { + if ( $bs >= $bs_best ) { + $n_best = $n; + $bs_best = $bs; + } } } - } + } ## end loop over all line pairs # recombine the pair with the greatest bond strength if ($n_best) { + DEBUG_RECOMBINE > 1 + && print +"BEST: rev=$reverse nb=$n_best nstart=$nstart stop=$nstop nbmin=$nbs_min nbmax=$nbs_max bs=$bs_best dbs=$dbs_max dbsmin=$dbs_min\n"; splice @{$ri_beg}, $n_best, 1; splice @{$ri_end}, $n_best - 1, 1; splice @{$rjoint}, $n_best, 1; + # Note that we must subtract 1 here to get an updated best + # index because we decreased all index values by this splice. + $n_best_last = $n_best - 1; + + # Look for a pattern which allows optimization: + if ( + + # this is not already on, and + !$optimization_on + + # we have not taken a shortcut to get here, and + && !$incomplete_loop + + # we have seen multiple good breaks + && defined($num_bs) && $num_bs > 1 + + ) + { + + # Look for pattern 1: + # - we are joining at the first possible joint, and + # - the strength values do not increase with $n + if ( $n_best == $nbs_min && $dbs_max <= 0 ) { + if (OPTIMIZED_FORWARD_SEARCH) { + + DEBUG_RECOMBINE > 1 + && print STDERR +"Start FORWARD Optimiation: bs=$bs_best num=$num_bs dbs_min=$dbs_min dbs_max=$dbs_max nbest=$n_best nbs_min=$nbs_min\n"; + + $optimization_on = 1; + $reverse = 0; + } + } + + # Look for pattern 2: + # - we are joining at the last possible joint, and + # - the strength values increase montonically with $n + elsif ( $n_best == $nbs_max && $dbs_min > 0 ) { + if (OPTIMIZED_REVERSE_SEARCH) { + DEBUG_RECOMBINE > 1 + && print STDERR +"Start REVERSE Optimization: bs=$bs_best, dbs_min=$dbs_min dbs_max=$dbs_max nbest=$n_best nmax=$nbs_max\n"; + + $optimization_on = 1; + $reverse = 1; + } + } + } + + $bs_previous_best = $bs_best; + # keep going if we are still making progress $more_to_do++; } - } # end iteration loop - return $it_count; + } ## end iteration loop + + if (DEBUG_RECOMBINE) { + my $ratio = sprintf "%0.3f", $num_compares / $num_tot; + print STDERR +"exiting recombine_inner_loop with $nmax_last lines, opt=$optimization_on, rev=$reverse, starting lines=$num_tot, num_compares=$num_compares, ratio=$ratio\n"; + } + + return; } ## end sub recombine_breakpoints_section_loop sub recombine_section_0 { @@ -22814,8 +23022,7 @@ EOM # Section B: Handle a multiline list ... #--------------------------------------- - $self->break_multiline_list( $rhash_IN, $rhash_A, - $i_opening_minus ); + $self->break_multiline_list( $rhash_IN, $rhash_A, $i_opening_minus ); return; } ## end sub table_maker @@ -23045,8 +23252,8 @@ EOM || ( $first_term_length > $columns_if_unbroken ); } - my $hash_B = $self->table_layout_B( $rhash_IN, $rhash_A, - $is_lp_formatting ); + my $hash_B = + $self->table_layout_B( $rhash_IN, $rhash_A, $is_lp_formatting ); return if ( !defined($hash_B) ); # Updated variables