From: Steve Hancock Date: Sat, 4 Mar 2023 15:41:59 +0000 (-0800) Subject: improve recombine operation, c200 X-Git-Tag: 20230309~2 X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=b744e29582b619035bf5b5f69b01b8d194379358;p=perltidy.git improve recombine operation, c200 This update consolidates the three possible optimization methods in the recombine operation into a single, simpler method which can be shown to run in linear time. --- diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm index 94efcad9..a6dca9c9 100644 --- a/lib/Perl/Tidy/Formatter.pm +++ b/lib/Perl/Tidy/Formatter.pm @@ -18072,14 +18072,14 @@ sub break_equals { # $rbond_strength_to_go = array of bond strengths pulling # tokens together, used to decide where best to recombine lines. - #--------------------------------------------------------------- + #------------------------------------------------------------------- # Do nothing under extreme stress; use <= 2 for c171. - # (Actually, c118 & c171 run okay without this now due to the - # new optimizations. So this is not strictly necessary now. But - # removing this check is not really useful because this condition - # only occurs in test runs, and another formatting pass will fix - # things anyway.) - #--------------------------------------------------------------- + # (NOTE: New optimizations make this unnecessary. But removing this + # check is not really useful because this condition only occurs in + # test runs, and another formatting pass will fix things anyway.) + # This routine has a long history of improvements. Some past + # relevant issues are : c118, c167, c171, c186, c187, c193, c200. + #------------------------------------------------------------------- return if ( $high_stress_level <= 2 ); my $nmax_start = @{$ri_end} - 1; @@ -18092,10 +18092,9 @@ sub break_equals { my $has_terminal_semicolon = $iend_max >= 0 && $types_to_go[$iend_max] eq ';'; - #---------------------------------------------------------------- - # Break into small sub-sections to decrease the maximum n-squared - # operations and avoid excess run time. - #---------------------------------------------------------------- + #-------------------------------------------------------------------- + # Break into the smallest possible sub-sections to improve efficiency + #-------------------------------------------------------------------- # Also make a list of all good joining tokens between the lines # n-1 and n. @@ -18124,8 +18123,8 @@ sub break_equals { } } - # joint[$nn] = [ index of joint character, skip flag ] - $joint[$nn] = [ $itok, 0 ]; + # joint[$nn] = index of joint character + $joint[$nn] = $itok; # Update the section list my $excess = $self->excess_line_length( $ibeg_1, $iend_2, 1 ); @@ -18200,6 +18199,28 @@ EOM return; } ## end sub recombine_breakpoints + sub dump_pair_list { + + #-------------------------------------------- + # Debug routine, may be eventually be removed + #-------------------------------------------- + my ( $rhash, $msg ) = @_; + my $rpair_list = $rhash->{_rpair_list}; + my $opt = $rhash->{_optimization_on}; + $msg = "" unless $msg; + print STDERR <> +opt=$opt +EOM + foreach my $item ( @{$rpair_list} ) { + my ( $n, $bs ) = @{$item}; + print STDERR <{_num_freeze} = $nmax_batch - $nend; + # Setup the list of line pairs to test. This stores the following + # values for each line pair: + # [ $n=index of the second line of the pair, $bs=bond strength] + my @pair_list; + my $rbond_strength_to_go = $rhash->{_rbond_strength_to_go}; + foreach my $n ( $nbeg + 1 .. $nend ) { + my $iend_1 = $ri_end->[ $n - 1 ]; + my $ibeg_2 = $ri_beg->[$n]; + my $bs_tweak = 0; + if ( $is_amp_amp{ $types_to_go[$ibeg_2] } ) { $bs_tweak = 0.25 } + my $bs = $rbond_strength_to_go->[$iend_1] + $bs_tweak; + push @pair_list, [ $n, $bs ]; + } + + # Any order for testing is possible, but optimization is only possible + # if we sort the line pairs on decreasing joint strength. + @pair_list = + sort { $b->[1] <=> $a->[1] || $a->[0] <=> $b->[0] } @pair_list; + $rhash->{_rpair_list} = \@pair_list; + #---------------- # Iteration limit #---------------- - # This is potentially an O(n-squared) loop, but not critical, so we can - # put a finite limit on the total number of iterations. This is - # suggested by issue c118, which pushed about 5.e5 lines through here - # and caused an excessive run time. For other test cases see: - # c167, c186, c187. - - # Several lines of defense have been put in place to prevent excessive - # run times: - # 1. do nothing if formatting under stress (c118 was under stress) - # (this has already been done by the calling routine) - # 2. break into small sub-sections to decrease the maximum n-squared. - # (this has already been done by the calling routine) - # 3. put a finite limit on the number of iterations. - # 4. optimize the inner loop on certain common patterns. - - # The iteration limit guards against excessive run time. + # This was originally an O(n-squared) loop which required a check + # on the maximum number of ierations. It is now an O(n) loop, but + # to guard against programming errors we still place a finite limit + # on the total number of iterations. This should only be reached + # in the event of a programming error. + # Testing shows that most cases require roughly 1 comparison per line. + # The upper bound appears to be about 4 comparisons per line unless + # optimization is deactivated. # The most extreme cases in my large collection of files are: - # camel1.t - needs 5 compares per line (obfuscated perl test) - # ternary.t - needs 9 compares per line - # So a limiting ratio of 20 should allow essentially all code to pass. + # camel1.t - needs 3 compares per line (12 without optimization) + # ternary.t - needs 4 compares per line (12 without optimization) + # So a limiting ratio of 20 should allow all code to pass even with + # optimization turned off. The OPTIMIZE_OK flag should be true + # except for testing. + use constant OPTIMIZE_OK => 1; use constant MAX_COMPARE_RATIO => 20; - my $num_tot = $nend - $nbeg + 1; - my $max_compares = MAX_COMPARE_RATIO * $num_tot; - $rhash->{_num_compares} = 0; - - #------------- - # Optimization - #------------- - - # There optimization modes are as follows: - # $optimization_on $reverse MODE - # ----- ----- ---- - # 0 0 Normal Mode - # 0 1 Reverse Mode - # 1 0 Optimized Forward Search - # 1 1 Optimized Reverse Search - # 2 0 Optimized Tie Search - - # The Normal Mode is the basic method. The main issue is - # that it is can potentially take O(N^2) compares. - - # The Reverse Mode is similar but runs backwards. - - # The Optimized Modes give the same results as Normal Mode but - # run in O(N) time when certain common patterns are detected. - - # The Optimized Modes may be deactivated with these flags for - # testing. These flags should normally be true: - use constant OPTIMIZED_FORWARD_SEARCH => 1; - use constant OPTIMIZED_REVERSE_SEARCH => 1; - use constant OPTIMIZED_TIE_SEARCH => 1; + my $num_pairs = $nend - $nbeg + 1; + my $max_compares = MAX_COMPARE_RATIO * $num_pairs; - # Always start in Normal Mode + # Always start with optimization off + $rhash->{_num_compares} = 0; $rhash->{_optimization_on} = 0; - $rhash->{_reverse} = 0; - $rhash->{_bs_tie} = 0; + $rhash->{_ix_best_last} = 0; #-------------------------------------------- # loop until there are no more recombinations @@ -18331,6 +18338,8 @@ EOM # Iteration limit check: if ( $rhash->{_num_compares} > $max_compares ) { + + # See note above; should only get here on a programming error if (DEVEL_MODE) { my $ibeg = $ri_beg->[$nbeg]; my $Kbeg = $K_to_go[$ibeg]; @@ -18345,9 +18354,9 @@ EOM } ## end iteration loop if (DEBUG_RECOMBINE) { - my $ratio = sprintf "%0.3f", $rhash->{_num_compares} / $num_tot; + my $ratio = sprintf "%0.3f", $rhash->{_num_compares} / $num_pairs; print STDERR -"exiting recombine_inner_loop with $nmax_last lines, opt=$rhash->{_optimization_on}, rev=$rhash->{_reverse}, starting lines=$num_tot, num_compares=$rhash->{_num_compares}, ratio=$ratio\n"; +"exiting recombine_inner_loop with $nmax_last lines, opt=$rhash->{_optimization_on}, starting pairs=$num_pairs, num_compares=$rhash->{_num_compares}, ratio=$ratio\n"; } return; @@ -18371,18 +18380,16 @@ EOM my $nbeg = $rhash->{_nbeg}; my $rjoint = $rhash->{_rjoint}; my $rbond_strength_to_go = $rhash->{_rbond_strength_to_go}; + my $rpair_list = $rhash->{_rpair_list}; - # This will be the best joint: + # This will remember the best joint: my $n_best = 0; my $bs_best = 0.; - my $num_tie = 0; - - # Variables needed to check for optimization patterns: - my ( $bs_last, $num_bs, $dbs_min, $dbs_max, $nbs_min, $nbs_max ); + my $ix_best = 0; + my $num_bs = 0; - # In normal mode: loop over all remaining lines in this batch + # The range of lines in this group is $nbeg to $nstop my $nmax = @{$ri_end} - 1; - my $nstart = $nbeg + 1; my $nstop = $nmax - $rhash->{_num_freeze}; my $num_joints = $nstop - $nbeg; @@ -18390,63 +18397,36 @@ EOM # special two-line logic to be checked (c193) if ( $rhash->{_optimization_on} && $num_joints <= 2 ) { $rhash->{_optimization_on} = 0; - $rhash->{_reverse} = 0; - } - - # Set the order for visiting each line pair ... - my @nlist; - - # The 'tie' mode starts at the last best and circles around - if ( $rhash->{_optimization_on} == 2 ) { - my $nmid = max( $nstart, $rhash->{_n_best_last} ); - @nlist = ( $nmid .. $nstop, $nstart .. $nmid - 1 ); } - # All other modes run the full length from one end to the other - else { + # Start where we ended the last search + my $ix_start = $rhash->{_ix_best_last}; - if ( $rhash->{_optimization_on} == 1 ) { + # Back up one more during optimization, just to be careful + if ( $rhash->{_optimization_on} ) { $ix_start -= 1 } - # In reverse optimization mode: start loop at 1 + last best - # joint in order not to miss some small concatenations - if ( $rhash->{_reverse} ) { - $nstop = min( $nstop, $rhash->{_n_best_last} + 1 ); - } + # Keep the starting index in bounds + $ix_start = max( 0, $ix_start ); - # In forward optimization mode: backup by 1 because some - # tokens can become activated by certain changes. In - # particular, a '?' can become active when a nearby ':' - # is joined, or a level changes. And an '=' can become - # active if two lines might be made. - else { - $nstart = max( $nstart, $rhash->{_n_best_last} - 1 ); - } - } - - if ( $rhash->{_reverse} ) { - @nlist = reverse( $nstart .. $nstop ); - } - else { - @nlist = ( $nstart .. $nstop ); - } - } + # Make a search order list which cycles around to visit + # all line pairs. + my $ix_max = @{$rpair_list} - 1; + my @ix_list = ( $ix_start .. $ix_max, 0 .. $ix_start - 1 ); + my $ix_last = $ix_list[-1]; #------------------------- # loop over all line pairs #------------------------- my $incomplete_loop; - foreach my $n (@nlist) { - - # This flag will be true if we 'last' out of this loop early: - $incomplete_loop = $n != $nlist[-1]; + foreach my $ix (@ix_list) { + my $item = $rpair_list->[$ix]; + my ( $n, $bs ) = @{$item}; - my ( $itok, $skip ) = @{ $rjoint->[$n] }; + # This flag will be true if we 'last' out of this loop early. + # We cannot turn on optimization if this is true. + $incomplete_loop = $ix != $ix_last; - # The skip flag is available for future optimization - # but not yet used. - next if ($skip); - - # Count total number of times through the inner loop + # Update the count of the number of times through this inner loop $rhash->{_num_compares}++; #---------------------------------------------------------- @@ -18505,7 +18485,8 @@ EOM if ( $tokens_to_go[$iend_1] eq ')' && $tokens_to_go[$ibeg_2] eq '{' ) { - $n_best = $n; + $n_best = $n; + $ix_best = $ix; last; } } @@ -18528,6 +18509,7 @@ EOM # to the left or right of an operator. #---------------------------------------------------------- + my $itok = $rjoint->[$n]; if ($itok) { my $ok_0 = recombine_section_0( $itok, $ri_beg, $ri_end, $n ); next if ( !$ok_0 ); @@ -18546,7 +18528,8 @@ EOM && defined( $rK_weld_left->{ $K_to_go[$ibeg_2] } ) ) ) { - $n_best = $n; + $n_best = $n; + $ix_best = $ix; last; } @@ -18572,6 +18555,7 @@ EOM if ($skip_Section_3) { $forced_breakpoint_to_go[$iend_1] = 0; $n_best = $n; + $ix_best = $ix; $incomplete_loop = 1; last; } @@ -18589,7 +18573,17 @@ EOM # honor hard breakpoints next if ( $forced_breakpoint_to_go[$iend_1] ); - my $bs = $rbond_strength_to_go->[$iend_1] + $bs_tweak; + if (DEVEL_MODE) { + + # During development, be sure that the strengths are correct + my $bs_check = $rbond_strength_to_go->[$iend_1] + $bs_tweak; + if ( $bs_check != $bs ) { + Fault(<{_reverse} ) { $dbs = -$dbs } - $bs_last = $bs; - - if ( $num_bs <= 1 ) { - $dbs_min = $dbs_max = $dbs; - } - else { - if ( $dbs < $dbs_min ) { $dbs_min = $dbs } - if ( $dbs > $dbs_max ) { $dbs_max = $dbs } - } - - if ( $n > $nbs_max ) { $nbs_max = $n } - if ( $n < $nbs_min ) { $nbs_min = $n } $num_bs++; - if ( $bs == $bs_best ) { $num_tie++ } - else { $num_tie = 0 } - # save maximum strength; in case of a tie select min $n if ( $bs > $bs_best || $bs == $bs_best && $n < $n_best ) { $n_best = $n; + $ix_best = $ix; $bs_best = $bs; } - } - # in 'tie' mode, stop if we see the target strength - if ( $rhash->{_optimization_on} == 2 && $bs == $rhash->{_bs_tie} ) { - last; - } } ## end loop over all line pairs #--------------------------------------------------- @@ -18692,86 +18661,46 @@ EOM if ($n_best) { DEBUG_RECOMBINE > 1 && print -"BEST: rev=$rhash->{_reverse} nb=$n_best nstart=$nstart stop=$nstop nbmin=$nbs_min nbmax=$nbs_max bs=$bs_best dbs=$dbs_max dbsmin=$dbs_min\n"; +"BEST: rev=$rhash->{_reverse} nb=$n_best nbeg=$nbeg stop=$nstop bs=$bs_best\n"; splice @{$ri_beg}, $n_best, 1; splice @{$ri_end}, $n_best - 1, 1; splice @{$rjoint}, $n_best, 1; - # NOTE: we must subtract 1 here to get the updated index of this - # line end because the splice decreased its index value by 1. - # BUT: if we are joining the first two lines, this produces an - # invalid joint index of 0. So this index must be tested before use. - $rhash->{_n_best_last} = $n_best - 1; + splice @{$rpair_list}, $ix_best, 1; - # Turn off tie search if not found - my $in_tie_mode = $rhash->{_optimization_on} == 2; - if ( $in_tie_mode - && $bs_best != $rhash->{_bs_tie} ) - { - $rhash->{_optimization_on} = 0; - $rhash->{_reverse} = 0; - $rhash->{_bs_tie} = 0; + # Update the pair list: + # old $n values greater than the best $n decrease by 1 + foreach my $item ( @{$rpair_list} ) { + my $n_old = $item->[0]; + if ( $n_old > $n_best ) { $item->[0] -= 1 } } - # Look for a pattern which allows optimization: + # And store updated indexes of the best $n. We must subtract 1 to + # get the updated indexes because the splice decreased its index + # value by 1. BUT CAUTION: if this is the first line pair, then + # this produces an invalid index. So these indexes must be + # tested before use in the next pass through the outer loop. + $rhash->{_n_best_last} = $n_best - 1; + $rhash->{_ix_best_last} = $ix_best - 1; + + # Turn on optimization if ... if ( - # this is not already on, and + # it is not already on, and !$rhash->{_optimization_on} # we have not taken a shortcut to get here, and && !$incomplete_loop - # we have seen multiple good breaks - && defined($num_bs) && $num_bs > 1 + # we have seen a good break on strength, and + && $num_bs + + # we are allowed to optimize + && OPTIMIZE_OK ) { - - # Look for pattern 1: - # - this is the first possible joint, and - # - the strength values do not increase with $n - # - we did not just exit tie mode ($dbs_max may be wrong) - if ( $n_best == $nbs_min && $dbs_max <= 0 && !$in_tie_mode ) { - if (OPTIMIZED_FORWARD_SEARCH) { - - DEBUG_RECOMBINE > 1 - && print STDERR -"Start FORWARD Optimiation: bs=$bs_best num=$num_bs dbs_min=$dbs_min dbs_max=$dbs_max nbest=$n_best nbs_min=$nbs_min\n"; - - $rhash->{_optimization_on} = 1; - $rhash->{_reverse} = 0; - } - } - - # Look for pattern 2: - # - this is the last possible joint, and - # - the strength values increase montonically with $n - # - we did not just exit tie mode ($dbs_min may be wrong) - elsif ( $n_best == $nbs_max && $dbs_min > 0 && !$in_tie_mode ) { - if (OPTIMIZED_REVERSE_SEARCH) { - DEBUG_RECOMBINE > 1 - && print STDERR -"Start REVERSE Optimization: bs=$bs_best, dbs_min=$dbs_min dbs_max=$dbs_max nbest=$n_best nmax=$nbs_max\n"; - - $rhash->{_optimization_on} = 1; - $rhash->{_reverse} = 1; - } - } - - # Look for pattern 3: - # - there are multiple joints with this same strength (ties) - elsif ( $num_tie > 1 && $bs_best > 0 ) { - if (OPTIMIZED_TIE_SEARCH) { - DEBUG_RECOMBINE > 1 - && print STDERR -"Start TIE Optimization: bs=$bs_best, dbs_min=$dbs_min dbs_max=$dbs_max nbest=$n_best nmax=$nbs_max num_tie=$num_tie\n"; - - $rhash->{_optimization_on} = 2; - $rhash->{_reverse} = 0; - $rhash->{_bs_tie} = $bs_best; - } - } + $rhash->{_optimization_on} = 1; } } return;