From 897a9361b29df5db2b16f969b44bf28d975bb768 Mon Sep 17 00:00:00 2001
From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Thu, 30 Sep 2021 08:07:08 -0700
Subject: [PATCH] code cleanups and minor optimizations

---
 CHANGES.md                       |   3 +
 lib/Perl/Tidy.pm                 |  14 +--
 lib/Perl/Tidy/Formatter.pm       | 159 ++++++++++++++++---------------
 lib/Perl/Tidy/Tokenizer.pm       |   8 +-
 lib/Perl/Tidy/VerticalAligner.pm |   8 +-
 5 files changed, 101 insertions(+), 91 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 5d8ecf97..8153069c 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -18,6 +18,9 @@
       comment, '#>>V', can be lost.  A workaround for the previous version
       is to include the parameter '-mbl=2'.
 
+    - This version runs about 10 percent faster on large files than the previous
+      release due to optimizations made with the help of NYTProf.
+
     - Numerous minor fixes have been made. A complete list is at:
 
            https://github.com/perltidy/perltidy/blob/master/local-docs/BugLog.pod
diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm
index 895f5e32..e33fd335 100644
--- a/lib/Perl/Tidy.pm
+++ b/lib/Perl/Tidy.pm
@@ -1558,7 +1558,7 @@ EOM
                     last;
                 }
             } ## end if ( $iter < $max_iterations)
-        }    # end loop over iterations for one source file
+        } ## end loop over iterations for one source file
 
         # restore objects which have been temporarily undefined
         # for second and higher iterations
@@ -1830,7 +1830,7 @@ EOM
 
         $logger_object->finish( $infile_syntax_ok, $formatter )
           if $logger_object;
-    }    # end of main loop to process all files
+    } ## end of main loop to process all files
 
     # Fix for RT #130297: return a true value if anything was written to the
     # standard error output, even non-fatal warning messages, otherwise return
@@ -2912,7 +2912,7 @@ q(wbb=% + - * / x != == >= <= =~ !~ < > | & = **= += *= &= <<= &&= -= /= |= >>=
         \%option_category, \%option_range
     );
 
-}    # end of generate_options
+} ## end of generate_options
 
 # Memoize process_command_line. Given same @ARGV passed in, return same
 # values and same @ARGV back.
@@ -3211,7 +3211,7 @@ EOM
 
     return ( \%Opts, $config_file, \@raw_options, $roption_string,
         $rexpansion, $roption_category, $roption_range );
-}    # end of _process_command_line
+} ## end of _process_command_line
 
 sub check_options {
 
@@ -3520,7 +3520,7 @@ sub expand_command_abbreviations {
             else {
                 push( @new_argv, $word );
             }
-        }    # end of this pass
+        } ## end of this pass
 
         # update parameter list @ARGV to the new one
         @ARGV = @new_argv;
@@ -3559,8 +3559,8 @@ Program bug - circular-references in the %expansion hash, probably due to
 a recent program change.
 DIE
             }
-        }    # end of check for circular references
-    }    # end of loop over all passes
+        } ## end of check for circular references
+    } ## end of loop over all passes
     return;
 }
 
diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm
index 28a47d75..753688b0 100644
--- a/lib/Perl/Tidy/Formatter.pm
+++ b/lib/Perl/Tidy/Formatter.pm
@@ -438,10 +438,13 @@ BEGIN {
 
         _rspecial_side_comment_type_ => $i++,
 
-        _rseqno_controlling_my_ci_ => $i++,
-        _ris_seqno_controlling_ci_ => $i++,
-        _save_logfile_             => $i++,
-        _maximum_level_            => $i++,
+        _rseqno_controlling_my_ci_    => $i++,
+        _ris_seqno_controlling_ci_    => $i++,
+        _save_logfile_                => $i++,
+        _maximum_level_               => $i++,
+        _maximum_level_at_line_       => $i++,
+        _maximum_BLOCK_level_         => $i++,
+        _maximum_BLOCK_level_at_line_ => $i++,
 
         _rKrange_code_without_comments_ => $i++,
         _rbreak_before_Kfirst_          => $i++,
@@ -824,8 +827,11 @@ sub new {
     $self->[_rseqno_controlling_my_ci_] = {};
     $self->[_ris_seqno_controlling_ci_] = {};
 
-    $self->[_rspecial_side_comment_type_] = {};
-    $self->[_maximum_level_]              = 0;
+    $self->[_rspecial_side_comment_type_]  = {};
+    $self->[_maximum_level_]               = 0;
+    $self->[_maximum_level_at_line_]       = 0;
+    $self->[_maximum_BLOCK_level_]         = 0;
+    $self->[_maximum_BLOCK_level_at_line_] = 0;
 
     $self->[_rKrange_code_without_comments_] = [];
     $self->[_rbreak_before_Kfirst_]          = {};
@@ -2428,7 +2434,7 @@ sub set_whitespace_flags {
             $ws_4 = $ws_3 = $ws_2 = $ws_1 = $ws
               if DEBUG_WHITE;
 
-        }    # end setting space flag inside opening tokens
+        } ## end setting space flag inside opening tokens
 
         #---------------------------------------------------------------
         # Whitespace Rules Section 2:
@@ -2467,7 +2473,7 @@ sub set_whitespace_flags {
 
             $ws_4 = $ws_3 = $ws_2 = $ws
               if DEBUG_WHITE;
-        }    # end setting space flag inside closing tokens
+        } ## end setting space flag inside closing tokens
 
         #---------------------------------------------------------------
         # Whitespace Rules Section 3:
@@ -4965,10 +4971,12 @@ EOM
                             # For efficiency, we find the maximum level of
                             # opening tokens of any type.  The actual maximum
                             # level will be that of their contents which is 1
-                            # greater.
+                            # greater.  That will be fixed in sub
+                            # 'finish_formatting'.
                             my $level = $rlevels->[$j];
                             if ( $level > $self->[_maximum_level_] ) {
-                                $self->[_maximum_level_] = $level;
+                                $self->[_maximum_level_]         = $level;
+                                $self->[_maximum_level_at_line_] = $line_number;
                             }
                         }
                         else { $Iss_closing->[$seqno] = @{$rSS} }
@@ -5041,9 +5049,10 @@ sub finish_formatting {
     # The file has been tokenized and is ready to be formatted.
     # All of the relevant data is stored in $self, ready to go.
 
-    # Check the maximum level. If it is extremely large we will
-    # give up and output the file verbatim.  Note that the actual
-    # maximum level is 1 greater than the saved value.
+    # Check the maximum level. If it is extremely large we will give up and
+    # output the file verbatim.  Note that the actual maximum level is 1
+    # greater than the saved value, so we fix that here.
+    $self->[_maximum_level_] += 1;
     my $maximum_level       = $self->[_maximum_level_];
     my $maximum_table_index = $#maximum_line_length_at_level;
     if ( !$severe_error && $maximum_level >= $maximum_table_index ) {
@@ -5599,8 +5608,6 @@ sub respace_tokens {
     my $length_function = $self->[_length_function_];
     my $is_encoded_data = $self->[_is_encoded_data_];
 
-    my $rOpts_sub_alias_list = $rOpts->{'sub-alias-list'};
-
     my $rLL_new = [];    # This is the new array
     my $rtoken_vars;
     my $Ktoken_vars;                   # the old K value of $rtoken_vars
@@ -8466,7 +8473,7 @@ EOM
         # as here:
 
         #    $_[0]->code_handler
-        #     	( sub { $more .= $_[1] . ":" . $_[0] . "\n" } );
+        #      ( sub { $more .= $_[1] . ":" . $_[0] . "\n" } );
 
         # Here is another example where we do not want to weld:
         #  $wrapped->add_around_modifier(
@@ -9251,9 +9258,12 @@ sub whitespace_cycle_adjustment {
     my $rLL = $self->[_rLL_];
     return unless ( defined($rLL) && @{$rLL} );
     my $radjusted_levels = $self->[_radjusted_levels_];
+    my $maximum_level    = $self->[_maximum_level_];
 
-    my $rOpts_whitespace_cycle = $rOpts->{'whitespace-cycle'};
-    if ( $rOpts_whitespace_cycle && $rOpts_whitespace_cycle > 0 ) {
+    if (   $rOpts_whitespace_cycle
+        && $rOpts_whitespace_cycle > 0
+        && $rOpts_whitespace_cycle < $maximum_level )
+    {
 
         my $Kmax = @{$rLL} - 1;
 
@@ -10868,7 +10878,6 @@ EOM
     # flags needed by the store routine
     my $line_of_tokens;
     my $no_internal_newlines;
-    my $side_comment_follows;
     my $CODE_type;
 
     # range of K of tokens for the current line
@@ -10982,16 +10991,9 @@ EOM
         my ( $self, $Ktoken_vars, $rtoken_vars ) = @_;
 
         # Add one token to the next batch.
-        # $Ktoken_vars = the index K in the global token array
-        # $rtoken_vars = $rLL->[$Ktoken_vars] = the corresponding token values
-        #                unless they are temporarily being overridden
-
-        # NOTE: This routine needs to be coded efficiently because it is called
-        # once per token.  I have gotten it down from the second slowest to the
-        # eighth slowest, but that still seems rather slow for what it does.
-
-        # This closure variable has already been defined, for efficiency:
-        #     my $radjusted_levels = $self->[_radjusted_levels_];
+        #   $Ktoken_vars = the index K in the global token array
+        #   $rtoken_vars = $rLL->[$Ktoken_vars] = the corresponding token values
+        #                  unless they are temporarily being overridden
 
         my $type = $rtoken_vars->[_TYPE_];
 
@@ -11015,6 +11017,14 @@ EOM
             # happen, but it is worth checking.  Later code can then make the
             # simplifying assumption that blank tokens are not consecutive.
             elsif ( $type eq 'b' && $types_to_go[$max_index_to_go] eq 'b' ) {
+
+                if (DEVEL_MODE) {
+
+                    # if this happens, it is may be that consecutive blanks
+                    # were inserted into the token stream in 'respace_tokens'
+                    my $lno = $rLL->[$Ktoken_vars]->[_LINE_INDEX_] + 1;
+                    Fault("consecutive blanks near line $lno; please fix");
+                }
                 return;
             }
         }
@@ -11118,9 +11128,7 @@ EOM
         $parent_seqno_to_go[$max_index_to_go]  = $parent_seqno;
         $nesting_depth_to_go[$max_index_to_go] = $slevel;
         $block_type_to_go[$max_index_to_go]    = $block_type;
-
-        $nobreak_to_go[$max_index_to_go] =
-          $side_comment_follows ? 2 : $no_internal_newlines;
+        $nobreak_to_go[$max_index_to_go]       = $no_internal_newlines;
 
         my $length = $rtoken_vars->[_TOKEN_LENGTH_];
 
@@ -11345,12 +11353,13 @@ EOM
             return;
         }
 
+        # This flag will become nobreak_to_go and should be set to 2 to prevent
+        # a line break AFTER the current token.
         $no_internal_newlines = 0;
         if ( !$rOpts_add_newlines || $CODE_type eq 'NIN' ) {
             $no_internal_newlines = 2;
         }
 
-        $side_comment_follows = 0;
         my $is_comment =
           ( $K_first == $K_last && $rLL->[$K_first]->[_TYPE_] eq '#' );
         my $is_static_block_comment_without_leading_space =
@@ -11574,7 +11583,10 @@ EOM
                 $rbrace_follower = undef;
             }
 
-            my ( $block_type, $is_opening_BLOCK, $is_closing_BLOCK );
+            my (
+                $block_type,       $is_opening_BLOCK,
+                $is_closing_BLOCK, $nobreak_BEFORE_BLOCK
+            );
             if ( $rtoken_vars->[_TYPE_SEQUENCE_] ) {
 
                 my $token         = $rtoken_vars->[_TOKEN_];
@@ -11589,16 +11601,18 @@ EOM
                 {
 
                     if ( $type eq '{' ) {
-                        $is_opening_BLOCK = 1;
+                        $is_opening_BLOCK     = 1;
+                        $nobreak_BEFORE_BLOCK = $no_internal_newlines;
                     }
                     elsif ( $type eq '}' ) {
-                        $is_closing_BLOCK = 1;
+                        $is_closing_BLOCK     = 1;
+                        $nobreak_BEFORE_BLOCK = $no_internal_newlines;
                     }
                 }
             }
 
             # Find next nonblank token on this line and look for a side comment
-            my $Knnb;
+            my ( $Knnb, $side_comment_follows );
 
             # if before last token ...
             if ( $Ktoken_vars < $K_last ) {
@@ -11613,23 +11627,13 @@ EOM
                     $side_comment_follows = 1;
 
                     # Do not allow breaks which would promote a side comment to
-                    # a block comment.  In order to allow a break before an
-                    # opening or closing BLOCK, followed by a side comment,
-                    # those sections of code will handle this flag separately.
-                    if (   !$is_opening_BLOCK
-                        && !$is_closing_BLOCK )
-                    {
-                        $no_internal_newlines = 1;
-                    }
-                }
-                else {
-                    $side_comment_follows = undef;
+                    # a block comment.
+                    $no_internal_newlines = 2;
                 }
             }
 
             # if at last token ...
             else {
-                $side_comment_follows = undef;
 
                 # --------------------
                 # handle side comments
@@ -11648,6 +11652,8 @@ EOM
                 next;
             }
 
+            # Process non-blank and non-comment tokens ...
+
             # ----------------
             # handle semicolon
             # ----------------
@@ -11773,7 +11779,7 @@ EOM
                 {
 
                     # but only if allowed
-                    unless ($no_internal_newlines) {
+                    unless ($nobreak_BEFORE_BLOCK) {
 
                         # since we already stored this token, we must unstore it
                         $self->unstore_token_to_go();
@@ -11786,13 +11792,9 @@ EOM
                     }
                 }
 
-                # Now update for side comment
-                if ($side_comment_follows) { $no_internal_newlines = 1 }
-
                 # now output this line
-                unless ($no_internal_newlines) {
-                    $self->end_batch() if ( $max_index_to_go >= 0 );
-                }
+                $self->end_batch()
+                  if ( $max_index_to_go >= 0 && !$no_internal_newlines );
             }
 
             # ----------
@@ -11838,16 +11840,10 @@ EOM
                 }
 
                 # put a break before this closing curly brace if appropriate
-                unless ( $no_internal_newlines
-                    || $index_start_one_line_block != UNDEFINED_INDEX )
-                {
-
-                    # write out everything before this closing curly brace
-                    $self->end_batch() if ( $max_index_to_go >= 0 );
-                }
-
-                # Now update for side comment
-                if ($side_comment_follows) { $no_internal_newlines = 1 }
+                $self->end_batch()
+                  if ( $max_index_to_go >= 0
+                    && !$nobreak_BEFORE_BLOCK
+                    && $index_start_one_line_block == UNDEFINED_INDEX );
 
                 # store the closing curly brace
                 $self->store_token_to_go( $Ktoken_vars, $rtoken_vars );
@@ -12024,7 +12020,7 @@ EOM
                         || $max_index_to_go < 0 );
                 }
 
-            }    # end treatment of closing block token
+            } ## end treatment of closing block token
 
             # -----------------------------
             # handle here_doc target string
@@ -12616,6 +12612,12 @@ sub compare_indentation_levels {
         $structural_indentation_level = $radjusted_levels->[$K_first];
     }
 
+    # record max structural depth for log file
+    if ( $structural_indentation_level > $self->[_maximum_BLOCK_level_] ) {
+        $self->[_maximum_BLOCK_level_]         = $structural_indentation_level;
+        $self->[_maximum_BLOCK_level_at_line_] = $line_number;
+    }
+
     my $type_sequence = $rLL->[$K_first]->[_TYPE_SEQUENCE_];
     my $is_closing_block =
          $type_sequence
@@ -12633,7 +12635,6 @@ sub compare_indentation_levels {
             if ( !$self->[_first_brace_tabbing_disagreement_] ) {
                 $self->[_first_brace_tabbing_disagreement_] = $line_number;
             }
-
         }
 
         if ( !$self->[_in_tabbing_disagreement_] ) {
@@ -13430,8 +13431,7 @@ EOM
                 if ( $iend - $ibeg != $Kend - $Kbeg ) {
                     $index_error = $n unless defined($index_error);
                 }
-                push @{$rlines_K},
-                  [ $Kbeg, $Kend, $forced_breakpoint_to_go[$iend] ];
+                push @{$rlines_K}, [ $Kbeg, $Kend ];
             }
 
             # Check correctness of the mapping between the i and K token
@@ -17673,7 +17673,7 @@ sub set_continuation_breaks {
                             $self->set_forced_breakpoint($icomma);
                         }
                     }
-                }    # end logic to open up a container
+                } ## end logic to open up a container
 
                 # Break open a logical container open if it was already open
                 elsif ($is_simple_logical_expression
@@ -18604,7 +18604,7 @@ EOM
                 return;
             }
 
-        }    # end shortcut methods
+        } ## end shortcut methods
 
         # debug stuff
         DEBUG_SPARSE && do {
@@ -20311,7 +20311,7 @@ EOM
           # and limit total to 10 character widths
           && token_sequence_length( $ibeg, $iend ) <= 10;
 
-    }    # end of loop to output each line
+    } ## end of loop to output each line
 
     # remember indentation of lines containing opening containers for
     # later use by sub set_adjusted_indentation
@@ -21581,7 +21581,7 @@ sub get_seqno {
             $iendm          = $iend;
             $ibegm          = $ibeg;
             $has_leading_op = $has_leading_op_next;
-        }    # end of loop over lines
+        } ## end of loop over lines
         return;
     }
 } ## end closure set_logical_padding
@@ -21766,7 +21766,7 @@ sub pad_token {
                      # Containers beginning with { and [ are given those names
                      # for uniqueness. That way commas in different containers
                      # will not match. Here is an example of what this prevents:
-                     #	a => [ 1,       2, 3 ],
+                     #   a => [ 1,       2, 3 ],
                      #   b => { b1 => 4, b2 => 5 },
                      # Here is another example of what we avoid by labeling the
                      # commas properly:
@@ -22287,7 +22287,7 @@ sub make_paren_name {
                 )
 
                 # remove continuation indentation for any line like
-                # 	} ... {
+                #       } ... {
                 # or without ending '{' and unbalanced, such as
                 #       such as '}->{$operator}'
                 || (
@@ -23962,6 +23962,13 @@ sub wrapup {
     $file_writer_object->decrement_output_line_number()
       ;    # fix up line number since it was incremented
     we_are_at_the_last_line();
+
+    my $max_depth = $self->[_maximum_BLOCK_level_];
+    my $at_line   = $self->[_maximum_BLOCK_level_at_line_];
+    write_logfile_entry(
+"Maximum leading structural depth is $max_depth in input at line $at_line\n"
+    );
+
     my $added_semicolon_count    = $self->[_added_semicolon_count_];
     my $first_added_semicolon_at = $self->[_first_added_semicolon_at_];
     my $last_added_semicolon_at  = $self->[_last_added_semicolon_at_];
diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index 65e9ad19..164d9454 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -5154,7 +5154,7 @@ EOM
 
         return;
     }
-}    # end tokenize_this_line
+} ## end tokenize_this_line
 
 #########i#############################################################
 # Tokenizer routines which assist in identifying token types
@@ -8353,10 +8353,10 @@ sub scan_number_do {
            [Pp][+-]?[0-9a-fA-F]          # REQUIRED exponent with digit
            [0-9a-fA-F_]*)                # optional Additional exponent digits
 
-	   # or hex integer
+           # or hex integer
            |([xX][0-9a-fA-F_]+)        
 
-	   # or octal fraction
+           # or octal fraction
            |([oO]?[0-7_]+          # string of octal digits
            (\.([0-7][0-7_]*)?)?    # optional decimal and fraction
            [Pp][+-]?[0-7]          # REQUIRED exponent, no underscore
@@ -8371,7 +8371,7 @@ sub scan_number_do {
            [Pp][+-]?[01]           # Required exponent indicator, no underscore
            [01_]*)                 # additional exponent bits
 
-	   # or binary integer
+           # or binary integer
            |([bB][01_]+)           # 'b' with string of binary digits 
 
            )/gx
diff --git a/lib/Perl/Tidy/VerticalAligner.pm b/lib/Perl/Tidy/VerticalAligner.pm
index dd96fc5b..24f13f32 100644
--- a/lib/Perl/Tidy/VerticalAligner.pm
+++ b/lib/Perl/Tidy/VerticalAligner.pm
@@ -2759,8 +2759,8 @@ EOM
                     # will now be incorrect. For example, this will prevent
                     # aligning commas as follows after deleting the second '=>'
                     #    $w->insert(
-                    #	ListBox => origin => [ 270, 160 ],
-                    #	size    => [ 200,           55 ],
+                    #         ListBox => origin => [ 270, 160 ],
+                    #         size    => [ 200,           55 ],
                     #    );
                     if ( defined($delete_above_level) ) {
                         if ( $lev > $delete_above_level ) {
@@ -5275,7 +5275,7 @@ sub get_output_line_number {
                 # Here is a complex example:
 
                 # Foo($Bar[0], {  # (side comment)
-                # 	baz => 1,
+                #     baz => 1,
                 # });
 
                 # The first line has sequence 6::4.  It does not begin with
@@ -5475,7 +5475,7 @@ sub valign_output_step_D {
         $leading_string_cache[$leading_whitespace_count] = $leading_string;
         return $leading_string;
     }
-}    # end get_leading_string
+} ## end get_leading_string
 
 ##########################
 # CODE SECTION 10: Summary
-- 
2.39.5