From befcfe640ddc834f99dfb7607fec9e4e9fbcf172 Mon Sep 17 00:00:00 2001 From: Steve Hancock Date: Sun, 11 Sep 2022 15:02:18 -0700 Subject: [PATCH] cleanups and optimizations --- CHANGES.md | 2 +- lib/Perl/Tidy/Formatter.pm | 349 ++++++++++++++++++++----------- lib/Perl/Tidy/Tokenizer.pm | 88 +++++--- lib/Perl/Tidy/VerticalAligner.pm | 95 +++++---- 4 files changed, 334 insertions(+), 200 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 808768e2..e444ac10 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -80,7 +80,7 @@ my ($curr) = current(); err(@_); - - This version runs 5 to 15 percent faster than the previous + - This version runs about 10 to 15 percent faster than the previous release on large files, depending on formatting parameters. ## 2022 06 13 diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm index ff0da29f..6a968867 100644 --- a/lib/Perl/Tidy/Formatter.pm +++ b/lib/Perl/Tidy/Formatter.pm @@ -230,6 +230,7 @@ my ( # Static hashes initialized in a BEGIN block %is_assignment, + %is_non_list_type, %is_if_unless_and_or_last_next_redo_return, %is_if_elsif_else_unless_while_until_for_foreach, %is_if_unless_while_until_for_foreach, @@ -598,6 +599,10 @@ BEGIN { ); @is_assignment{@q} = (1) x scalar(@q); + # a hash needed by break_lists for efficiency: + push @q, qw{ ; < > ~ f }; + @is_non_list_type{@q} = (1) x scalar(@q); + @q = qw(is if unless and or err last next redo return); @is_if_unless_and_or_last_next_redo_return{@q} = (1) x scalar(@q); @@ -4044,6 +4049,11 @@ EOM my ($self) = @_; + #----------------------------------------------------------------- + # Define a 'bond strength' for each token pair in an output batch. + # See comments above for definition of bond strength. + #----------------------------------------------------------------- + my $rbond_strength_to_go = []; my $rLL = $self->[_rLL_]; @@ -5390,7 +5400,14 @@ EOM sub write_line_inner_loop { my ( $self, $line_of_tokens_old, $line_of_tokens ) = @_; - # Copy the tokens for this line to their new storage location + #--------------------------------------------------------------------- + # Copy the tokens on one line received from the tokenizer to their new + # storage locations. + #--------------------------------------------------------------------- + + # Input parameters: + # $line_of_tokens_old = line received from tokenizer + # $line_of_tokens = line of tokens being formed for formatter my $rtokens = $line_of_tokens_old->{_rtokens}; my $jmax = @{$rtokens} - 1; @@ -5752,7 +5769,7 @@ sub set_CODE_type { # A line which is entirely a quote or pattern must go out # verbatim. Note: the \n is contained in $input_line. if ( $jmax <= 0 ) { - if ( ( $input_line =~ "\t" ) ) { + if ( $self->[_save_logfile_] && $input_line =~ /\t/ ) { my $input_line_number = $line_of_tokens->{_line_number}; $self->note_embedded_tab($input_line_number); } @@ -6302,18 +6319,20 @@ sub respace_tokens { my $self = shift; - # return parameters + #-------------------------------------------------------------------------- + # This routine is called once per file to do as much formatting as possible + # before new line breaks are set. + #-------------------------------------------------------------------------- + + # Return parameters: + # Set $severe_error=true if processing must terminate immediately my ( $severe_error, $rqw_lines ); + # We change any spaces in --indent-only mode if ( $rOpts->{'indent-only'} ) { return ( $severe_error, $rqw_lines ); } - # This routine is called once per file to do as much formatting as possible - # before new line breaks are set. - - # Set $severe_error=true if processing must terminate immediately - # This routine makes all necessary and possible changes to the tokenization # after the initial tokenization of the file. This is a tedious routine, # but basically it consists of inserting and deleting whitespace between @@ -6499,7 +6518,7 @@ sub respace_tokens { # The level and ci_level of newly created spaces should be the # same as the previous token. Otherwise blinking states can # be created if the -lp mode is used. See similar coding in - # sub 'store_token_and_space'. Fixes cases b1109 b1110. + # sub 'store_space_and_token'. Fixes cases b1109 b1110. $rcopy->[_LEVEL_] = $rLL_new->[-1]->[_LEVEL_]; $rcopy->[_CI_LEVEL_] = @@ -6540,9 +6559,10 @@ sub respace_tokens_inner_loop { my ( $self, $Kfirst, $Klast, $input_line_number ) = @_; - #------------------------------------------------------- - # Loop to copy all tokens on this line, with any changes - #------------------------------------------------------- + #----------------------------------------------------------------- + # Loop to copy all tokens on one line, making any spacing changes, + # while also collecting information needed by later subs. + #----------------------------------------------------------------- my $type_sequence; my $rtoken_vars; foreach my $KK ( $Kfirst .. $Klast ) { @@ -6797,10 +6817,15 @@ EOM # this) $token =~ s/\s*$//; $rtoken_vars->[_TOKEN_] = $token; - $self->note_embedded_tab($input_line_number) - if ( $token =~ "\t" ); - $self->store_token_and_space( $rtoken_vars, - $rwhitespace_flags->[$KK] == WS_YES ); + if ( $self->[_save_logfile_] && $token =~ /\t/ ) { + $self->note_embedded_tab($input_line_number); + } + if ( $rwhitespace_flags->[$KK] == WS_YES ) { + $self->store_space_and_token($rtoken_vars); + } + else { + $self->store_token($rtoken_vars); + } next; } ## end if ( $type eq 'q' ) @@ -6823,12 +6848,13 @@ EOM # check a quote for problems elsif ( $type eq 'Q' ) { - $self->check_Q( $KK, $Kfirst, $input_line_number ); + $self->check_Q( $KK, $Kfirst, $input_line_number ) + if ( $self->[_save_logfile_] ); } # Store this token with possible previous blank if ( $rwhitespace_flags->[$KK] == WS_YES ) { - $self->store_token_and_space( $rtoken_vars, 1 ); + $self->store_space_and_token($rtoken_vars); } else { $self->store_token($rtoken_vars); @@ -7066,14 +7092,20 @@ sub set_permanently_broken { } ## end sub set_permanently_broken sub store_token { + my ( $self, $item ) = @_; + #------------------------------------------ + # Store one token during respace operations + #------------------------------------------ + + # Input parameter: + # $item = ref to a token + # This will be the index of this item in the new array my $KK_new = @{$rLL_new}; - #------------------------------------------------------------------ - # NOTE: called once per token so coding efficiency is critical here - #------------------------------------------------------------------ + # NOTE: this sub is called once per token so coding efficiency is critical. # The next multiple assignment statements are significantly faster than # doing them one-by-one. @@ -7299,14 +7331,13 @@ sub store_token { return; } ## end sub store_token -sub store_token_and_space { - my ( $self, $item, $want_space ) = @_; +sub store_space_and_token { + my ( $self, $item ) = @_; # store a token with preceding space if requested and needed # First store the space - if ( $want_space - && @{$rLL_new} + if ( @{$rLL_new} && $rLL_new->[-1]->[_TYPE_] ne 'b' && $rOpts_add_whitespace ) { @@ -7333,7 +7364,7 @@ sub store_token_and_space { # then the token $self->store_token($item); return; -} ## end sub store_token_and_space +} ## end sub store_space_and_token sub add_phantom_semicolon { @@ -7485,10 +7516,14 @@ sub add_phantom_semicolon { sub check_Q { - # Check that a quote looks okay + # Check that a quote looks okay, and report possible problems + # to the logfile. + my ( $self, $KK, $Kfirst, $line_number ) = @_; my $token = $rLL->[$KK]->[_TOKEN_]; - $self->note_embedded_tab($line_number) if ( $token =~ "\t" ); + if ( $token =~ /\t/ ) { + $self->note_embedded_tab($line_number); + } # The remainder of this routine looks for something like # '$var = s/xxx/yyy/;' @@ -9659,7 +9694,7 @@ sub weld_nested_quotes { my $next_type = $rLL->[$Kn]->[_TYPE_]; next unless ( ( $next_type eq 'q' || $next_type eq 'Q' ) - && $next_token =~ /^q/ ); + && substr( $next_token, 0, 1 ) eq 'q' ); # The token before the closing container must also be a quote my $Kouter_closing = $K_closing_container->{$outer_seqno}; @@ -10049,7 +10084,7 @@ sub clip_adjusted_levels { my ($self) = @_; my $radjusted_levels = $self->[_radjusted_levels_]; return unless defined($radjusted_levels) && @{$radjusted_levels}; - my $min = min( @{$radjusted_levels} ); # fast check for min + my $min = min( @{$radjusted_levels} ); # fast check for min if ( $min < 0 ) { # slow loop, but rarely needed @@ -12526,14 +12561,17 @@ EOM my ( $self, $Ktoken_vars, $rtoken_vars ) = @_; - # Add one token to the next batch. + #------------------------------------------------------- + # Token storage utility for sub process_line_of_CODE. + # Add one token to the next batch of '_to_go' variables. + #------------------------------------------------------- + + # Input parameters: # $Ktoken_vars = the index K in the global token array # $rtoken_vars = $rLL->[$Ktoken_vars] = the corresponding token values # unless they are temporarily being overridden - #------------------------------------------------------------------ # NOTE: called once per token so coding efficiency is critical here - #------------------------------------------------------------------ my ( @@ -12627,7 +12665,7 @@ EOM $summed_lengths_to_go[$max_index_to_go] + $length; # Initializations for first token of new batch - if ( $max_index_to_go == 0 ) { + if ( !$max_index_to_go ) { # Reset flag '$starting_in_quote' for a new batch. It must be set # to the value of '$in_continued_quote', but here for efficiency we @@ -13214,6 +13252,10 @@ EOM my ( $self, $has_side_comment ) = @_; + #-------------------------------------------------------------------- + # Loop to move all tokens from an input line to a newly forming batch + #-------------------------------------------------------------------- + # We do not want a leading blank if the previous batch just got output if ( $max_index_to_go < 0 && $rLL->[$K_first]->[_TYPE_] eq 'b' ) { @@ -14706,6 +14748,11 @@ EOM my ($self) = @_; + #----------------------------------------------------------------- + # This sub directs the formatting of one complete batch of tokens. + # The tokens of the batch are in the '_to_go' arrays. + #----------------------------------------------------------------- + my $this_batch = $self->[_this_batch_]; $batch_count++; @@ -14747,7 +14794,7 @@ EOM # Shortcut for block comments # Note that this shortcut does not work for -lp yet #-------------------------------------------------- - elsif ( $max_index_to_go == 0 && $types_to_go[0] eq '#' ) { + elsif ( !$max_index_to_go && $types_to_go[0] eq '#' ) { my $ibeg = 0; $this_batch->[_ri_first_] = [$ibeg]; $this_batch->[_ri_last_] = [$ibeg]; @@ -15081,21 +15128,22 @@ EOM my $called_pad_array_to_go; # set all forced breakpoints for good list formatting - my $is_long_line = $max_index_to_go > 0 - && $self->excess_line_length( $imin, $max_index_to_go ) > 0; - - my $old_line_count_in_batch = 1; + my $is_long_line; + my $multiple_old_lines_in_batch; if ( $max_index_to_go > 0 ) { + $is_long_line = + $self->excess_line_length( $imin, $max_index_to_go ) > 0; + my $Kbeg = $K_to_go[0]; my $Kend = $K_to_go[$max_index_to_go]; - $old_line_count_in_batch += + $multiple_old_lines_in_batch = $rLL->[$Kend]->[_LINE_INDEX_] - $rLL->[$Kbeg]->[_LINE_INDEX_]; } my $rbond_strength_bias = []; if ( $is_long_line - || $old_line_count_in_batch > 1 + || $multiple_old_lines_in_batch # must always call break_lists() with unbalanced batches because # it is maintaining some stacks @@ -15168,7 +15216,8 @@ EOM $self->break_all_chain_tokens( $ri_first, $ri_last ); - $self->break_equals( $ri_first, $ri_last ); + $self->break_equals( $ri_first, $ri_last ) + if @{$ri_first} >= 3; # now we do a correction step to clean this up a bit # (The only time we would not do this is for debugging) @@ -17894,6 +17943,14 @@ sub break_long_lines { # maximum line length. #----------------------------------------------------------- + my ( $self, $saw_good_break, $rcolon_list, $rbond_strength_bias ) = @_; + + # Input parameters: + # $saw_good_break - a flag set by break_lists + # $rcolon_list - ref to a list of all the ? and : tokens in the batch, + # in order. + # $rbond_strength_bias - small bond strength bias values set by break_lists + # Output: returns references to the arrays: # @i_first # @i_last @@ -17905,11 +17962,6 @@ sub break_long_lines { # may be updated to be =1 for any index $i after which there must be # a break. This signals later routines not to undo the breakpoint. - my ( $self, $saw_good_break, $rcolon_list, $rbond_strength_bias ) = @_; - - # @{$rcolon_list} is a list of all the ? and : tokens in the batch, in - # order. - # Method: # This routine is called if a statement is longer than the maximum line # length, or if a preliminary scanning located desirable break points. @@ -18125,6 +18177,19 @@ sub break_lines_inner_loop { # which, if possible, does not exceed the maximum line length. #----------------------------------------------------------------- + my ( + $self, # + + $i_begin, + $i_last_break, + $imax, + $last_break_strength, + $line_count, + $rbond_strength_to_go, + $saw_good_break, + + ) = @_; + # Given: # $i_begin = first index of range # $i_last_break = index of previous break @@ -18140,19 +18205,6 @@ sub break_lines_inner_loop { # $leading_alignment_type = special token type after break # $Msg = string of debug info - my ( - $self, # - - $i_begin, - $i_last_break, - $imax, - $last_break_strength, - $line_count, - $rbond_strength_to_go, - $saw_good_break, - - ) = @_; - my $Msg = EMPTY_STRING; my $strength = NO_BREAK; my $i_test = $i_begin - 1; @@ -19339,13 +19391,12 @@ EOM $last_dot_index[$depth] = $i; } - # Turn off alignment if we are sure that this is not a list + # Turn off comma alignment if we are sure that this is not a list # environment. To be safe, we will do this if we see certain - # non-list tokens, such as ';', and also the environment is - # not a list. Note that '=' could be in any of the = operators - # (lextest.t). We can't just use the reported environment - # because it can be incorrect in some cases. - elsif ( ( $type =~ /^[\;\<\>\~f]$/ || $is_assignment{$type} ) + # non-list tokens, such as ';', '=', and also the environment is + # not a list. + ## $type =~ /^[\;\<\>\~f]$/ || $is_assignment{$type} + elsif ( $is_non_list_type{$type} && !$self->is_in_list_by_i($i) ) { $dont_align[$depth] = 1; @@ -21725,13 +21776,13 @@ sub get_available_spaces_to_go { sub set_lp_indentation { + my ($self) = @_; + #------------------------------------------------------------------ # Define the leading whitespace for all tokens in the current batch # when the -lp formatting is selected. #------------------------------------------------------------------ - my ($self) = @_; - return unless ($rOpts_line_up_parentheses); return unless ( defined($max_index_to_go) && $max_index_to_go >= 0 ); @@ -22841,9 +22892,10 @@ sub convey_batch_to_vertical_aligner { # have been defined. Here we prepare the lines for passing to the vertical # aligner. We do the following tasks: # - mark certain vertical alignment tokens, such as '=', in each line - # - make minor indentation adjustments + # - make final indentation adjustments # - do logical padding: insert extra blank spaces to help display certain # logical constructions + # - send the line to the vertical aligner my $this_batch = $self->[_this_batch_]; my $ri_first = $this_batch->[_ri_first_]; @@ -23012,11 +23064,28 @@ EOM # -------------------------------------- # get the final indentation of this line # -------------------------------------- - my ( $indentation, $lev, $level_end, $i_terminal, $is_outdented_line ) - = $self->get_final_indentation( $ibeg, $iend, $rfields, - $rpatterns, $ri_first, $ri_last, - $rindentation_list, $ljump, $starting_in_quote, - $is_static_block_comment ); + my ( + + $indentation, + $lev, + $level_end, + $i_terminal, + $is_outdented_line, + + ) = $self->get_final_indentation( + + $ibeg, + $iend, + $rfields, + $rpatterns, + $ri_first, + $ri_last, + $rindentation_list, + $ljump, + $starting_in_quote, + $is_static_block_comment, + + ); # -------------------------------- # define flag 'outdent_long_lines' @@ -23399,16 +23468,21 @@ EOM sub set_vertical_alignment_markers { - # This routine takes the first step toward vertical alignment of the - # lines of output text. It looks for certain tokens which can serve as - # vertical alignment markers (such as an '='). - # + my ( $self, $ri_first, $ri_last ) = @_; + + #---------------------------------------------------------------------- + # This routine looks at output lines for certain tokens which can serve + # as vertical alignment markers (such as an '='). + #---------------------------------------------------------------------- + + # Input parameters: + # $ri_first = ref to list of starting line indexes in _to_go arrays + # $ri_last = ref to list of ending line indexes in _to_go arrays + # Method: We look at each token $i in this output batch and set # $ralignment_type_to_go->[$i] equal to those tokens at which we would # accept vertical alignment. - my ( $self, $ri_first, $ri_last ) = @_; - my $ralignment_type_to_go; my $ralignment_counts = []; my $ralignment_hash_by_line = []; @@ -24823,11 +24897,26 @@ sub xlp_tweak { sub make_alignment_patterns { - # Here we do some important preliminary work for the - # vertical aligner. We create four arrays for one - # output line. These arrays contain strings that can - # be tested by the vertical aligner to see if - # consecutive lines can be aligned vertically. + my ( $self, $ibeg, $iend, $ralignment_type_to_go, $alignment_count, + $ralignment_hash ) + = @_; + + #------------------------------------------------------------------ + # This sub creates arrays of vertical alignment info for one output + # line. + #------------------------------------------------------------------ + + # Input parameters: + # $ibeg, $iend - index range of this line in the _to_go arrays + # $ralignment_type_to_go - alignment type of tokens, like '=', if any + # $alignment_count - number of alignment tokens in the line + # $ralignment_hash - this contains all of the alignments for this + # line. It is not yet used but is available for future coding in + # case there is a need to do a preliminary scan of alignment tokens. + + # The arrays which are created contain strings that can be tested by + # the vertical aligner to see if consecutive lines can be aligned + # vertically. # # The four arrays are indexed on the vertical # alignment fields and are: @@ -24844,13 +24933,6 @@ sub xlp_tweak { # allowed, even when the alignment tokens match. # @field_lengths - the display width of each field - my ( $self, $ibeg, $iend, $ralignment_type_to_go, $alignment_count, - $ralignment_hash ) - = @_; - - # The var $ralignment_hash contains all of the alignments for this - # line. It is not yet used but is available for future coding in case - # there is a need to do a preliminary scan of the alignment tokens. if (DEVEL_MODE) { my $new_count = 0; if ( defined($ralignment_hash) ) { @@ -25374,9 +25456,26 @@ sub make_paren_name { sub get_final_indentation { - #-------------------------------------------------------------------- - # This routine sets the final indentation of a line in the Formatter. - #-------------------------------------------------------------------- + my ( + $self, # + + $ibeg, + $iend, + $rfields, + $rpatterns, + $ri_first, + $ri_last, + $rindentation_list, + $level_jump, + $starting_in_quote, + $is_static_block_comment + + ) = @_; + + #-------------------------------------------------------------- + # This routine makes any necessary adjustments to get the final + # indentation of a line in the Formatter. + #-------------------------------------------------------------- # It starts with the basic indentation which has been defined for the # leading token, and then takes into account any options that the user @@ -25399,15 +25498,6 @@ sub make_paren_name { # undo_ci, which was processed earlier, so care has to be taken to # keep them coordinated. - my ( - $self, $ibeg, - $iend, $rfields, - $rpatterns, $ri_first, - $ri_last, $rindentation_list, - $level_jump, $starting_in_quote, - $is_static_block_comment - ) = @_; - # Find the last code token of this line my $i_terminal = $iend; my $terminal_type = $types_to_go[$iend]; @@ -25498,11 +25588,14 @@ sub make_paren_name { # This can be tedious so we let a sub do it ( - $adjust_indentation, $default_adjust_indentation, - $opening_indentation, $opening_offset, - $is_leading, $opening_exists - ) - = $self->get_closing_token_indentation( + $adjust_indentation, + $default_adjust_indentation, + $opening_indentation, + $opening_offset, + $is_leading, + $opening_exists + + ) = $self->get_closing_token_indentation( $ibeg, $iend, @@ -25514,7 +25607,7 @@ sub make_paren_name { $is_semicolon_terminated, $seqno_qw_closing, - ); + ); } #-------------------------------------------------------- @@ -25842,8 +25935,15 @@ sub make_paren_name { } } - return ( $indentation, $lev, $level_end, $i_terminal, - $is_outdented_line ); + return ( + + $indentation, + $lev, + $level_end, + $i_terminal, + $is_outdented_line, + + ); } ## end sub get_final_indentation sub get_closing_token_indentation { @@ -25852,7 +25952,7 @@ sub make_paren_name { # token - i.e. one of these: ) ] } : my ( - $self, + $self, # $ibeg, $iend, @@ -25951,8 +26051,6 @@ sub make_paren_name { = $self->get_opening_indentation( $ibeg_weld_fix, $ri_first, $ri_last, $rindentation_list, $seqno_qw_closing ); - my $terminal_is_in_list = $self->is_in_list_by_i($i_terminal); - # First set the default behavior: if ( @@ -26017,7 +26115,7 @@ sub make_paren_name { # require LIST environment; otherwise, we may outdent too much - # this can happen in calls without parentheses (overload.t); - && $terminal_is_in_list + && $self->is_in_list_by_i($i_terminal) ) { $adjust_indentation = 1; @@ -26075,10 +26173,10 @@ sub make_paren_name { # but right now we do not have that information. For now # we see if we are in a list, and this works well. # See test files 'sub*.t' for good test cases. - if ( $terminal_is_in_list - && !$rOpts_indent_closing_brace + if ( !$rOpts_indent_closing_brace && $block_type_beg - && $block_type_beg =~ /$ASUB_PATTERN/ ) + && $self->[_ris_asub_block_]->{$seqno_beg} + && $self->is_in_list_by_i($i_terminal) ) { ( $opening_indentation, $opening_offset, @@ -26230,9 +26328,16 @@ sub make_paren_name { if ($is_leading) { $adjust_indentation = 2; } } - return ( $adjust_indentation, $default_adjust_indentation, - $opening_indentation, $opening_offset, - $is_leading, $opening_exists ); + return ( + + $adjust_indentation, + $default_adjust_indentation, + $opening_indentation, + $opening_offset, + $is_leading, + $opening_exists, + + ); } } ## end closure get_final_indentation diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm index 142bda03..e97a06fd 100644 --- a/lib/Perl/Tidy/Tokenizer.pm +++ b/lib/Perl/Tidy/Tokenizer.pm @@ -781,6 +781,15 @@ sub get_input_line_number { return $tokenizer_self->[_last_line_number_]; } +sub write_logfile_numbered_msg { + my ($msg) = @_; + + # write input line number + message to logfile + my $input_line_number = get_input_line_number(); + write_logfile_entry("Line $input_line_number: $msg"); + return; +} + # returns the next tokenized line sub get_line { @@ -796,12 +805,6 @@ sub get_line { my $input_line_number = ++$tokenizer_self->[_last_line_number_]; - my $write_logfile_entry = sub { - my ($msg) = @_; - write_logfile_entry("Line $input_line_number: $msg"); - return; - }; - # Find and remove what characters terminate this line, including any # control r my $input_line_separator = EMPTY_STRING; @@ -820,7 +823,7 @@ sub get_line { # for backwards compatibility we keep the line text terminated with # a newline character $input_line .= "\n"; - $tokenizer_self->[_line_of_text_] = $input_line; # update + $tokenizer_self->[_line_of_text_] = $input_line; # create a data structure describing this line which will be # returned to the caller. @@ -860,6 +863,7 @@ sub get_line { _square_bracket_depth => $square_bracket_depth, _paren_depth => $paren_depth, _quote_character => EMPTY_STRING, +## Skip these needless initializations for efficiency: ## _rtoken_type => undef, ## _rtokens => undef, ## _rlevels => undef, @@ -887,7 +891,8 @@ sub get_line { if ( $candidate_target eq $here_doc_target ) { $tokenizer_self->[_nearly_matched_here_target_at_] = undef; $line_of_tokens->{_line_type} = 'HERE_END'; - $write_logfile_entry->("Exiting HERE document $here_doc_target\n"); + write_logfile_numbered_msg( + "Exiting HERE document $here_doc_target\n"); my $rhere_target_list = $tokenizer_self->[_rhere_target_list_]; if ( @{$rhere_target_list} ) { # there can be multiple here targets @@ -896,7 +901,7 @@ sub get_line { $tokenizer_self->[_here_doc_target_] = $here_doc_target; $tokenizer_self->[_here_quote_character_] = $here_quote_character; - $write_logfile_entry->( + write_logfile_numbered_msg( "Entering HERE document $here_doc_target\n"); $tokenizer_self->[_nearly_matched_here_target_at_] = undef; $tokenizer_self->[_started_looking_for_here_target_at_] = @@ -932,7 +937,7 @@ sub get_line { # This is the end when count reaches 0 if ( !$tokenizer_self->[_in_format_] ) { - $write_logfile_entry->("Exiting format section\n"); + write_logfile_numbered_msg("Exiting format section\n"); $line_of_tokens->{_line_type} = 'FORMAT_END'; } } @@ -954,7 +959,7 @@ sub get_line { $line_of_tokens->{_line_type} = 'POD'; if ( $input_line =~ /^=cut/ ) { $line_of_tokens->{_line_type} = 'POD_END'; - $write_logfile_entry->("Exiting POD section\n"); + write_logfile_numbered_msg("Exiting POD section\n"); $tokenizer_self->[_in_pod_] = 0; } if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) { @@ -972,7 +977,7 @@ sub get_line { $line_of_tokens->{_line_type} = 'SKIP'; if ( $input_line =~ /$code_skipping_pattern_end/ ) { $line_of_tokens->{_line_type} = 'SKIP_END'; - $write_logfile_entry->("Exiting code-skipping section\n"); + write_logfile_numbered_msg("Exiting code-skipping section\n"); $tokenizer_self->[_in_skipped_] = 0; } return $line_of_tokens; @@ -996,7 +1001,7 @@ sub get_line { # end of a pod section if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) { $line_of_tokens->{_line_type} = 'POD_START'; - $write_logfile_entry->("Entering POD section\n"); + write_logfile_numbered_msg("Entering POD section\n"); $tokenizer_self->[_in_pod_] = 1; return $line_of_tokens; } @@ -1015,7 +1020,7 @@ sub get_line { # end of a pod section if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) { $line_of_tokens->{_line_type} = 'POD_START'; - $write_logfile_entry->("Entering POD section\n"); + write_logfile_numbered_msg("Entering POD section\n"); $tokenizer_self->[_in_pod_] = 1; return $line_of_tokens; } @@ -1147,13 +1152,13 @@ sub get_line { warning( "=cut starts a pod section .. this can fool pod utilities.\n" ) unless (DEVEL_MODE); - $write_logfile_entry->("Entering POD section\n"); + write_logfile_numbered_msg("Entering POD section\n"); } } else { $line_of_tokens->{_line_type} = 'POD_START'; - $write_logfile_entry->("Entering POD section\n"); + write_logfile_numbered_msg("Entering POD section\n"); } return $line_of_tokens; @@ -1163,7 +1168,7 @@ sub get_line { if ( $tokenizer_self->[_in_skipped_] ) { $line_of_tokens->{_line_type} = 'SKIP'; - $write_logfile_entry->("Entering code-skipping section\n"); + write_logfile_numbered_msg("Entering code-skipping section\n"); return $line_of_tokens; } @@ -1176,7 +1181,7 @@ sub get_line { $tokenizer_self->[_in_here_doc_] = 1; $tokenizer_self->[_here_doc_target_] = $here_doc_target; $tokenizer_self->[_here_quote_character_] = $here_quote_character; - $write_logfile_entry->("Entering HERE document $here_doc_target\n"); + write_logfile_numbered_msg("Entering HERE document $here_doc_target\n"); $tokenizer_self->[_started_looking_for_here_target_at_] = $input_line_number; } @@ -1186,13 +1191,13 @@ sub get_line { # which are not tokenized (and cannot be read with either!). if ( $tokenizer_self->[_in_data_] ) { $line_of_tokens->{_line_type} = 'DATA_START'; - $write_logfile_entry->("Starting __DATA__ section\n"); + write_logfile_numbered_msg("Starting __DATA__ section\n"); $tokenizer_self->[_saw_data_] = 1; # keep parsing after __DATA__ if use SelfLoader was seen if ( $tokenizer_self->[_saw_selfloader_] ) { $tokenizer_self->[_in_data_] = 0; - $write_logfile_entry->( + write_logfile_numbered_msg( "SelfLoader seen, continuing; -nlsl deactivates\n"); } @@ -1201,13 +1206,13 @@ sub get_line { elsif ( $tokenizer_self->[_in_end_] ) { $line_of_tokens->{_line_type} = 'END_START'; - $write_logfile_entry->("Starting __END__ section\n"); + write_logfile_numbered_msg("Starting __END__ section\n"); $tokenizer_self->[_saw_end_] = 1; # keep parsing after __END__ if use AutoLoader was seen if ( $tokenizer_self->[_saw_autoloader_] ) { $tokenizer_self->[_in_end_] = 0; - $write_logfile_entry->( + write_logfile_numbered_msg( "AutoLoader seen, continuing; -nlal deactivates\n"); } return $line_of_tokens; @@ -1232,7 +1237,7 @@ sub get_line { # Note: if keyword 'format' occurs in this line code, it is still CODE # (keyword 'format' need not start a line) if ( $tokenizer_self->[_in_format_] ) { - $write_logfile_entry->("Entering format section\n"); + write_logfile_numbered_msg("Entering format section\n"); } if ( $tokenizer_self->[_in_quote_] @@ -1244,7 +1249,7 @@ sub get_line { /^\s*$/ ) { $tokenizer_self->[_line_start_quote_] = $input_line_number; - $write_logfile_entry->( + write_logfile_numbered_msg( "Start multi-line quote or pattern ending in $quote_target\n"); } } @@ -1252,7 +1257,7 @@ sub get_line { && !$tokenizer_self->[_in_quote_] ) { $tokenizer_self->[_line_start_quote_] = -1; - $write_logfile_entry->("End of multi-line quote or pattern\n"); + write_logfile_numbered_msg("End of multi-line quote or pattern\n"); } # we are returning a line of CODE @@ -4534,8 +4539,16 @@ EOM } ## end sub tokenize_this_line sub tokenizer_main_loop { + my ($is_END_or_DATA) = @_; + #--------------------------------- + # Break one input line into tokens + #--------------------------------- + + # Input parameter: + # $is_END_or_DATA is true for a __END__ or __DATA__ line + # start by breaking the line into pre-tokens my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens ( $rtokens, $rtoken_map, $rtoken_type ) = @@ -4561,9 +4574,9 @@ EOM $i = -1; $i_tok = -1; - # ------------------------------------------------------------ + #----------------------------- # begin main tokenization loop - # ------------------------------------------------------------ + #----------------------------- # we are looking at each pre-token of one line and combining them # into tokens @@ -4897,10 +4910,14 @@ EOM sub tokenizer_wrapup_line { my ($line_of_tokens) = @_; - # We have broken the current line into tokens. Now we have to wrap up - # the result for shipping. Most of the remaining work involves - # defining the two indentation parameters that the formatter needs - # (structural indentation level and continuation indentation). + #--------------------------------------------------------- + # Package a line of tokens for shipping back to the caller + #--------------------------------------------------------- + + # Most of the remaining work involves defining the two indentation + # parameters that the formatter needs for each token: + # - $level = structural indentation level and + # - $ci_level = continuation indentation level # The method for setting the indentation level is straightforward. # But the method used to define the continuation indentation is @@ -9516,6 +9533,12 @@ sub write_on_underline { sub pre_tokenize { + my ( $str, $max_tokens_wanted ) = @_; + + # Input parameter: + # $max_tokens_wanted > 0 to stop on reaching this many tokens. + # = 0 means get all tokens + # Break a string, $str, into a sequence of preliminary tokens. We # are interested in these types of tokens: # words (type='w'), example: 'max_tokens_wanted' @@ -9529,9 +9552,8 @@ sub pre_tokenize { # An advantage of doing this pre-tokenization step is that it keeps almost # all of the regex work highly localized. A disadvantage is that in some # very rare instances we will have to go back and split a pre-token. - my ( $str, $max_tokens_wanted ) = @_; - # we return references to these 3 arrays: + # Return parameters: my @tokens = (); # array of the tokens themselves my @token_map = (0); # string position of start of each token my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct diff --git a/lib/Perl/Tidy/VerticalAligner.pm b/lib/Perl/Tidy/VerticalAligner.pm index a56fa115..b15fd84e 100644 --- a/lib/Perl/Tidy/VerticalAligner.pm +++ b/lib/Perl/Tidy/VerticalAligner.pm @@ -530,7 +530,13 @@ BEGIN { sub valign_input { - # Place one line in the current vertical group. + #--------------------------------------------------------------------- + # This is the front door of the vertical aligner. On each call + # we receive one line of specially marked text for vertical alignment. + # We compare the line with the current group, and either: + # - the line joins the current group if alignments match, or + # - the current group is flushed and a new group is started otherwise + #--------------------------------------------------------------------- # # The key input parameters describing each line are: # $level = indentation level of this line @@ -1473,9 +1479,9 @@ EOM # Revert to the starting state if does not fit if ( $pad > $padding_available ) { - ################################################ + #---------------------------------------------- # Line does not fit -- revert to starting state - ################################################ + #---------------------------------------------- foreach my $alignment (@alignments) { $alignment->restore_column(); } @@ -1487,9 +1493,9 @@ EOM $padding_available -= $pad; } - ###################################### + #------------------------------------- # The line fits, the match is accepted - ###################################### + #------------------------------------- return 1; } @@ -1645,18 +1651,18 @@ sub _flush_group_lines { "APPEND0: _flush_group_lines called from $a $b $c lines=$nlines, type=$group_type \n"; }; - ############################################ + #------------------------------------------- # Section 1: Handle a group of COMMENT lines - ############################################ + #------------------------------------------- if ( $group_type eq 'COMMENT' ) { $self->_flush_comment_lines(); return; } - ######################################################################### + #------------------------------------------------------------------------ # Section 2: Handle line(s) of CODE. Most of the actual work of vertical # aligning happens here in the following steps: - ######################################################################### + #------------------------------------------------------------------------ # STEP 1: Remove most unmatched tokens. They block good alignments. my ( $max_lev_diff, $saw_side_comment ) = @@ -2075,9 +2081,9 @@ sub sweep_left_to_right { my $ng_max = @{$rgroups} - 1; return unless ( $ng_max > 0 ); - ############################################################################ + #--------------------------------------------------------------------- # Step 1: Loop over groups to find all common leading alignment tokens - ############################################################################ + #--------------------------------------------------------------------- my $line; my $rtokens; @@ -2192,9 +2198,9 @@ sub sweep_left_to_right { } return unless @icommon; - ########################################################### + #---------------------------------------------------------- # Step 2: Reorder and consolidate the list into a task list - ########################################################### + #---------------------------------------------------------- # We have to work first from lowest token index to highest, then by group, # sort our list first on token index then group number @@ -2220,9 +2226,9 @@ sub sweep_left_to_right { push @todo, [ $i, $ng_beg, $ng_end, $raw_tok, $lev ]; } - ############################### + #------------------------------ # Step 3: Execute the task list - ############################### + #------------------------------ do_left_to_right_sweep( $rlines, $rgroups, \@todo, \%max_move, $short_pad, $group_level ); return; @@ -3275,18 +3281,18 @@ sub match_line_pairs { # find number of leading common tokens - ################################# + #--------------------------------- # No match to hanging side comment - ################################# + #--------------------------------- if ( $line->{'is_hanging_side_comment'} ) { # Should not get here; HSC's have been filtered out $imax_align = -1; } - ############################## + #----------------------------- # Handle comma-separated lists - ############################## + #----------------------------- elsif ( $list_type && $list_type eq $list_type_m ) { # do not align lists across a ci jump with new list method @@ -3305,9 +3311,9 @@ sub match_line_pairs { $imax_align = $i_nomatch - 1; } - ################## + #----------------- # Handle non-lists - ################## + #----------------- else { my $i_nomatch = $imax_min + 1; foreach my $i ( 0 .. $imax_min ) { @@ -3583,9 +3589,9 @@ sub prune_alignment_tree { use constant EXPLAIN_PRUNE => 0; - #################################################################### + #------------------------------------------------------------------- # Prune Tree Step 1. Start by scanning the lines and collecting info - #################################################################### + #------------------------------------------------------------------- # Note that the caller had this info but we have to redo this now because # alignment tokens may have been deleted. @@ -3631,9 +3637,9 @@ sub prune_alignment_tree { # the patterns and levels of the next line being tested at each depth my ( @token_patterns_next, @levels_next, @token_indexes_next ); - ######################################################### + #----------------------------------------------------------- # define a recursive worker subroutine for tree construction - ######################################################### + #----------------------------------------------------------- # This is a recursive routine which is called if a match condition changes # at any depth when a new line is encountered. It ends the match node @@ -3692,9 +3698,9 @@ sub prune_alignment_tree { return; }; ## end sub end_node - ###################################################### + #----------------------------------------------------- # Prune Tree Step 2. Loop to form the tree of matches. - ###################################################### + #----------------------------------------------------- foreach my $jp ( 0 .. $jmax ) { # working with two adjacent line indexes, 'm'=minus, 'p'=plus @@ -3764,9 +3770,9 @@ sub prune_alignment_tree { } } ## end loop to form tree of matches - ########################################################## + #--------------------------------------------------------- # Prune Tree Step 3. Make links from parent to child nodes - ########################################################## + #--------------------------------------------------------- # It seemed cleaner to do this as a separate step rather than during tree # construction. The children nodes have links up to the parent node which @@ -3801,9 +3807,9 @@ sub prune_alignment_tree { } }; - ####################################################### + #------------------------------------------------------ # Prune Tree Step 4. Make a list of nodes to be deleted - ####################################################### + #------------------------------------------------------ # list of lines with tokens to be deleted: # [$jbeg, $jend, $level_keep] @@ -3882,9 +3888,9 @@ sub prune_alignment_tree { @todo_list = @todo_next; } ## end loop to mark nodes to delete - ############################################################# + #------------------------------------------------------------ # Prune Tree Step 5. Loop to delete selected alignment tokens - ############################################################# + #------------------------------------------------------------ foreach my $item (@delete_list) { my ( $jbeg, $jend, $level_keep ) = @{$item}; foreach my $jj ( $jbeg .. $jend ) { @@ -4325,12 +4331,12 @@ sub get_extra_leading_spaces { ? $extra_indentation_spaces_wanted : $avail; - ######################################################### + #-------------------------------------------------------- # Note: min spaces can be negative; for example with -gnu # f( # do { 1; !!(my $x = bless []); } # ); - ######################################################### + #-------------------------------------------------------- # The following rule is needed to match older formatting: # For multiple groups, we will keep spaces non-negative. # For a single group, we will allow a negative space. @@ -4626,11 +4632,11 @@ sub align_side_comments { sub valign_output_step_A { - ############################################################### + #------------------------------------------------------------ # This is Step A in writing vertically aligned lines. # The line is prepared according to the alignments which have # been found. Then it is shipped to the next step. - ############################################################### + #------------------------------------------------------------ my ( $self, $rinput_hash ) = @_; @@ -5139,12 +5145,12 @@ sub get_output_line_number { sub valign_output_step_B { - ############################################################### + #--------------------------------------------------------- # This is Step B in writing vertically aligned lines. # Vertical tightness is applied according to preset flags. # In particular this routine handles stacking of opening # and closing tokens. - ############################################################### + #--------------------------------------------------------- my ( $self, $rinput ) = @_; @@ -5367,12 +5373,12 @@ sub get_output_line_number { sub valign_output_step_C { - ############################################################### + #----------------------------------------------------------------------- # This is Step C in writing vertically aligned lines. # Lines are either stored in a buffer or passed along to the next step. # The reason for storing lines is that we may later want to reduce their # indentation when -sot and -sct are both used. - ############################################################### + #----------------------------------------------------------------------- my ( $self, $seqno_string, @@ -5402,7 +5408,8 @@ sub get_output_line_number { # Start storing lines when we see a line with multiple stacked # opening tokens. # patch for RT #94354, requested by Colin Williams - if ( $seqno_string =~ /^\d+(\:+\d+)+$/ + if ( index( $seqno_string, ':' ) >= 0 + && $seqno_string =~ /^\d+(\:+\d+)+$/ && $args_to_D[0] !~ /^[\}\)\]\:\?]/ ) { @@ -5446,11 +5453,11 @@ sub get_output_line_number { sub valign_output_step_D { - ############################################################### + #---------------------------------------------------------------- # This is Step D in writing vertically aligned lines. # It is the end of the vertical alignment pipeline. # Write one vertically aligned line of code to the output object. - ############################################################### + #---------------------------------------------------------------- my ( $self, $line, $leading_space_count, $level, $Kend ) = @_; -- 2.39.5