X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=lib%2FPerl%2FTidy.pm;h=86764c33f65c68e6f00cbd9561e07785d272628f;hb=8aa69fbac36a21cad0a1c0d5b3452a546d427d7f;hp=e69780c75b3de53dccf003943ac5c5cfa5ad9be7;hpb=8650cadc9aaa12d0fb1d7025ad8fefda9c3df636;p=perltidy.git diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm index e69780c..86764c3 100644 --- a/lib/Perl/Tidy.pm +++ b/lib/Perl/Tidy.pm @@ -2,7 +2,7 @@ # # perltidy - a perl script indenter and formatter # -# Copyright (c) 2000-2006 by Steve Hancock +# Copyright (c) 2000-2007 by Steve Hancock # Distributed under the GPL license agreement; see file COPYING # # This program is free software; you can redistribute it and/or modify @@ -35,6 +35,7 @@ # Yves Orton supplied coding to help detect Windows versions. # Axel Rose supplied a patch for MacPerl. # Sebastien Aperghis-Tramoni supplied a patch for the defined or operator. +# Dan Tyrell sent a patch for binary I/O. # Many others have supplied key ideas, suggestions, and bug reports; # see the CHANGES file. # @@ -63,7 +64,7 @@ use IO::File; use File::Basename; BEGIN { - ( $VERSION = q($Id: Tidy.pm,v 1.49 2006/06/14 01:56:24 perltidy Exp $) ) =~ s/^.*\s+(\d+)\/(\d+)\/(\d+).*$/$1$2$3/; # all one line for MakeMaker + ( $VERSION = q($Id: Tidy.pm,v 1.61 2007/04/24 13:31:15 perltidy Exp $) ) =~ s/^.*\s+(\d+)\/(\d+)\/(\d+).*$/$1$2$3/; # all one line for MakeMaker } sub streamhandle { @@ -308,8 +309,8 @@ sub make_temporary_filename { } if ($input_file) { - if ( ref $input_file ) { print STDERR " of reference to:" } - else { print STDERR " of file:" } + if ( ref $input_file ) { print STDERR " of reference to:" } + else { print STDERR " of file:" } print STDERR " $input_file"; } print STDERR "\n"; @@ -358,7 +359,7 @@ EOM my $hash_ref = $input_hash{$key}; if ( defined($hash_ref) ) { unless ( ref($hash_ref) eq 'HASH' ) { - my $what = ref($hash_ref); + my $what = ref($hash_ref); my $but_is = $what ? "but is ref to $what" : "but is not a reference"; croak <{'preserve-line-endings'} ) { $line_separator = find_input_line_ending($input_file); } - $line_separator = "\n" unless defined($line_separator); + + # Eventually all I/O may be done with binmode, but for now it is + # only done when a user requests a particular line separator + # through the -ple or -ole flags + my $binmode = 0; + if ( defined($line_separator) ) { $binmode = 1 } + else { $line_separator = "\n" } my $sink_object = Perl::Tidy::LineSink->new( $output_file, $tee_file, - $line_separator, $rOpts, $rpending_logfile_message ); + $line_separator, $rOpts, $rpending_logfile_message, $binmode ); #--------------------------------------------------------------- # initialize the error logger @@ -973,6 +986,7 @@ EOM my $fout = IO::File->new("> $input_file") or die "problem opening $input_file for write for -b option; check directory permissions: $!\n"; + binmode $fout; my $line; while ( $line = $output_file->getline() ) { $fout->print($line); @@ -1102,6 +1116,7 @@ sub generate_options { # chk --> check-multiline-quotes # check for old bug; to be deleted # scl --> short-concatenation-item-length # helps break at '.' # recombine # for debugging line breaks + # valign # for debugging vertical alignment # I --> DIAGNOSTICS # for debugging ###################################################################### @@ -1157,6 +1172,7 @@ sub generate_options { no-profile npro recombine! + valign! ); my $category = 13; # Debugging @@ -1216,6 +1232,13 @@ sub generate_options { $add_option->( 'standard-output', 'st', '!' ); $add_option->( 'warning-output', 'w', '!' ); + # options which are both toggle switches and values moved here + # to hide from tidyview (which does not show category 0 flags): + # -ole moved here from category 1 + # -sil moved here from category 2 + $add_option->( 'output-line-ending', 'ole', '=s' ); + $add_option->( 'starting-indentation-level', 'sil', '=i' ); + ######################################## $category = 1; # Basic formatting options ######################################## @@ -1223,7 +1246,6 @@ sub generate_options { $add_option->( 'entab-leading-whitespace', 'et', '=i' ); $add_option->( 'indent-columns', 'i', '=i' ); $add_option->( 'maximum-line-length', 'l', '=i' ); - $add_option->( 'output-line-ending', 'ole', '=s' ); $add_option->( 'perl-syntax-check-flags', 'pscf', '=s' ); $add_option->( 'preserve-line-endings', 'ple', '!' ); $add_option->( 'tabs', 't', '!' ); @@ -1232,7 +1254,6 @@ sub generate_options { $category = 2; # Code indentation control ######################################## $add_option->( 'continuation-indentation', 'ci', '=i' ); - $add_option->( 'starting-indentation-level', 'sil', '=i' ); $add_option->( 'line-up-parentheses', 'lp', '!' ); $add_option->( 'outdent-keyword-list', 'okwl', '=s' ); $add_option->( 'outdent-keywords', 'okw', '!' ); @@ -1305,7 +1326,7 @@ sub generate_options { $add_option->( 'brace-vertical-tightness-closing', 'bvtc', '=i' ); $add_option->( 'cuddled-else', 'ce', '!' ); $add_option->( 'delete-old-newlines', 'dnl', '!' ); - $add_option->( 'opening-brace-always-on-right', 'bar', '' ); + $add_option->( 'opening-brace-always-on-right', 'bar', '!' ); $add_option->( 'opening-brace-on-new-line', 'bl', '!' ); $add_option->( 'opening-hash-brace-right', 'ohbr', '!' ); $add_option->( 'opening-paren-right', 'opr', '!' ); @@ -1336,7 +1357,7 @@ sub generate_options { ######################################## $add_option->( 'break-at-old-keyword-breakpoints', 'bok', '!' ); $add_option->( 'break-at-old-logical-breakpoints', 'bol', '!' ); - $add_option->( 'break-at-old-trinary-breakpoints', 'bot', '!' ); + $add_option->( 'break-at-old-ternary-breakpoints', 'bot', '!' ); $add_option->( 'ignore-old-breakpoints', 'iob', '!' ); ######################################## @@ -1422,34 +1443,36 @@ sub generate_options { # if min is undefined, there is no lower limit # if max is undefined, there is no upper limit # Parameters not listed here have defaults - $option_range{'format'} = [qw(tidy html user)]; - $option_range{'output-line-ending'} = [qw(dos win mac unix)]; - - $option_range{'block-brace-tightness'} = [ 0, 2 ]; - $option_range{'brace-tightness'} = [ 0, 2 ]; - $option_range{'paren-tightness'} = [ 0, 2 ]; - $option_range{'square-bracket-tightness'} = [ 0, 2 ]; - - $option_range{'block-brace-vertical-tightness'} = [ 0, 2 ]; - $option_range{'brace-vertical-tightness'} = [ 0, 2 ]; - $option_range{'brace-vertical-tightness-closing'} = [ 0, 2 ]; - $option_range{'paren-vertical-tightness'} = [ 0, 2 ]; - $option_range{'paren-vertical-tightness-closing'} = [ 0, 2 ]; - $option_range{'square-bracket-vertical-tightness'} = [ 0, 2 ]; - $option_range{'square-bracket-vertical-tightness-closing'} = [ 0, 2 ]; - $option_range{'vertical-tightness'} = [ 0, 2 ]; - $option_range{'vertical-tightness-closing'} = [ 0, 2 ]; - - $option_range{'closing-brace-indentation'} = [ 0, 3 ]; - $option_range{'closing-paren-indentation'} = [ 0, 3 ]; - $option_range{'closing-square-bracket-indentation'} = [ 0, 3 ]; - $option_range{'closing-token-indentation'} = [ 0, 3 ]; - - $option_range{'closing-side-comment-else-flag'} = [ 0, 2 ]; - $option_range{'comma-arrow-breakpoints'} = [ 0, 3 ]; - -# Note: we could actually allow negative ci if someone really wants it: -# $option_range{'continuation-indentation'} = [ undef, undef ]; + %option_range = ( + 'format' => [ 'tidy', 'html', 'user' ], + 'output-line-ending' => [ 'dos', 'win', 'mac', 'unix' ], + + 'block-brace-tightness' => [ 0, 2 ], + 'brace-tightness' => [ 0, 2 ], + 'paren-tightness' => [ 0, 2 ], + 'square-bracket-tightness' => [ 0, 2 ], + + 'block-brace-vertical-tightness' => [ 0, 2 ], + 'brace-vertical-tightness' => [ 0, 2 ], + 'brace-vertical-tightness-closing' => [ 0, 2 ], + 'paren-vertical-tightness' => [ 0, 2 ], + 'paren-vertical-tightness-closing' => [ 0, 2 ], + 'square-bracket-vertical-tightness' => [ 0, 2 ], + 'square-bracket-vertical-tightness-closing' => [ 0, 2 ], + 'vertical-tightness' => [ 0, 2 ], + 'vertical-tightness-closing' => [ 0, 2 ], + + 'closing-brace-indentation' => [ 0, 3 ], + 'closing-paren-indentation' => [ 0, 3 ], + 'closing-square-bracket-indentation' => [ 0, 3 ], + 'closing-token-indentation' => [ 0, 3 ], + + 'closing-side-comment-else-flag' => [ 0, 2 ], + 'comma-arrow-breakpoints' => [ 0, 3 ], + ); + + # Note: we could actually allow negative ci if someone really wants it: + # $option_range{'continuation-indentation'} = [ undef, undef ]; #--------------------------------------------------------------- # Assign default values to the above options here, except @@ -1469,7 +1492,7 @@ sub generate_options { brace-vertical-tightness-closing=0 brace-vertical-tightness=0 break-at-old-logical-breakpoints - break-at-old-trinary-breakpoints + break-at-old-ternary-breakpoints break-at-old-keyword-breakpoints comma-arrow-breakpoints=1 nocheck-syntax @@ -1512,6 +1535,7 @@ sub generate_options { paren-vertical-tightness=0 pass-version-line recombine + valign short-concatenation-item-length=8 space-for-semicolon square-bracket-tightness=1 @@ -1544,15 +1568,16 @@ sub generate_options { 'outdent-long-lines' => [qw(outdent-long-quotes outdent-long-comments)], 'nooutdent-long-lines' => [qw(nooutdent-long-quotes nooutdent-long-comments)], - 'noll' => [qw(nooutdent-long-lines)], - 'io' => [qw(indent-only)], + 'noll' => [qw(nooutdent-long-lines)], + 'io' => [qw(indent-only)], 'delete-all-comments' => [qw(delete-block-comments delete-side-comments delete-pod)], 'nodelete-all-comments' => [qw(nodelete-block-comments nodelete-side-comments nodelete-pod)], - 'dac' => [qw(delete-all-comments)], - 'ndac' => [qw(nodelete-all-comments)], - 'gnu' => [qw(gnu-style)], + 'dac' => [qw(delete-all-comments)], + 'ndac' => [qw(nodelete-all-comments)], + 'gnu' => [qw(gnu-style)], + 'pbp' => [qw(perl-best-practices)], 'tee-all-comments' => [qw(tee-block-comments tee-side-comments tee-pod)], 'notee-all-comments' => @@ -1568,6 +1593,8 @@ sub generate_options { 'baa' => [qw(cab=0)], 'nbaa' => [qw(cab=1)], + 'break-at-old-trinary-breakpoints' => [qw(bot)], + 'cti=0' => [qw(cpi=0 cbi=0 csbi=0)], 'cti=1' => [qw(cpi=1 cbi=1 csbi=1)], 'cti=2' => [qw(cpi=2 cbi=2 csbi=2)], @@ -1663,6 +1690,7 @@ sub generate_options { noblanks-before-subs nofuzzy-line-length notabs + norecombine ) ], @@ -1675,6 +1703,12 @@ sub generate_options { ) ], + # Style suggested in Damian Conway's Perl Best Practices + 'perl-best-practices' => [ + qw(l=78 i=4 ci=4 st se vt=2 cti=0 pt=1 bt=1 sbt=1 bbt=1 nsfs nolq), +q(wbb=% + - * / x != == >= <= =~ !~ < > | & = **= += *= &= <<= &&= -= /= |= >>= ||= //= .= %= ^= x=) + ], + # Additional styles can be added here ); @@ -1831,7 +1865,7 @@ EOM # look for a config file if we don't have one yet my $rconfig_file_chatter; $$rconfig_file_chatter = ""; - $config_file = + $config_file = find_config_file( $is_Windows, $Windows_type, $rconfig_file_chatter, $rpending_complaint ) unless $config_file; @@ -1917,6 +1951,7 @@ EOM } ) { + if ( defined( $Opts{$_} ) ) { delete $Opts{$_}; warn "ignoring --$_ in config file: $config_file\n"; @@ -2079,20 +2114,6 @@ EOM # entab leading whitespace has priority over the older 'tabs' option if ( $rOpts->{'tabs'} ) { $rOpts->{'tabs'} = 0; } } - - if ( $rOpts->{'output-line-ending'} ) { - unless ( is_unix() ) { - warn "ignoring -ole; only works under unix\n"; - $rOpts->{'output-line-ending'} = undef; - } - } - if ( $rOpts->{'preserve-line-endings'} ) { - unless ( is_unix() ) { - warn "ignoring -ple; only works under unix\n"; - $rOpts->{'preserve-line-endings'} = undef; - } - } - } sub expand_command_abbreviations { @@ -2432,7 +2453,7 @@ sub Win_Config_Locs { # 9x/Me box. Contributed by: Yves Orton. my $rpending_complaint = shift; - my $os = (@_) ? shift: Win_OS_Type(); + my $os = (@_) ? shift : Win_OS_Type(); return unless $os; my $system = ""; @@ -2466,7 +2487,7 @@ sub dump_config_file { print STDOUT "$$rconfig_file_chatter"; if ($fh) { print STDOUT "# Dump of file: '$config_file'\n"; - while ( $_ = $fh->getline() ) { print STDOUT } + while ( my $line = $fh->getline() ) { print STDOUT $line } eval { $fh->close() }; } else { @@ -2484,21 +2505,22 @@ sub read_config_file { my $name = undef; my $line_no; - while ( $_ = $fh->getline() ) { + while ( my $line = $fh->getline() ) { $line_no++; - chomp; - next if /^\s*#/; # skip full-line comment - ( $_, $death_message ) = strip_comment( $_, $config_file, $line_no ); + chomp $line; + next if $line =~ /^\s*#/; # skip full-line comment + ( $line, $death_message ) = + strip_comment( $line, $config_file, $line_no ); last if ($death_message); - s/^\s*(.*?)\s*$/$1/; # trim both ends - next unless $_; + $line =~ s/^\s*(.*?)\s*$/$1/; # trim both ends + next unless $line; # look for something of the general form # newname { body } # or just # body - if ( $_ =~ /^((\w+)\s*\{)?([^}]*)(\})?$/ ) { + if ( $line =~ /^((\w+)\s*\{)?([^}]*)(\})?$/ ) { my ( $newname, $body, $curly ) = ( $2, $3, $4 ); # handle a new alias definition @@ -2753,7 +2775,7 @@ sub show_version { print <<"EOM"; This is perltidy, v$VERSION -Copyright 2000-2006, Steve Hancock +Copyright 2000-2007, Steve Hancock Perltidy is free software and may be copied under the terms of the GNU General Public License, which is included in the distribution files. @@ -2867,7 +2889,7 @@ Following Old Breakpoints -boc break at old comma breaks: turns off all automatic list formatting -bol break at old logical breakpoints: or, and, ||, && (default) -bok break at old list keyword breakpoints such as map, sort (default) - -bot break at old conditional (trinary ?:) operator breakpoints (default) + -bot break at old conditional (ternary ?:) operator breakpoints (default) -cab=n break at commas after a comma-arrow (=>): n=0 break at all commas after => n=1 stable: break unless this breaks an existing one-line container @@ -3342,7 +3364,7 @@ package Perl::Tidy::LineSink; sub new { my ( $class, $output_file, $tee_file, $line_separator, $rOpts, - $rpending_logfile_message ) + $rpending_logfile_message, $binmode ) = @_; my $fh = undef; my $fh_copy = undef; @@ -3354,6 +3376,12 @@ sub new { ( $fh, $output_file ) = Perl::Tidy::streamhandle( $output_file, 'w' ); unless ($fh) { die "Cannot write to output stream\n"; } $output_file_open = 1; + if ($binmode) { + if ( ref($fh) eq 'IO::File' ) { + binmode $fh; + } + if ( $output_file eq '-' ) { binmode STDOUT } + } } # in order to check output syntax when standard output is used, @@ -3384,6 +3412,7 @@ EOM _tee_file => $tee_file, _tee_file_opened => 0, _line_separator => $line_separator, + _binmode => $binmode, }, $class; } @@ -3432,6 +3461,7 @@ sub really_open_tee_file { my $fh_tee; $fh_tee = IO::File->new(">$tee_file") or die("couldn't open TEE file $tee_file: $!\n"); + binmode $fh_tee if $self->{_binmode}; $self->{_tee_file_opened} = 1; $self->{_fh_tee} = $fh_tee; } @@ -3664,10 +3694,10 @@ sub make_line_information_string { my $line_information_string = ""; if ($input_line_number) { - my $output_line_number = $self->{_output_line_number}; - my $brace_depth = $line_of_tokens->{_curly_brace_depth}; - my $paren_depth = $line_of_tokens->{_paren_depth}; - my $square_bracket_depth = $line_of_tokens->{_square_bracket_depth}; + my $output_line_number = $self->{_output_line_number}; + my $brace_depth = $line_of_tokens->{_curly_brace_depth}; + my $paren_depth = $line_of_tokens->{_paren_depth}; + my $square_bracket_depth = $line_of_tokens->{_square_bracket_depth}; my $python_indentation_level = $line_of_tokens->{_python_indentation_level}; my $rlevels = $line_of_tokens->{_rlevels}; @@ -3683,13 +3713,13 @@ sub make_line_information_string { # for longer scripts it doesn't really matter my $extra_space = ""; $extra_space .= - ( $input_line_number < 10 ) ? " " + ( $input_line_number < 10 ) ? " " : ( $input_line_number < 100 ) ? " " - : ""; + : ""; $extra_space .= - ( $output_line_number < 10 ) ? " " + ( $output_line_number < 10 ) ? " " : ( $output_line_number < 100 ) ? " " - : ""; + : ""; # there are 2 possible nesting strings: # the original which looks like this: (0 [1 {2 @@ -3811,11 +3841,11 @@ sub warning { if ( $self->get_use_prefix() > 0 ) { my $input_line_number = Perl::Tidy::Tokenizer::get_input_line_number(); - print $fh_warnings "$input_line_number:\t@_"; + $fh_warnings->print("$input_line_number:\t@_"); $self->write_logfile_entry("WARNING: @_"); } else { - print $fh_warnings @_; + $fh_warnings->print(@_); $self->write_logfile_entry(@_); } } @@ -3823,7 +3853,7 @@ sub warning { $self->{_warning_count} = $warning_count; if ( $warning_count == WARNING_LIMIT ) { - print $fh_warnings "No further warnings will be given"; + $fh_warnings->print("No further warnings will be given\n"); } } } @@ -3863,14 +3893,14 @@ EOM elsif ( $saw_code_bug == 1 ) { if ( $self->{_saw_extrude} ) { $self->warning(<add_toc_item( '__END__', '__END__' ); } @@ -5317,10 +5347,10 @@ EOM # add the line number if requested if ( $rOpts->{'html-line-numbers'} ) { my $extra_space .= - ( $line_number < 10 ) ? " " + ( $line_number < 10 ) ? " " : ( $line_number < 100 ) ? " " : ( $line_number < 1000 ) ? " " - : ""; + : ""; $html_line = $extra_space . $line_number . " " . $html_line; } @@ -5429,6 +5459,7 @@ use vars qw{ $last_last_nonblank_token_to_go @nonblank_lines_at_depth $starting_in_quote + $ending_in_quote $in_format_skipping_section $format_skipping_pattern_begin @@ -5449,7 +5480,6 @@ use vars qw{ $added_semicolon_count $first_added_semicolon_at $last_added_semicolon_at - $saw_negative_indentation $first_tabbing_disagreement $last_tabbing_disagreement $in_tabbing_disagreement @@ -5499,6 +5529,7 @@ use vars qw{ %is_assignment %is_chain_operator %is_if_unless_and_or_last_next_redo_return + %is_until_while_for_if_elsif_else @has_broken_sublist @dont_align @@ -5539,7 +5570,7 @@ use vars qw{ $rOpts_break_at_old_keyword_breakpoints $rOpts_break_at_old_comma_breakpoints $rOpts_break_at_old_logical_breakpoints - $rOpts_break_at_old_trinary_breakpoints + $rOpts_break_at_old_ternary_breakpoints $rOpts_closing_side_comment_else_flag $rOpts_closing_side_comment_maximum_text $rOpts_continuation_indentation @@ -5615,6 +5646,10 @@ BEGIN { @_ = qw(is if unless and or err last next redo return); @is_if_unless_and_or_last_next_redo_return{@_} = (1) x scalar(@_); + # always break after a closing curly of these block types: + @_ = qw(until while for if elsif else); + @is_until_while_for_if_elsif_else{@_} = (1) x scalar(@_); + @_ = qw(last next redo return); @is_last_next_redo_return{@_} = (1) x scalar(@_); @@ -5633,8 +5668,13 @@ BEGIN { @_ = qw(and or err); @is_and_or{@_} = (1) x scalar(@_); - # Identify certain operators which often occur in chains - @_ = qw(&& || and or : ? .); + # Identify certain operators which often occur in chains. + # Note: the minus (-) causes a side effect of padding of the first line in + # something like this (by sub set_logical_padding): + # Checkbutton => 'Transmission checked', + # -variable => \$TRANS + # This usually improves appearance so it seems ok. + @_ = qw(&& || and or : ? . + - * /); @is_chain_operator{@_} = (1) x scalar(@_); # We can remove semicolons after blocks preceded by these keywords @@ -5689,6 +5729,25 @@ use constant TYPE_SEQUENCE_INCREMENT => 4; sub _decrement_count { --$_count } } +sub trim { + + # trim leading and trailing whitespace from a string + $_[0] =~ s/\s+$//; + $_[0] =~ s/^\s+//; + return $_[0]; +} + +sub split_words { + + # given a string containing words separated by whitespace, + # return the list of words + my ($str) = @_; + return unless $str; + $str =~ s/\s+$//; + $str =~ s/^\s+//; + return split( /\s+/, $str ); +} + # interface to Perl::Tidy::Logger routines sub warning { if ($logger_object) { @@ -5808,7 +5867,6 @@ sub new { @want_comma_break = (); @ci_stack = (""); - $saw_negative_indentation = 0; $first_tabbing_disagreement = 0; $last_tabbing_disagreement = 0; $tabbing_disagreement_count = 0; @@ -6096,8 +6154,9 @@ sub set_leading_whitespace { # handle the standard indentation scheme #------------------------------------------- unless ($rOpts_line_up_parentheses) { - my $space_count = $ci_level * $rOpts_continuation_indentation + $level * - $rOpts_indent_columns; + my $space_count = + $ci_level * $rOpts_continuation_indentation + + $level * $rOpts_indent_columns; my $ci_spaces = ( $ci_level == 0 ) ? 0 : $rOpts_continuation_indentation; @@ -6123,7 +6182,7 @@ sub set_leading_whitespace { my $space_count = 0; my $available_space = 0; $level = -1; # flag to prevent storing in item_list - $leading_spaces_to_go[$max_index_to_go] = + $leading_spaces_to_go[$max_index_to_go] = $reduced_spaces_to_go[$max_index_to_go] = new_lp_indentation_item( $space_count, $level, $ci_level, $available_space, 0 ); @@ -6152,17 +6211,33 @@ sub set_leading_whitespace { # find the position if we break at the '=' my $i_test = $last_equals; if ( $types_to_go[ $i_test + 1 ] eq 'b' ) { $i_test++ } + + # TESTING + ##my $too_close = ($i_test==$max_index_to_go-1); + my $test_position = total_line_length( $i_test, $max_index_to_go ); if ( + # the equals is not just before an open paren (testing) + ##!$too_close && + # if we are beyond the midpoint $gnu_position_predictor > $half_maximum_line_length - # or if we can save some space by breaking at the '=' - # without obscuring the second line by the first - || ( $test_position > 1 + - total_line_length( $line_start_index_to_go, $last_equals ) ) + # or we are beyont the 1/4 point and there was an old + # break at the equals + || ( + $gnu_position_predictor > $half_maximum_line_length / 2 + && ( + $old_breakpoint_to_go[$last_equals] + || ( $last_equals > 0 + && $old_breakpoint_to_go[ $last_equals - 1 ] ) + || ( $last_equals > 1 + && $types_to_go[ $last_equals - 1 ] eq 'b' + && $old_breakpoint_to_go[ $last_equals - 2 ] ) + ) + ) ) { @@ -6865,15 +6940,8 @@ EOM # implement outdenting preferences for keywords %outdent_keyword = (); - - # load defaults - @_ = qw(next last redo goto return); - - # override defaults if requested - if ( $_ = $rOpts->{'outdent-keyword-list'} ) { - s/^\s+//; - s/\s+$//; - @_ = split /\s+/; + unless ( @_ = split_words( $rOpts->{'outdent-keyword-okl'} ) ) { + @_ = qw(next last redo goto return); # defaults } # FUTURE: if not a keyword, assume that it is an identifier @@ -6887,30 +6955,19 @@ EOM } # implement user whitespace preferences - if ( $_ = $rOpts->{'want-left-space'} ) { - s/^\s+//; - s/\s+$//; - @_ = split /\s+/; + if ( @_ = split_words( $rOpts->{'want-left-space'} ) ) { @want_left_space{@_} = (1) x scalar(@_); } - if ( $_ = $rOpts->{'want-right-space'} ) { - s/^\s+//; - s/\s+$//; - @_ = split /\s+/; + if ( @_ = split_words( $rOpts->{'want-right-space'} ) ) { @want_right_space{@_} = (1) x scalar(@_); } - if ( $_ = $rOpts->{'nowant-left-space'} ) { - s/^\s+//; - s/\s+$//; - @_ = split /\s+/; + + if ( @_ = split_words( $rOpts->{'nowant-left-space'} ) ) { @want_left_space{@_} = (-1) x scalar(@_); } - if ( $_ = $rOpts->{'nowant-right-space'} ) { - s/^\s+//; - s/\s+$//; - @_ = split /\s+/; + if ( @_ = split_words( $rOpts->{'nowant-right-space'} ) ) { @want_right_space{@_} = (-1) x scalar(@_); } if ( $rOpts->{'dump-want-left-space'} ) { @@ -6930,53 +6987,40 @@ EOM @space_after_keyword{@_} = (1) x scalar(@_); # allow user to modify these defaults - if ( $_ = $rOpts->{'space-after-keyword'} ) { - s/^\s+//; - s/\s+$//; - @_ = split /\s+/; + if ( @_ = split_words( $rOpts->{'space-after-keyword'} ) ) { @space_after_keyword{@_} = (1) x scalar(@_); } - if ( $_ = $rOpts->{'nospace-after-keyword'} ) { - s/^\s+//; - s/\s+$//; - @_ = split /\s+/; + if ( @_ = split_words( $rOpts->{'nospace-after-keyword'} ) ) { @space_after_keyword{@_} = (0) x scalar(@_); } # implement user break preferences - if ( $_ = $rOpts->{'want-break-after'} ) { - @_ = split /\s+/; - foreach my $tok (@_) { - if ( $tok eq '?' ) { $tok = ':' } # patch to coordinate ?/: - my $lbs = $left_bond_strength{$tok}; - my $rbs = $right_bond_strength{$tok}; - if ( defined($lbs) && defined($rbs) && $lbs < $rbs ) { - ( $right_bond_strength{$tok}, $left_bond_strength{$tok} ) = - ( $lbs, $rbs ); - } + foreach my $tok ( split_words( $rOpts->{'want-break-after'} ) ) { + if ( $tok eq '?' ) { $tok = ':' } # patch to coordinate ?/: + my $lbs = $left_bond_strength{$tok}; + my $rbs = $right_bond_strength{$tok}; + if ( defined($lbs) && defined($rbs) && $lbs < $rbs ) { + ( $right_bond_strength{$tok}, $left_bond_strength{$tok} ) = + ( $lbs, $rbs ); } } - if ( $_ = $rOpts->{'want-break-before'} ) { - s/^\s+//; - s/\s+$//; - @_ = split /\s+/; - foreach my $tok (@_) { - my $lbs = $left_bond_strength{$tok}; - my $rbs = $right_bond_strength{$tok}; - if ( defined($lbs) && defined($rbs) && $rbs < $lbs ) { - ( $right_bond_strength{$tok}, $left_bond_strength{$tok} ) = - ( $lbs, $rbs ); - } + foreach my $tok ( split_words( $rOpts->{'want-break-before'} ) ) { + my $lbs = $left_bond_strength{$tok}; + my $rbs = $right_bond_strength{$tok}; + if ( defined($lbs) && defined($rbs) && $rbs < $lbs ) { + ( $right_bond_strength{$tok}, $left_bond_strength{$tok} ) = + ( $lbs, $rbs ); } } # make note if breaks are before certain key types %want_break_before = (); - - foreach - my $tok ( '.', ',', ':', '?', '&&', '||', 'and', 'or', 'err', 'xor' ) + foreach my $tok ( + '=', '.', ',', ':', '?', '&&', '||', 'and', + 'or', 'err', 'xor', '+', '-', '*', '/', + ) { $want_break_before{$tok} = $left_bond_strength{$tok} < $right_bond_strength{$tok}; @@ -6992,7 +7036,7 @@ EOM # Define here tokens which may follow the closing brace of a do statement # on the same line, as in: # } while ( $something); - @_ = qw(until while unless if ; ); + @_ = qw(until while unless if ; : ); push @_, ','; @is_do_follower{@_} = (1) x scalar(@_); @@ -7012,14 +7056,14 @@ EOM %is_else_brace_follower = (); # what can follow a multi-line anonymous sub definition closing curly: - @_ = qw# ; : => or and && || ) #; + @_ = qw# ; : => or and && || ~~ !~~ ) #; push @_, ','; @is_anon_sub_brace_follower{@_} = (1) x scalar(@_); # what can follow a one-line anonynomous sub closing curly: # one-line anonumous subs also have ']' here... # see tk3.t and PP.pm - @_ = qw# ; : => or and && || ) ] #; + @_ = qw# ; : => or and && || ) ] ~~ !~~ #; push @_, ','; @is_anon_sub_1_brace_follower{@_} = (1) x scalar(@_); @@ -7085,15 +7129,15 @@ EOM ); # frequently used parameters - $rOpts_add_newlines = $rOpts->{'add-newlines'}; - $rOpts_add_whitespace = $rOpts->{'add-whitespace'}; - $rOpts_block_brace_tightness = $rOpts->{'block-brace-tightness'}; + $rOpts_add_newlines = $rOpts->{'add-newlines'}; + $rOpts_add_whitespace = $rOpts->{'add-whitespace'}; + $rOpts_block_brace_tightness = $rOpts->{'block-brace-tightness'}; $rOpts_block_brace_vertical_tightness = $rOpts->{'block-brace-vertical-tightness'}; $rOpts_brace_left_and_indent = $rOpts->{'brace-left-and-indent'}; $rOpts_comma_arrow_breakpoints = $rOpts->{'comma-arrow-breakpoints'}; - $rOpts_break_at_old_trinary_breakpoints = - $rOpts->{'break-at-old-trinary-breakpoints'}; + $rOpts_break_at_old_ternary_breakpoints = + $rOpts->{'break-at-old-ternary-breakpoints'}; $rOpts_break_at_old_comma_breakpoints = $rOpts->{'break-at-old-comma-breakpoints'}; $rOpts_break_at_old_keyword_breakpoints = @@ -7265,9 +7309,7 @@ sub make_block_pattern { # pattern: '^((if|else|elsif|unless|while|for|foreach|do|\w+:)$|sub)'; my ( $abbrev, $string ) = @_; - $string =~ s/^\s+//; - $string =~ s/\s+$//; - my @list = split /\s+/, $string; + my @list = split_words($string); my @words = (); my %seen; for my $i (@list) { @@ -7425,8 +7467,18 @@ EOM # for avoiding syntax problems rather than for formatting. my ( $tokenll, $typell, $tokenl, $typel, $tokenr, $typer ) = @_; - # never combine two bare words or numbers - my $result = ( ( $tokenr =~ /^[\'\w]/ ) && ( $tokenl =~ /[\'\w]$/ ) ) + my $result = + + # never combine two bare words or numbers + # examples: and ::ok(1) + # return ::spw(...) + # for bla::bla:: abc + # example is "%overload:: and" in files Dumpvalue.pm or colonbug.pl + # $input eq"quit" to make $inputeq"quit" + # my $size=-s::SINK if $file; <==OK but we won't do it + # don't join something like: for bla::bla:: abc + # example is "%overload:: and" in files Dumpvalue.pm or colonbug.pl + ( ( $tokenl =~ /([\'\w]|\:\:)$/ ) && ( $tokenr =~ /^([\'\w]|\:\:)/ ) ) # do not combine a number with a concatination dot # example: pom.caputo: @@ -7479,7 +7531,11 @@ EOM # retain any space after possible filehandle # (testfiles prnterr1.t with --extrude and mangle.t with --mangle) - || ( $typel eq 'Z' || $typell eq 'Z' ) + || ( $typel eq 'Z' ) + + # Perl is sensitive to whitespace after the + here: + # $b = xvals $a + 0.1 * yvals $a; + || ( $typell eq 'Z' && $typel =~ /^[\/\?\+\-\*]$/ ) # keep paren separate in 'use Foo::Bar ()' || ( $tokenr eq '(' @@ -7516,7 +7572,8 @@ EOM $tokenl eq 'my' # /^(for|foreach)$/ - && $is_for_foreach{$tokenll} && $tokenr =~ /^\$/ + && $is_for_foreach{$tokenll} + && $tokenr =~ /^\$/ ) # must have space between grep and left paren; "grep(" will fail @@ -7526,9 +7583,6 @@ EOM #use Mail::Internet 1.28 (); (see Entity.pm, Head.pm, Test.pm) || ( ( $typel eq 'n' ) && ( $tokenr eq '(' ) ) - # don't join something like: for bla::bla:: abc - # example is "%overload:: and" in files Dumpvalue.pm or colonbug.pl - || ( $tokenl =~ /\:\:$/ && ( $tokenr =~ /^[\'\w]/ ) ) ; # the value of this long logic sequence is the result we want return $result; } @@ -7587,7 +7641,7 @@ sub set_white_space_flag { my @spaces_both_sides = qw" + - * / % ? = . : x < > | & ^ .. << >> ** && .. || // => += -= - .= %= x= &= |= ^= *= <> <= >= == =~ !~ /= != ... <<= >>= + .= %= x= &= |= ^= *= <> <= >= == =~ !~ /= != ... <<= >>= ~~ !~~ &&= ||= //= <=> A k f w F n C Y U G v "; @@ -7653,8 +7707,11 @@ sub set_white_space_flag { $binary_ws_rules{'R'}{'++'} = WS_NO; $binary_ws_rules{'R'}{'--'} = WS_NO; - $binary_ws_rules{'k'}{':'} = WS_NO; # keep colon with label - $binary_ws_rules{'w'}{':'} = WS_NO; + ######################################################## + # should no longer be necessary (see niek.pl) + ##$binary_ws_rules{'k'}{':'} = WS_NO; # keep colon with label + ##$binary_ws_rules{'w'}{':'} = WS_NO; + ######################################################## $binary_ws_rules{'i'}{'Q'} = WS_YES; $binary_ws_rules{'n'}{'('} = WS_YES; # occurs in 'use package n ()' @@ -7883,7 +7940,7 @@ sub set_white_space_flag { # patch for SWITCH/CASE: make space at ']{' optional # since the '{' might begin a case or when block - elsif ( $token eq '{' && $last_token eq ']' ) { + elsif ( ( $token eq '{' && $type ne 'L' ) && $last_token eq ']' ) { $ws = WS_OPTIONAL; } @@ -7932,8 +7989,13 @@ sub set_white_space_flag { elsif ( $type eq '#' ) { $ws = WS_YES if $j > 0 } # always preserver whatever space was used after a possible - # filehandle or here doc operator - if ( $type ne '#' && ( $last_type eq 'Z' || $last_type eq 'h' ) ) { + # filehandle (except _) or here doc operator + if ( + $type ne '#' + && ( ( $last_type eq 'Z' && $last_token ne '_' ) + || $last_type eq 'h' ) + ) + { $ws = WS_OPTIONAL; } @@ -8110,7 +8172,7 @@ sub set_white_space_flag { # If this becomes too much of a problem, we might give up and just clip # them at zero. ## $levels_to_go[$max_index_to_go] = ( $level > 0 ) ? $level : 0; - $levels_to_go[$max_index_to_go] = $level; + $levels_to_go[$max_index_to_go] = $level; $nesting_depth_to_go[$max_index_to_go] = ( $slevel >= 0 ) ? $slevel : 0; $lengths_to_go[ $max_index_to_go + 1 ] = $lengths_to_go[$max_index_to_go] + length($token); @@ -8165,16 +8227,6 @@ sub set_white_space_flag { return; } - my %is_until_while_for_if_elsif_else; - - BEGIN { - - # always break after a closing curly of these block types: - @_ = qw(until while for if elsif else); - @is_until_while_for_if_elsif_else{@_} = (1) x scalar(@_); - - } - sub print_line_of_tokens { my $line_of_tokens = shift; @@ -8214,7 +8266,8 @@ sub set_white_space_flag { $in_continued_quote = $starting_in_quote = $line_of_tokens->{_starting_in_quote}; - $in_quote = $line_of_tokens->{_ending_in_quote}; + $in_quote = $line_of_tokens->{_ending_in_quote}; + $ending_in_quote = $in_quote; $python_indentation_level = $line_of_tokens->{_python_indentation_level}; @@ -8316,7 +8369,7 @@ sub set_white_space_flag { && $rOpts->{'static-block-comments'} && $input_line =~ /$static_block_comment_pattern/o ) { - $is_static_block_comment = 1; + $is_static_block_comment = 1; $is_static_block_comment_without_leading_space = substr( $input_line, 0, 1 ) eq '#'; } @@ -8423,7 +8476,7 @@ sub set_white_space_flag { # /([\$*])(([\w\:\']*)\bVERSION)\b.*\=/ # Examples: # *VERSION = \'1.01'; - # ( $VERSION ) = '$Revision: 1.49 $ ' =~ /\$Revision:\s+([^\s]+)/; + # ( $VERSION ) = '$Revision: 1.61 $ ' =~ /\$Revision:\s+([^\s]+)/; # We will pass such a line straight through without breaking # it unless -npvl is used @@ -8442,13 +8495,13 @@ sub set_white_space_flag { } # take care of indentation-only - # also write a line which is entirely a 'qw' list - if ( $rOpts->{'indent-only'} - || ( ( $jmax == 0 ) && ( $$rtoken_type[0] eq 'q' ) ) ) - { + # NOTE: In previous versions we sent all qw lines out immediately here. + # No longer doing this: also write a line which is entirely a 'qw' list + # to allow stacking of opening and closing tokens. Note that interior + # qw lines will still go out at the end of this routine. + if ( $rOpts->{'indent-only'} ) { flush(); - $input_line =~ s/^\s*//; # trim left end - $input_line =~ s/\s*$//; # trim right end + trim($input_line); extract_token(0); $token = $input_line; @@ -8875,7 +8928,13 @@ sub set_white_space_flag { # # But make a line break if the curly ends a # significant block: - if ( $is_until_while_for_if_elsif_else{$block_type} ) { + if ( + $is_block_without_semicolon{$block_type} + + # if needless semicolon follows we handle it later + && $next_nonblank_token ne ';' + ) + { output_line_to_go() unless ($no_internal_newlines); } } @@ -8911,11 +8970,6 @@ sub set_white_space_flag { } } - # TESTING ONLY for SWITCH/CASE - this is where to start - # recoding to retain else's on the same line as a case, - # but there is a lot more that would need to be done. - ##elsif ($block_type eq 'case') {$rbrace_follower = {else=>1};} - # None of the above: specify what can follow a closing # brace of a block which is not an # if/elsif/else/do/sort/map/grep/eval @@ -9087,7 +9141,9 @@ sub set_white_space_flag { # if there is a side comment ( ( $type eq '#' ) && !$rOpts->{'delete-side-comments'} ) - # if this line which ends in a quote + # if this line ends in a quote + # NOTE: This is critically important for insuring that quoted lines + # do not get processed by things like -sot and -sct || $in_quote # if this is a VERSION statement @@ -9108,141 +9164,385 @@ sub set_white_space_flag { if ( $max_index_to_go >= 0 && !$rOpts_ignore_old_breakpoints ) { $old_breakpoint_to_go[$max_index_to_go] = 1; } - } + } # end sub print_line_of_tokens } # end print_line_of_tokens -sub note_added_semicolon { - $last_added_semicolon_at = $input_line_number; - if ( $added_semicolon_count == 0 ) { - $first_added_semicolon_at = $last_added_semicolon_at; - } - $added_semicolon_count++; - write_logfile_entry("Added ';' here\n"); -} +# sub output_line_to_go sends one logical line of tokens on down the +# pipeline to the VerticalAligner package, breaking the line into continuation +# lines as necessary. The line of tokens is ready to go in the "to_go" +# arrays. +sub output_line_to_go { -sub note_deleted_semicolon { - $last_deleted_semicolon_at = $input_line_number; - if ( $deleted_semicolon_count == 0 ) { - $first_deleted_semicolon_at = $last_deleted_semicolon_at; - } - $deleted_semicolon_count++; - write_logfile_entry("Deleted unnecessary ';'\n"); # i hope ;) -} + # debug stuff; this routine can be called from many points + FORMATTER_DEBUG_FLAG_OUTPUT && do { + my ( $a, $b, $c ) = caller; + write_diagnostics( +"OUTPUT: output_line_to_go called: $a $c $last_nonblank_type $last_nonblank_token, one_line=$index_start_one_line_block, tokens to write=$max_index_to_go\n" + ); + my $output_str = join "", @tokens_to_go[ 0 .. $max_index_to_go ]; + write_diagnostics("$output_str\n"); + }; -sub note_embedded_tab { - $embedded_tab_count++; - $last_embedded_tab_at = $input_line_number; - if ( !$first_embedded_tab_at ) { - $first_embedded_tab_at = $last_embedded_tab_at; + # just set a tentative breakpoint if we might be in a one-line block + if ( $index_start_one_line_block != UNDEFINED_INDEX ) { + set_forced_breakpoint($max_index_to_go); + return; } - if ( $embedded_tab_count <= MAX_NAG_MESSAGES ) { - write_logfile_entry("Embedded tabs in quote or pattern\n"); - } -} + my $cscw_block_comment; + $cscw_block_comment = add_closing_side_comment() + if ( $rOpts->{'closing-side-comments'} && $max_index_to_go >= 0 ); -sub starting_one_line_block { + match_opening_and_closing_tokens(); - # after seeing an opening curly brace, look for the closing brace - # and see if the entire block will fit on a line. This routine is - # not always right because it uses the old whitespace, so a check - # is made later (at the closing brace) to make sure we really - # have a one-line block. We have to do this preliminary check, - # though, because otherwise we would always break at a semicolon - # within a one-line block if the block contains multiple statements. + # tell the -lp option we are outputting a batch so it can close + # any unfinished items in its stack + finish_lp_batch(); - my ( $j, $jmax, $level, $slevel, $ci_level, $rtokens, $rtoken_type, - $rblock_type ) - = @_; + # If this line ends in a code block brace, set breaks at any + # previous closing code block braces to breakup a chain of code + # blocks on one line. This is very rare but can happen for + # user-defined subs. For example we might be looking at this: + # BOOL { $server_data{uptime} > 0; } NUM { $server_data{load}; } STR { + my $saw_good_break = 0; # flag to force breaks even if short line + if ( - # kill any current block - we can only go 1 deep - destroy_one_line_block(); + # looking for opening or closing block brace + $block_type_to_go[$max_index_to_go] - # return value: - # 1=distance from start of block to opening brace exceeds line length - # 0=otherwise + # but not one of these which are never duplicated on a line: + # until|while|for|if|elsif|else + && !$is_block_without_semicolon{ $block_type_to_go[$max_index_to_go] } + ) + { + my $lev = $nesting_depth_to_go[$max_index_to_go]; - my $i_start = 0; + # Walk backwards from the end and + # set break at any closing block braces at the same level. + # But quit if we are not in a chain of blocks. + for ( my $i = $max_index_to_go - 1 ; $i >= 0 ; $i-- ) { + last if ( $levels_to_go[$i] < $lev ); # stop at a lower level + next if ( $levels_to_go[$i] > $lev ); # skip past higher level - # shouldn't happen: there must have been a prior call to - # store_token_to_go to put the opening brace in the output stream - if ( $max_index_to_go < 0 ) { - warning("program bug: store_token_to_go called incorrectly\n"); - report_definite_bug(); - } - else { + if ( $block_type_to_go[$i] ) { + if ( $tokens_to_go[$i] eq '}' ) { + set_forced_breakpoint($i); + $saw_good_break = 1; + } + } - # cannot use one-line blocks with cuddled else else/elsif lines - if ( ( $tokens_to_go[0] eq '}' ) && $rOpts_cuddled_else ) { - return 0; + # quit if we see anything besides words, function, blanks + # at this level + elsif ( $types_to_go[$i] !~ /^[\(\)Gwib]$/ ) { last } } } - my $block_type = $$rblock_type[$j]; - - # find the starting keyword for this block (such as 'if', 'else', ...) + my $imin = 0; + my $imax = $max_index_to_go; - if ( $block_type =~ /^[\{\}\;\:]$/ ) { - $i_start = $max_index_to_go; + # trim any blank tokens + if ( $max_index_to_go >= 0 ) { + if ( $types_to_go[$imin] eq 'b' ) { $imin++ } + if ( $types_to_go[$imax] eq 'b' ) { $imax-- } } - elsif ( $last_last_nonblank_token_to_go eq ')' ) { - - # For something like "if (xxx) {", the keyword "if" will be - # just after the most recent break. This will be 0 unless - # we have just killed a one-line block and are starting another. - # (doif.t) - $i_start = $index_max_forced_break + 1; - if ( $types_to_go[$i_start] eq 'b' ) { - $i_start++; - } - - unless ( $tokens_to_go[$i_start] eq $block_type ) { - return 0; - } - } + # anything left to write? + if ( $imin <= $imax ) { - # the previous nonblank token should start these block types - elsif ( - ( $last_last_nonblank_token_to_go eq $block_type ) - || ( $block_type =~ /^sub/ - && $last_last_nonblank_token_to_go =~ /^sub/ ) - ) - { - $i_start = $last_last_nonblank_index_to_go; - } + # add a blank line before certain key types + if ( $last_line_leading_type !~ /^[#b]/ ) { + my $want_blank = 0; + my $leading_token = $tokens_to_go[$imin]; + my $leading_type = $types_to_go[$imin]; - # patch for SWITCH/CASE to retain one-line case/when blocks - elsif ( $block_type eq 'case' || $block_type eq 'when' ) { - $i_start = $index_max_forced_break + 1; - if ( $types_to_go[$i_start] eq 'b' ) { - $i_start++; - } - unless ( $tokens_to_go[$i_start] eq $block_type ) { - return 0; - } - } + # blank lines before subs except declarations and one-liners + # MCONVERSION LOCATION - for sub tokenization change + if ( $leading_token =~ /^(sub\s)/ && $leading_type eq 'i' ) { + $want_blank = ( $rOpts->{'blanks-before-subs'} ) + && ( + terminal_type( \@types_to_go, \@block_type_to_go, $imin, + $imax ) !~ /^[\;\}]$/ + ); + } - else { - return 1; - } + # break before all package declarations + # MCONVERSION LOCATION - for tokenizaton change + elsif ($leading_token =~ /^(package\s)/ + && $leading_type eq 'i' ) + { + $want_blank = ( $rOpts->{'blanks-before-subs'} ); + } - my $pos = total_line_length( $i_start, $max_index_to_go ) - 1; + # break before certain key blocks except one-liners + if ( $leading_token =~ /^(BEGIN|END)$/ && $leading_type eq 'k' ) { + $want_blank = ( $rOpts->{'blanks-before-subs'} ) + && ( + terminal_type( \@types_to_go, \@block_type_to_go, $imin, + $imax ) ne '}' + ); + } - my $i; + # Break before certain block types if we haven't had a + # break at this level for a while. This is the + # difficult decision.. + elsif ($leading_token =~ /^(unless|if|while|until|for|foreach)$/ + && $leading_type eq 'k' ) + { + my $lc = $nonblank_lines_at_depth[$last_line_leading_level]; + if ( !defined($lc) ) { $lc = 0 } - # see if length is too long to even start - if ( $pos > $rOpts_maximum_line_length ) { - return 1; - } + $want_blank = $rOpts->{'blanks-before-blocks'} + && $lc >= $rOpts->{'long-block-line-count'} + && $file_writer_object->get_consecutive_nonblank_lines() >= + $rOpts->{'long-block-line-count'} + && ( + terminal_type( \@types_to_go, \@block_type_to_go, $imin, + $imax ) ne '}' + ); + } - for ( $i = $j + 1 ; $i <= $jmax ; $i++ ) { + if ($want_blank) { - # old whitespace could be arbitrarily large, so don't use it - if ( $$rtoken_type[$i] eq 'b' ) { $pos += 1 } - else { $pos += length( $$rtokens[$i] ) } + # future: send blank line down normal path to VerticalAligner + Perl::Tidy::VerticalAligner::flush(); + $file_writer_object->write_blank_code_line(); + } + } - # Return false result if we exceed the maximum line length, + # update blank line variables and count number of consecutive + # non-blank, non-comment lines at this level + $last_last_line_leading_level = $last_line_leading_level; + $last_line_leading_level = $levels_to_go[$imin]; + if ( $last_line_leading_level < 0 ) { $last_line_leading_level = 0 } + $last_line_leading_type = $types_to_go[$imin]; + if ( $last_line_leading_level == $last_last_line_leading_level + && $last_line_leading_type ne 'b' + && $last_line_leading_type ne '#' + && defined( $nonblank_lines_at_depth[$last_line_leading_level] ) ) + { + $nonblank_lines_at_depth[$last_line_leading_level]++; + } + else { + $nonblank_lines_at_depth[$last_line_leading_level] = 1; + } + + FORMATTER_DEBUG_FLAG_FLUSH && do { + my ( $package, $file, $line ) = caller; + print +"FLUSH: flushing from $package $file $line, types= $types_to_go[$imin] to $types_to_go[$imax]\n"; + }; + + # add a couple of extra terminal blank tokens + pad_array_to_go(); + + # set all forced breakpoints for good list formatting + my $is_long_line = excess_line_length( $imin, $max_index_to_go ) > 0; + + if ( + $max_index_to_go > 0 + && ( + $is_long_line + || $old_line_count_in_batch > 1 + || is_unbalanced_batch() + || ( + $comma_count_in_batch + && ( $rOpts_maximum_fields_per_table > 0 + || $rOpts_comma_arrow_breakpoints == 0 ) + ) + ) + ) + { + $saw_good_break ||= scan_list(); + } + + # let $ri_first and $ri_last be references to lists of + # first and last tokens of line fragments to output.. + my ( $ri_first, $ri_last ); + + # write a single line if.. + if ( + + # we aren't allowed to add any newlines + !$rOpts_add_newlines + + # or, we don't already have an interior breakpoint + # and we didn't see a good breakpoint + || ( + !$forced_breakpoint_count + && !$saw_good_break + + # and this line is 'short' + && !$is_long_line + ) + ) + { + @$ri_first = ($imin); + @$ri_last = ($imax); + } + + # otherwise use multiple lines + else { + + ( $ri_first, $ri_last ) = set_continuation_breaks($saw_good_break); + + break_all_chain_tokens( $ri_first, $ri_last ); + + # now we do a correction step to clean this up a bit + # (The only time we would not do this is for debugging) + if ( $rOpts->{'recombine'} ) { + ( $ri_first, $ri_last ) = + recombine_breakpoints( $ri_first, $ri_last ); + } + } + + # do corrector step if -lp option is used + my $do_not_pad = 0; + if ($rOpts_line_up_parentheses) { + $do_not_pad = correct_lp_indentation( $ri_first, $ri_last ); + } + send_lines_to_vertical_aligner( $ri_first, $ri_last, $do_not_pad ); + } + prepare_for_new_input_lines(); + + # output any new -cscw block comment + if ($cscw_block_comment) { + flush(); + $file_writer_object->write_code_line( $cscw_block_comment . "\n" ); + } +} + +sub note_added_semicolon { + $last_added_semicolon_at = $input_line_number; + if ( $added_semicolon_count == 0 ) { + $first_added_semicolon_at = $last_added_semicolon_at; + } + $added_semicolon_count++; + write_logfile_entry("Added ';' here\n"); +} + +sub note_deleted_semicolon { + $last_deleted_semicolon_at = $input_line_number; + if ( $deleted_semicolon_count == 0 ) { + $first_deleted_semicolon_at = $last_deleted_semicolon_at; + } + $deleted_semicolon_count++; + write_logfile_entry("Deleted unnecessary ';'\n"); # i hope ;) +} + +sub note_embedded_tab { + $embedded_tab_count++; + $last_embedded_tab_at = $input_line_number; + if ( !$first_embedded_tab_at ) { + $first_embedded_tab_at = $last_embedded_tab_at; + } + + if ( $embedded_tab_count <= MAX_NAG_MESSAGES ) { + write_logfile_entry("Embedded tabs in quote or pattern\n"); + } +} + +sub starting_one_line_block { + + # after seeing an opening curly brace, look for the closing brace + # and see if the entire block will fit on a line. This routine is + # not always right because it uses the old whitespace, so a check + # is made later (at the closing brace) to make sure we really + # have a one-line block. We have to do this preliminary check, + # though, because otherwise we would always break at a semicolon + # within a one-line block if the block contains multiple statements. + + my ( $j, $jmax, $level, $slevel, $ci_level, $rtokens, $rtoken_type, + $rblock_type ) + = @_; + + # kill any current block - we can only go 1 deep + destroy_one_line_block(); + + # return value: + # 1=distance from start of block to opening brace exceeds line length + # 0=otherwise + + my $i_start = 0; + + # shouldn't happen: there must have been a prior call to + # store_token_to_go to put the opening brace in the output stream + if ( $max_index_to_go < 0 ) { + warning("program bug: store_token_to_go called incorrectly\n"); + report_definite_bug(); + } + else { + + # cannot use one-line blocks with cuddled else else/elsif lines + if ( ( $tokens_to_go[0] eq '}' ) && $rOpts_cuddled_else ) { + return 0; + } + } + + my $block_type = $$rblock_type[$j]; + + # find the starting keyword for this block (such as 'if', 'else', ...) + + if ( $block_type =~ /^[\{\}\;\:]$/ ) { + $i_start = $max_index_to_go; + } + + elsif ( $last_last_nonblank_token_to_go eq ')' ) { + + # For something like "if (xxx) {", the keyword "if" will be + # just after the most recent break. This will be 0 unless + # we have just killed a one-line block and are starting another. + # (doif.t) + $i_start = $index_max_forced_break + 1; + if ( $types_to_go[$i_start] eq 'b' ) { + $i_start++; + } + + unless ( $tokens_to_go[$i_start] eq $block_type ) { + return 0; + } + } + + # the previous nonblank token should start these block types + elsif ( + ( $last_last_nonblank_token_to_go eq $block_type ) + || ( $block_type =~ /^sub/ + && $last_last_nonblank_token_to_go =~ /^sub/ ) + ) + { + $i_start = $last_last_nonblank_index_to_go; + } + + # patch for SWITCH/CASE to retain one-line case/when blocks + elsif ( $block_type eq 'case' || $block_type eq 'when' ) { + $i_start = $index_max_forced_break + 1; + if ( $types_to_go[$i_start] eq 'b' ) { + $i_start++; + } + unless ( $tokens_to_go[$i_start] eq $block_type ) { + return 0; + } + } + + else { + return 1; + } + + my $pos = total_line_length( $i_start, $max_index_to_go ) - 1; + + my $i; + + # see if length is too long to even start + if ( $pos > $rOpts_maximum_line_length ) { + return 1; + } + + for ( $i = $j + 1 ; $i <= $jmax ; $i++ ) { + + # old whitespace could be arbitrarily large, so don't use it + if ( $$rtoken_type[$i] eq 'b' ) { $pos += 1 } + else { $pos += length( $$rtokens[$i] ) } + + # Return false result if we exceed the maximum line length, if ( $pos > $rOpts_maximum_line_length ) { return 0; } @@ -9463,14 +9763,29 @@ sub set_logical_padding { # we might pad token $ibeg, so be sure that it # is at the same depth as the next line. next - if ( $nesting_depth_to_go[ $ibeg + 1 ] != + if ( $nesting_depth_to_go[$ibeg] != $nesting_depth_to_go[$ibeg_next] ); # We can pad on line 1 of a statement if at least 3 # lines will be aligned. Otherwise, it # can look very confusing. - if ( $max_line > 2 ) { + + # We have to be careful not to pad if there are too few + # lines. The current rule is: + # (1) in general we require at least 3 consecutive lines + # with the same leading chain operator token, + # (2) but an exception is that we only require two lines + # with leading colons if there are no more lines. For example, + # the first $i in the following snippet would get padding + # by the second rule: + # + # $i == 1 ? ( "First", "Color" ) + # : $i == 2 ? ( "Then", "Rarity" ) + # : ( "Then", "Name" ); + + if ( $max_line > 1 ) { my $leading_token = $tokens_to_go[$ibeg_next]; + my $tokens_differ; # never indent line 1 of a '.' series because # previous line is most likely at same level. @@ -9481,13 +9796,18 @@ sub set_logical_padding { my $count = 1; foreach my $l ( 2 .. 3 ) { + last if ( $line + $l > $max_line ); my $ibeg_next_next = $$ri_first[ $line + $l ]; - next - unless $tokens_to_go[$ibeg_next_next] eq - $leading_token; + if ( $tokens_to_go[$ibeg_next_next] ne + $leading_token ) + { + $tokens_differ = 1; + last; + } $count++; } - next unless $count == 3; + next if ($tokens_differ); + next if ( $count < 3 && $leading_token ne ':' ); $ipad = $ibeg; } else { @@ -9563,9 +9883,10 @@ sub set_logical_padding { ) { - #----------------------begin special check--------------- + #----------------------begin special checks-------------- # - # One more check is needed before we can make the pad. + # SPECIAL CHECK 1: + # A check is needed before we can make the pad. # If we are in a list with some long items, we want each # item to stand out. So in the following example, the # first line begining with '$casefold->' would look good @@ -9624,6 +9945,28 @@ sub set_logical_padding { ); } } + + # SPECIAL CHECK 2: + # a minus may introduce a quoted variable, and we will + # add the pad only if this line begins with a bare word, + # such as for the word 'Button' here: + # [ + # Button => "Print letter \"~$_\"", + # -command => [ sub { print "$_[0]\n" }, $_ ], + # -accelerator => "Meta+$_" + # ]; + # + # On the other hand, if 'Button' is quoted, it looks best + # not to pad: + # [ + # 'Button' => "Print letter \"~$_\"", + # -command => [ sub { print "$_[0]\n" }, $_ ], + # -accelerator => "Meta+$_" + # ]; + if ( $types_to_go[$ibeg_next] eq 'm' ) { + $ok_to_pad = 0 if $types_to_go[$ibeg] eq 'Q'; + } + next unless $ok_to_pad; #----------------------end special check--------------- @@ -9854,7 +10197,7 @@ sub correct_lp_indentation { # then we are probably vertically aligned. We could set # an exact flag in sub scan_list, but this is good # enough. - my $indentation_count = keys %saw_indentation; + my $indentation_count = keys %saw_indentation; my $is_vertically_aligned = ( $i == $ibeg && $first_line_comma_count > 1 @@ -9904,213 +10247,10 @@ sub flush { Perl::Tidy::VerticalAligner::flush(); } -# output_line_to_go sends one logical line of tokens on down the -# pipeline to the VerticalAligner package, breaking the line into continuation -# lines as necessary. The line of tokens is ready to go in the "to_go" -# arrays. - -sub output_line_to_go { +sub reset_block_text_accumulator { - # debug stuff; this routine can be called from many points - FORMATTER_DEBUG_FLAG_OUTPUT && do { - my ( $a, $b, $c ) = caller; - write_diagnostics( -"OUTPUT: output_line_to_go called: $a $c $last_nonblank_type $last_nonblank_token, one_line=$index_start_one_line_block, tokens to write=$max_index_to_go\n" - ); - my $output_str = join "", @tokens_to_go[ 0 .. $max_index_to_go ]; - write_diagnostics("$output_str\n"); - }; - - # just set a tentative breakpoint if we might be in a one-line block - if ( $index_start_one_line_block != UNDEFINED_INDEX ) { - set_forced_breakpoint($max_index_to_go); - return; - } - - my $cscw_block_comment; - $cscw_block_comment = add_closing_side_comment() - if ( $rOpts->{'closing-side-comments'} && $max_index_to_go >= 0 ); - - match_opening_and_closing_tokens(); - - # tell the -lp option we are outputting a batch so it can close - # any unfinished items in its stack - finish_lp_batch(); - - my $imin = 0; - my $imax = $max_index_to_go; - - # trim any blank tokens - if ( $max_index_to_go >= 0 ) { - if ( $types_to_go[$imin] eq 'b' ) { $imin++ } - if ( $types_to_go[$imax] eq 'b' ) { $imax-- } - } - - # anything left to write? - if ( $imin <= $imax ) { - - # add a blank line before certain key types - if ( $last_line_leading_type !~ /^[#b]/ ) { - my $want_blank = 0; - my $leading_token = $tokens_to_go[$imin]; - my $leading_type = $types_to_go[$imin]; - - # blank lines before subs except declarations and one-liners - # MCONVERSION LOCATION - for sub tokenization change - if ( $leading_token =~ /^(sub\s)/ && $leading_type eq 'i' ) { - $want_blank = ( $rOpts->{'blanks-before-subs'} ) - && ( - terminal_type( \@types_to_go, \@block_type_to_go, $imin, - $imax ) !~ /^[\;\}]$/ - ); - } - - # break before all package declarations - # MCONVERSION LOCATION - for tokenizaton change - elsif ( $leading_token =~ /^(package\s)/ && $leading_type eq 'i' ) { - $want_blank = ( $rOpts->{'blanks-before-subs'} ); - } - - # break before certain key blocks except one-liners - if ( $leading_token =~ /^(BEGIN|END)$/ && $leading_type eq 'k' ) { - $want_blank = ( $rOpts->{'blanks-before-subs'} ) - && ( - terminal_type( \@types_to_go, \@block_type_to_go, $imin, - $imax ) ne '}' - ); - } - - # Break before certain block types if we haven't had a break at this - # level for a while. This is the difficult decision.. - elsif ($leading_token =~ /^(unless|if|while|until|for|foreach)$/ - && $leading_type eq 'k' ) - { - my $lc = $nonblank_lines_at_depth[$last_line_leading_level]; - if ( !defined($lc) ) { $lc = 0 } - - $want_blank = $rOpts->{'blanks-before-blocks'} - && $lc >= $rOpts->{'long-block-line-count'} - && $file_writer_object->get_consecutive_nonblank_lines() >= - $rOpts->{'long-block-line-count'} - && ( - terminal_type( \@types_to_go, \@block_type_to_go, $imin, - $imax ) ne '}' - ); - } - - if ($want_blank) { - - # future: send blank line down normal path to VerticalAligner - Perl::Tidy::VerticalAligner::flush(); - $file_writer_object->write_blank_code_line(); - } - } - - # update blank line variables and count number of consecutive - # non-blank, non-comment lines at this level - $last_last_line_leading_level = $last_line_leading_level; - $last_line_leading_level = $levels_to_go[$imin]; - if ( $last_line_leading_level < 0 ) { $last_line_leading_level = 0 } - $last_line_leading_type = $types_to_go[$imin]; - if ( $last_line_leading_level == $last_last_line_leading_level - && $last_line_leading_type ne 'b' - && $last_line_leading_type ne '#' - && defined( $nonblank_lines_at_depth[$last_line_leading_level] ) ) - { - $nonblank_lines_at_depth[$last_line_leading_level]++; - } - else { - $nonblank_lines_at_depth[$last_line_leading_level] = 1; - } - - FORMATTER_DEBUG_FLAG_FLUSH && do { - my ( $package, $file, $line ) = caller; - print -"FLUSH: flushing from $package $file $line, types= $types_to_go[$imin] to $types_to_go[$imax]\n"; - }; - - # add a couple of extra terminal blank tokens - pad_array_to_go(); - - # set all forced breakpoints for good list formatting - my $saw_good_break = 0; - my $is_long_line = excess_line_length( $imin, $max_index_to_go ) > 0; - - if ( - $max_index_to_go > 0 - && ( - $is_long_line - || $old_line_count_in_batch > 1 - || is_unbalanced_batch() - || ( - $comma_count_in_batch - && ( $rOpts_maximum_fields_per_table > 0 - || $rOpts_comma_arrow_breakpoints == 0 ) - ) - ) - ) - { - $saw_good_break = scan_list(); - } - - # let $ri_first and $ri_last be references to lists of - # first and last tokens of line fragments to output.. - my ( $ri_first, $ri_last ); - - # write a single line if.. - if ( - - # we aren't allowed to add any newlines - !$rOpts_add_newlines - - # or, we don't already have an interior breakpoint - # and we didn't see a good breakpoint - || ( - !$forced_breakpoint_count - && !$saw_good_break - - # and this line is 'short' - && !$is_long_line - ) - ) - { - @$ri_first = ($imin); - @$ri_last = ($imax); - } - - # otherwise use multiple lines - else { - - ( $ri_first, $ri_last ) = set_continuation_breaks($saw_good_break); - - # now we do a correction step to clean this up a bit - # (The only time we would not do this is for debugging) - if ( $rOpts->{'recombine'} ) { - ( $ri_first, $ri_last ) = - recombine_breakpoints( $ri_first, $ri_last ); - } - } - - # do corrector step if -lp option is used - my $do_not_pad = 0; - if ($rOpts_line_up_parentheses) { - $do_not_pad = correct_lp_indentation( $ri_first, $ri_last ); - } - send_lines_to_vertical_aligner( $ri_first, $ri_last, $do_not_pad ); - } - prepare_for_new_input_lines(); - - # output any new -cscw block comment - if ($cscw_block_comment) { - flush(); - $file_writer_object->write_code_line( $cscw_block_comment . "\n" ); - } -} - -sub reset_block_text_accumulator { - - # save text after 'if' and 'elsif' to append after 'else' - if ($accumulating_text_for_block) { + # save text after 'if' and 'elsif' to append after 'else' + if ($accumulating_text_for_block) { if ( $accumulating_text_for_block =~ /^(if|elsif)$/ ) { push @{$rleading_block_if_elsif_text}, $leading_block_text; @@ -10130,8 +10270,8 @@ sub set_block_text_accumulator { if ( $accumulating_text_for_block !~ /^els/ ) { $rleading_block_if_elsif_text = []; } - $leading_block_text = ""; - $leading_block_text_level = $levels_to_go[$i]; + $leading_block_text = ""; + $leading_block_text_level = $levels_to_go[$i]; $leading_block_text_line_number = $vertical_aligner_object->get_output_line_number(); $leading_block_text_length_exceeded = 0; @@ -10280,7 +10420,8 @@ sub accumulate_block_text { { my $output_line_number = $vertical_aligner_object->get_output_line_number(); - $block_line_count = $output_line_number - + $block_line_count = + $output_line_number - $block_opening_line_number{$type_sequence} + 1; delete $block_opening_line_number{$type_sequence}; } @@ -10425,7 +10566,8 @@ sub make_else_csc_text { # undo it if line length exceeded my $length = - length($csc_text) + length($block_type) + + length($csc_text) + + length($block_type) + length( $rOpts->{'closing-side-comment-prefix'} ) + $levels_to_go[$i_terminal] * $rOpts_indent_columns + 3; if ( $length > $rOpts_maximum_line_length ) { @@ -10473,6 +10615,13 @@ sub add_closing_side_comment { && $block_type_to_go[$i_terminal] =~ /$closing_side_comment_list_pattern/o + # .. but not an anonymous sub + # These are not normally of interest, and their closing braces are + # often followed by commas or semicolons anyway. This also avoids + # possible erratic output due to line numbering inconsistencies + # in the cases where their closing braces terminate a line. + && $block_type_to_go[$i_terminal] ne 'sub' + # ..and the corresponding opening brace must is not in this batch # (because we do not need to tag one-line blocks, although this # should also be caught with a positive -csci value) @@ -10589,9 +10738,9 @@ sub add_closing_side_comment { else { # insert the new side comment into the output token stream - my $type = '#'; - my $block_type = ''; - my $type_sequence = ''; + my $type = '#'; + my $block_type = ''; + my $type_sequence = ''; my $container_environment = $container_environment_to_go[$max_index_to_go]; my $level = $levels_to_go[$max_index_to_go]; @@ -10635,6 +10784,9 @@ sub send_lines_to_vertical_aligner { my $rindentation_list = [0]; # ref to indentations for each line + # define the array @matching_token_to_go for the output tokens + # which will be non-blank for each special token (such as =>) + # for which alignment is required. set_vertical_alignment_markers( $ri_first, $ri_last ); # flush if necessary to avoid unwanted alignment @@ -10751,7 +10903,7 @@ sub send_lines_to_vertical_aligner { # Mark most things before arrows as a quote to # get them to line up. Testfile: mixed.pl. if ( ( $i < $iend - 1 ) && ( $type =~ /^[wnC]$/ ) ) { - my $next_type = $types_to_go[ $i + 1 ]; + my $next_type = $types_to_go[ $i + 1 ]; my $i_next_nonblank = ( ( $next_type eq 'b' ) ? $i + 2 : $i + 1 ); @@ -10782,8 +10934,8 @@ sub send_lines_to_vertical_aligner { # done with this line .. join text of tokens to make the last field push( @fields, join( '', @tokens_to_go[ $i_start .. $iend ] ) ); - my ( $indentation, $lev, $level_end, $is_semicolon_terminated, - $is_outdented_line ) + my ( $indentation, $lev, $level_end, $terminal_type, + $is_semicolon_terminated, $is_outdented_line ) = set_adjusted_indentation( $ibeg, $iend, \@fields, \@patterns, $ri_first, $ri_last, $rindentation_list ); @@ -10813,6 +10965,17 @@ sub send_lines_to_vertical_aligner { # flush an outdented line to avoid any unwanted vertical alignment Perl::Tidy::VerticalAligner::flush() if ($is_outdented_line); + my $is_terminal_ternary = 0; + if ( $tokens_to_go[$ibeg] eq ':' + || $n > 0 && $tokens_to_go[ $$ri_last[ $n - 1 ] ] eq ':' ) + { + if ( ( $terminal_type eq ';' && $level_end <= $lev ) + || ( $level_end < $lev ) ) + { + $is_terminal_ternary = 1; + } + } + # send this new line down the pipe my $forced_breakpoint = $forced_breakpoint_to_go[$iend]; Perl::Tidy::VerticalAligner::append_line( @@ -10824,6 +10987,7 @@ sub send_lines_to_vertical_aligner { \@patterns, $forced_breakpoint_to_go[$iend] || $in_comma_list, $outdent_long_lines, + $is_terminal_ternary, $is_semicolon_terminated, $do_not_pad, $rvertical_tightness_flags, @@ -10980,6 +11144,13 @@ sub get_opening_indentation { if ( $saved_opening_indentation{$seqno} ) { ( $indent, $offset ) = @{ $saved_opening_indentation{$seqno} }; } + + # some kind of serious error + # (example is badfile.t) + else { + $indent = 0; + $offset = 0; + } } # if no sequence number it must be an unbalanced container @@ -11140,9 +11311,9 @@ sub lookup_opening_indentation { if ( $types_to_go[$i_terminal] =~ /^[\}\]\)R]$/ && $i_terminal == $ibeg ) { - my $ci = $ci_levels_to_go[$ibeg]; - my $lev = $levels_to_go[$ibeg]; - my $next_type = $types_to_go[ $ibeg + 1 ]; + my $ci = $ci_levels_to_go[$ibeg]; + my $lev = $levels_to_go[$ibeg]; + my $next_type = $types_to_go[ $ibeg + 1 ]; my $i_next_nonblank = ( ( $next_type eq 'b' ) ? $ibeg + 2 : $ibeg + 1 ); if ( $i_next_nonblank <= $max_index_to_go @@ -11429,8 +11600,8 @@ sub lookup_opening_indentation { } } - return ( $indentation, $lev, $level_end, $is_semicolon_terminated, - $is_outdented_line ); + return ( $indentation, $lev, $level_end, $terminal_type, + $is_semicolon_terminated, $is_outdented_line ); } } @@ -11454,7 +11625,7 @@ sub set_vertical_tightness_flags { # These flags are used by sub set_leading_whitespace in # the vertical aligner - my $rvertical_tightness_flags; + my $rvertical_tightness_flags = [ 0, 0, 0, 0, 0, 0 ]; # For non-BLOCK tokens, we will need to examine the next line # too, so we won't consider the last line. @@ -11579,7 +11750,7 @@ sub set_vertical_tightness_flags { # this is a line with just an opening token && ( $iend_next == $ibeg_next - || $iend_next == $ibeg_next + 1 + || $iend_next == $ibeg_next + 2 && $types_to_go[$iend_next] eq '#' ) # looks bad if we align vertically with the wrong container @@ -11599,7 +11770,7 @@ sub set_vertical_tightness_flags { # patch to make something like 'qw(' behave like an opening paren # (aran.t) if ( $types_to_go[$ibeg_next] eq 'q' ) { - if ( $token_beg_next =~ /^q.([\[\(\{])$/ ) { + if ( $token_beg_next =~ /^qw\s*([\[\(\{])$/ ) { $token_beg_next = $1; } } @@ -11637,7 +11808,7 @@ sub set_vertical_tightness_flags { if ( $is_semicolon_terminated || ( $iend_next == $ibeg_next - || $iend_next == $ibeg_next + 1 + || $iend_next == $ibeg_next + 2 && $types_to_go[$iend_next] eq '#' ) ) { @@ -11661,9 +11832,34 @@ sub set_vertical_tightness_flags { ( 3, $rOpts_block_brace_vertical_tightness, 0, 1 ); } + # pack in the sequence numbers of the ends of this line + $rvertical_tightness_flags->[4] = get_seqno($ibeg); + $rvertical_tightness_flags->[5] = get_seqno($iend); return $rvertical_tightness_flags; } +sub get_seqno { + + # get opening and closing sequence numbers of a token for the vertical + # aligner. Assign qw quotes a value to allow qw opening and closing tokens + # to be treated somewhat like opening and closing tokens for stacking + # tokens by the vertical aligner. + my ($ii) = @_; + my $seqno = $type_sequence_to_go[$ii]; + if ( $types_to_go[$ii] eq 'q' ) { + my $SEQ_QW = -1; + if ( $ii > 0 ) { + $seqno = $SEQ_QW if ( $tokens_to_go[$ii] =~ /^qw\s*[\(\{\[]/ ); + } + else { + if ( !$ending_in_quote ) { + $seqno = $SEQ_QW if ( $tokens_to_go[$ii] =~ /[\)\}\]]$/ ); + } + } + } + return ($seqno); +} + { my %is_vertical_alignment_type; my %is_vertical_alignment_keyword; @@ -11672,7 +11868,7 @@ sub set_vertical_tightness_flags { @_ = qw# = **= += *= &= <<= &&= -= /= |= >>= ||= //= .= %= ^= x= - { ? : => =~ && || // + { ? : => =~ && || // ~~ !~~ #; @is_vertical_alignment_type{@_} = (1) x scalar(@_); @@ -11682,8 +11878,12 @@ sub set_vertical_tightness_flags { sub set_vertical_alignment_markers { - # Look at the tokens in this output batch and define the array - # 'matching_token_to_go' which marks tokens at which we would + # This routine takes the first step toward vertical alignment of the + # lines of output text. It looks for certain tokens which can serve as + # vertical alignment markers (such as an '='). + # + # Method: We look at each token $i in this output batch and set + # $matching_token_to_go[$i] equal to those tokens at which we would # accept vertical alignment. # nothing to do if we aren't allowed to change whitespace @@ -11696,6 +11896,14 @@ sub set_vertical_tightness_flags { my ( $ri_first, $ri_last ) = @_; + # remember the index of last nonblank token before any sidecomment + my $i_terminal = $max_index_to_go; + if ( $types_to_go[$i_terminal] eq '#' ) { + if ( $i_terminal > 0 && $types_to_go[ --$i_terminal ] eq 'b' ) { + if ( $i_terminal > 0 ) { --$i_terminal } + } + } + # look at each line of this batch.. my $last_vertical_alignment_before_index; my $vert_last_nonblank_type; @@ -11704,6 +11912,7 @@ sub set_vertical_tightness_flags { my $max_line = @$ri_first - 1; my ( $i, $type, $token, $block_type, $alignment_type ); my ( $ibeg, $iend, $line ); + foreach $line ( 0 .. $max_line ) { $ibeg = $$ri_first[$line]; $iend = $$ri_last[$line]; @@ -11735,12 +11944,10 @@ sub set_vertical_tightness_flags { # align before the first token and 2) the second # token must be a blank if we are to align before # the third - if ( $i < $ibeg + 2 ) { - } + if ( $i < $ibeg + 2 ) { } # must follow a blank token - elsif ( $types_to_go[ $i - 1 ] ne 'b' ) { - } + elsif ( $types_to_go[ $i - 1 ] ne 'b' ) { } # align a side comment -- elsif ( $type eq '#' ) { @@ -11765,8 +11972,7 @@ sub set_vertical_tightness_flags { # otherwise, do not align two in a row to create a # blank field - elsif ( $last_vertical_alignment_before_index == $i - 2 ) { - } + elsif ( $last_vertical_alignment_before_index == $i - 2 ) { } # align before one of these keywords # (within a line, since $i>1) @@ -11783,6 +11989,34 @@ sub set_vertical_tightness_flags { elsif ( $is_vertical_alignment_type{$type} ) { $alignment_type = $token; + # Do not align a terminal token. Although it might + # occasionally look ok to do this, it has been found to be + # a good general rule. The main problems are: + # (1) that the terminal token (such as an = or :) might get + # moved far to the right where it is hard to see because + # nothing follows it, and + # (2) doing so may prevent other good alignments. + if ( $i == $iend || $i >= $i_terminal ) { + $alignment_type = ""; + } + + # Do not align leading ': (' or '. ('. This would prevent + # alignment in something like the following: + # $extra_space .= + # ( $input_line_number < 10 ) ? " " + # : ( $input_line_number < 100 ) ? " " + # : ""; + # or + # $code = + # ( $case_matters ? $accessor : " lc($accessor) " ) + # . ( $yesno ? " eq " : " ne " ) + if ( $i == $ibeg + 2 + && $types_to_go[$ibeg] =~ /^[\.\:]$/ + && $types_to_go[ $i - 1 ] eq 'b' ) + { + $alignment_type = ""; + } + # For a paren after keyword, only align something like this: # if ( $a ) { &a } # elsif ( $b ) { &b } @@ -11797,12 +12031,10 @@ sub set_vertical_tightness_flags { # if ($token ne $type) {$alignment_type .= $type} } - # NOTE: This is deactivated until the new vertical aligner - # is finished because it causes the previous if/elsif alignment - # to fail - #elsif ( $type eq '}' && $token eq '}' && $block_type_to_go[$i]) { - # $alignment_type = $type; - #} + # NOTE: This is deactivated because it causes the previous + # if/elsif alignment to fail + #elsif ( $type eq '}' && $token eq '}' && $block_type_to_go[$i]) + #{ $alignment_type = $type; } if ($alignment_type) { $last_vertical_alignment_before_index = $i; @@ -11954,20 +12186,31 @@ sub terminal_type { $left_bond_strength{'->'} = STRONG; $right_bond_strength{'->'} = VERY_STRONG; - # breaking AFTER these is just ok: - @_ = qw" % + - * / x "; + # breaking AFTER modulus operator is ok: + @_ = qw" % "; + @left_bond_strength{@_} = (STRONG) x scalar(@_); + @right_bond_strength{@_} = + ( 0.1 * NOMINAL + 0.9 * STRONG ) x scalar(@_); + + # Break AFTER math operators * and / + @_ = qw" * / x "; @left_bond_strength{@_} = (STRONG) x scalar(@_); @right_bond_strength{@_} = (NOMINAL) x scalar(@_); + # Break AFTER weakest math operators + and - + # Make them weaker than * but a bit stronger than '.' + @_ = qw" + - "; + @left_bond_strength{@_} = (STRONG) x scalar(@_); + @right_bond_strength{@_} = + ( 0.91 * NOMINAL + 0.09 * WEAK ) x scalar(@_); + # breaking BEFORE these is just ok: @_ = qw" >> << "; @right_bond_strength{@_} = (STRONG) x scalar(@_); @left_bond_strength{@_} = (NOMINAL) x scalar(@_); - # I prefer breaking before the string concatenation operator + # breaking before the string concatenation operator seems best # because it can be hard to see at the end of a line - # swap these to break after a '.' - # this could be a future option $right_bond_strength{'.'} = STRONG; $left_bond_strength{'.'} = 0.9 * NOMINAL + 0.1 * WEAK; @@ -11977,14 +12220,14 @@ sub terminal_type { # make these a little weaker than nominal so that they get # favored for end-of-line characters - @_ = qw"!= == =~ !~"; - @left_bond_strength{@_} = (STRONG) x scalar(@_); + @_ = qw"!= == =~ !~ ~~ !~~"; + @left_bond_strength{@_} = (STRONG) x scalar(@_); @right_bond_strength{@_} = ( 0.9 * NOMINAL + 0.1 * WEAK ) x scalar(@_); # break AFTER these - @_ = qw" < > | & >= <="; - @left_bond_strength{@_} = (VERY_STRONG) x scalar(@_); + @_ = qw" < > | & >= <="; + @left_bond_strength{@_} = (VERY_STRONG) x scalar(@_); @right_bond_strength{@_} = ( 0.8 * NOMINAL + 0.2 * WEAK ) x scalar(@_); @@ -12005,14 +12248,14 @@ sub terminal_type { $left_bond_strength{'G'} = NOMINAL; $right_bond_strength{'G'} = STRONG; - # it is very good to break AFTER various assignment operators + # it is good to break AFTER various assignment operators @_ = qw( = **= += *= &= <<= &&= -= /= |= >>= ||= //= .= %= ^= x= ); - @left_bond_strength{@_} = (STRONG) x scalar(@_); + @left_bond_strength{@_} = (STRONG) x scalar(@_); @right_bond_strength{@_} = ( 0.4 * WEAK + 0.6 * VERY_WEAK ) x scalar(@_); @@ -12654,10 +12897,10 @@ sub pad_array_to_go { # to simplify coding in scan_list and set_bond_strengths, it helps # to create some extra blank tokens at the end of the arrays - $tokens_to_go[ $max_index_to_go + 1 ] = ''; - $tokens_to_go[ $max_index_to_go + 2 ] = ''; - $types_to_go[ $max_index_to_go + 1 ] = 'b'; - $types_to_go[ $max_index_to_go + 2 ] = 'b'; + $tokens_to_go[ $max_index_to_go + 1 ] = ''; + $tokens_to_go[ $max_index_to_go + 2 ] = ''; + $types_to_go[ $max_index_to_go + 1 ] = 'b'; + $types_to_go[ $max_index_to_go + 2 ] = 'b'; $nesting_depth_to_go[ $max_index_to_go + 1 ] = $nesting_depth_to_go[$max_index_to_go]; @@ -12855,6 +13098,7 @@ sub pad_array_to_go { $last_colon_sequence_number = -1; $last_nonblank_token = ';'; $last_nonblank_type = ';'; + $last_nonblank_block_type = ' '; $last_old_breakpoint_count = 0; $minimum_depth = $current_depth + 1; # forces update in check below $old_breakpoint_count = 0; @@ -12875,9 +13119,10 @@ sub pad_array_to_go { # loop over all tokens in this batch while ( ++$i <= $max_index_to_go ) { if ( $type ne 'b' ) { - $i_last_nonblank_token = $i - 1; - $last_nonblank_type = $type; - $last_nonblank_token = $token; + $i_last_nonblank_token = $i - 1; + $last_nonblank_type = $type; + $last_nonblank_token = $token; + $last_nonblank_block_type = $block_type; } $type = $types_to_go[$i]; $block_type = $block_type_to_go[$i]; @@ -12947,9 +13192,20 @@ sub pad_array_to_go { # Note that such breakpoints will be undone later if these tokens # are fully contained within parens on a line. if ( - $type eq 'k' + + # break before a keyword within a line + $type eq 'k' && $i > 0 - && $token =~ /^(if|unless)$/ + + # if one of these keywords: + && $token =~ /^(if|unless|while|until|for)$/ + + # but do not break at something like '1 while' + && ( $last_nonblank_type ne 'n' || $i > 2 ) + + # and let keywords follow a closing 'do' brace + && $last_nonblank_block_type ne 'do' + && ( $is_long_line @@ -13028,7 +13284,7 @@ sub pad_array_to_go { # TESTING: retain break at a ':' line break if ( ( $i == $i_line_start || $i == $i_line_end ) - && $rOpts_break_at_old_trinary_breakpoints ) + && $rOpts_break_at_old_ternary_breakpoints ) { # TESTING: @@ -13105,7 +13361,7 @@ sub pad_array_to_go { $rfor_semicolon_list[$depth] = []; $i_equals[$depth] = -1; $want_comma_break[$depth] = 0; - $container_type[$depth] = + $container_type[$depth] = ( $last_nonblank_type =~ /^(k|=>|&&|\|\||\?|\:|\.)$/ ) ? $last_nonblank_token : ""; @@ -13412,6 +13668,11 @@ sub pad_array_to_go { if ( $rOpts_line_up_parentheses && $saw_opening_structure ) { my $item = $leading_spaces_to_go[ $i_opening + 1 ]; + if ( $i_opening + 1 < $max_index_to_go + && $types_to_go[ $i_opening + 1 ] eq 'b' ) + { + $item = $leading_spaces_to_go[ $i_opening + 2 ]; + } if ( defined($item) ) { my $i_start_2 = $item->get_STARTING_INDEX(); if ( @@ -13576,13 +13837,26 @@ sub pad_array_to_go { # break before the previous token if it looks safe # Example of something that we will not try to break before: # DBI::SQL_SMALLINT() => $ado_consts->{adSmallInt}, + # Also we don't want to break at a binary operator (like +): + # $c->createOval( + # $x + $R, $y + + # $R => $x - $R, + # $y - $R, -fill => 'black', + # ); my $ibreak = $index_before_arrow[$depth] - 1; if ( $ibreak > 0 && $tokens_to_go[ $ibreak + 1 ] !~ /^[\)\}\]]$/ ) { if ( $tokens_to_go[$ibreak] eq '-' ) { $ibreak-- } - if ( $types_to_go[$ibreak] =~ /^[,b\(\{\[]$/ ) { - set_forced_breakpoint($ibreak); + if ( $types_to_go[$ibreak] eq 'b' ) { $ibreak-- } + if ( $types_to_go[$ibreak] =~ /^[,wiZCUG\(\{\[]$/ ) { + + # don't break pointer calls, such as the following: + # File::Spec->curdir => 1, + # (This is tokenized as adjacent 'w' tokens) + if ( $tokens_to_go[ $ibreak + 1 ] !~ /^->/ ) { + set_forced_breakpoint($ibreak); + } } } @@ -13904,7 +14178,7 @@ sub find_token_starting_list { # Looks like a list of items. We have to look at it and size it up. #--------------------------------------------------------------- - my $opening_token = $tokens_to_go[$i_opening_paren]; + my $opening_token = $tokens_to_go[$i_opening_paren]; my $opening_environment = $container_environment_to_go[$i_opening_paren]; @@ -14014,7 +14288,7 @@ sub find_token_starting_list { # Field width parameters my $pair_width = ( $max_length[0] + $max_length[1] ); - my $max_width = + my $max_width = ( $max_length[0] > $max_length[1] ) ? $max_length[0] : $max_length[1]; # Number of free columns across the page width for laying out tables @@ -14157,8 +14431,8 @@ sub find_token_starting_list { # ) # if $style eq 'all'; - my $i_last_comma = $$rcomma_index[ $comma_count - 1 ]; - my $long_last_term = excess_line_length( 0, $i_last_comma ) <= 0; + my $i_last_comma = $$rcomma_index[ $comma_count - 1 ]; + my $long_last_term = excess_line_length( 0, $i_last_comma ) <= 0; my $long_first_term = excess_line_length( $i_first_comma + 1, $max_index_to_go ) <= 0; @@ -14222,8 +14496,8 @@ sub find_token_starting_list { if ( $number_of_fields > 1 ) { $formatted_columns = - ( $pair_width * ( int( $item_count / 2 ) ) + ( $item_count % 2 ) * - $max_width ); + ( $pair_width * ( int( $item_count / 2 ) ) + + ( $item_count % 2 ) * $max_width ); } else { $formatted_columns = $max_width * $item_count; @@ -14238,10 +14512,10 @@ sub find_token_starting_list { # align; high sparsity does not look good, especially with few lines my $sparsity = ($unused_columns) / ($formatted_columns); my $max_allowed_sparsity = - ( $item_count < 3 ) ? 0.1 + ( $item_count < 3 ) ? 0.1 : ( $packed_lines == 1 ) ? 0.15 : ( $packed_lines == 2 ) ? 0.4 - : 0.7; + : 0.7; # Begin check for shortcut methods, which avoid treating a list # as a table for relatively small parenthesized lists. These @@ -14281,8 +14555,7 @@ sub find_token_starting_list { ) { - my $break_count = - set_ragged_breakpoints( \@i_term_comma, + my $break_count = set_ragged_breakpoints( \@i_term_comma, $ri_ragged_break_list ); ++$break_count if ($use_separate_first_term); @@ -14333,8 +14606,7 @@ sub find_token_starting_list { # imprecise, but not too bad. (steve.t) if ( !$too_long && $i_opening_paren > 0 && $opening_token eq '(' ) { - $too_long = - excess_line_length( $i_opening_minus, + $too_long = excess_line_length( $i_opening_minus, $i_effective_last_comma + 1 ) > 0; } @@ -14344,8 +14616,7 @@ sub find_token_starting_list { if ( !$too_long && $i_opening_paren > 0 && $list_type eq '=>' ) { my $i_opening_minus = $i_opening_paren - 4; if ( $i_opening_minus >= 0 ) { - $too_long = - excess_line_length( $i_opening_minus, + $too_long = excess_line_length( $i_opening_minus, $i_effective_last_comma + 1 ) > 0; } } @@ -14388,8 +14659,7 @@ sub find_token_starting_list { # let the continuation logic handle it if 2 lines else { - my $break_count = - set_ragged_breakpoints( \@i_term_comma, + my $break_count = set_ragged_breakpoints( \@i_term_comma, $ri_ragged_break_list ); ++$break_count if ($use_separate_first_term); @@ -14617,7 +14887,7 @@ sub get_maximum_fields_wanted { sub table_columns_available { my $i_first_comma = shift; - my $columns = + my $columns = $rOpts_maximum_line_length - leading_spaces_to_go($i_first_comma); # Patch: the vertical formatter does not line up lines whose lengths @@ -14732,7 +15002,7 @@ sub set_forced_breakpoint { # if we break before or after it my $token = $tokens_to_go[$i]; - if ( $token =~ /^([\.\,\:\?]|and|or|xor|&&|\|\|)$/ ) { + if ( $token =~ /^([\=\.\,\:\?]|and|or|xor|&&|\|\|)$/ ) { if ( $want_break_before{$token} && $i >= 0 ) { $i-- } } @@ -15003,37 +15273,88 @@ sub recombine_breakpoints { # if '=' at end of line ... elsif ( $is_assignment{ $types_to_go[$imid] } ) { - # otherwise always ok to join isolated '=' - unless ( $if == $imid ) { - - my $is_math = ( - ( $types_to_go[$il] =~ /^[+-\/\*\)]$/ ) - - # note no '$' in pattern because -> can - # start long identifier - && !grep { $_ =~ /^(->|=>|[\,])/ } - @types_to_go[ $imidr .. $il ] - ); - - # retain the break after the '=' unless ... + my $is_short_quote = + ( $types_to_go[$imidr] eq 'Q' + && $imidr == $il + && length( $tokens_to_go[$imidr] ) < + $rOpts_short_concatenation_item_length ); + my $ifnmax = $$ri_first[$nmax]; + my $ifnp = ( $nmax > $n ) ? $$ri_first[ $n + 1 ] : $ifnmax; + my $is_qk = + ( $types_to_go[$if] eq '?' && $types_to_go[$ifnp] eq ':' ); + + # always join an isolated '=', a short quote, or if this + # will put ?/: at start of adjacent lines + if ( $if != $imid + && !$is_short_quote + && !$is_qk ) + { next unless ( + ( - # '=' is followed by a number and looks like math - ( $types_to_go[$imidr] eq 'n' && $is_math ) + # unless we can reduce this to two lines + $nmax < $n + 2 - # or followed by a scalar and looks like math - || ( ( $types_to_go[$imidr] eq 'i' ) - && ( $tokens_to_go[$imidr] =~ /^\$/ ) - && $is_math ) + # or three lines, the last with a leading semicolon + || ( $nmax == $n + 2 + && $types_to_go[$ifnmax] eq ';' ) - # or followed by a single "short" token - # ('12' is arbitrary) - || ( $il == $imidr - && token_sequence_length( $imidr, $imidr ) < 12 ) + # or the next line ends with a here doc + || $types_to_go[$il] eq 'h' + ) + # do not recombine if the two lines might align well + # this is a very approximate test for this + && $types_to_go[$imidr] ne $types_to_go[$ifnp] ); + + # -lp users often prefer this: + # my $title = function($env, $env, $sysarea, + # "bubba Borrower Entry"); + # so we will recombine if -lp is used we have ending comma + if ( !$rOpts_line_up_parentheses + || $types_to_go[$il] ne ',' ) + { + + # otherwise, scan the rhs line up to last token for + # complexity. Note that we are not counting the last + # token in case it is an opening paren. + my $tv = 0; + my $depth = $nesting_depth_to_go[$imidr]; + for ( my $i = $imidr + 1 ; $i < $il ; $i++ ) { + if ( $nesting_depth_to_go[$i] != $depth ) { + $tv++; + last if ( $tv > 1 ); + } + $depth = $nesting_depth_to_go[$i]; + } + + # ok to recombine if no level changes before last token + if ( $tv > 0 ) { + + # otherwise, do not recombine if more than two + # level changes. + next if ( $tv > 1 ); + + # check total complexity of the two adjacent lines + # that will occur if we do this join + my $istop = + ( $n < $nmax ) ? $$ri_last[ $n + 1 ] : $il; + for ( my $i = $il ; $i <= $istop ; $i++ ) { + if ( $nesting_depth_to_go[$i] != $depth ) { + $tv++; + last if ( $tv > 2 ); + } + $depth = $nesting_depth_to_go[$i]; + } + + # do not recombine if total is more than 2 level changes + next if ( $tv > 2 ); + } + } } + unless ( $tokens_to_go[$imidr] =~ /^[\{\(\[]$/ ) { $forced_breakpoint_to_go[$imid] = 0; } @@ -15049,6 +15370,9 @@ sub recombine_breakpoints { #/^(last|next|redo|return)$/ $is_last_next_redo_return{ $tokens_to_go[$imid] } + + # but only if followed by multiple lines + && $n < $nmax ); if ( $is_and_or{ $tokens_to_go[$imid] } ) { @@ -15056,13 +15380,32 @@ sub recombine_breakpoints { } } - #---------------------------------------------------------- - # Section 2: Now examine token at $imidr (left end of second - # line of pair) - #---------------------------------------------------------- + # handle trailing + - * / + elsif ( $types_to_go[$imid] =~ /^[\+\-\*\/]$/ ) { + my $i_next_nonblank = $imidr; + my $i_next_next = $i_next_nonblank + 1; + $i_next_next++ if ( $types_to_go[$i_next_next] eq 'b' ); - # join lines identified above as capable of - # causing an outdented line with leading closing paren + # do not strand numbers + next + unless ( + $types_to_go[$i_next_nonblank] eq 'n' + && ( + $i_next_nonblank == $il + || ( $i_next_next == $il + && $types_to_go[$i_next_next] =~ /^[\+\-\*\/]$/ ) + || $types_to_go[$i_next_next] eq ';' + ) + ); + } + + #---------------------------------------------------------- + # Section 2: Now examine token at $imidr (left end of second + # line of pair) + #---------------------------------------------------------- + + # join lines identified above as capable of + # causing an outdented line with leading closing paren if ($previous_outdentable_closing_paren) { $forced_breakpoint_to_go[$imid] = 0; } @@ -15086,7 +15429,7 @@ sub recombine_breakpoints { my $ifff = $n + 2 <= $nmax ? $$ri_first[ $n + 2 ] : -1; my $imm = $n > 1 ? $$ri_first[ $n - 2 ] : -1; my $seqno = $type_sequence_to_go[$imidr]; - my $f_ok = + my $f_ok = ( $types_to_go[$if] eq ':' && $type_sequence_to_go[$if] == $seqno - TYPE_SEQUENCE_INCREMENT ); @@ -15190,9 +15533,6 @@ sub recombine_breakpoints { ) ); - - # override breakpoint - ##$forced_breakpoint_to_go[$imid] = 0; } # handle leading "if" and "unless" @@ -15208,10 +15548,6 @@ sub recombine_breakpoints { && $is_and_or{ $tokens_to_go[$if] } ); - - # override breakpoint - ##$forced_breakpoint_to_go[$imid] = 0; - } # handle all other leading keywords @@ -15244,9 +15580,54 @@ sub recombine_breakpoints { && $is_if_unless{ $tokens_to_go[$if] } ); + } + + # handle leading + - * / + elsif ( $types_to_go[$imidr] =~ /^[\+\-\*\/]$/ ) { + my $i_next_nonblank = $imidr + 1; + if ( $types_to_go[$i_next_nonblank] eq 'b' ) { + $i_next_nonblank++; + } + + my $i_next_next = $i_next_nonblank + 1; + $i_next_next++ if ( $types_to_go[$i_next_next] eq 'b' ); + + next + unless ( + + # unless there is just one and we can reduce + # this to two lines if we do. For example, this + ( + $n == 2 + && $n == $nmax + && $types_to_go[$if] ne $types_to_go[$imidr] + ) + + # do not strand numbers + || ( + $types_to_go[$i_next_nonblank] eq 'n' + && ( $i_next_nonblank >= $il - 1 + || $types_to_go[$i_next_next] eq ';' ) + ) + ); + } - # override breakpoint - ##$forced_breakpoint_to_go[$imid] = 0; + # handle line with leading = or similar + elsif ( $is_assignment{ $types_to_go[$imidr] } ) { + next unless $n == 1; + my $ifnmax = $$ri_first[$nmax]; + next + unless ( + + # unless we can reduce this to two lines + $nmax == 2 + + # or three lines, the last with a leading semicolon + || ( $nmax == 3 && $types_to_go[$ifnmax] eq ';' ) + + # or the next line ends with a here doc + || $types_to_go[$il] eq 'h' + ); } #---------------------------------------------------------- @@ -15311,12 +15692,169 @@ sub recombine_breakpoints { return ( $ri_first, $ri_last ); } +sub break_all_chain_tokens { + + # scan the current breakpoints looking for breaks at certain "chain + # operators" (. : && || + etc) which often occur repeatedly in a long + # statement. If we see a break at any one, break at all similar tokens + # within the same container. + # + # TODO: + # does not handle nested ?: operators correctly + # coordinate better with ?: logic in set_continuation_breaks + my ( $ri_left, $ri_right ) = @_; + + my %saw_chain_type; + my %left_chain_type; + my %right_chain_type; + my %interior_chain_type; + my $nmax = @$ri_right - 1; + + # scan the left and right end tokens of all lines + my $count = 0; + for my $n ( 0 .. $nmax ) { + my $il = $$ri_left[$n]; + my $ir = $$ri_right[$n]; + my $typel = $types_to_go[$il]; + my $typer = $types_to_go[$ir]; + $typel = '+' if ( $typel eq '-' ); # treat + and - the same + $typer = '+' if ( $typer eq '-' ); + $typel = '*' if ( $typel eq '/' ); # treat * and / the same + $typer = '*' if ( $typer eq '/' ); + my $tokenl = $tokens_to_go[$il]; + my $tokenr = $tokens_to_go[$ir]; + + if ( $is_chain_operator{$tokenl} && $want_break_before{$typel} ) { + next if ( $typel eq '?' ); + push @{ $left_chain_type{$typel} }, $il; + $saw_chain_type{$typel} = 1; + $count++; + } + if ( $is_chain_operator{$tokenr} && !$want_break_before{$typer} ) { + next if ( $typer eq '?' ); + push @{ $right_chain_type{$typer} }, $ir; + $saw_chain_type{$typer} = 1; + $count++; + } + } + return unless $count; + + # now look for any interior tokens of the same types + $count = 0; + for my $n ( 0 .. $nmax ) { + my $il = $$ri_left[$n]; + my $ir = $$ri_right[$n]; + for ( my $i = $il + 1 ; $i < $ir ; $i++ ) { + my $type = $types_to_go[$i]; + $type = '+' if ( $type eq '-' ); + $type = '*' if ( $type eq '/' ); + if ( $saw_chain_type{$type} ) { + push @{ $interior_chain_type{$type} }, $i; + $count++; + } + } + } + return unless $count; + + # now make a list of all new break points + my @insert_list; + + # loop over all chain types + foreach my $type ( keys %saw_chain_type ) { + + # quit if just ONE continuation line with leading . For example-- + # print LATEXFILE '\framebox{\parbox[c][' . $h . '][t]{' . $w . '}{' + # . $contents; + last if ( $nmax == 1 && $type =~ /^[\.\+]$/ ); + + # loop over all interior chain tokens + foreach my $itest ( @{ $interior_chain_type{$type} } ) { + + # loop over all left end tokens of same type + if ( $left_chain_type{$type} ) { + next if $nobreak_to_go[ $itest - 1 ]; + foreach my $i ( @{ $left_chain_type{$type} } ) { + next unless in_same_container( $i, $itest ); + push @insert_list, $itest - 1; + last; + } + } + + # loop over all right end tokens of same type + if ( $right_chain_type{$type} ) { + next if $nobreak_to_go[$itest]; + foreach my $i ( @{ $right_chain_type{$type} } ) { + next unless in_same_container( $i, $itest ); + push @insert_list, $itest; + last; + } + } + } + } + + # insert any new break points + if (@insert_list) { + insert_additional_breaks( \@insert_list, $ri_left, $ri_right ); + } +} + +sub in_same_container { + + # check to see if tokens at i1 and i2 are in the + # same container, and not separated by a comma, ? or : + my ( $i1, $i2 ) = @_; + my $type = $types_to_go[$i1]; + my $depth = $nesting_depth_to_go[$i1]; + return unless ( $nesting_depth_to_go[$i2] == $depth ); + if ( $i2 < $i1 ) { ( $i1, $i2 ) = ( $i2, $i1 ) } + for ( my $i = $i1 + 1 ; $i < $i2 ; $i++ ) { + next if ( $nesting_depth_to_go[$i] > $depth ); + return if ( $nesting_depth_to_go[$i] < $depth ); + + my $tok = $tokens_to_go[$i]; + $tok = ',' if $tok eq '=>'; # treat => same as , + + # Example: we would not want to break at any of these .'s + # : "$str" + if ( $type ne ':' ) { + return if ( $tok =~ /^[\,\:\?]$/ ) || $tok eq '||' || $tok eq 'or'; + } + else { + return if ( $tok =~ /^[\,]$/ ); + } + } + return 1; +} + sub set_continuation_breaks { # Define an array of indexes for inserting newline characters to # keep the line lengths below the maximum desired length. There is # an implied break after the last token, so it need not be included. - # We'll break at points where the bond strength is lowest. + + # Method: + # This routine is part of series of routines which adjust line + # lengths. It is only called if a statement is longer than the + # maximum line length, or if a preliminary scanning located + # desirable break points. Sub scan_list has already looked at + # these tokens and set breakpoints (in array + # $forced_breakpoint_to_go[$i]) where it wants breaks (for example + # after commas, after opening parens, and before closing parens). + # This routine will honor these breakpoints and also add additional + # breakpoints as necessary to keep the line length below the maximum + # requested. It bases its decision on where the 'bond strength' is + # lowest. + + # Output: returns references to the arrays: + # @i_first + # @i_last + # which contain the indexes $i of the first and last tokens on each + # line. + + # In addition, the array: + # $forced_breakpoint_to_go[$i] + # may be updated to be =1 for any index $i after which there must be + # a break. This signals later routines not to undo the breakpoint. my $saw_good_break = shift; my @i_first = (); # the first index to output @@ -15330,7 +15868,7 @@ sub set_continuation_breaks { my $imax = $max_index_to_go; if ( $types_to_go[$imin] eq 'b' ) { $imin++ } if ( $types_to_go[$imax] eq 'b' ) { $imax-- } - my $i_begin = $imin; + my $i_begin = $imin; # index for starting next iteration my $leading_spaces = leading_spaces_to_go($imin); my $line_count = 0; @@ -15353,6 +15891,10 @@ sub set_continuation_breaks { # This is a sufficient but not necessary condition for colon chain my $is_colon_chain = ( $colons_in_order && @colon_list > 2 ); + #------------------------------------------------------- + # BEGINNING of main loop to set continuation breakpoints + # Keep iterating until we reach the end + #------------------------------------------------------- while ( $i_begin <= $imax ) { my $lowest_strength = NO_BREAK; my $starting_sum = $lengths_to_go[$i_begin]; @@ -15362,12 +15904,14 @@ sub set_continuation_breaks { my $lowest_next_type = 'b'; my $i_lowest_next_nonblank = -1; - # loop to find next break point + #------------------------------------------------------- + # BEGINNING of inner loop to find the best next breakpoint + #------------------------------------------------------- for ( $i_test = $i_begin ; $i_test <= $imax ; $i_test++ ) { - my $type = $types_to_go[$i_test]; - my $token = $tokens_to_go[$i_test]; - my $next_type = $types_to_go[ $i_test + 1 ]; - my $next_token = $tokens_to_go[ $i_test + 1 ]; + my $type = $types_to_go[$i_test]; + my $token = $tokens_to_go[$i_test]; + my $next_type = $types_to_go[ $i_test + 1 ]; + my $next_token = $tokens_to_go[ $i_test + 1 ]; my $i_next_nonblank = ( ( $next_type eq 'b' ) ? $i_test + 2 : $i_test + 1 ); my $next_nonblank_type = $types_to_go[$i_next_nonblank]; @@ -15409,7 +15953,6 @@ sub set_continuation_breaks { # There is an implied forced break at a terminal opening brace || ( ( $type eq '{' ) && ( $i_test == $imax ) ) - ) { @@ -15429,8 +15972,9 @@ sub set_continuation_breaks { && ( $next_nonblank_type =~ /^[\;\,]$/ ) && ( ( - $leading_spaces + $lengths_to_go[ $i_next_nonblank + 1 ] - - $starting_sum + $leading_spaces + + $lengths_to_go[ $i_next_nonblank + 1 ] - + $starting_sum ) > $rOpts_maximum_line_length ) ) @@ -15448,7 +15992,8 @@ sub set_continuation_breaks { && ( $token eq $type ) && ( ( - $leading_spaces + $lengths_to_go[ $i_test + 1 ] - + $leading_spaces + + $lengths_to_go[ $i_test + 1 ] - $starting_sum ) <= $rOpts_maximum_line_length ) @@ -15538,7 +16083,8 @@ sub set_continuation_breaks { ? 1 : ( ( - $leading_spaces + $lengths_to_go[ $i_test + 2 ] - + $leading_spaces + + $lengths_to_go[ $i_test + 2 ] - $starting_sum ) > $rOpts_maximum_line_length ); @@ -15568,6 +16114,11 @@ sub set_continuation_breaks { ); } + #------------------------------------------------------- + # END of inner loop to find the best next breakpoint + # Now decide exactly where to put the breakpoint + #------------------------------------------------------- + # it's always ok to break at imax if no other break was found if ( $i_lowest < 0 ) { $i_lowest = $imax } @@ -15609,6 +16160,11 @@ sub set_continuation_breaks { last; } + #------------------------------------------------------- + # END of inner loop to find the best next breakpoint: + # Break the line after the token with index i=$i_lowest + #------------------------------------------------------- + # final index calculation $i_next_nonblank = ( ( $types_to_go[ $i_lowest + 1 ] eq 'b' ) @@ -15685,6 +16241,11 @@ sub set_continuation_breaks { } } + #------------------------------------------------------- + # END of main loop to set continuation breakpoints + # Now go back and make any necessary corrections + #------------------------------------------------------- + #------------------------------------------------------- # ?/: rule 4 -- if we broke at a ':', then break at # corresponding '?' unless this is a chain of ?: expressions @@ -15732,7 +16293,7 @@ sub insert_additional_breaks { my $i_l; my $line_number = 0; my $i_break_left; - foreach $i_break_left ( sort @$ri_break_list ) { + foreach $i_break_left ( sort { $a <=> $b } @$ri_break_list ) { $i_f = $$ri_first[$line_number]; $i_l = $$ri_last[$line_number]; @@ -15903,7 +16464,7 @@ sub permanently_decrease_AVAILABLE_SPACES { my ( $item, $spaces_needed ) = @_; my $available_spaces = $item->get_AVAILABLE_SPACES(); - my $deleted_spaces = + my $deleted_spaces = ( $available_spaces > $spaces_needed ) ? $spaces_needed : $available_spaces; @@ -15922,7 +16483,7 @@ sub tentatively_decrease_AVAILABLE_SPACES { # caller. my ( $item, $spaces_needed ) = @_; my $available_spaces = $item->get_AVAILABLE_SPACES(); - my $deleted_spaces = + my $deleted_spaces = ( $available_spaces > $spaces_needed ) ? $spaces_needed : $available_spaces; @@ -16373,6 +16934,7 @@ BEGIN { use constant VALIGN_DEBUG_FLAG_APPEND => 0; use constant VALIGN_DEBUG_FLAG_APPEND0 => 0; + use constant VALIGN_DEBUG_FLAG_TERNARY => 0; my $debug_warning = sub { print "VALIGN_DEBUGGING with key $_[0]\n"; @@ -16414,6 +16976,7 @@ use vars qw( $file_writer_object @side_comment_history $comment_leading_space_count + $is_matching_terminal_line $cached_line_text $cached_line_type @@ -16421,6 +16984,10 @@ use vars qw( $cached_seqno $cached_line_valid $cached_line_leading_space_count + $cached_seqno_string + + $seqno_string + $last_nonblank_seqno_string $rOpts @@ -16429,6 +16996,7 @@ use vars qw( $rOpts_indent_columns $rOpts_tabs $rOpts_entab_leading_whitespace + $rOpts_valign $rOpts_minimum_space_to_comment @@ -16442,7 +17010,6 @@ sub initialize { = @_; # variables describing the entire space group: - $ralignment_list = []; $group_level = 0; $last_group_level_written = -1; @@ -16461,6 +17028,7 @@ sub initialize { $last_outdented_line_at = 0; $last_side_comment_line_number = 0; $last_side_comment_level = -1; + $is_matching_terminal_line = 0; # most recent 3 side comments; [ line number, column ] $side_comment_history[0] = [ -300, 0 ]; @@ -16474,6 +17042,11 @@ sub initialize { $cached_seqno = 0; $cached_line_valid = 0; $cached_line_leading_space_count = 0; + $cached_seqno_string = ""; + + # string of sequence numbers joined together + $seqno_string = ""; + $last_nonblank_seqno_string = ""; # frequently used parameters $rOpts_indent_columns = $rOpts->{'indent-columns'}; @@ -16481,6 +17054,7 @@ sub initialize { $rOpts_entab_leading_whitespace = $rOpts->{'entab-leading-whitespace'}; $rOpts_minimum_space_to_comment = $rOpts->{'minimum-space-to-comment'}; $rOpts_maximum_line_length = $rOpts->{'maximum-line-length'}; + $rOpts_valign = $rOpts->{'valign'}; forget_side_comment(); @@ -16654,19 +17228,18 @@ sub append_line { # The log file warns the user if there are any such tabs. my ( - $level, $level_end, - $indentation, $rfields, - $rtokens, $rpatterns, - $is_forced_break, $outdent_long_lines, - $is_terminal_statement, $do_not_pad, - $rvertical_tightness_flags, $level_jump, + $level, $level_end, + $indentation, $rfields, + $rtokens, $rpatterns, + $is_forced_break, $outdent_long_lines, + $is_terminal_ternary, $is_terminal_statement, + $do_not_pad, $rvertical_tightness_flags, + $level_jump, ) = @_; # number of fields is $jmax # number of tokens between fields is $jmax-1 my $jmax = $#{$rfields}; - $previous_minimum_jmax_seen = $minimum_jmax_seen; - $previous_maximum_jmax_seen = $maximum_jmax_seen; my $leading_space_count = get_SPACES($indentation); @@ -16692,6 +17265,8 @@ sub append_line { if ($rvertical_tightness_flags) { if ( $maximum_line_index <= 0 && $cached_line_type + && $cached_seqno + && $rvertical_tightness_flags->[2] && $rvertical_tightness_flags->[2] == $cached_seqno ) { $rvertical_tightness_flags->[3] ||= 1; @@ -16716,7 +17291,8 @@ sub append_line { if ( $level < 0 ) { $level = 0 } # do not align code across indentation level changes - if ( $level != $group_level || $is_outdented ) { + # or if vertical alignment is turned off for debugging + if ( $level != $group_level || $is_outdented || !$rOpts_valign ) { # we are allowed to shift a group of lines to the right if its # level is greater than the previous and next group @@ -16763,6 +17339,27 @@ sub append_line { } } + # -------------------------------------------------------------------- + # add dummy fields for terminal ternary + # -------------------------------------------------------------------- + my $j_terminal_match; + if ( $is_terminal_ternary && $current_line ) { + $j_terminal_match = + fix_terminal_ternary( $rfields, $rtokens, $rpatterns ); + $jmax = @{$rfields} - 1; + } + + # -------------------------------------------------------------------- + # add dummy fields for else statement + # -------------------------------------------------------------------- + if ( $rfields->[0] =~ /^else\s*$/ + && $current_line + && $level_jump == 0 ) + { + $j_terminal_match = fix_terminal_else( $rfields, $rtokens, $rpatterns ); + $jmax = @{$rfields} - 1; + } + # -------------------------------------------------------------------- # Step 1. Handle simple line of code with no fields to match. # -------------------------------------------------------------------- @@ -16842,6 +17439,19 @@ sub append_line { rvertical_tightness_flags => $rvertical_tightness_flags, ); + # Initialize a global flag saying if the last line of the group should + # match end of group and also terminate the group. There should be no + # returns between here and where the flag is handled at the bottom. + my $col_matching_terminal = 0; + if ( defined($j_terminal_match) ) { + + # remember the column of the terminal ? or { to match with + $col_matching_terminal = $current_line->get_column($j_terminal_match); + + # set global flag for sub decide_if_aligned + $is_matching_terminal_line = 1; + } + # -------------------------------------------------------------------- # It simplifies things to create a zero length side comment # if none exists. @@ -16911,6 +17521,26 @@ sub append_line { # Future update to allow this to vary: $current_line = $new_line if ( $maximum_line_index == 0 ); + # output this group if it ends in a terminal else or ternary line + if ( defined($j_terminal_match) ) { + + # if there is only one line in the group (maybe due to failure to match + # perfectly with previous lines), then align the ? or { of this + # terminal line with the previous one unless that would make the line + # too long + if ( $maximum_line_index == 0 ) { + my $col_now = $current_line->get_column($j_terminal_match); + my $pad = $col_matching_terminal - $col_now; + my $padding_available = + $current_line->get_available_space_on_right(); + if ( $pad > 0 && $pad <= $padding_available ) { + $current_line->increase_field_width( $j_terminal_match, $pad ); + } + } + my_flush(); + $is_matching_terminal_line = 0; + } + # -------------------------------------------------------------------- # Step 8. Some old debugging stuff # -------------------------------------------------------------------- @@ -16923,6 +17553,8 @@ sub append_line { dump_array(@$rpatterns); dump_alignments(); }; + + return; } sub join_hanging_comment { @@ -17135,12 +17767,11 @@ sub decide_if_list { sub eliminate_new_fields { return unless ( $maximum_line_index >= 0 ); - my $new_line = shift; - my $old_line = shift; - my $jmax = $new_line->get_jmax(); + my ( $new_line, $old_line ) = @_; + my $jmax = $new_line->get_jmax(); - my $old_rtokens = $old_line->get_rtokens(); - my $rtokens = $new_line->get_rtokens(); + my $old_rtokens = $old_line->get_rtokens(); + my $rtokens = $new_line->get_rtokens(); my $is_assignment = ( $rtokens->[0] =~ /^=\d*$/ && ( $old_rtokens->[0] eq $rtokens->[0] ) ); @@ -17166,7 +17797,7 @@ sub eliminate_new_fields { my $rpatterns = $new_line->get_rpatterns(); my $old_rpatterns = $old_line->get_rpatterns(); - # loop over all old tokens except comment + # loop over all OLD tokens except comment and check match my $match = 1; my $k; for ( $k = 0 ; $k < $maximum_field_index - 1 ; $k++ ) { @@ -17178,7 +17809,7 @@ sub eliminate_new_fields { } } - # first tokens agree, so combine new tokens + # first tokens agree, so combine extra new tokens if ($match) { for $k ( $maximum_field_index .. $jmax - 1 ) { @@ -17196,11 +17827,233 @@ sub eliminate_new_fields { $new_line->set_jmax($jmax); } +sub fix_terminal_ternary { + + # Add empty fields as necessary to align a ternary term + # like this: + # + # my $leapyear = + # $year % 4 ? 0 + # : $year % 100 ? 1 + # : $year % 400 ? 0 + # : 1; + # + # returns 1 if the terminal item should be indented + + my ( $rfields, $rtokens, $rpatterns ) = @_; + + my $jmax = @{$rfields} - 1; + my $old_line = $group_lines[$maximum_line_index]; + my $rfields_old = $old_line->get_rfields(); + + my $rpatterns_old = $old_line->get_rpatterns(); + my $rtokens_old = $old_line->get_rtokens(); + my $maximum_field_index = $old_line->get_jmax(); + + # look for the question mark after the : + my ($jquestion); + my $depth_question; + my $pad = ""; + for ( my $j = 0 ; $j < $maximum_field_index ; $j++ ) { + my $tok = $rtokens_old->[$j]; + if ( $tok =~ /^\?(\d+)$/ ) { + $depth_question = $1; + + # depth must be correct + next unless ( $depth_question eq $group_level ); + + $jquestion = $j; + if ( $rfields_old->[ $j + 1 ] =~ /^(\?\s*)/ ) { + $pad = " " x length($1); + } + else { + return; # shouldn't happen + } + last; + } + } + return unless ( defined($jquestion) ); # shouldn't happen + + # Now splice the tokens and patterns of the previous line + # into the else line to insure a match. Add empty fields + # as necessary. + my $jadd = $jquestion; + + # Work on copies of the actual arrays in case we have + # to return due to an error + my @fields = @{$rfields}; + my @patterns = @{$rpatterns}; + my @tokens = @{$rtokens}; + + VALIGN_DEBUG_FLAG_TERNARY && do { + local $" = '><'; + print "CURRENT FIELDS=<@{$rfields_old}>\n"; + print "CURRENT TOKENS=<@{$rtokens_old}>\n"; + print "CURRENT PATTERNS=<@{$rpatterns_old}>\n"; + print "UNMODIFIED FIELDS=<@{$rfields}>\n"; + print "UNMODIFIED TOKENS=<@{$rtokens}>\n"; + print "UNMODIFIED PATTERNS=<@{$rpatterns}>\n"; + }; + + # handle cases of leading colon on this line + if ( $fields[0] =~ /^(:\s*)(.*)$/ ) { + + my ( $colon, $therest ) = ( $1, $2 ); + + # Handle sub-case of first field with leading colon plus additional code + # This is the usual situation as at the '1' below: + # ... + # : $year % 400 ? 0 + # : 1; + if ($therest) { + + # Split the first field after the leading colon and insert padding. + # Note that this padding will remain even if the terminal value goes + # out on a separate line. This does not seem to look to bad, so no + # mechanism has been included to undo it. + my $field1 = shift @fields; + unshift @fields, ( $colon, $pad . $therest ); + + # change the leading pattern from : to ? + return unless ( $patterns[0] =~ s/^\:/?/ ); + + # install leading tokens and patterns of existing line + unshift( @tokens, @{$rtokens_old}[ 0 .. $jquestion ] ); + unshift( @patterns, @{$rpatterns_old}[ 0 .. $jquestion ] ); + + # insert appropriate number of empty fields + splice( @fields, 1, 0, ('') x $jadd ) if $jadd; + } + + # handle sub-case of first field just equal to leading colon. + # This can happen for example in the example below where + # the leading '(' would create a new alignment token + # : ( $name =~ /[]}]$/ ) ? ( $mname = $name ) + # : ( $mname = $name . '->' ); + else { + + return unless ( $jmax > 0 && $tokens[0] ne '#' ); # shouldn't happen + + # prepend a leading ? onto the second pattern + $patterns[1] = "?b" . $patterns[1]; + + # pad the second field + $fields[1] = $pad . $fields[1]; + + # install leading tokens and patterns of existing line, replacing + # leading token and inserting appropriate number of empty fields + splice( @tokens, 0, 1, @{$rtokens_old}[ 0 .. $jquestion ] ); + splice( @patterns, 1, 0, @{$rpatterns_old}[ 1 .. $jquestion ] ); + splice( @fields, 1, 0, ('') x $jadd ) if $jadd; + } + } + + # Handle case of no leading colon on this line. This will + # be the case when -wba=':' is used. For example, + # $year % 400 ? 0 : + # 1; + else { + + # install leading tokens and patterns of existing line + $patterns[0] = '?' . 'b' . $patterns[0]; + unshift( @tokens, @{$rtokens_old}[ 0 .. $jquestion ] ); + unshift( @patterns, @{$rpatterns_old}[ 0 .. $jquestion ] ); + + # insert appropriate number of empty fields + $jadd = $jquestion + 1; + $fields[0] = $pad . $fields[0]; + splice( @fields, 0, 0, ('') x $jadd ) if $jadd; + } + + VALIGN_DEBUG_FLAG_TERNARY && do { + local $" = '><'; + print "MODIFIED TOKENS=<@tokens>\n"; + print "MODIFIED PATTERNS=<@patterns>\n"; + print "MODIFIED FIELDS=<@fields>\n"; + }; + + # all ok .. update the arrays + @{$rfields} = @fields; + @{$rtokens} = @tokens; + @{$rpatterns} = @patterns; + + # force a flush after this line + return $jquestion; +} + +sub fix_terminal_else { + + # Add empty fields as necessary to align a balanced terminal + # else block to a previous if/elsif/unless block, + # like this: + # + # if ( 1 || $x ) { print "ok 13\n"; } + # else { print "not ok 13\n"; } + # + # returns 1 if the else block should be indented + # + my ( $rfields, $rtokens, $rpatterns ) = @_; + my $jmax = @{$rfields} - 1; + return unless ( $jmax > 0 ); + + # check for balanced else block following if/elsif/unless + my $rfields_old = $current_line->get_rfields(); + + # TBD: add handling for 'case' + return unless ( $rfields_old->[0] =~ /^(if|elsif|unless)\s*$/ ); + + # look for the opening brace after the else, and extrace the depth + my $tok_brace = $rtokens->[0]; + my $depth_brace; + if ( $tok_brace =~ /^\{(\d+)$/ ) { $depth_brace = $1; } + + # probably: "else # side_comment" + else { return } + + my $rpatterns_old = $current_line->get_rpatterns(); + my $rtokens_old = $current_line->get_rtokens(); + my $maximum_field_index = $current_line->get_jmax(); + + # be sure the previous if/elsif is followed by an opening paren + my $jparen = 0; + my $tok_paren = '(' . $depth_brace; + my $tok_test = $rtokens_old->[$jparen]; + return unless ( $tok_test eq $tok_paren ); # shouldn't happen + + # Now find the opening block brace + my ($jbrace); + for ( my $j = 1 ; $j < $maximum_field_index ; $j++ ) { + my $tok = $rtokens_old->[$j]; + if ( $tok eq $tok_brace ) { + $jbrace = $j; + last; + } + } + return unless ( defined($jbrace) ); # shouldn't happen + + # Now splice the tokens and patterns of the previous line + # into the else line to insure a match. Add empty fields + # as necessary. + my $jadd = $jbrace - $jparen; + splice( @{$rtokens}, 0, 0, @{$rtokens_old}[ $jparen .. $jbrace - 1 ] ); + splice( @{$rpatterns}, 1, 0, @{$rpatterns_old}[ $jparen + 1 .. $jbrace ] ); + splice( @{$rfields}, 1, 0, ('') x $jadd ); + + # force a flush after this line if it does not follow a case + return $jbrace + unless ( $rfields_old->[0] =~ /^case\s*$/ ); +} + sub check_match { my $new_line = shift; my $old_line = shift; + # uses global variables: + # $previous_minimum_jmax_seen + # $maximum_jmax_seen + # $maximum_line_index + # $marginal_match my $jmax = $new_line->get_jmax(); my $maximum_field_index = $old_line->get_jmax(); @@ -17253,12 +18106,16 @@ sub check_match { my $old_tok = $$old_rtokens[$j]; my $new_tok = $$rtokens[$j]; - # dumb down the match after an equals + # Dumb down the match AFTER an equals and + # also dumb down after seeing a ? ternary operator ... + # Everything after a + is the token which preceded the previous + # opening paren (container name). We won't require them to match. if ( $saw_equals && $new_tok =~ /(.*)\+/ ) { $new_tok = $1; $old_tok =~ s/\+.*$//; } - if ( $new_tok =~ /^=\d*$/ ) { $saw_equals = 1 } + + if ( $new_tok =~ /^[\?=]\d*$/ ) { $saw_equals = 1 } # we never match if the matching tokens differ if ( $j < $jlimit @@ -17405,14 +18262,6 @@ sub check_fit { my $maximum_field_index = $old_line->get_jmax(); for $j ( 0 .. $jmax ) { - ## testing patch to avoid excessive gaps in previous lines, - # due to a line of fewer fields. - # return join( ".", - # $self->{"dfi"}, $self->{"aa"}, $self->rsvd, $self->{"rd"}, - # $self->{"area"}, $self->{"id"}, $self->{"sel"} ); - ## MOVED BELOW AS A TEST - ##next if ($jmax < $maximum_field_index && $j==$jmax-1); - $pad = length( $$rfields[$j] ) - $old_line->current_field_width($j); if ( $j == 0 ) { @@ -17455,7 +18304,11 @@ sub check_fit { last; } - # TESTING PATCH moved from above to be sure we fit + # patch to avoid excessive gaps in previous lines, + # due to a line of fewer fields. + # return join( ".", + # $self->{"dfi"}, $self->{"aa"}, $self->rsvd, $self->{"rd"}, + # $self->{"area"}, $self->{"id"}, $self->{"sel"} ); next if ( $jmax < $maximum_field_index && $j == $jmax - 1 ); # looks ok, squeeze this field in @@ -17471,6 +18324,8 @@ sub check_fit { sub accept_line { + # The current line either starts a new alignment group or is + # accepted into the current alignment group. my $new_line = shift; $group_lines[ ++$maximum_line_index ] = $new_line; @@ -17503,6 +18358,10 @@ sub accept_line { $group_lines[ $maximum_line_index - 1 ]->get_alignments(); $new_line->set_alignments(@new_alignments); } + + # remember group jmax extremes for next call to append_line + $previous_minimum_jmax_seen = $minimum_jmax_seen; + $previous_maximum_jmax_seen = $maximum_jmax_seen; } sub dump_array { @@ -17520,11 +18379,13 @@ sub flush { if ( $maximum_line_index < 0 ) { if ($cached_line_type) { + $seqno_string = $cached_seqno_string; entab_and_output( $cached_line_text, $cached_line_leading_space_count, $last_group_level_written ); - $cached_line_type = 0; - $cached_line_text = ""; + $cached_line_type = 0; + $cached_line_text = ""; + $cached_seqno_string = ""; } } else { @@ -17552,7 +18413,7 @@ sub my_flush { # zero leading space count if any lines are too long my $max_excess = 0; for my $i ( 0 .. $maximum_line_index ) { - my $str = $group_lines[$i]; + my $str = $group_lines[$i]; my $excess = length($str) + $leading_space_count - $rOpts_maximum_line_length; if ( $excess > $max_excess ) { @@ -17604,8 +18465,7 @@ sub my_flush { my $group_leader_length = $group_lines[0]->get_leading_space_count(); # add extra leading spaces if helpful - my $min_ci_gap = - improve_continuation_indentation( $do_not_align, + my $min_ci_gap = improve_continuation_indentation( $do_not_align, $group_leader_length ); # loop to output all lines @@ -17622,6 +18482,7 @@ sub decide_if_aligned { # Do not try to align two lines which are not really similar return unless $maximum_line_index == 1; + return if ($is_matching_terminal_line); my $group_list_type = $group_lines[0]->get_list_type(); @@ -17639,6 +18500,8 @@ sub decide_if_aligned { || $group_maximum_gap > 12 # or lines with differing number of alignment tokens + # TODO: this could be improved. It occasionally rejects + # good matches. || $previous_maximum_jmax_seen != $previous_minimum_jmax_seen ) ); @@ -17801,7 +18664,9 @@ sub improve_continuation_indentation { my $leading_space_count = $line->get_leading_space_count(); my $rfields = $line->get_rfields(); - my $gap = $line->get_column(0) - $leading_space_count - + my $gap = + $line->get_column(0) - + $leading_space_count - length( $$rfields[0] ); if ( $leading_space_count > $group_leader_length ) { @@ -17874,6 +18739,9 @@ sub write_vertically_aligned_line { $total_pad_count = 0; $str .= $$rfields[$j]; } + else { + $total_pad_count = 0; + } # update side comment history buffer if ( $j == $maximum_field_index ) { @@ -17942,6 +18810,9 @@ sub get_extra_leading_spaces { sub combine_fields { # combine all fields except for the comment field ( sidecmt.t ) + # Uses global variables: + # @group_lines + # $maximum_line_index my ( $j, $k ); my $maximum_field_index = $group_lines[0]->get_jmax(); for ( $j = 0 ; $j <= $maximum_line_index ; $j++ ) { @@ -17991,10 +18862,12 @@ sub write_leader_and_string { # handle outdenting of long lines: if ($outdent_long_lines) { my $excess = - length($str) - $side_comment_length + $leading_space_count - + length($str) - + $side_comment_length + + $leading_space_count - $rOpts_maximum_line_length; if ( $excess > 0 ) { - $leading_space_count = 0; + $leading_space_count = 0; $last_outdented_line_at = $file_writer_object->get_output_line_number(); @@ -18020,12 +18893,17 @@ sub write_leader_and_string { # [2] sequence number of container # [3] valid flag: do not append if this flag is false # - my ( $open_or_close, $tightness_flag, $seqno, $valid ); + my ( $open_or_close, $tightness_flag, $seqno, $valid, $seqno_beg, + $seqno_end ); if ($rvertical_tightness_flags) { - ( $open_or_close, $tightness_flag, $seqno, $valid ) = - @{$rvertical_tightness_flags}; + ( + $open_or_close, $tightness_flag, $seqno, $valid, $seqno_beg, + $seqno_end + ) = @{$rvertical_tightness_flags}; } + $seqno_string = $seqno_end; + # handle any cached line .. # either append this line to it or write it out if ( length($cached_line_text) ) { @@ -18051,6 +18929,7 @@ sub write_leader_and_string { if ( $gap >= 0 ) { $leading_string = $cached_line_text . ' ' x $gap; $leading_space_count = $cached_line_leading_space_count; + $seqno_string = $cached_seqno_string . ':' . $seqno_beg; } else { entab_and_output( $cached_line_text, @@ -18064,6 +18943,87 @@ sub write_leader_and_string { my $test_line = $cached_line_text . ' ' x $cached_line_flag . $str; if ( length($test_line) <= $rOpts_maximum_line_length ) { + + $seqno_string = $cached_seqno_string . ':' . $seqno_beg; + + # Patch to outdent closing tokens ending # in ');' + # If we are joining a line like ');' to a previous stacked + # set of closing tokens, then decide if we may outdent the + # combined stack to the indentation of the ');'. Since we + # should not normally outdent any of the other tokens more than + # the indentation of the lines that contained them, we will + # only do this if all of the corresponding opening + # tokens were on the same line. This can happen with + # -sot and -sct. For example, it is ok here: + # __PACKAGE__->load_components( qw( + # PK::Auto + # Core + # )); + # + # But, for example, we do not outdent in this example because + # that would put the closing sub brace out farther than the + # opening sub brace: + # + # perltidy -sot -sct + # $c->Tk::bind( + # '' => sub { + # my ($c) = @_; + # my $e = $c->XEvent; + # itemsUnderArea $c; + # } ); + # + if ( $str =~ /^\);/ && $cached_line_text =~ /^[\)\}\]\s]*$/ ) { + + # The way to tell this is if the stacked sequence numbers + # of this output line are the reverse of the stacked + # sequence numbers of the previous non-blank line of + # sequence numbers. So we can join if the previous + # nonblank string of tokens is the mirror image. For + # example if stack )}] is 13:8:6 then we are looking for a + # leading stack like [{( which is 6:8:13 We only need to + # check the two ends, because the intermediate tokens must + # fall in order. Note on speed: having to split on colons + # and eliminate multiple colons might appear to be slow, + # but it's not an issue because we almost never come + # through here. In a typical file we don't. + $seqno_string =~ s/^:+//; + $last_nonblank_seqno_string =~ s/^:+//; + $seqno_string =~ s/:+/:/g; + $last_nonblank_seqno_string =~ s/:+/:/g; + + # how many spaces can we outdent? + my $diff = + $cached_line_leading_space_count - $leading_space_count; + if ( $diff > 0 + && length($seqno_string) + && length($last_nonblank_seqno_string) == + length($seqno_string) ) + { + my @seqno_last = + ( split ':', $last_nonblank_seqno_string ); + my @seqno_now = ( split ':', $seqno_string ); + if ( $seqno_now[-1] == $seqno_last[0] + && $seqno_now[0] == $seqno_last[-1] ) + { + + # OK to outdent .. + # for absolute safety, be sure we only remove + # whitespace + my $ws = substr( $test_line, 0, $diff ); + if ( ( length($ws) == $diff ) && $ws =~ /^\s+$/ ) { + + $test_line = substr( $test_line, $diff ); + $cached_line_leading_space_count -= $diff; + } + + # shouldn't happen, but not critical: + ##else { + ## ERROR transferring indentation here + ##} + } + } + } + $str = $test_line; $leading_string = ""; $leading_space_count = $cached_line_leading_space_count; @@ -18082,7 +19042,7 @@ sub write_leader_and_string { my $line = $leading_string . $str; # write or cache this line - if ( !$rvertical_tightness_flags || $side_comment_length > 0 ) { + if ( !$open_or_close || $side_comment_length > 0 ) { entab_and_output( $line, $leading_space_count, $group_level ); } else { @@ -18092,6 +19052,7 @@ sub write_leader_and_string { $cached_seqno = $seqno; $cached_line_valid = $valid; $cached_line_leading_space_count = $leading_space_count; + $cached_seqno_string = $seqno_string; } $last_group_level_written = $group_level; @@ -18138,7 +19099,7 @@ sub entab_and_output { # Handle option of one tab per level else { my $leading_string = ( "\t" x $level ); - my $space_count = + my $space_count = $leading_space_count - $level * $rOpts_indent_columns; # shouldn't happen: @@ -18166,6 +19127,9 @@ sub entab_and_output { } } $file_writer_object->write_code_line( $line . "\n" ); + if ($seqno_string) { + $last_nonblank_seqno_string = $seqno_string; + } } { # begin get_leading_string @@ -18202,8 +19166,7 @@ sub entab_and_output { elsif ($rOpts_entab_leading_whitespace) { my $space_count = $leading_whitespace_count % $rOpts_entab_leading_whitespace; - my $tab_count = - int( + my $tab_count = int( $leading_whitespace_count / $rOpts_entab_leading_whitespace ); $leading_string = "\t" x $tab_count . ' ' x $space_count; } @@ -18663,88 +19626,55 @@ BEGIN { } use Carp; + +# PACKAGE VARIABLES for for processing an entire FILE. use vars qw{ $tokenizer_self - $level_in_tokenizer - $slevel_in_tokenizer - $nesting_token_string - $nesting_type_string - $nesting_block_string - $nesting_block_flag - $nesting_list_string - $nesting_list_flag - $saw_negative_indentation - $id_scan_state + $last_nonblank_token $last_nonblank_type $last_nonblank_block_type - $last_nonblank_container_type - $last_nonblank_type_sequence - $last_last_nonblank_token - $last_last_nonblank_type - $last_last_nonblank_block_type - $last_last_nonblank_container_type - $last_last_nonblank_type_sequence - $last_nonblank_prototype $statement_type - $identifier $in_attribute_list - $in_quote - $quote_type - $quote_character - $quote_pos - $quote_depth - $allowed_quote_modifiers + $current_package + $context + + %is_constant + %is_user_function + %user_function_prototype + %is_block_function + %is_block_list_function + %saw_function_definition + + $brace_depth $paren_depth + $square_bracket_depth + + @current_depth + @nesting_sequence_number + @current_sequence_number @paren_type @paren_semicolon_count @paren_structural_type - $brace_depth @brace_type @brace_structural_type @brace_statement_type @brace_context @brace_package - $square_bracket_depth @square_bracket_type @square_bracket_structural_type @depth_array @starting_line_of_current_depth - @current_depth - @current_sequence_number - @nesting_sequence_number - @lower_case_labels_at - $saw_v_string - %is_constant - %is_user_function - %user_function_prototype - %saw_function_definition - $max_token_index - $peeked_ahead - $current_package - $unexpected_error_count - $input_line - $input_line_number - $rpretokens - $rpretoken_map - $rpretoken_type - $want_paren - $context - @slevel_stack - $ci_string_in_tokenizer - $continuation_string_in_tokenizer - $in_statement_continuation - $started_looking_for_here_target_at - $nearly_matched_here_target_at +}; +# GLOBAL CONSTANTS for routines in this package +use vars qw{ %is_indirect_object_taker %is_block_operator %expecting_operator_token %expecting_operator_types %expecting_term_types %expecting_term_token - %is_block_function - %is_block_list_function %is_digraph %is_file_test_operator %is_trigraph @@ -18791,17 +19721,18 @@ sub new { # Note: 'tabs' and 'indent_columns' are temporary and should be # removed asap my %defaults = ( - source_object => undef, - debugger_object => undef, - diagnostics_object => undef, - logger_object => undef, - starting_level => undef, - indent_columns => 4, - tabs => 0, - look_for_hash_bang => 0, - trim_qw => 1, - look_for_autoloader => 1, - look_for_selfloader => 1, + source_object => undef, + debugger_object => undef, + diagnostics_object => undef, + logger_object => undef, + starting_level => undef, + indent_columns => 4, + tabs => 0, + look_for_hash_bang => 0, + trim_qw => 1, + look_for_autoloader => 1, + look_for_selfloader => 1, + starting_line_number => 1, ); my %args = ( %defaults, @_ ); @@ -18831,45 +19762,53 @@ sub new { # _know_input_tabstr flag indicating if we know _input_tabstr # _line_buffer_object object with get_line() method to supply source code # _diagnostics_object place to write debugging information + # _unexpected_error_count error count used to limit output + # _lower_case_labels_at line numbers where lower case labels seen $tokenizer_self = { - _rhere_target_list => undef, - _in_here_doc => 0, - _here_doc_target => "", - _here_quote_character => "", - _in_data => 0, - _in_end => 0, - _in_format => 0, - _in_error => 0, - _in_pod => 0, - _in_attribute_list => 0, - _in_quote => 0, - _quote_target => "", - _line_start_quote => -1, - _starting_level => $args{starting_level}, - _know_starting_level => defined( $args{starting_level} ), - _tabs => $args{tabs}, - _indent_columns => $args{indent_columns}, - _look_for_hash_bang => $args{look_for_hash_bang}, - _trim_qw => $args{trim_qw}, - _input_tabstr => "", - _know_input_tabstr => -1, - _last_line_number => 0, - _saw_perl_dash_P => 0, - _saw_perl_dash_w => 0, - _saw_use_strict => 0, - _look_for_autoloader => $args{look_for_autoloader}, - _look_for_selfloader => $args{look_for_selfloader}, - _saw_autoloader => 0, - _saw_selfloader => 0, - _saw_hash_bang => 0, - _saw_end => 0, - _saw_data => 0, - _saw_lc_filehandle => 0, - _started_tokenizing => 0, - _line_buffer_object => $line_buffer_object, - _debugger_object => $args{debugger_object}, - _diagnostics_object => $args{diagnostics_object}, - _logger_object => $args{logger_object}, + _rhere_target_list => [], + _in_here_doc => 0, + _here_doc_target => "", + _here_quote_character => "", + _in_data => 0, + _in_end => 0, + _in_format => 0, + _in_error => 0, + _in_pod => 0, + _in_attribute_list => 0, + _in_quote => 0, + _quote_target => "", + _line_start_quote => -1, + _starting_level => $args{starting_level}, + _know_starting_level => defined( $args{starting_level} ), + _tabs => $args{tabs}, + _indent_columns => $args{indent_columns}, + _look_for_hash_bang => $args{look_for_hash_bang}, + _trim_qw => $args{trim_qw}, + _input_tabstr => "", + _know_input_tabstr => -1, + _last_line_number => $args{starting_line_number} - 1, + _saw_perl_dash_P => 0, + _saw_perl_dash_w => 0, + _saw_use_strict => 0, + _saw_v_string => 0, + _look_for_autoloader => $args{look_for_autoloader}, + _look_for_selfloader => $args{look_for_selfloader}, + _saw_autoloader => 0, + _saw_selfloader => 0, + _saw_hash_bang => 0, + _saw_end => 0, + _saw_data => 0, + _saw_negative_indentation => 0, + _started_tokenizing => 0, + _line_buffer_object => $line_buffer_object, + _debugger_object => $args{debugger_object}, + _diagnostics_object => $args{diagnostics_object}, + _logger_object => $args{logger_object}, + _unexpected_error_count => 0, + _started_looking_for_here_target_at => 0, + _nearly_matched_here_target_at => undef, + _line_text => "", + _rlower_case_labels_at => undef, }; prepare_for_a_new_file(); @@ -18985,38 +19924,6 @@ sub report_tokenization_errors { warning("hit EOF while in format description\n"); } - # this check may be removed after a year or so - if ( $tokenizer_self->{_saw_lc_filehandle} ) { - - warning( <<'EOM' ); ------------------------------------------------------------------------- -PLEASE NOTE: If you get this message, it is because perltidy noticed -possible ambiguous syntax at one or more places in your script, as -noted above. The problem is with statements accepting indirect objects, -such as print and printf statements of the form - - print bareword ( $etc - -Perltidy needs your help in deciding if 'bareword' is a filehandle or a -function call. The problem is the space between 'bareword' and '('. If -'bareword' is a function call, you should remove the trailing space. If -'bareword' is a filehandle, you should avoid the opening paren or else -globally capitalize 'bareword' to be BAREWORD. So the above line -would be: - - print bareword( $etc # function -or - print bareword @list # filehandle -or - print BAREWORD ( $etc # filehandle - -If you want to keep the line as it is, and are sure it is correct, -you can use -w=0 to prevent this message. ------------------------------------------------------------------------- -EOM - - } - if ( $tokenizer_self->{_in_pod} ) { # Just write log entry if this is after __END__ or __DATA__ @@ -19038,6 +19945,8 @@ EOM if ( $tokenizer_self->{_in_here_doc} ) { my $here_doc_target = $tokenizer_self->{_here_doc_target}; + my $started_looking_for_here_target_at = + $tokenizer_self->{_started_looking_for_here_target_at}; if ($here_doc_target) { warning( "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n" @@ -19048,6 +19957,8 @@ EOM "hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n" ); } + my $nearly_matched_here_target_at = + $tokenizer_self->{_nearly_matched_here_target_at}; if ($nearly_matched_here_target_at) { warning( "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n" @@ -19058,7 +19969,7 @@ EOM if ( $tokenizer_self->{_in_quote} ) { my $line_start_quote = $tokenizer_self->{_line_start_quote}; my $quote_target = $tokenizer_self->{_quote_target}; - my $what = + my $what = ( $tokenizer_self->{_in_attribute_list} ) ? "attribute list" : "quote/pattern"; @@ -19086,8 +19997,9 @@ EOM # it is suggested that lables have at least one upper case character # for legibility and to avoid code breakage as new keywords are introduced - if (@lower_case_labels_at) { - my $num = @lower_case_labels_at; + if ( $tokenizer_self->{_rlower_case_labels_at} ) { + my @lower_case_labels_at = + @{ $tokenizer_self->{_rlower_case_labels_at} }; write_logfile_entry( "Suggest using upper case characters in label(s)\n"); local $" = ')('; @@ -19099,7 +20011,9 @@ sub report_v_string { # warn if this version can't handle v-strings my $tok = shift; - $saw_v_string = $input_line_number; + unless ( $tokenizer_self->{_saw_v_string} ) { + $tokenizer_self->{_saw_v_string} = $tokenizer_self->{_last_line_number}; + } if ( $] < 5.006 ) { warning( "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n" @@ -19116,11 +20030,15 @@ sub get_line { my $self = shift; + # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth, + # $square_bracket_depth, $paren_depth + my $input_line = $tokenizer_self->{_line_buffer_object}->get_line(); + $tokenizer_self->{_line_text} = $input_line; return undef unless ($input_line); - $tokenizer_self->{_last_line_number}++; + my $input_line_number = ++$tokenizer_self->{_last_line_number}; # Find and remove what characters terminate this line, including any # control r @@ -19135,8 +20053,7 @@ sub get_line { # for backwards compatability we keep the line text terminated with # a newline character $input_line .= "\n"; - - my $input_line_number = $tokenizer_self->{_last_line_number}; + $tokenizer_self->{_line_text} = $input_line; # update # create a data structure describing this line which will be # returned to the caller. @@ -19181,8 +20098,7 @@ sub get_line { _rci_levels => undef, _rnesting_blocks => undef, _python_indentation_level => -1, ## 0, - _starting_in_quote => - ( $tokenizer_self->{_in_quote} && ( $quote_type eq 'Q' ) ), + _starting_in_quote => 0, # to be set by subroutine _ending_in_quote => 0, _curly_brace_depth => $brace_depth, _square_bracket_depth => $square_bracket_depth, @@ -19199,21 +20115,22 @@ sub get_line { my $candidate_target = $input_line; chomp $candidate_target; if ( $candidate_target eq $here_doc_target ) { - $nearly_matched_here_target_at = undef; - $line_of_tokens->{_line_type} = 'HERE_END'; + $tokenizer_self->{_nearly_matched_here_target_at} = undef; + $line_of_tokens->{_line_type} = 'HERE_END'; write_logfile_entry("Exiting HERE document $here_doc_target\n"); my $rhere_target_list = $tokenizer_self->{_rhere_target_list}; if (@$rhere_target_list) { # there can be multiple here targets ( $here_doc_target, $here_quote_character ) = @{ shift @$rhere_target_list }; - $tokenizer_self->{_here_doc_target} = $here_doc_target; + $tokenizer_self->{_here_doc_target} = $here_doc_target; $tokenizer_self->{_here_quote_character} = $here_quote_character; write_logfile_entry( "Entering HERE document $here_doc_target\n"); - $nearly_matched_here_target_at = undef; - $started_looking_for_here_target_at = $input_line_number; + $tokenizer_self->{_nearly_matched_here_target_at} = undef; + $tokenizer_self->{_started_looking_for_here_target_at} = + $input_line_number; } else { $tokenizer_self->{_in_here_doc} = 0; @@ -19228,7 +20145,8 @@ sub get_line { $candidate_target =~ s/\s*$//; $candidate_target =~ s/^\s*//; if ( $candidate_target eq $here_doc_target ) { - $nearly_matched_here_target_at = $input_line_number; + $tokenizer_self->{_nearly_matched_here_target_at} = + $input_line_number; } } return $line_of_tokens; @@ -19438,14 +20356,14 @@ sub get_line { my $rhere_target_list = $tokenizer_self->{_rhere_target_list}; if (@$rhere_target_list) { - #my $here_doc_target = shift @$rhere_target_list; my ( $here_doc_target, $here_quote_character ) = @{ shift @$rhere_target_list }; $tokenizer_self->{_in_here_doc} = 1; $tokenizer_self->{_here_doc_target} = $here_doc_target; $tokenizer_self->{_here_quote_character} = $here_quote_character; write_logfile_entry("Entering HERE document $here_doc_target\n"); - $started_looking_for_here_target_at = $input_line_number; + $tokenizer_self->{_started_looking_for_here_target_at} = + $input_line_number; } # NOTE: __END__ and __DATA__ statements are written unformatted @@ -19505,9 +20423,11 @@ sub get_line { and ( $tokenizer_self->{_line_start_quote} < 0 ) ) { - if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) { + #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) { + if ( + ( my $quote_target = $tokenizer_self->{_quote_target} ) !~ /^\s*$/ ) + { $tokenizer_self->{_line_start_quote} = $input_line_number; - $tokenizer_self->{_quote_target} = $quote_target; write_logfile_entry( "Start multi-line quote or pattern ending in $quote_target\n"); } @@ -19525,6 +20445,7 @@ sub get_line { sub find_starting_indentation_level { + # USES GLOBAL VARIABLES: $tokenizer_self my $starting_level = 0; my $know_input_tabstr = -1; # flag for find_indentation_level @@ -19595,6 +20516,8 @@ sub find_starting_indentation_level { sub find_indentation_level { my ( $line, $structural_indentation_level ) = @_; + + # USES GLOBAL VARIABLES: $tokenizer_self my $level = 0; my $msg = ""; @@ -19668,7 +20591,7 @@ sub find_indentation_level { } else { $columns = int $columns; - $msg = + $msg = "old indentation is unclear, using $columns $entabbed spaces\n"; } $input_tabstr = " " x $columns; @@ -19711,81 +20634,6 @@ sub find_indentation_level { return ( $level, $msg ); } -sub dump_token_types { - my $class = shift; - my $fh = shift; - - # This should be the latest list of token types in use - # adding NEW_TOKENS: add a comment here - print $fh <<'END_OF_LIST'; - -Here is a list of the token types currently used for lines of type 'CODE'. -For the following tokens, the "type" of a token is just the token itself. - -.. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <> -( ) <= >= == =~ !~ != ++ -- /= x= -... **= <<= >>= &&= ||= //= <=> -, + - / * | % ! x ~ = \ ? : . < > ^ & - -The following additional token types are defined: - - type meaning - b blank (white space) - { indent: opening structural curly brace or square bracket or paren - (code block, anonymous hash reference, or anonymous array reference) - } outdent: right structural curly brace or square bracket or paren - [ left non-structural square bracket (enclosing an array index) - ] right non-structural square bracket - ( left non-structural paren (all but a list right of an =) - ) right non-structural parena - L left non-structural curly brace (enclosing a key) - R right non-structural curly brace - ; terminal semicolon - f indicates a semicolon in a "for" statement - h here_doc operator << - # a comment - Q indicates a quote or pattern - q indicates a qw quote block - k a perl keyword - C user-defined constant or constant function (with void prototype = ()) - U user-defined function taking parameters - G user-defined function taking block parameter (like grep/map/eval) - M (unused, but reserved for subroutine definition name) - P (unused, but -html uses it to label pod text) - t type indicater such as %,$,@,*,&,sub - w bare word (perhaps a subroutine call) - i identifier of some type (with leading %, $, @, *, &, sub, -> ) - n a number - v a v-string - F a file test operator (like -e) - Y File handle - Z identifier in indirect object slot: may be file handle, object - J LABEL: code block label - j LABEL after next, last, redo, goto - p unary + - m unary - - pp pre-increment operator ++ - mm pre-decrement operator -- - A : used as attribute separator - - Here are the '_line_type' codes used internally: - SYSTEM - system-specific code before hash-bang line - CODE - line of perl code (including comments) - POD_START - line starting pod, such as '=head' - POD - pod documentation text - POD_END - last line of pod section, '=cut' - HERE - text of here-document - HERE_END - last line of here-doc (target word) - FORMAT - format section - FORMAT_END - last line of format section, '.' - DATA_START - __DATA__ line - DATA - unidentified text following __DATA__ - END_START - __END__ line - END - unidentified text following __END__ - ERROR - we are in big trouble, probably not a perl script -END_OF_LIST -} - # This is a currently unused debug routine sub dump_functions { @@ -19816,143 +20664,400 @@ sub dump_functions { } } +sub ones_count { + + # count number of 1's in a string of 1's and 0's + # example: ones_count("010101010101") gives 6 + return ( my $cis = $_[0] ) =~ tr/1/0/; +} + sub prepare_for_a_new_file { - $saw_negative_indentation = 0; - $id_scan_state = ''; - $statement_type = ''; # '' or 'use' or 'sub..' or 'case..' + + # previous tokens needed to determine what to expect next $last_nonblank_token = ';'; # the only possible starting state which $last_nonblank_type = ';'; # will make a leading brace a code block $last_nonblank_block_type = ''; - $last_nonblank_container_type = ''; - $last_nonblank_type_sequence = ''; - $last_last_nonblank_token = ';'; - $last_last_nonblank_type = ';'; - $last_last_nonblank_block_type = ''; - $last_last_nonblank_container_type = ''; - $last_last_nonblank_type_sequence = ''; - $last_nonblank_prototype = ""; - $identifier = ''; - $in_attribute_list = 0; # ATTRS - $in_quote = 0; # flag telling if we are chasing a quote, and what kind - $quote_type = 'Q'; - $quote_character = ""; # character we seek if chasing a quote - $quote_pos = 0; # next character index to check for case of alphanum char - $quote_depth = 0; - $allowed_quote_modifiers = ""; - $paren_depth = 0; - $brace_depth = 0; - $square_bracket_depth = 0; - $current_package = "main"; + + # scalars for remembering statement types across multiple lines + $statement_type = ''; # '' or 'use' or 'sub..' or 'case..' + $in_attribute_list = 0; + + # scalars for remembering where we are in the file + $current_package = "main"; + $context = UNKNOWN_CONTEXT; + + # hashes used to remember function information + %is_constant = (); # user-defined constants + %is_user_function = (); # user-defined functions + %user_function_prototype = (); # their prototypes + %is_block_function = (); + %is_block_list_function = (); + %saw_function_definition = (); + + # variables used to track depths of various containers + # and report nesting errors + $paren_depth = 0; + $brace_depth = 0; + $square_bracket_depth = 0; @current_depth[ 0 .. $#closing_brace_names ] = (0) x scalar @closing_brace_names; @nesting_sequence_number[ 0 .. $#closing_brace_names ] = ( 0 .. $#closing_brace_names ); - @current_sequence_number = (); - + @current_sequence_number = (); $paren_type[$paren_depth] = ''; $paren_semicolon_count[$paren_depth] = 0; + $paren_structural_type[$brace_depth] = ''; $brace_type[$brace_depth] = ';'; # identify opening brace as code block $brace_structural_type[$brace_depth] = ''; $brace_statement_type[$brace_depth] = ""; $brace_context[$brace_depth] = UNKNOWN_CONTEXT; - $paren_structural_type[$brace_depth] = ''; + $brace_package[$paren_depth] = $current_package; $square_bracket_type[$square_bracket_depth] = ''; $square_bracket_structural_type[$square_bracket_depth] = ''; - $brace_package[$paren_depth] = $current_package; - %is_constant = (); # user-defined constants - %is_user_function = (); # user-defined functions - %user_function_prototype = (); # their prototypes - %is_block_function = (); - %is_block_list_function = (); - %saw_function_definition = (); - $unexpected_error_count = 0; - $want_paren = ""; - $context = UNKNOWN_CONTEXT; - @slevel_stack = (); - $ci_string_in_tokenizer = ""; - $continuation_string_in_tokenizer = "0"; - $in_statement_continuation = 0; - @lower_case_labels_at = (); - $saw_v_string = 0; # for warning of v-strings on older perl - $nesting_token_string = ""; - $nesting_type_string = ""; - $nesting_block_string = '1'; # initially in a block - $nesting_block_flag = 1; - $nesting_list_string = '0'; # initially not in a list - $nesting_list_flag = 0; # initially not in a list - $nearly_matched_here_target_at = undef; -} - -sub get_quote_target { - return matching_end_token($quote_character); -} - -sub get_indentation_level { - return $level_in_tokenizer; -} - -sub reset_indentation_level { - $level_in_tokenizer = $_[0]; - $slevel_in_tokenizer = $_[0]; - push @slevel_stack, $slevel_in_tokenizer; -} - -{ # begin tokenize_this_line + + initialize_tokenizer_state(); +} + +{ # begin tokenize_this_line use constant BRACE => 0; use constant SQUARE_BRACKET => 1; use constant PAREN => 2; use constant QUESTION_COLON => 3; + # TV1: scalars for processing one LINE. + # Re-initialized on each entry to sub tokenize_this_line. + my ( + $block_type, $container_type, $expecting, + $i, $i_tok, $input_line, + $input_line_number, $last_nonblank_i, $max_token_index, + $next_tok, $next_type, $peeked_ahead, + $prototype, $rhere_target_list, $rtoken_map, + $rtoken_type, $rtokens, $tok, + $type, $type_sequence, + ); + + # TV2: refs to ARRAYS for processing one LINE + # Re-initialized on each call. + my $routput_token_list = []; # stack of output token indexes + my $routput_token_type = []; # token types + my $routput_block_type = []; # types of code block + my $routput_container_type = []; # paren types, such as if, elsif, .. + my $routput_type_sequence = []; # nesting sequential number + + # TV3: SCALARS for quote variables. These are initialized with a + # subroutine call and continually updated as lines are processed. + my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth, + $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, ); + + # TV4: SCALARS for multi-line identifiers and + # statements. These are initialized with a subroutine call + # and continually updated as lines are processed. + my ( $id_scan_state, $identifier, $want_paren, ); + + # TV5: SCALARS for tracking indentation level. + # Initialized once and continually updated as lines are + # processed. my ( - $block_type, $container_type, $expecting, - $here_doc_target, $here_quote_character, $i, - $i_tok, $last_nonblank_i, $next_tok, - $next_type, $prototype, $rtoken_map, - $rtoken_type, $rtokens, $tok, - $type, $type_sequence, + $nesting_token_string, $nesting_type_string, + $nesting_block_string, $nesting_block_flag, + $nesting_list_string, $nesting_list_flag, + $ci_string_in_tokenizer, $continuation_string_in_tokenizer, + $in_statement_continuation, $level_in_tokenizer, + $slevel_in_tokenizer, $rslevel_stack, ); - my @output_token_list = (); # stack of output token indexes - my @output_token_type = (); # token types - my @output_block_type = (); # types of code block - my @output_container_type = (); # paren types, such as if, elsif, .. - my @output_type_sequence = (); # nesting sequential number + # TV6: SCALARS for remembering several previous + # tokens. Initialized once and continually updated as + # lines are processed. + my ( + $last_nonblank_container_type, $last_nonblank_type_sequence, + $last_last_nonblank_token, $last_last_nonblank_type, + $last_last_nonblank_block_type, $last_last_nonblank_container_type, + $last_last_nonblank_type_sequence, $last_nonblank_prototype, + ); + + # ---------------------------------------------------------------- + # beginning of tokenizer variable access and manipulation routines + # ---------------------------------------------------------------- + + sub initialize_tokenizer_state { + + # TV1: initialized on each call + # TV2: initialized on each call + # TV3: + $in_quote = 0; + $quote_type = 'Q'; + $quote_character = ""; + $quote_pos = 0; + $quote_depth = 0; + $quoted_string_1 = ""; + $quoted_string_2 = ""; + $allowed_quote_modifiers = ""; + + # TV4: + $id_scan_state = ''; + $identifier = ''; + $want_paren = ""; + + # TV5: + $nesting_token_string = ""; + $nesting_type_string = ""; + $nesting_block_string = '1'; # initially in a block + $nesting_block_flag = 1; + $nesting_list_string = '0'; # initially not in a list + $nesting_list_flag = 0; # initially not in a list + $ci_string_in_tokenizer = ""; + $continuation_string_in_tokenizer = "0"; + $in_statement_continuation = 0; + $level_in_tokenizer = 0; + $slevel_in_tokenizer = 0; + $rslevel_stack = []; + + # TV6: + $last_nonblank_container_type = ''; + $last_nonblank_type_sequence = ''; + $last_last_nonblank_token = ';'; + $last_last_nonblank_type = ';'; + $last_last_nonblank_block_type = ''; + $last_last_nonblank_container_type = ''; + $last_last_nonblank_type_sequence = ''; + $last_nonblank_prototype = ""; + } + + sub save_tokenizer_state { + + my $rTV1 = [ + $block_type, $container_type, $expecting, + $i, $i_tok, $input_line, + $input_line_number, $last_nonblank_i, $max_token_index, + $next_tok, $next_type, $peeked_ahead, + $prototype, $rhere_target_list, $rtoken_map, + $rtoken_type, $rtokens, $tok, + $type, $type_sequence, + ]; + + my $rTV2 = [ + $routput_token_list, $routput_token_type, + $routput_block_type, $routput_container_type, + $routput_type_sequence, + ]; + + my $rTV3 = [ + $in_quote, $quote_type, + $quote_character, $quote_pos, + $quote_depth, $quoted_string_1, + $quoted_string_2, $allowed_quote_modifiers, + ]; + + my $rTV4 = [ $id_scan_state, $identifier, $want_paren, ]; + + my $rTV5 = [ + $nesting_token_string, $nesting_type_string, + $nesting_block_string, $nesting_block_flag, + $nesting_list_string, $nesting_list_flag, + $ci_string_in_tokenizer, $continuation_string_in_tokenizer, + $in_statement_continuation, $level_in_tokenizer, + $slevel_in_tokenizer, $rslevel_stack, + ]; + + my $rTV6 = [ + $last_nonblank_container_type, + $last_nonblank_type_sequence, + $last_last_nonblank_token, + $last_last_nonblank_type, + $last_last_nonblank_block_type, + $last_last_nonblank_container_type, + $last_last_nonblank_type_sequence, + $last_nonblank_prototype, + ]; + return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ]; + } + + sub restore_tokenizer_state { + my ($rstate) = @_; + my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate}; + ( + $block_type, $container_type, $expecting, + $i, $i_tok, $input_line, + $input_line_number, $last_nonblank_i, $max_token_index, + $next_tok, $next_type, $peeked_ahead, + $prototype, $rhere_target_list, $rtoken_map, + $rtoken_type, $rtokens, $tok, + $type, $type_sequence, + ) = @{$rTV1}; + + ( + $routput_token_list, $routput_token_type, + $routput_block_type, $routput_container_type, + $routput_type_sequence, + ) = @{$rTV2}; + + ( + $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth, + $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, + ) = @{$rTV3}; + + ( $id_scan_state, $identifier, $want_paren, ) = @{$rTV4}; + + ( + $nesting_token_string, $nesting_type_string, + $nesting_block_string, $nesting_block_flag, + $nesting_list_string, $nesting_list_flag, + $ci_string_in_tokenizer, $continuation_string_in_tokenizer, + $in_statement_continuation, $level_in_tokenizer, + $slevel_in_tokenizer, $rslevel_stack, + ) = @{$rTV5}; + + ( + $last_nonblank_container_type, + $last_nonblank_type_sequence, + $last_last_nonblank_token, + $last_last_nonblank_type, + $last_last_nonblank_block_type, + $last_last_nonblank_container_type, + $last_last_nonblank_type_sequence, + $last_nonblank_prototype, + ) = @{$rTV6}; + } + + sub get_indentation_level { + return $level_in_tokenizer; + } + + sub reset_indentation_level { + $level_in_tokenizer = $_[0]; + $slevel_in_tokenizer = $_[0]; + push @{$rslevel_stack}, $slevel_in_tokenizer; + } + + sub peeked_ahead { + $peeked_ahead = defined( $_[0] ) ? $_[0] : $peeked_ahead; + } - my @here_target_list = (); # list of here-doc target strings + # ------------------------------------------------------------ + # end of tokenizer variable access and manipulation routines + # ------------------------------------------------------------ # ------------------------------------------------------------ - # beginning of various scanner interfaces to simplify coding + # beginning of various scanner interface routines # ------------------------------------------------------------ + sub scan_replacement_text { + + # check for here-docs in replacement text invoked by + # a substitution operator with executable modifier 'e'. + # + # given: + # $replacement_text + # return: + # $rht = reference to any here-doc targets + my ($replacement_text) = @_; + + # quick check + return undef unless ( $replacement_text =~ /<{_logger_object}; + + # localize all package variables + local ( + $tokenizer_self, $last_nonblank_token, + $last_nonblank_type, $last_nonblank_block_type, + $statement_type, $in_attribute_list, + $current_package, $context, + %is_constant, %is_user_function, + %user_function_prototype, %is_block_function, + %is_block_list_function, %saw_function_definition, + $brace_depth, $paren_depth, + $square_bracket_depth, @current_depth, + @nesting_sequence_number, @current_sequence_number, + @paren_type, @paren_semicolon_count, + @paren_structural_type, @brace_type, + @brace_structural_type, @brace_statement_type, + @brace_context, @brace_package, + @square_bracket_type, @square_bracket_structural_type, + @depth_array, @starting_line_of_current_depth, + ); + + # save all lexical variables + my $rstate = save_tokenizer_state(); + _decrement_count(); # avoid error check for multiple tokenizers + + # make a new tokenizer + my $rOpts = {}; + my $rpending_logfile_message; + my $source_object = + Perl::Tidy::LineSource->new( \$replacement_text, $rOpts, + $rpending_logfile_message ); + my $tokenizer = Perl::Tidy::Tokenizer->new( + source_object => $source_object, + logger_object => $logger_object, + starting_line_number => $input_line_number, + ); + + # scan the replacement text + 1 while ( $tokenizer->get_line() ); + + # remove any here doc targets + my $rht = undef; + if ( $tokenizer_self->{_in_here_doc} ) { + $rht = []; + push @{$rht}, + [ + $tokenizer_self->{_here_doc_target}, + $tokenizer_self->{_here_quote_character} + ]; + if ( $tokenizer_self->{_rhere_target_list} ) { + push @{$rht}, @{ $tokenizer_self->{_rhere_target_list} }; + $tokenizer_self->{_rhere_target_list} = undef; + } + $tokenizer_self->{_in_here_doc} = undef; + } + + # now its safe to report errors + $tokenizer->report_tokenization_errors(); + + # restore all tokenizer lexical variables + restore_tokenizer_state($rstate); + + # return the here doc targets + return $rht; + } + sub scan_bare_identifier { ( $i, $tok, $type, $prototype ) = scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype, - $rtoken_map ); + $rtoken_map, $max_token_index ); } sub scan_identifier { ( $i, $tok, $type, $id_scan_state, $identifier ) = - scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens ); + scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens, + $max_token_index ); } sub scan_id { ( $i, $tok, $type, $id_scan_state ) = scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map, - $id_scan_state ); + $id_scan_state, $max_token_index ); } - my $number; - sub scan_number { + my $number; ( $i, $type, $number ) = - scan_number_do( $input_line, $i, $rtoken_map, $type ); + scan_number_do( $input_line, $i, $rtoken_map, $type, + $max_token_index ); + return $number; } # a sub to warn if token found where term expected sub error_if_expecting_TERM { if ( $expecting == TERM ) { if ( $really_want_term{$last_nonblank_type} ) { - unexpected( $tok, "term", $i_tok, $last_nonblank_i ); + unexpected( $tok, "term", $i_tok, $last_nonblank_i, $rtoken_map, + $rtoken_type, $input_line ); 1; } } @@ -19962,7 +21067,8 @@ sub reset_indentation_level { sub error_if_expecting_OPERATOR { if ( $expecting == OPERATOR ) { my $thing = defined $_[0] ? $_[0] : $tok; - unexpected( $thing, "operator", $i_tok, $last_nonblank_i ); + unexpected( $thing, "operator", $i_tok, $last_nonblank_i, + $rtoken_map, $rtoken_type, $input_line ); if ( $i_tok == 0 ) { interrupt_logfile(); warning("Missing ';' above?\n"); @@ -20028,6 +21134,8 @@ sub reset_indentation_level { ## '||=' => undef, ## '//=' => undef, ## '~' => undef, +## '~~' => undef, +## '!~~' => undef, '>' => sub { error_if_expecting_TERM() @@ -20100,7 +21208,8 @@ sub reset_indentation_level { # error; for example, we might have a constant pi and # invoke it with pi() or just pi; my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens ); + find_next_nonblank_token( $i, $rtokens, + $max_token_index ); if ( $next_nonblank_token ne ')' ) { my $hint; error_if_expecting_OPERATOR('('); @@ -20127,7 +21236,8 @@ sub reset_indentation_level { } ## end if ( $expecting == OPERATOR... } $paren_type[$paren_depth] = $container_type; - $type_sequence = increase_nesting_depth( PAREN, $i_tok ); + $type_sequence = + increase_nesting_depth( PAREN, $$rtoken_map[$i_tok] ); # propagate types down through nested parens # for example: the second paren in 'if ((' would be structural @@ -20175,7 +21285,8 @@ sub reset_indentation_level { }, ')' => sub { - $type_sequence = decrease_nesting_depth( PAREN, $i_tok ); + $type_sequence = + decrease_nesting_depth( PAREN, $$rtoken_map[$i_tok] ); if ( $paren_structural_type[$paren_depth] eq '{' ) { $type = '}'; @@ -20256,7 +21367,8 @@ sub reset_indentation_level { if ( $expecting == UNKNOWN ) { # indeterminte, must guess.. my $msg; ( $is_pattern, $msg ) = - guess_if_pattern_or_division( $i, $rtokens, $rtoken_map ); + guess_if_pattern_or_division( $i, $rtokens, $rtoken_map, + $max_token_index ); if ($msg) { write_diagnostics("DIVIDE:$msg\n"); @@ -20278,11 +21390,11 @@ sub reset_indentation_level { $type = $tok; } - #DEBUG - collecting info on what tokens follow a divide - # for development of guessing algorithm - #if ( numerator_expected( $i, $rtokens ) < 0 ) { - # #write_diagnostics( "DIVIDE? $input_line\n" ); - #} + #DEBUG - collecting info on what tokens follow a divide + # for development of guessing algorithm + #if ( numerator_expected( $i, $rtokens, $max_token_index ) < 0 ) { + # #write_diagnostics( "DIVIDE? $input_line\n" ); + #} } }, '{' => sub { @@ -20371,15 +21483,16 @@ sub reset_indentation_level { # which will be blank for an anonymous hash else { - $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type ); + $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type, + $max_token_index ); # patch to promote bareword type to function taking block if ( $block_type && $last_nonblank_type eq 'w' && $last_nonblank_i >= 0 ) { - if ( $output_token_type[$last_nonblank_i] eq 'w' ) { - $output_token_type[$last_nonblank_i] = 'G'; + if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) { + $routput_token_type->[$last_nonblank_i] = 'G'; } } @@ -20395,7 +21508,8 @@ sub reset_indentation_level { } $brace_type[ ++$brace_depth ] = $block_type; $brace_package[$brace_depth] = $current_package; - $type_sequence = increase_nesting_depth( BRACE, $i_tok ); + $type_sequence = + increase_nesting_depth( BRACE, $$rtoken_map[$i_tok] ); $brace_structural_type[$brace_depth] = $type; $brace_context[$brace_depth] = $context; $brace_statement_type[$brace_depth] = $statement_type; @@ -20410,7 +21524,8 @@ sub reset_indentation_level { # can happen on brace error (caught elsewhere) else { } - $type_sequence = decrease_nesting_depth( BRACE, $i_tok ); + $type_sequence = + decrease_nesting_depth( BRACE, $$rtoken_map[$i_tok] ); if ( $brace_structural_type[$brace_depth] eq 'L' ) { $type = 'R'; @@ -20444,7 +21559,7 @@ sub reset_indentation_level { if ( $expecting != OPERATOR ) { ( $i, $type ) = find_angle_operator_termination( $input_line, $i, $rtoken_map, - $expecting ); + $expecting, $max_token_index ); } else { @@ -20458,7 +21573,8 @@ sub reset_indentation_level { my $msg; ( $is_pattern, $msg ) = - guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map ); + guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map, + $max_token_index ); if ($msg) { write_logfile_entry($msg) } } @@ -20470,9 +21586,9 @@ sub reset_indentation_level { $allowed_quote_modifiers = '[cgimosx]'; # TBD:check this } else { - $type_sequence = - increase_nesting_depth( QUESTION_COLON, $i_tok ); + increase_nesting_depth( QUESTION_COLON, + $$rtoken_map[$i_tok] ); } }, '*' => sub { # typeglob, or multiply? @@ -20538,7 +21654,8 @@ sub reset_indentation_level { # otherwise, it should be part of a ?/: operator else { $type_sequence = - decrease_nesting_depth( QUESTION_COLON, $i_tok ); + decrease_nesting_depth( QUESTION_COLON, + $$rtoken_map[$i_tok] ); if ( $last_nonblank_token eq '?' ) { warning("Syntax error near ? :\n"); } @@ -20547,7 +21664,7 @@ sub reset_indentation_level { '+' => sub { # what kind of plus? if ( $expecting == TERM ) { - scan_number(); + my $number = scan_number(); # unary plus is safest assumption if not a number if ( !defined($number) ) { $type = 'p'; } @@ -20577,7 +21694,8 @@ sub reset_indentation_level { '[' => sub { $square_bracket_type[ ++$square_bracket_depth ] = $last_nonblank_token; - $type_sequence = increase_nesting_depth( SQUARE_BRACKET, $i_tok ); + $type_sequence = + increase_nesting_depth( SQUARE_BRACKET, $$rtoken_map[$i_tok] ); # It may seem odd, but structural square brackets have # type '{' and '}'. This simplifies the indentation logic. @@ -20587,7 +21705,8 @@ sub reset_indentation_level { $square_bracket_structural_type[$square_bracket_depth] = $type; }, ']' => sub { - $type_sequence = decrease_nesting_depth( SQUARE_BRACKET, $i_tok ); + $type_sequence = + decrease_nesting_depth( SQUARE_BRACKET, $$rtoken_map[$i_tok] ); if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' ) { @@ -20605,7 +21724,7 @@ sub reset_indentation_level { $type = 'F'; } elsif ( $expecting == TERM ) { - scan_number(); + my $number = scan_number(); # maybe part of bareword token? unary is safest if ( !defined($number) ) { $type = 'm'; } @@ -20661,12 +21780,17 @@ sub reset_indentation_level { ; # here-doc not possible if end of line if ( $expecting != OPERATOR ) { - my ($found_target); - ( $found_target, $here_doc_target, $here_quote_character, $i ) = - find_here_doc( $expecting, $i, $rtokens, $rtoken_map ); + my ( $found_target, $here_doc_target, $here_quote_character, + $saw_error ); + ( + $found_target, $here_doc_target, $here_quote_character, $i, + $saw_error + ) + = find_here_doc( $expecting, $i, $rtokens, $rtoken_map, + $max_token_index ); if ($found_target) { - push @here_target_list, + push @{$rhere_target_list}, [ $here_doc_target, $here_quote_character ]; $type = 'h'; if ( length($here_doc_target) > 80 ) { @@ -20680,10 +21804,12 @@ sub reset_indentation_level { } } elsif ( $expecting == TERM ) { + unless ($saw_error) { - # shouldn't happen.. - warning("Program bug; didn't find here doc target\n"); - report_definite_bug(); + # shouldn't happen.. + warning("Program bug; didn't find here doc target\n"); + report_definite_bug(); + } } } else { @@ -20701,7 +21827,7 @@ sub reset_indentation_level { if ( $expecting == TERM ) { $type = 'pp' } elsif ( $expecting == UNKNOWN ) { my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens ); + find_next_nonblank_token( $i, $rtokens, $max_token_index ); if ( $next_nonblank_token eq '$' ) { $type = 'pp' } } }, @@ -20722,7 +21848,7 @@ sub reset_indentation_level { if ( $expecting == TERM ) { $type = 'mm' } elsif ( $expecting == UNKNOWN ) { my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens ); + find_next_nonblank_token( $i, $rtokens, $max_token_index ); if ( $next_nonblank_token eq '$' ) { $type = 'mm' } } }, @@ -20921,6 +22047,9 @@ sub reset_indentation_level { # *, then run diff between the output of the previous version and the # current version. # + # *. For another example, search for the smartmatch operator '~~' + # with your editor to see where updates were made for it. + # # ----------------------------------------------------------------------- my $line_of_tokens = shift; @@ -20933,6 +22062,9 @@ sub reset_indentation_level { # extract line number for use in error messages $input_line_number = $line_of_tokens->{_line_number}; + # reinitialize for multi-line quote + $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q'; + # check for pod documentation if ( ( $untrimmed_input_line =~ /^=[A-Za-z_]/ ) ) { @@ -20956,12 +22088,18 @@ sub reset_indentation_level { $input_line =~ s/^\s*//; # trim left end } + # update the copy of the line for use in error messages + # This must be exactly what we give the pre_tokenizer + $tokenizer_self->{_line_text} = $input_line; + # re-initialize for the main loop - @output_token_list = (); # stack of output token indexes - @output_token_type = (); # token types - @output_block_type = (); # types of code block - @output_container_type = (); # paren types, such as if, elsif, .. - @output_type_sequence = (); # nesting sequential number + $routput_token_list = []; # stack of output token indexes + $routput_token_type = []; # token types + $routput_block_type = []; # types of code block + $routput_container_type = []; # paren types, such as if, elsif, .. + $routput_type_sequence = []; # nesting sequential number + + $rhere_target_list = []; $tok = $last_nonblank_token; $type = $last_nonblank_type; @@ -20970,9 +22108,7 @@ sub reset_indentation_level { $block_type = $last_nonblank_block_type; $container_type = $last_nonblank_container_type; $type_sequence = $last_nonblank_type_sequence; - @here_target_list = (); # list of here-doc target strings - - $peeked_ahead = 0; + $peeked_ahead = 0; # tokenization is done in two stages.. # stage 1 is a very simple pre-tokenization @@ -20984,24 +22120,20 @@ sub reset_indentation_level { } # start by breaking the line into pre-tokens - ( $rpretokens, $rpretoken_map, $rpretoken_type ) = + ( $rtokens, $rtoken_map, $rtoken_type ) = pre_tokenize( $input_line, $max_tokens_wanted ); - $max_token_index = scalar(@$rpretokens) - 1; - push( @$rpretokens, ' ', ' ', ' ' ); # extra whitespace simplifies logic - push( @$rpretoken_map, 0, 0, 0 ); # shouldn't be referenced - push( @$rpretoken_type, 'b', 'b', 'b' ); - - # temporary copies while coding change is underway - ( $rtokens, $rtoken_map, $rtoken_type ) = - ( $rpretokens, $rpretoken_map, $rpretoken_type ); + $max_token_index = scalar(@$rtokens) - 1; + push( @$rtokens, ' ', ' ', ' ' ); # extra whitespace simplifies logic + push( @$rtoken_map, 0, 0, 0 ); # shouldn't be referenced + push( @$rtoken_type, 'b', 'b', 'b' ); # initialize for main loop for $i ( 0 .. $max_token_index + 3 ) { - $output_token_type[$i] = ""; - $output_block_type[$i] = ""; - $output_container_type[$i] = ""; - $output_type_sequence[$i] = ""; + $routput_token_type->[$i] = ""; + $routput_block_type->[$i] = ""; + $routput_container_type->[$i] = ""; + $routput_type_sequence->[$i] = ""; } $i = -1; $i_tok = -1; @@ -21017,25 +22149,39 @@ sub reset_indentation_level { if ($in_quote) { # continue looking for end of a quote $type = $quote_type; - unless (@output_token_list) { # initialize if continuation line - push( @output_token_list, $i ); - $output_token_type[$i] = $type; + unless ( @{$routput_token_list} ) + { # initialize if continuation line + push( @{$routput_token_list}, $i ); + $routput_token_type->[$i] = $type; } $tok = $quote_character unless ( $quote_character =~ /^\s*$/ ); # scan for the end of the quote or pattern - ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) = - do_quote( $i, $in_quote, $quote_character, $quote_pos, - $quote_depth, $rtokens, $rtoken_map ); + ( + $i, $in_quote, $quote_character, $quote_pos, $quote_depth, + $quoted_string_1, $quoted_string_2 + ) + = do_quote( + $i, $in_quote, $quote_character, + $quote_pos, $quote_depth, $quoted_string_1, + $quoted_string_2, $rtokens, $rtoken_map, + $max_token_index + ); # all done if we didn't find it last if ($in_quote); + # save pattern and replacement text for rescanning + my $qs1 = $quoted_string_1; + my $qs2 = $quoted_string_2; + # re-initialize for next search $quote_character = ''; $quote_pos = 0; $quote_type = 'Q'; + $quoted_string_1 = ""; + $quoted_string_2 = ""; last if ( ++$i > $max_token_index ); # look for any modifiers @@ -21044,7 +22190,32 @@ sub reset_indentation_level { # check for exact quote modifiers if ( $$rtokens[$i] =~ /^[A-Za-z_]/ ) { my $str = $$rtokens[$i]; - while ( $str =~ /\G$allowed_quote_modifiers/gc ) { } + my $saw_modifier_e; + while ( $str =~ /\G$allowed_quote_modifiers/gc ) { + my $pos = pos($str); + my $char = substr( $str, $pos - 1, 1 ); + $saw_modifier_e ||= ( $char eq 'e' ); + } + + # For an 'e' quote modifier we must scan the replacement + # text for here-doc targets. + if ($saw_modifier_e) { + + my $rht = scan_replacement_text($qs1); + + # Change type from 'Q' to 'h' for quotes with + # here-doc targets so that the formatter (see sub + # print_line_of_tokens) will not make any line + # breaks after this point. + if ($rht) { + push @{$rhere_target_list}, @{$rht}; + $type = 'h'; + if ( $i_tok < 0 ) { + my $ilast = $routput_token_list->[-1]; + $routput_token_type->[$ilast] = $type; + } + } + } if ( defined( pos($str) ) ) { @@ -21108,9 +22279,9 @@ EOM } } - $last_last_nonblank_token = $last_nonblank_token; - $last_last_nonblank_type = $last_nonblank_type; - $last_last_nonblank_block_type = $last_nonblank_block_type; + $last_last_nonblank_token = $last_nonblank_token; + $last_last_nonblank_type = $last_nonblank_type; + $last_last_nonblank_block_type = $last_nonblank_block_type; $last_last_nonblank_container_type = $last_nonblank_container_type; $last_last_nonblank_type_sequence = @@ -21126,10 +22297,10 @@ EOM # store previous token type if ( $i_tok >= 0 ) { - $output_token_type[$i_tok] = $type; - $output_block_type[$i_tok] = $block_type; - $output_container_type[$i_tok] = $container_type; - $output_type_sequence[$i_tok] = $type_sequence; + $routput_token_type->[$i_tok] = $type; + $routput_block_type->[$i_tok] = $block_type; + $routput_container_type->[$i_tok] = $container_type; + $routput_type_sequence->[$i_tok] = $type_sequence; } my $pre_tok = $$rtokens[$i]; # get the next pre-token my $pre_type = $$rtoken_type[$i]; # and type @@ -21142,7 +22313,7 @@ EOM $i_tok = $i; # this pre-token will start an output token - push( @output_token_list, $i_tok ); + push( @{$routput_token_list}, $i_tok ); # continue gathering identifier if necessary # but do not start on blanks and comments @@ -21246,15 +22417,15 @@ EOM if ( $pre_type eq 'w' ) { $expecting = operator_expected( $prev_type, $tok, $next_type ); my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens ); + find_next_nonblank_token( $i, $rtokens, $max_token_index ); # ATTRS: handle sub and variable attributes if ($in_attribute_list) { # treat bare word followed by open paren like qw( if ( $next_nonblank_token eq '(' ) { - $in_quote = $quote_items{q}; - $allowed_quote_modifiers = $quote_modifiers{q}; + $in_quote = $quote_items{'q'}; + $allowed_quote_modifiers = $quote_modifiers{'q'}; $type = 'q'; $quote_type = 'q'; next; @@ -21275,13 +22446,13 @@ EOM $type = 'C'; } elsif ( $is_user_function{$current_package}{$tok} ) { - $type = 'U'; + $type = 'U'; $prototype = $user_function_prototype{$current_package}{$tok}; } elsif ( $tok =~ /^v\d+$/ ) { $type = 'v'; - unless ($saw_v_string) { report_v_string($tok) } + report_v_string($tok); } else { $type = 'w' } @@ -21385,7 +22556,8 @@ EOM { scan_bare_identifier(); my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens ); + find_next_nonblank_token( $i, $rtokens, + $max_token_index ); if ($next_nonblank_token) { @@ -21442,7 +22614,8 @@ EOM ) { if ( $tok !~ /A-Z/ ) { - push @lower_case_labels_at, $input_line_number; + push @{ $tokenizer_self->{_rlower_case_labels_at} }, + $input_line_number; } $type = 'J'; $tok .= ':'; @@ -21583,12 +22756,9 @@ EOM $type = 'U'; } - # mark bare words following a file test operator as - # something that will expect an operator next. - # patch 072901: unless followed immediately by a paren, - # in which case it must be a function call (pid.t) - if ( $last_nonblank_type eq 'F' && $next_tok ne '(' ) { - $type = 'C'; + # underscore after file test operator is file handle + if ( $tok eq '_' && $last_nonblank_type eq 'F' ) { + $type = 'Z'; } # patch for SWITCH/CASE if 'case' and 'when are @@ -21627,7 +22797,7 @@ EOM $expecting = operator_expected( $prev_type, $tok, $next_type ); error_if_expecting_OPERATOR("Number") if ( $expecting == OPERATOR ); - scan_number(); + my $number = scan_number(); if ( !defined($number) ) { # shouldn't happen - we should always get a number @@ -21657,10 +22827,10 @@ EOM # ----------------------------- if ( $i_tok >= 0 ) { - $output_token_type[$i_tok] = $type; - $output_block_type[$i_tok] = $block_type; - $output_container_type[$i_tok] = $container_type; - $output_type_sequence[$i_tok] = $type_sequence; + $routput_token_type->[$i_tok] = $type; + $routput_block_type->[$i_tok] = $block_type; + $routput_container_type->[$i_tok] = $container_type; + $routput_type_sequence->[$i_tok] = $type_sequence; } unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) { @@ -21705,9 +22875,9 @@ EOM my $container_environment = ''; my $im = -1; # previous $i value my $num; - my $ci_string_sum = ( $_ = $ci_string_in_tokenizer ) =~ tr/1/0/; + my $ci_string_sum = ones_count($ci_string_in_tokenizer); -# =head1 Computing Token Indentation +# Computing Token Indentation # # The final section of the tokenizer forms tokens and also computes # parameters needed to find indentation. It is much easier to do it @@ -21763,7 +22933,7 @@ EOM # indentation level, if it is is appropriate for list formatting. # If so, continuation indentation is used to indent long list items. # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string -# @slevel_stack = a stack of total nesting depths at each +# @{$rslevel_stack} = a stack of total nesting depths at each # structural indentation level, where "total nesting depth" means # the nesting depth that would occur if every nesting token -- '{', '[', # and '(' -- , regardless of context, is used to compute a nesting @@ -21776,10 +22946,11 @@ EOM $nesting_list_string_i, $nesting_token_string_i, $nesting_type_string_i, ); - foreach $i (@output_token_list) { # scan the list of pre-tokens indexes + foreach $i ( @{$routput_token_list} ) + { # scan the list of pre-tokens indexes # self-checking for valid token types - my $type = $output_token_type[$i]; + my $type = $routput_token_type->[$i]; my $tok = $$rtokens[$i]; # the token, but ONLY if same as pretoken $level_i = $level_in_tokenizer; @@ -21820,18 +22991,18 @@ EOM $container_environment = $nesting_block_flag ? 'BLOCK' : $nesting_list_flag ? 'LIST' - : ""; + : ""; # if the difference between total nesting levels is not 1, # there are intervening non-structural nesting types between # this '{' and the previous unclosed '{' my $intervening_secondary_structure = 0; - if (@slevel_stack) { + if ( @{$rslevel_stack} ) { $intervening_secondary_structure = - $slevel_in_tokenizer - $slevel_stack[-1]; + $slevel_in_tokenizer - $rslevel_stack->[-1]; } - # =head1 Continuation Indentation + # Continuation Indentation # # Having tried setting continuation indentation both in the formatter and # in the tokenizer, I can say that setting it in the tokenizer is much, @@ -21878,10 +23049,10 @@ EOM # variable. # save the current states - push( @slevel_stack, 1 + $slevel_in_tokenizer ); + push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer ); $level_in_tokenizer++; - if ( $output_block_type[$i] ) { + if ( $routput_block_type->[$i] ) { $nesting_block_flag = 1; $nesting_block_string .= '1'; } @@ -21893,10 +23064,10 @@ EOM # we will use continuation indentation within containers # which are not blocks and not logical expressions my $bit = 0; - if ( !$output_block_type[$i] ) { + if ( !$routput_block_type->[$i] ) { # propagate flag down at nested open parens - if ( $output_container_type[$i] eq '(' ) { + if ( $routput_container_type->[$i] eq '(' ) { $bit = 1 if $nesting_list_flag; } @@ -21905,7 +23076,8 @@ EOM else { $bit = 1 unless - $is_logical_container{ $output_container_type[$i] }; + $is_logical_container{ $routput_container_type->[$i] + }; } } $nesting_list_string .= $bit; @@ -21913,7 +23085,7 @@ EOM $ci_string_in_tokenizer .= ( $intervening_secondary_structure != 0 ) ? '1' : '0'; - $ci_string_sum = ( $_ = $ci_string_in_tokenizer ) =~ tr/1/0/; + $ci_string_sum = ones_count($ci_string_in_tokenizer); $continuation_string_in_tokenizer .= ( $in_statement_continuation > 0 ) ? '1' : '0'; @@ -21936,7 +23108,7 @@ EOM my $total_ci = $ci_string_sum; if ( - !$output_block_type[$i] # patch: skip for BLOCK + !$routput_block_type->[$i] # patch: skip for BLOCK && ($in_statement_continuation) ) { @@ -21951,7 +23123,7 @@ EOM elsif ( $type eq '}' || $type eq 'R' ) { # only a nesting error in the script would prevent popping here - if ( @slevel_stack > 1 ) { pop(@slevel_stack); } + if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); } $level_i = --$level_in_tokenizer; @@ -21964,23 +23136,23 @@ EOM $nesting_list_flag = ( $nesting_list_string =~ /1$/ ); chop $ci_string_in_tokenizer; - $ci_string_sum = - ( $_ = $ci_string_in_tokenizer ) =~ tr/1/0/; + $ci_string_sum = ones_count($ci_string_in_tokenizer); $in_statement_continuation = chop $continuation_string_in_tokenizer; # zero continuation flag at terminal BLOCK '}' which # ends a statement. - if ( $output_block_type[$i] ) { + if ( $routput_block_type->[$i] ) { # ...These include non-anonymous subs # note: could be sub ::abc { or sub 'abc - if ( $output_block_type[$i] =~ m/^sub\s*/gc ) { + if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) { # note: older versions of perl require the /gc modifier # here or else the \G does not work. - if ( $output_block_type[$i] =~ /\G('|::|\w)/gc ) { + if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc ) + { $in_statement_continuation = 0; } } @@ -21989,8 +23161,8 @@ EOM # block prototypes and these: (sort|grep|map|do|eval) # /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|continue|;|if|elsif|else|unless|while|until|for|foreach)$/ elsif ( - $is_zero_continuation_block_type{ $output_block_type - [$i] } ) + $is_zero_continuation_block_type{ + $routput_block_type->[$i] } ) { $in_statement_continuation = 0; } @@ -21999,18 +23171,19 @@ EOM # /^(sort|grep|map|do|eval)$/ ) elsif ( $is_not_zero_continuation_block_type{ - $output_block_type[$i] } ) + $routput_block_type->[$i] } ) { } # ..and a block introduced by a label # /^\w+\s*:$/gc ) { - elsif ( $output_block_type[$i] =~ /:$/ ) { + elsif ( $routput_block_type->[$i] =~ /:$/ ) { $in_statement_continuation = 0; } - # ..nor user function with block prototype + # user function with block prototype else { + $in_statement_continuation = 0; } } @@ -22026,7 +23199,7 @@ EOM # ); elsif ( $tok eq ')' ) { $in_statement_continuation = 1 - if $output_container_type[$i] =~ /^[;,\{\}]$/; + if $routput_container_type->[$i] =~ /^[;,\{\}]$/; } } @@ -22034,7 +23207,7 @@ EOM $container_environment = $nesting_block_flag ? 'BLOCK' : $nesting_list_flag ? 'LIST' - : ""; + : ""; $ci_string_i = $ci_string_sum + $in_statement_continuation; $nesting_block_string_i = $nesting_block_string; $nesting_list_string_i = $nesting_list_string; @@ -22046,7 +23219,7 @@ EOM $container_environment = $nesting_block_flag ? 'BLOCK' : $nesting_list_flag ? 'LIST' - : ""; + : ""; # zero the continuation indentation at certain tokens so # that they will be at the same level as its container. For @@ -22113,8 +23286,8 @@ EOM } if ( $level_in_tokenizer < 0 ) { - unless ($saw_negative_indentation) { - $saw_negative_indentation = 1; + unless ( $tokenizer_self->{_saw_negative_indentation} ) { + $tokenizer_self->{_saw_negative_indentation} = 1; warning("Starting negative indentation\n"); } } @@ -22146,16 +23319,16 @@ EOM } } - push( @block_type, $output_block_type[$i] ); + push( @block_type, $routput_block_type->[$i] ); push( @ci_string, $ci_string_i ); push( @container_environment, $container_environment ); - push( @container_type, $output_container_type[$i] ); + push( @container_type, $routput_container_type->[$i] ); push( @levels, $level_i ); push( @nesting_tokens, $nesting_token_string_i ); push( @nesting_types, $nesting_type_string_i ); push( @slevels, $slevel_i ); push( @token_type, $fix_type ); - push( @type_sequence, $output_type_sequence[$i] ); + push( @type_sequence, $routput_type_sequence->[$i] ); push( @nesting_blocks, $nesting_block_string ); push( @nesting_lists, $nesting_list_string ); @@ -22179,7 +23352,9 @@ EOM $tokenizer_self->{_in_attribute_list} = $in_attribute_list; $tokenizer_self->{_in_quote} = $in_quote; - $tokenizer_self->{_rhere_target_list} = \@here_target_list; + $tokenizer_self->{_quote_target} = + $in_quote ? matching_end_token($quote_character) : ""; + $tokenizer_self->{_rhere_target_list} = $rhere_target_list; $line_of_tokens->{_rtoken_type} = \@token_type; $line_of_tokens->{_rtokens} = \@tokens; @@ -22197,37 +23372,247 @@ EOM } } # end tokenize_this_line -sub new_statement_ok { +#########i############################################################# +# Tokenizer routines which assist in identifying token types +####################################################################### - # return true if the current token can start a new statement +sub operator_expected { - return label_ok() # a label would be ok here + # Many perl symbols have two or more meanings. For example, '<<' + # can be a shift operator or a here-doc operator. The + # interpretation of these symbols depends on the current state of + # the tokenizer, which may either be expecting a term or an + # operator. For this example, a << would be a shift if an operator + # is expected, and a here-doc if a term is expected. This routine + # is called to make this decision for any current token. It returns + # one of three possible values: + # + # OPERATOR - operator expected (or at least, not a term) + # UNKNOWN - can't tell + # TERM - a term is expected (or at least, not an operator) + # + # The decision is based on what has been seen so far. This + # information is stored in the "$last_nonblank_type" and + # "$last_nonblank_token" variables. For example, if the + # $last_nonblank_type is '=~', then we are expecting a TERM, whereas + # if $last_nonblank_type is 'n' (numeric), we are expecting an + # OPERATOR. + # + # If a UNKNOWN is returned, the calling routine must guess. A major + # goal of this tokenizer is to minimize the possiblity of returning + # UNKNOWN, because a wrong guess can spoil the formatting of a + # script. + # + # adding NEW_TOKENS: it is critically important that this routine be + # updated to allow it to determine if an operator or term is to be + # expected after the new token. Doing this simply involves adding + # the new token character to one of the regexes in this routine or + # to one of the hash lists + # that it uses, which are initialized in the BEGIN section. + # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token, + # $statement_type - || $last_nonblank_type eq 'J'; # or we follow a label + my ( $prev_type, $tok, $next_type ) = @_; -} + my $op_expected = UNKNOWN; -sub label_ok { +#print "tok=$tok last type=$last_nonblank_type last tok=$last_nonblank_token\n"; - # Decide if a bare word followed by a colon here is a label +# Note: function prototype is available for token type 'U' for future +# program development. It contains the leading and trailing parens, +# and no blanks. It might be used to eliminate token type 'C', for +# example (prototype = '()'). Thus: +# if ($last_nonblank_type eq 'U') { +# print "previous token=$last_nonblank_token type=$last_nonblank_type prototype=$last_nonblank_prototype\n"; +# } - # if it follows an opening or closing code block curly brace.. - if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' ) - && $last_nonblank_type eq $last_nonblank_token ) - { + # A possible filehandle (or object) requires some care... + if ( $last_nonblank_type eq 'Z' ) { - # it is a label if and only if the curly encloses a code block - return $brace_type[$brace_depth]; - } + # angle.t + if ( $last_nonblank_token =~ /^[A-Za-z_]/ ) { + $op_expected = UNKNOWN; + } - # otherwise, it is a label if and only if it follows a ';' - # (real or fake) - else { - return ( $last_nonblank_type eq ';' ); + # For possible file handle like "$a", Perl uses weird parsing rules. + # For example: + # print $a/2,"/hi"; - division + # print $a / 2,"/hi"; - division + # print $a/ 2,"/hi"; - division + # print $a /2,"/hi"; - pattern (and error)! + elsif ( ( $prev_type eq 'b' ) && ( $next_type ne 'b' ) ) { + $op_expected = TERM; + } + + # Note when an operation is being done where a + # filehandle might be expected, since a change in whitespace + # could change the interpretation of the statement. + else { + if ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) { + complain("operator in print statement not recommended\n"); + $op_expected = OPERATOR; + } + } } -} -sub code_block_type { + # handle something after 'do' and 'eval' + elsif ( $is_block_operator{$last_nonblank_token} ) { + + # something like $a = eval "expression"; + # ^ + if ( $last_nonblank_type eq 'k' ) { + $op_expected = TERM; # expression or list mode following keyword + } + + # something like $a = do { BLOCK } / 2; + # ^ + else { + $op_expected = OPERATOR; # block mode following } + } + } + + # handle bare word.. + elsif ( $last_nonblank_type eq 'w' ) { + + # unfortunately, we can't tell what type of token to expect next + # after most bare words + $op_expected = UNKNOWN; + } + + # operator, but not term possible after these types + # Note: moved ')' from type to token because parens in list context + # get marked as '{' '}' now. This is a minor glitch in the following: + # my %opts = (ref $_[0] eq 'HASH') ? %{shift()} : (); + # + elsif (( $last_nonblank_type =~ /^[\]RnviQh]$/ ) + || ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) ) + { + $op_expected = OPERATOR; + + # in a 'use' statement, numbers and v-strings are not true + # numbers, so to avoid incorrect error messages, we will + # mark them as unknown for now (use.t) + # TODO: it would be much nicer to create a new token V for VERSION + # number in a use statement. Then this could be a check on type V + # and related patches which change $statement_type for '=>' + # and ',' could be removed. Further, it would clean things up to + # scan the 'use' statement with a separate subroutine. + if ( ( $statement_type eq 'use' ) + && ( $last_nonblank_type =~ /^[nv]$/ ) ) + { + $op_expected = UNKNOWN; + } + } + + # no operator after many keywords, such as "die", "warn", etc + elsif ( $expecting_term_token{$last_nonblank_token} ) { + + # patch for dor.t (defined or). + # perl functions which may be unary operators + # TODO: This list is incomplete, and these should be put + # into a hash. + if ( $tok eq '/' + && $next_type eq '/' + && $last_nonblank_type eq 'k' + && $last_nonblank_token =~ /^eof|undef|shift|pop$/ ) + { + $op_expected = OPERATOR; + } + else { + $op_expected = TERM; + } + } + + # no operator after things like + - ** (i.e., other operators) + elsif ( $expecting_term_types{$last_nonblank_type} ) { + $op_expected = TERM; + } + + # a few operators, like "time", have an empty prototype () and so + # take no parameters but produce a value to operate on + elsif ( $expecting_operator_token{$last_nonblank_token} ) { + $op_expected = OPERATOR; + } + + # post-increment and decrement produce values to be operated on + elsif ( $expecting_operator_types{$last_nonblank_type} ) { + $op_expected = OPERATOR; + } + + # no value to operate on after sub block + elsif ( $last_nonblank_token =~ /^sub\s/ ) { $op_expected = TERM; } + + # a right brace here indicates the end of a simple block. + # all non-structural right braces have type 'R' + # all braces associated with block operator keywords have been given those + # keywords as "last_nonblank_token" and caught above. + # (This statement is order dependent, and must come after checking + # $last_nonblank_token). + elsif ( $last_nonblank_type eq '}' ) { + + # patch for dor.t (defined or). + if ( $tok eq '/' + && $next_type eq '/' + && $last_nonblank_token eq ']' ) + { + $op_expected = OPERATOR; + } + else { + $op_expected = TERM; + } + } + + # something else..what did I forget? + else { + + # collecting diagnostics on unknown operator types..see what was missed + $op_expected = UNKNOWN; + write_diagnostics( +"OP: unknown after type=$last_nonblank_type token=$last_nonblank_token\n" + ); + } + + TOKENIZER_DEBUG_FLAG_EXPECT && do { + print +"EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n"; + }; + return $op_expected; +} + +sub new_statement_ok { + + # return true if the current token can start a new statement + # USES GLOBAL VARIABLES: $last_nonblank_type + + return label_ok() # a label would be ok here + + || $last_nonblank_type eq 'J'; # or we follow a label + +} + +sub label_ok { + + # Decide if a bare word followed by a colon here is a label + # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type, + # $brace_depth, @brace_type + + # if it follows an opening or closing code block curly brace.. + if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' ) + && $last_nonblank_type eq $last_nonblank_token ) + { + + # it is a label if and only if the curly encloses a code block + return $brace_type[$brace_depth]; + } + + # otherwise, it is a label if and only if it follows a ';' + # (real or fake) + else { + return ( $last_nonblank_type eq ';' ); + } +} + +sub code_block_type { # Decide if this is a block of code, and its type. # Must be called only when $type = $token = '{' @@ -22236,12 +23621,14 @@ sub code_block_type { # Returns "" if not code block, otherwise returns 'last_nonblank_token' # to indicate the type of code block. (For example, 'last_nonblank_token' # might be 'if' for an if block, 'else' for an else block, etc). + # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type, + # $last_nonblank_block_type, $brace_depth, @brace_type # handle case of multiple '{'s # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n"; - my ( $i, $rtokens, $rtoken_type ) = @_; + my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_; if ( $last_nonblank_token eq '{' && $last_nonblank_type eq $last_nonblank_token ) { @@ -22249,7 +23636,8 @@ sub code_block_type { # opening brace where a statement may appear is probably # a code block but might be and anonymous hash reference if ( $brace_type[$brace_depth] ) { - return decide_if_code_block( $i, $rtokens, $rtoken_type ); + return decide_if_code_block( $i, $rtokens, $rtoken_type, + $max_token_index ); } # cannot start a code block within an anonymous hash @@ -22262,7 +23650,8 @@ sub code_block_type { # an opening brace where a statement may appear is probably # a code block but might be and anonymous hash reference - return decide_if_code_block( $i, $rtokens, $rtoken_type ); + return decide_if_code_block( $i, $rtokens, $rtoken_type, + $max_token_index ); } # handle case of '}{' @@ -22273,7 +23662,8 @@ sub code_block_type { # a } { situation ... # could be hash reference after code block..(blktype1.t) if ($last_nonblank_block_type) { - return decide_if_code_block( $i, $rtokens, $rtoken_type ); + return decide_if_code_block( $i, $rtokens, $rtoken_type, + $max_token_index ); } # must be a block if it follows a closing hash reference @@ -22315,7 +23705,8 @@ sub code_block_type { # check bareword elsif ( $last_nonblank_type eq 'w' ) { - return decide_if_code_block( $i, $rtokens, $rtoken_type ); + return decide_if_code_block( $i, $rtokens, $rtoken_type, + $max_token_index ); } # anything else must be anonymous hash reference @@ -22326,9 +23717,10 @@ sub code_block_type { sub decide_if_code_block { - my ( $i, $rtokens, $rtoken_type ) = @_; + # USES GLOBAL VARIABLES: $last_nonblank_token + my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_; my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens ); + find_next_nonblank_token( $i, $rtokens, $max_token_index ); # we are at a '{' where a statement may appear. # We must decide if this brace starts an anonymous hash or a code @@ -22430,12 +23822,16 @@ sub decide_if_code_block { sub unexpected { # report unexpected token type and show where it is - my ( $found, $expecting, $i_tok, $last_nonblank_i ) = @_; - $unexpected_error_count++; - if ( $unexpected_error_count <= MAX_NAG_MESSAGES ) { + # USES GLOBAL VARIABLES: $tokenizer_self + my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map, + $rpretoken_type, $input_line ) + = @_; + + if ( ++$tokenizer_self->{_unexpected_error_count} <= MAX_NAG_MESSAGES ) { my $msg = "found $found where $expecting expected"; my $pos = $$rpretoken_map[$i_tok]; interrupt_logfile(); + my $input_line_number = $tokenizer_self->{_last_line_number}; my ( $offset, $numbered_line, $underline ) = make_numbered_line( $input_line_number, $input_line, $pos ); $underline = write_on_underline( $underline, $pos - $offset, '^' ); @@ -22463,1606 +23859,1368 @@ sub unexpected { } } -sub indicate_error { - my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_; - interrupt_logfile(); - warning($msg); - write_error_indicator_pair( $line_number, $input_line, $pos, $carrat ); - resume_logfile(); -} - -sub write_error_indicator_pair { - my ( $line_number, $input_line, $pos, $carrat ) = @_; - my ( $offset, $numbered_line, $underline ) = - make_numbered_line( $line_number, $input_line, $pos ); - $underline = write_on_underline( $underline, $pos - $offset, $carrat ); - warning( $numbered_line . "\n" ); - $underline =~ s/\s*$//; - warning( $underline . "\n" ); -} - -sub make_numbered_line { +sub is_non_structural_brace { - # Given an input line, its line number, and a character position of - # interest, create a string not longer than 80 characters of the form - # $lineno: sub_string - # such that the sub_string of $str contains the position of interest - # - # Here is an example of what we want, in this case we add trailing - # '...' because the line is long. - # - # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ... - # - # Here is another example, this time in which we used leading '...' - # because of excessive length: - # - # 2: ... er of the World Wide Web Consortium's - # - # input parameters are: - # $lineno = line number - # $str = the text of the line - # $pos = position of interest (the error) : 0 = first character - # - # We return : - # - $offset = an offset which corrects the position in case we only - # display part of a line, such that $pos-$offset is the effective - # position from the start of the displayed line. - # - $numbered_line = the numbered line as above, - # - $underline = a blank 'underline' which is all spaces with the same - # number of characters as the numbered line. + # Decide if a brace or bracket is structural or non-structural + # by looking at the previous token and type + # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token - my ( $lineno, $str, $pos ) = @_; - my $offset = ( $pos < 60 ) ? 0 : $pos - 40; - my $excess = length($str) - $offset - 68; - my $numc = ( $excess > 0 ) ? 68 : undef; + # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting. + # Tentatively deactivated because it caused the wrong operator expectation + # for this code: + # $user = @vars[1] / 100; + # Must update sub operator_expected before re-implementing. + # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) { + # return 0; + # } - if ( defined($numc) ) { - if ( $offset == 0 ) { - $str = substr( $str, $offset, $numc - 4 ) . " ..."; - } - else { - $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ..."; - } - } - else { + # NOTE: braces after type characters start code blocks, but for + # simplicity these are not identified as such. See also + # sub code_block_type + # if ($last_nonblank_type eq 't') {return 0} - if ( $offset == 0 ) { - } - else { - $str = "... " . substr( $str, $offset + 4 ); - } - } + # otherwise, it is non-structural if it is decorated + # by type information. + # For example, the '{' here is non-structural: ${xxx} + ( + $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/ - my $numbered_line = sprintf( "%d: ", $lineno ); - $offset -= length($numbered_line); - $numbered_line .= $str; - my $underline = " " x length($numbered_line); - return ( $offset, $numbered_line, $underline ); + # or if we follow a hash or array closing curly brace or bracket + # For example, the second '{' in this is non-structural: $a{'x'}{'y'} + # because the first '}' would have been given type 'R' + || $last_nonblank_type =~ /^([R\]])$/ + ); } -sub write_on_underline { +#########i############################################################# +# Tokenizer routines for tracking container nesting depths +####################################################################### - # The "underline" is a string that shows where an error is; it starts - # out as a string of blanks with the same length as the numbered line of - # code above it, and we have to add marking to show where an error is. - # In the example below, we want to write the string '--^' just below - # the line of bad code: - # - # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ... - # ---^ - # We are given the current underline string, plus a position and a - # string to write on it. - # - # In the above example, there will be 2 calls to do this: - # First call: $pos=19, pos_chr=^ - # Second call: $pos=16, pos_chr=--- - # - # This is a trivial thing to do with substr, but there is some - # checking to do. - - my ( $underline, $pos, $pos_chr ) = @_; +# The following routines keep track of nesting depths of the nesting +# types, ( [ { and ?. This is necessary for determining the indentation +# level, and also for debugging programs. Not only do they keep track of +# nesting depths of the individual brace types, but they check that each +# of the other brace types is balanced within matching pairs. For +# example, if the program sees this sequence: +# +# { ( ( ) } +# +# then it can determine that there is an extra left paren somewhere +# between the { and the }. And so on with every other possible +# combination of outer and inner brace types. For another +# example: +# +# ( [ ..... ] ] ) +# +# which has an extra ] within the parens. +# +# The brace types have indexes 0 .. 3 which are indexes into +# the matrices. +# +# The pair ? : are treated as just another nesting type, with ? acting +# as the opening brace and : acting as the closing brace. +# +# The matrix +# +# $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b]; +# +# saves the nesting depth of brace type $b (where $b is either of the other +# nesting types) when brace type $a enters a new depth. When this depth +# decreases, a check is made that the current depth of brace types $b is +# unchanged, or otherwise there must have been an error. This can +# be very useful for localizing errors, particularly when perl runs to +# the end of a large file (such as this one) and announces that there +# is a problem somewhere. +# +# A numerical sequence number is maintained for every nesting type, +# so that each matching pair can be uniquely identified in a simple +# way. - # check for error..shouldn't happen - unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) { - return $underline; - } - my $excess = length($pos_chr) + $pos - length($underline); - if ( $excess > 0 ) { - $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess ); - } - substr( $underline, $pos, length($pos_chr) ) = $pos_chr; - return ($underline); -} +sub increase_nesting_depth { + my ( $a, $pos ) = @_; -sub is_non_structural_brace { + # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth, + # @current_sequence_number, @depth_array, @starting_line_of_current_depth + my $b; + $current_depth[$a]++; + my $input_line_number = $tokenizer_self->{_last_line_number}; + my $input_line = $tokenizer_self->{_line_text}; - # Decide if a brace or bracket is structural or non-structural - # by looking at the previous token and type + # Sequence numbers increment by number of items. This keeps + # a unique set of numbers but still allows the relative location + # of any type to be determined. + $nesting_sequence_number[$a] += scalar(@closing_brace_names); + my $seqno = $nesting_sequence_number[$a]; + $current_sequence_number[$a][ $current_depth[$a] ] = $seqno; - # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting. - # Tentatively deactivated because it caused the wrong operator expectation - # for this code: - # $user = @vars[1] / 100; - # Must update sub operator_expected before re-implementing. - # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) { - # return 0; - # } + $starting_line_of_current_depth[$a][ $current_depth[$a] ] = + [ $input_line_number, $input_line, $pos ]; - # NOTE: braces after type characters start code blocks, but for - # simplicity these are not identified as such. See also - # sub code_block_type - # if ($last_nonblank_type eq 't') {return 0} + for $b ( 0 .. $#closing_brace_names ) { + next if ( $b == $a ); + $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b]; + } + return $seqno; +} - # otherwise, it is non-structural if it is decorated - # by type information. - # For example, the '{' here is non-structural: ${xxx} - ( - $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/ +sub decrease_nesting_depth { - # or if we follow a hash or array closing curly brace or bracket - # For example, the second '{' in this is non-structural: $a{'x'}{'y'} - # because the first '}' would have been given type 'R' - || $last_nonblank_type =~ /^([R\]])$/ - ); -} + my ( $a, $pos ) = @_; -sub operator_expected { + # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth, + # @current_sequence_number, @depth_array, @starting_line_of_current_depth + my $b; + my $seqno = 0; + my $input_line_number = $tokenizer_self->{_last_line_number}; + my $input_line = $tokenizer_self->{_line_text}; - # Many perl symbols have two or more meanings. For example, '<<' - # can be a shift operator or a here-doc operator. The - # interpretation of these symbols depends on the current state of - # the tokenizer, which may either be expecting a term or an - # operator. For this example, a << would be a shift if an operator - # is expected, and a here-doc if a term is expected. This routine - # is called to make this decision for any current token. It returns - # one of three possible values: - # - # OPERATOR - operator expected (or at least, not a term) - # UNKNOWN - can't tell - # TERM - a term is expected (or at least, not an operator) - # - # The decision is based on what has been seen so far. This - # information is stored in the "$last_nonblank_type" and - # "$last_nonblank_token" variables. For example, if the - # $last_nonblank_type is '=~', then we are expecting a TERM, whereas - # if $last_nonblank_type is 'n' (numeric), we are expecting an - # OPERATOR. - # - # If a UNKNOWN is returned, the calling routine must guess. A major - # goal of this tokenizer is to minimize the possiblity of returning - # UNKNOWN, because a wrong guess can spoil the formatting of a - # script. - # - # adding NEW_TOKENS: it is critically important that this routine be - # updated to allow it to determine if an operator or term is to be - # expected after the new token. Doing this simply involves adding - # the new token character to one of the regexes in this routine or - # to one of the hash lists - # that it uses, which are initialized in the BEGIN section. + if ( $current_depth[$a] > 0 ) { - my ( $prev_type, $tok, $next_type ) = @_; - my $op_expected = UNKNOWN; + $seqno = $current_sequence_number[$a][ $current_depth[$a] ]; -#print "tok=$tok last type=$last_nonblank_type last tok=$last_nonblank_token\n"; + # check that any brace types $b contained within are balanced + for $b ( 0 .. $#closing_brace_names ) { + next if ( $b == $a ); -# Note: function prototype is available for token type 'U' for future -# program development. It contains the leading and trailing parens, -# and no blanks. It might be used to eliminate token type 'C', for -# example (prototype = '()'). Thus: -# if ($last_nonblank_type eq 'U') { -# print "previous token=$last_nonblank_token type=$last_nonblank_type prototype=$last_nonblank_prototype\n"; -# } + unless ( $depth_array[$a][$b][ $current_depth[$a] ] == + $current_depth[$b] ) + { + my $diff = + $current_depth[$b] - + $depth_array[$a][$b][ $current_depth[$a] ]; - # A possible filehandle (or object) requires some care... - if ( $last_nonblank_type eq 'Z' ) { + # don't whine too many times + my $saw_brace_error = get_saw_brace_error(); + if ( + $saw_brace_error <= MAX_NAG_MESSAGES - # angle.t - if ( $last_nonblank_token =~ /^[A-Za-z_]/ ) { - $op_expected = UNKNOWN; - } + # if too many closing types have occured, we probably + # already caught this error + && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) ) + ) + { + interrupt_logfile(); + my $rsl = + $starting_line_of_current_depth[$a][ $current_depth[$a] ]; + my $sl = $$rsl[0]; + my $rel = [ $input_line_number, $input_line, $pos ]; + my $el = $$rel[0]; + my ($ess); - # For possible file handle like "$a", Perl uses weird parsing rules. - # For example: - # print $a/2,"/hi"; - division - # print $a / 2,"/hi"; - division - # print $a/ 2,"/hi"; - division - # print $a /2,"/hi"; - pattern (and error)! - elsif ( ( $prev_type eq 'b' ) && ( $next_type ne 'b' ) ) { - $op_expected = TERM; - } + if ( $diff == 1 || $diff == -1 ) { + $ess = ''; + } + else { + $ess = 's'; + } + my $bname = + ( $diff > 0 ) + ? $opening_brace_names[$b] + : $closing_brace_names[$b]; + write_error_indicator_pair( @$rsl, '^' ); + my $msg = <<"EOM"; +Found $diff extra $bname$ess between $opening_brace_names[$a] on line $sl and $closing_brace_names[$a] on line $el +EOM - # Note when an operation is being done where a - # filehandle might be expected, since a change in whitespace - # could change the interpretation of the statement. - else { - if ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) { - complain("operator in print statement not recommended\n"); - $op_expected = OPERATOR; + if ( $diff > 0 ) { + my $rml = + $starting_line_of_current_depth[$b] + [ $current_depth[$b] ]; + my $ml = $$rml[0]; + $msg .= +" The most recent un-matched $bname is on line $ml\n"; + write_error_indicator_pair( @$rml, '^' ); + } + write_error_indicator_pair( @$rel, '^' ); + warning($msg); + resume_logfile(); + } + increment_brace_error(); } } + $current_depth[$a]--; } + else { - # handle something after 'do' and 'eval' - elsif ( $is_block_operator{$last_nonblank_token} ) { - - # something like $a = eval "expression"; - # ^ - if ( $last_nonblank_type eq 'k' ) { - $op_expected = TERM; # expression or list mode following keyword - } - - # something like $a = do { BLOCK } / 2; - # ^ - else { - $op_expected = OPERATOR; # block mode following } + my $saw_brace_error = get_saw_brace_error(); + if ( $saw_brace_error <= MAX_NAG_MESSAGES ) { + my $msg = <<"EOM"; +There is no previous $opening_brace_names[$a] to match a $closing_brace_names[$a] on line $input_line_number +EOM + indicate_error( $msg, $input_line_number, $input_line, $pos, '^' ); } + increment_brace_error(); } + return $seqno; +} - # handle bare word.. - elsif ( $last_nonblank_type eq 'w' ) { +sub check_final_nesting_depths { + my ($a); - # unfortunately, we can't tell what type of token to expect next - # after most bare words - $op_expected = UNKNOWN; - } + # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth - # operator, but not term possible after these types - # Note: moved ')' from type to token because parens in list context - # get marked as '{' '}' now. This is a minor glitch in the following: - # my %opts = (ref $_[0] eq 'HASH') ? %{shift()} : (); - # - elsif (( $last_nonblank_type =~ /^[\]RnviQh]$/ ) - || ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) ) - { - $op_expected = OPERATOR; + for $a ( 0 .. $#closing_brace_names ) { - # in a 'use' statement, numbers and v-strings are not true - # numbers, so to avoid incorrect error messages, we will - # mark them as unknown for now (use.t) - # TODO: it would be much nicer to create a new token V for VERSION - # number in a use statement. Then this could be a check on type V - # and related patches which change $statement_type for '=>' - # and ',' could be removed. Further, it would clean things up to - # scan the 'use' statement with a separate subroutine. - if ( ( $statement_type eq 'use' ) - && ( $last_nonblank_type =~ /^[nv]$/ ) ) - { - $op_expected = UNKNOWN; + if ( $current_depth[$a] ) { + my $rsl = $starting_line_of_current_depth[$a][ $current_depth[$a] ]; + my $sl = $$rsl[0]; + my $msg = <<"EOM"; +Final nesting depth of $opening_brace_names[$a]s is $current_depth[$a] +The most recent un-matched $opening_brace_names[$a] is on line $sl +EOM + indicate_error( $msg, @$rsl, '^' ); + increment_brace_error(); } } +} - # no operator after many keywords, such as "die", "warn", etc - elsif ( $expecting_term_token{$last_nonblank_token} ) { +#########i############################################################# +# Tokenizer routines for looking ahead in input stream +####################################################################### - # patch for dor.t (defined or). - # perl functions which may be unary operators - # TODO: This list is incomplete, and these should be put - # into a hash. - if ( $tok eq '/' - && $next_type eq '/' - && $last_nonblank_type eq 'k' - && $last_nonblank_token =~ /^eof|undef|shift|pop$/ ) - { - $op_expected = OPERATOR; - } - else { - $op_expected = TERM; - } - } +sub peek_ahead_for_n_nonblank_pre_tokens { - # no operator after things like + - ** (i.e., other operators) - elsif ( $expecting_term_types{$last_nonblank_type} ) { - $op_expected = TERM; - } + # returns next n pretokens if they exist + # returns undef's if hits eof without seeing any pretokens + # USES GLOBAL VARIABLES: $tokenizer_self + my $max_pretokens = shift; + my $line; + my $i = 0; + my ( $rpre_tokens, $rmap, $rpre_types ); - # a few operators, like "time", have an empty prototype () and so - # take no parameters but produce a value to operate on - elsif ( $expecting_operator_token{$last_nonblank_token} ) { - $op_expected = OPERATOR; + while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) ) + { + $line =~ s/^\s*//; # trim leading blanks + next if ( length($line) <= 0 ); # skip blank + next if ( $line =~ /^#/ ); # skip comment + ( $rpre_tokens, $rmap, $rpre_types ) = + pre_tokenize( $line, $max_pretokens ); + last; } + return ( $rpre_tokens, $rpre_types ); +} - # post-increment and decrement produce values to be operated on - elsif ( $expecting_operator_types{$last_nonblank_type} ) { - $op_expected = OPERATOR; - } +# look ahead for next non-blank, non-comment line of code +sub peek_ahead_for_nonblank_token { - # no value to operate on after sub block - elsif ( $last_nonblank_token =~ /^sub\s/ ) { $op_expected = TERM; } + # USES GLOBAL VARIABLES: $tokenizer_self + my ( $rtokens, $max_token_index ) = @_; + my $line; + my $i = 0; - # a right brace here indicates the end of a simple block. - # all non-structural right braces have type 'R' - # all braces associated with block operator keywords have been given those - # keywords as "last_nonblank_token" and caught above. - # (This statement is order dependent, and must come after checking - # $last_nonblank_token). - elsif ( $last_nonblank_type eq '}' ) { + while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) ) + { + $line =~ s/^\s*//; # trim leading blanks + next if ( length($line) <= 0 ); # skip blank + next if ( $line =~ /^#/ ); # skip comment + my ( $rtok, $rmap, $rtype ) = + pre_tokenize( $line, 2 ); # only need 2 pre-tokens + my $j = $max_token_index + 1; + my $tok; - # patch for dor.t (defined or). - if ( $tok eq '/' - && $next_type eq '/' - && $last_nonblank_token eq ']' ) - { - $op_expected = OPERATOR; - } - else { - $op_expected = TERM; + foreach $tok (@$rtok) { + last if ( $tok =~ "\n" ); + $$rtokens[ ++$j ] = $tok; } + last; } + return $rtokens; +} - # something else..what did I forget? - else { +#########i############################################################# +# Tokenizer guessing routines for ambiguous situations +####################################################################### - # collecting diagnostics on unknown operator types..see what was missed - $op_expected = UNKNOWN; - write_diagnostics( -"OP: unknown after type=$last_nonblank_type token=$last_nonblank_token\n" - ); - } +sub guess_if_pattern_or_conditional { - TOKENIZER_DEBUG_FLAG_EXPECT && do { - print -"EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n"; - }; - return $op_expected; -} + # this routine is called when we have encountered a ? following an + # unknown bareword, and we must decide if it starts a pattern or not + # input parameters: + # $i - token index of the ? starting possible pattern + # output parameters: + # $is_pattern = 0 if probably not pattern, =1 if probably a pattern + # msg = a warning or diagnostic message + # USES GLOBAL VARIABLES: $last_nonblank_token + my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_; + my $is_pattern = 0; + my $msg = "guessing that ? after $last_nonblank_token starts a "; -# The following routines keep track of nesting depths of the nesting -# types, ( [ { and ?. This is necessary for determining the indentation -# level, and also for debugging programs. Not only do they keep track of -# nesting depths of the individual brace types, but they check that each -# of the other brace types is balanced within matching pairs. For -# example, if the program sees this sequence: -# -# { ( ( ) } -# -# then it can determine that there is an extra left paren somewhere -# between the { and the }. And so on with every other possible -# combination of outer and inner brace types. For another -# example: -# -# ( [ ..... ] ] ) -# -# which has an extra ] within the parens. -# -# The brace types have indexes 0 .. 3 which are indexes into -# the matrices. -# -# The pair ? : are treated as just another nesting type, with ? acting -# as the opening brace and : acting as the closing brace. -# -# The matrix -# -# $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b]; -# -# saves the nesting depth of brace type $b (where $b is either of the other -# nesting types) when brace type $a enters a new depth. When this depth -# decreases, a check is made that the current depth of brace types $b is -# unchanged, or otherwise there must have been an error. This can -# be very useful for localizing errors, particularly when perl runs to -# the end of a large file (such as this one) and announces that there -# is a problem somewhere. -# -# A numerical sequence number is maintained for every nesting type, -# so that each matching pair can be uniquely identified in a simple -# way. + if ( $i >= $max_token_index ) { + $msg .= "conditional (no end to pattern found on the line)\n"; + } + else { + my $ibeg = $i; + $i = $ibeg + 1; + my $next_token = $$rtokens[$i]; # first token after ? -sub increase_nesting_depth { - my ( $a, $i_tok ) = @_; - my $b; - $current_depth[$a]++; + # look for a possible ending ? on this line.. + my $in_quote = 1; + my $quote_depth = 0; + my $quote_character = ''; + my $quote_pos = 0; + my $quoted_string; + ( + $i, $in_quote, $quote_character, $quote_pos, $quote_depth, + $quoted_string + ) + = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character, + $quote_pos, $quote_depth, $max_token_index ); - # Sequence numbers increment by number of items. This keeps - # a unique set of numbers but still allows the relative location - # of any type to be determined. - $nesting_sequence_number[$a] += scalar(@closing_brace_names); - my $seqno = $nesting_sequence_number[$a]; - $current_sequence_number[$a][ $current_depth[$a] ] = $seqno; + if ($in_quote) { - my $pos = $$rpretoken_map[$i_tok]; - $starting_line_of_current_depth[$a][ $current_depth[$a] ] = - [ $input_line_number, $input_line, $pos ]; + # we didn't find an ending ? on this line, + # so we bias towards conditional + $is_pattern = 0; + $msg .= "conditional (no ending ? on this line)\n"; - for $b ( 0 .. $#closing_brace_names ) { - next if ( $b == $a ); - $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b]; + # we found an ending ?, so we bias towards a pattern + } + else { + + if ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) { + $is_pattern = 1; + $msg .= "pattern (found ending ? and pattern expected)\n"; + } + else { + $msg .= "pattern (uncertain, but found ending ?)\n"; + } + } } - return $seqno; + return ( $is_pattern, $msg ); } -sub decrease_nesting_depth { - - my ( $a, $i_tok ) = @_; - my $pos = $$rpretoken_map[$i_tok]; - my $b; - my $seqno = 0; +sub guess_if_pattern_or_division { - if ( $current_depth[$a] > 0 ) { + # this routine is called when we have encountered a / following an + # unknown bareword, and we must decide if it starts a pattern or is a + # division + # input parameters: + # $i - token index of the / starting possible pattern + # output parameters: + # $is_pattern = 0 if probably division, =1 if probably a pattern + # msg = a warning or diagnostic message + # USES GLOBAL VARIABLES: $last_nonblank_token + my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_; + my $is_pattern = 0; + my $msg = "guessing that / after $last_nonblank_token starts a "; - $seqno = $current_sequence_number[$a][ $current_depth[$a] ]; + if ( $i >= $max_token_index ) { + "division (no end to pattern found on the line)\n"; + } + else { + my $ibeg = $i; + my $divide_expected = + numerator_expected( $i, $rtokens, $max_token_index ); + $i = $ibeg + 1; + my $next_token = $$rtokens[$i]; # first token after slash - # check that any brace types $b contained within are balanced - for $b ( 0 .. $#closing_brace_names ) { - next if ( $b == $a ); + # look for a possible ending / on this line.. + my $in_quote = 1; + my $quote_depth = 0; + my $quote_character = ''; + my $quote_pos = 0; + my $quoted_string; + ( + $i, $in_quote, $quote_character, $quote_pos, $quote_depth, + $quoted_string + ) + = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character, + $quote_pos, $quote_depth, $max_token_index ); - unless ( $depth_array[$a][$b][ $current_depth[$a] ] == - $current_depth[$b] ) - { - my $diff = $current_depth[$b] - - $depth_array[$a][$b][ $current_depth[$a] ]; + if ($in_quote) { - # don't whine too many times - my $saw_brace_error = get_saw_brace_error(); - if ( - $saw_brace_error <= MAX_NAG_MESSAGES + # we didn't find an ending / on this line, + # so we bias towards division + if ( $divide_expected >= 0 ) { + $is_pattern = 0; + $msg .= "division (no ending / on this line)\n"; + } + else { + $msg = "multi-line pattern (division not possible)\n"; + $is_pattern = 1; + } - # if too many closing types have occured, we probably - # already caught this error - && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) ) - ) - { - interrupt_logfile(); - my $rsl = - $starting_line_of_current_depth[$a][ $current_depth[$a] ]; - my $sl = $$rsl[0]; - my $rel = [ $input_line_number, $input_line, $pos ]; - my $el = $$rel[0]; - my ($ess); + } - if ( $diff == 1 || $diff == -1 ) { - $ess = ''; + # we found an ending /, so we bias towards a pattern + else { + + if ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) { + + if ( $divide_expected >= 0 ) { + + if ( $i - $ibeg > 60 ) { + $msg .= "division (matching / too distant)\n"; + $is_pattern = 0; } else { - $ess = 's'; - } - my $bname = - ( $diff > 0 ) - ? $opening_brace_names[$b] - : $closing_brace_names[$b]; - write_error_indicator_pair( @$rsl, '^' ); - my $msg = <<"EOM"; -Found $diff extra $bname$ess between $opening_brace_names[$a] on line $sl and $closing_brace_names[$a] on line $el -EOM - - if ( $diff > 0 ) { - my $rml = - $starting_line_of_current_depth[$b] - [ $current_depth[$b] ]; - my $ml = $$rml[0]; - $msg .= -" The most recent un-matched $bname is on line $ml\n"; - write_error_indicator_pair( @$rml, '^' ); + $msg .= "pattern (but division possible too)\n"; + $is_pattern = 1; } - write_error_indicator_pair( @$rel, '^' ); - warning($msg); - resume_logfile(); } - increment_brace_error(); + else { + $is_pattern = 1; + $msg .= "pattern (division not possible)\n"; + } } - } - $current_depth[$a]--; - } - else { + else { - my $saw_brace_error = get_saw_brace_error(); - if ( $saw_brace_error <= MAX_NAG_MESSAGES ) { - my $msg = <<"EOM"; -There is no previous $opening_brace_names[$a] to match a $closing_brace_names[$a] on line $input_line_number -EOM - indicate_error( $msg, $input_line_number, $input_line, $pos, '^' ); + if ( $divide_expected >= 0 ) { + $is_pattern = 0; + $msg .= "division (pattern not possible)\n"; + } + else { + $is_pattern = 1; + $msg .= + "pattern (uncertain, but division would not work here)\n"; + } + } } - increment_brace_error(); } - return $seqno; + return ( $is_pattern, $msg ); } -sub check_final_nesting_depths { - my ($a); - - for $a ( 0 .. $#closing_brace_names ) { - - if ( $current_depth[$a] ) { - my $rsl = $starting_line_of_current_depth[$a][ $current_depth[$a] ]; - my $sl = $$rsl[0]; - my $msg = <<"EOM"; -Final nesting depth of $opening_brace_names[$a]s is $current_depth[$a] -The most recent un-matched $opening_brace_names[$a] is on line $sl -EOM - indicate_error( $msg, @$rsl, '^' ); - increment_brace_error(); - } - } -} +# try to resolve here-doc vs. shift by looking ahead for +# non-code or the end token (currently only looks for end token) +# returns 1 if it is probably a here doc, 0 if not +sub guess_if_here_doc { -sub numerator_expected { + # This is how many lines we will search for a target as part of the + # guessing strategy. It is a constant because there is probably + # little reason to change it. + # USES GLOBAL VARIABLES: $tokenizer_self, $current_package + # %is_constant, + use constant HERE_DOC_WINDOW => 40; - # this is a filter for a possible numerator, in support of guessing - # for the / pattern delimiter token. - # returns - - # 1 - yes - # 0 - can't tell - # -1 - no - # Note: I am using the convention that variables ending in - # _expected have these 3 possible values. - my ( $i, $rtokens ) = @_; - my $next_token = $$rtokens[ $i + 1 ]; - if ( $next_token eq '=' ) { $i++; } # handle /= - my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens ); + my $next_token = shift; + my $here_doc_expected = 0; + my $line; + my $k = 0; + my $msg = "checking <<"; - if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) { - 1; - } - else { + while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $k++ ) ) + { + chomp $line; - if ( $next_nonblank_token =~ /^\s*$/ ) { - 0; - } - else { - -1; + if ( $line =~ /^$next_token$/ ) { + $msg .= " -- found target $next_token ahead $k lines\n"; + $here_doc_expected = 1; # got it + last; } + last if ( $k >= HERE_DOC_WINDOW ); } -} - -sub pattern_expected { - # This is the start of a filter for a possible pattern. - # It looks at the token after a possbible pattern and tries to - # determine if that token could end a pattern. - # returns - - # 1 - yes - # 0 - can't tell - # -1 - no - my ( $i, $rtokens ) = @_; - my $next_token = $$rtokens[ $i + 1 ]; - if ( $next_token =~ /^[cgimosx]/ ) { $i++; } # skip possible modifier - my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens ); + unless ($here_doc_expected) { - # list of tokens which may follow a pattern - # (can probably be expanded) - if ( $next_nonblank_token =~ /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/ ) - { - 1; - } - else { + if ( !defined($line) ) { + $here_doc_expected = -1; # hit eof without seeing target + $msg .= " -- must be shift; target $next_token not in file\n"; - if ( $next_nonblank_token =~ /^\s*$/ ) { - 0; } - else { - -1; + else { # still unsure..taking a wild guess + + if ( !$is_constant{$current_package}{$next_token} ) { + $here_doc_expected = 1; + $msg .= + " -- guessing it's a here-doc ($next_token not a constant)\n"; + } + else { + $msg .= + " -- guessing it's a shift ($next_token is a constant)\n"; + } } } + write_logfile_entry($msg); + return $here_doc_expected; } -sub find_next_nonblank_token_on_this_line { - my ( $i, $rtokens ) = @_; - my $next_nonblank_token; +#########i############################################################# +# Tokenizer Routines for scanning identifiers and related items +####################################################################### - if ( $i < $max_token_index ) { - $next_nonblank_token = $$rtokens[ ++$i ]; +sub scan_bare_identifier_do { - if ( $next_nonblank_token =~ /^\s*$/ ) { + # this routine is called to scan a token starting with an alphanumeric + # variable or package separator, :: or '. + # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, + # $last_nonblank_type,@paren_type, $paren_depth - if ( $i < $max_token_index ) { - $next_nonblank_token = $$rtokens[ ++$i ]; - } - } - } - else { - $next_nonblank_token = ""; - } - return ( $next_nonblank_token, $i ); -} + my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map, + $max_token_index ) + = @_; + my $i_begin = $i; + my $package = undef; -sub find_next_nonblank_token { - my ( $i, $rtokens ) = @_; + my $i_beg = $i; - if ( $i >= $max_token_index ) { + # we have to back up one pretoken at a :: since each : is one pretoken + if ( $tok eq '::' ) { $i_beg-- } + if ( $tok eq '->' ) { $i_beg-- } + my $pos_beg = $$rtoken_map[$i_beg]; + pos($input_line) = $pos_beg; - if ( !$peeked_ahead ) { - $peeked_ahead = 1; - $rtokens = peek_ahead_for_nonblank_token($rtokens); - } - } - my $next_nonblank_token = $$rtokens[ ++$i ]; + # Examples: + # A::B::C + # A:: + # ::A + # A'B + if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) { - if ( $next_nonblank_token =~ /^\s*$/ ) { - $next_nonblank_token = $$rtokens[ ++$i ]; - } - return ( $next_nonblank_token, $i ); -} + my $pos = pos($input_line); + my $numc = $pos - $pos_beg; + $tok = substr( $input_line, $pos_beg, $numc ); -sub peek_ahead_for_n_nonblank_pre_tokens { + # type 'w' includes anything without leading type info + # ($,%,@,*) including something like abc::def::ghi + $type = 'w'; - # returns next n pretokens if they exist - # returns undef's if hits eof without seeing any pretokens - my $max_pretokens = shift; - my $line; - my $i = 0; - my ( $rpre_tokens, $rmap, $rpre_types ); - - while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) ) - { - $line =~ s/^\s*//; # trim leading blanks - next if ( length($line) <= 0 ); # skip blank - next if ( $line =~ /^#/ ); # skip comment - ( $rpre_tokens, $rmap, $rpre_types ) = - pre_tokenize( $line, $max_pretokens ); - last; - } - return ( $rpre_tokens, $rpre_types ); -} + my $sub_name = ""; + if ( defined($2) ) { $sub_name = $2; } + if ( defined($1) ) { + $package = $1; -# look ahead for next non-blank, non-comment line of code -sub peek_ahead_for_nonblank_token { - my $rtokens = shift; - my $line; - my $i = 0; + # patch: don't allow isolated package name which just ends + # in the old style package separator (single quote). Example: + # use CGI':all'; + if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) { + $pos--; + } - while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) ) - { - $line =~ s/^\s*//; # trim leading blanks - next if ( length($line) <= 0 ); # skip blank - next if ( $line =~ /^#/ ); # skip comment - my ( $rtok, $rmap, $rtype ) = - pre_tokenize( $line, 2 ); # only need 2 pre-tokens - my $j = $max_token_index + 1; - my $tok; + $package =~ s/\'/::/g; + if ( $package =~ /^\:/ ) { $package = 'main' . $package } + $package =~ s/::$//; + } + else { + $package = $current_package; - foreach $tok (@$rtok) { - last if ( $tok =~ "\n" ); - $$rtokens[ ++$j ] = $tok; + if ( $is_keyword{$tok} ) { + $type = 'k'; + } } - last; - } - return $rtokens; -} -sub pre_tokenize { + # if it is a bareword.. + if ( $type eq 'w' ) { - # Break a string, $str, into a sequence of preliminary tokens. We - # are interested in these types of tokens: - # words (type='w'), example: 'max_tokens_wanted' - # digits (type = 'd'), example: '0755' - # whitespace (type = 'b'), example: ' ' - # any other single character (i.e. punct; type = the character itself). - # We cannot do better than this yet because we might be in a quoted - # string or pattern. Caller sets $max_tokens_wanted to 0 to get all - # tokens. - my ( $str, $max_tokens_wanted ) = @_; + # check for v-string with leading 'v' type character + # (This seems to have presidence over filehandle, type 'Y') + if ( $tok =~ /^v\d[_\d]*$/ ) { - # we return references to these 3 arrays: - my @tokens = (); # array of the tokens themselves - my @token_map = (0); # string position of start of each token - my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct + # we only have the first part - something like 'v101' - + # look for more + if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) { + $pos = pos($input_line); + $numc = $pos - $pos_beg; + $tok = substr( $input_line, $pos_beg, $numc ); + } + $type = 'v'; - do { + # warn if this version can't handle v-strings + report_v_string($tok); + } - # whitespace - if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; } + elsif ( $is_constant{$package}{$sub_name} ) { + $type = 'C'; + } - # numbers - # note that this must come before words! - elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; } + # bareword after sort has implied empty prototype; for example: + # @sorted = sort numerically ( 53, 29, 11, 32, 7 ); + # This has priority over whatever the user has specified. + elsif ($last_nonblank_token eq 'sort' + && $last_nonblank_type eq 'k' ) + { + $type = 'Z'; + } - # words - elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; } + # Note: strangely, perl does not seem to really let you create + # functions which act like eval and do, in the sense that eval + # and do may have operators following the final }, but any operators + # that you create with prototype (&) apparently do not allow + # trailing operators, only terms. This seems strange. + # If this ever changes, here is the update + # to make perltidy behave accordingly: - # single-character punctuation - elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; } + # elsif ( $is_block_function{$package}{$tok} ) { + # $tok='eval'; # patch to do braces like eval - doesn't work + # $type = 'k'; + #} + # FIXME: This could become a separate type to allow for different + # future behavior: + elsif ( $is_block_function{$package}{$sub_name} ) { + $type = 'G'; + } - # that's all.. - else { - return ( \@tokens, \@token_map, \@type ); - } + elsif ( $is_block_list_function{$package}{$sub_name} ) { + $type = 'G'; + } + elsif ( $is_user_function{$package}{$sub_name} ) { + $type = 'U'; + $prototype = $user_function_prototype{$package}{$sub_name}; + } - push @tokens, $1; - push @token_map, pos($str); + # check for indirect object + elsif ( - } while ( --$max_tokens_wanted != 0 ); + # added 2001-03-27: must not be followed immediately by '(' + # see fhandle.t + ( $input_line !~ m/\G\(/gc ) - return ( \@tokens, \@token_map, \@type ); -} + # and + && ( -sub show_tokens { + # preceded by keyword like 'print', 'printf' and friends + $is_indirect_object_taker{$last_nonblank_token} - # this is an old debug routine - my ( $rtokens, $rtoken_map ) = @_; - my $num = scalar(@$rtokens); - my $i; + # or preceded by something like 'print(' or 'printf(' + || ( + ( $last_nonblank_token eq '(' ) + && $is_indirect_object_taker{ $paren_type[$paren_depth] + } - for ( $i = 0 ; $i < $num ; $i++ ) { - my $len = length( $$rtokens[$i] ); - print "$i:$len:$$rtoken_map[$i]:$$rtokens[$i]:\n"; + ) + ) + ) + { + + # may not be indirect object unless followed by a space + if ( $input_line =~ m/\G\s+/gc ) { + $type = 'Y'; + + # Abandon Hope ... + # Perl's indirect object notation is a very bad + # thing and can cause subtle bugs, especially for + # beginning programmers. And I haven't even been + # able to figure out a sane warning scheme which + # doesn't get in the way of good scripts. + + # Complain if a filehandle has any lower case + # letters. This is suggested good practice, but the + # main reason for this warning is that prior to + # release 20010328, perltidy incorrectly parsed a + # function call after a print/printf, with the + # result that a space got added before the opening + # paren, thereby converting the function name to a + # filehandle according to perl's weird rules. This + # will not usually generate a syntax error, so this + # is a potentially serious bug. By warning + # of filehandles with any lower case letters, + # followed by opening parens, we will help the user + # find almost all of these older errors. + # use 'sub_name' because something like + # main::MYHANDLE is ok for filehandle + if ( $sub_name =~ /[a-z]/ ) { + + # could be bug caused by older perltidy if + # followed by '(' + if ( $input_line =~ m/\G\s*\(/gc ) { + complain( +"Caution: unknown word '$tok' in indirect object slot\n" + ); + } + } + } + + # bareword not followed by a space -- may not be filehandle + # (may be function call defined in a 'use' statement) + else { + $type = 'Z'; + } + } + } + + # Now we must convert back from character position + # to pre_token index. + # I don't think an error flag can occur here ..but who knows + my $error; + ( $i, $error ) = + inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index ); + if ($error) { + warning("scan_bare_identifier: Possibly invalid tokenization\n"); + } } -} -sub find_angle_operator_termination { + # no match but line not blank - could be syntax error + # perl will take '::' alone without complaint + else { + $type = 'w'; - # We are looking at a '<' and want to know if it is an angle operator. - # We are to return: - # $i = pretoken index of ending '>' if found, current $i otherwise - # $type = 'Q' if found, '>' otherwise - my ( $input_line, $i_beg, $rtoken_map, $expecting ) = @_; - my $i = $i_beg; - my $type = '<'; - pos($input_line) = 1 + $$rtoken_map[$i]; + # change this warning to log message if it becomes annoying + warning("didn't find identifier after leading ::\n"); + } + return ( $i, $tok, $type, $prototype ); +} - my $filter; +sub scan_id_do { - # we just have to find the next '>' if a term is expected - if ( $expecting == TERM ) { $filter = '[\>]' } +# This is the new scanner and will eventually replace scan_identifier. +# Only type 'sub' and 'package' are implemented. +# Token types $ * % @ & -> are not yet implemented. +# +# Scan identifier following a type token. +# The type of call depends on $id_scan_state: $id_scan_state = '' +# for starting call, in which case $tok must be the token defining +# the type. +# +# If the type token is the last nonblank token on the line, a value +# of $id_scan_state = $tok is returned, indicating that further +# calls must be made to get the identifier. If the type token is +# not the last nonblank token on the line, the identifier is +# scanned and handled and a value of '' is returned. +# USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list, +# $statement_type, $tokenizer_self + + my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state, + $max_token_index ) + = @_; + my $type = ''; + my ( $i_beg, $pos_beg ); - # we have to guess if we don't know what is expected - elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' } + #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n"; + #my ($a,$b,$c) = caller; + #print "NSCAN: scan_id called with tok=$tok $a $b $c\n"; - # shouldn't happen - we shouldn't be here if operator is expected - else { warning("Program Bug in find_angle_operator_termination\n") } + # on re-entry, start scanning at first token on the line + if ($id_scan_state) { + $i_beg = $i; + $type = ''; + } - # To illustrate what we might be looking at, in case we are - # guessing, here are some examples of valid angle operators - # (or file globs): - # - # - # <$fh> - # <*.c *.h> - # <_> - # ( glob.t) - # <${PREFIX}*img*.$IMAGE_TYPE> - # - # - # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl> - # - # Here are some examples of lines which do not have angle operators: - # return undef unless $self->[2]++ < $#{$self->[1]}; - # < 2 || @$t > - # - # the following line from dlister.pl caused trouble: - # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n"; - # - # If the '<' starts an angle operator, it must end on this line and - # it must not have certain characters like ';' and '=' in it. I use - # this to limit the testing. This filter should be improved if - # possible. + # on initial entry, start scanning just after type token + else { + $i_beg = $i + 1; + $id_scan_state = $tok; + $type = 't'; + } - if ( $input_line =~ /($filter)/g ) { + # find $i_beg = index of next nonblank token, + # and handle empty lines + my $blank_line = 0; + my $next_nonblank_token = $$rtokens[$i_beg]; + if ( $i_beg > $max_token_index ) { + $blank_line = 1; + } + else { - if ( $1 eq '>' ) { + # only a '#' immediately after a '$' is not a comment + if ( $next_nonblank_token eq '#' ) { + unless ( $tok eq '$' ) { + $blank_line = 1; + } + } - # We MAY have found an angle operator termination if we get - # here, but we need to do more to be sure we haven't been - # fooled. - my $pos = pos($input_line); + if ( $next_nonblank_token =~ /^\s/ ) { + ( $next_nonblank_token, $i_beg ) = + find_next_nonblank_token_on_this_line( $i_beg, $rtokens, + $max_token_index ); + if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) { + $blank_line = 1; + } + } + } - my $pos_beg = $$rtoken_map[$i]; - my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) ); + # handle non-blank line; identifier, if any, must follow + unless ($blank_line) { - # Reject if the closing '>' follows a '-' as in: - # if ( VERSION < 5.009 && $op-> name eq 'aassign' ) { } - if ( $expecting eq UNKNOWN ) { - my $check = substr( $input_line, $pos - 2, 1 ); - if ( $check eq '-' ) { - return ( $i, $type ); - } - } + if ( $id_scan_state eq 'sub' ) { + ( $i, $tok, $type, $id_scan_state ) = do_scan_sub( + $input_line, $i, $i_beg, + $tok, $type, $rtokens, + $rtoken_map, $id_scan_state, $max_token_index + ); + } - ######################################debug##### - #write_diagnostics( "ANGLE? :$str\n"); - #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n"; - ######################################debug##### - $type = 'Q'; - my $error; - ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map ); + elsif ( $id_scan_state eq 'package' ) { + ( $i, $tok, $type ) = + do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens, + $rtoken_map, $max_token_index ); + $id_scan_state = ''; + } - # It may be possible that a quote ends midway in a pretoken. - # If this happens, it may be necessary to split the pretoken. - if ($error) { - warning( - "Possible tokinization error..please check this line\n"); - report_possible_bug(); - } + else { + warning("invalid token in scan_id: $tok\n"); + $id_scan_state = ''; + } + } - # Now let's see where we stand.... - # OK if math op not possible - if ( $expecting == TERM ) { - } + if ( $id_scan_state && ( !defined($type) || !$type ) ) { - # OK if there are no more than 2 pre-tokens inside - # (not possible to write 2 token math between < and >) - # This catches most common cases - elsif ( $i <= $i_beg + 3 ) { - write_diagnostics("ANGLE(1 or 2 tokens): $str\n"); - } + # shouldn't happen: + warning( +"Program bug in scan_id: undefined type but scan_state=$id_scan_state\n" + ); + report_definite_bug(); + } - # Not sure.. - else { + TOKENIZER_DEBUG_FLAG_NSCAN && do { + print + "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n"; + }; + return ( $i, $tok, $type, $id_scan_state ); +} - # Let's try a Brace Test: any braces inside must balance - my $br = 0; - while ( $str =~ /\{/g ) { $br++ } - while ( $str =~ /\}/g ) { $br-- } - my $sb = 0; - while ( $str =~ /\[/g ) { $sb++ } - while ( $str =~ /\]/g ) { $sb-- } - my $pr = 0; - while ( $str =~ /\(/g ) { $pr++ } - while ( $str =~ /\)/g ) { $pr-- } +sub check_prototype { + my ( $proto, $package, $subname ) = @_; + return unless ( defined($package) && defined($subname) ); + if ( defined($proto) ) { + $proto =~ s/^\s*\(\s*//; + $proto =~ s/\s*\)$//; + if ($proto) { + $is_user_function{$package}{$subname} = 1; + $user_function_prototype{$package}{$subname} = "($proto)"; - # if braces do not balance - not angle operator - if ( $br || $sb || $pr ) { - $i = $i_beg; - $type = '<'; - write_diagnostics( - "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n"); + # prototypes containing '&' must be treated specially.. + if ( $proto =~ /\&/ ) { + + # right curly braces of prototypes ending in + # '&' may be followed by an operator + if ( $proto =~ /\&$/ ) { + $is_block_function{$package}{$subname} = 1; } - # we should keep doing more checks here...to be continued - # Tentatively accepting this as a valid angle operator. - # There are lots more things that can be checked. - else { - write_diagnostics( - "ANGLE-Guessing yes: $str expecting=$expecting\n"); - write_logfile_entry("Guessing angle operator here: $str\n"); + # right curly braces of prototypes NOT ending in + # '&' may NOT be followed by an operator + elsif ( $proto !~ /\&$/ ) { + $is_block_list_function{$package}{$subname} = 1; } } } - - # didn't find ending > else { - if ( $expecting == TERM ) { - warning("No ending > for angle operator\n"); - } + $is_constant{$package}{$subname} = 1; } } - return ( $i, $type ); + else { + $is_user_function{$package}{$subname} = 1; + } } -sub inverse_pretoken_map { +sub do_scan_package { - # Starting with the current pre_token index $i, scan forward until - # finding the index of the next pre_token whose position is $pos. - my ( $i, $pos, $rtoken_map ) = @_; - my $error = 0; + # do_scan_package parses a package name + # it is called with $i_beg equal to the index of the first nonblank + # token following a 'package' token. + # USES GLOBAL VARIABLES: $current_package, - while ( ++$i <= $max_token_index ) { + my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map, + $max_token_index ) + = @_; + my $package = undef; + my $pos_beg = $$rtoken_map[$i_beg]; + pos($input_line) = $pos_beg; - if ( $pos <= $$rtoken_map[$i] ) { + # handle non-blank line; package name, if any, must follow + if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w+)/gc ) { + $package = $1; + $package = ( defined($1) && $1 ) ? $1 : 'main'; + $package =~ s/\'/::/g; + if ( $package =~ /^\:/ ) { $package = 'main' . $package } + $package =~ s/::$//; + my $pos = pos($input_line); + my $numc = $pos - $pos_beg; + $tok = 'package ' . substr( $input_line, $pos_beg, $numc ); + $type = 'i'; - # Let the calling routine handle errors in which we do not - # land on a pre-token boundary. It can happen by running - # perltidy on some non-perl scripts, for example. - if ( $pos < $$rtoken_map[$i] ) { $error = 1 } - $i--; - last; + # Now we must convert back from character position + # to pre_token index. + # I don't think an error flag can occur here ..but ? + my $error; + ( $i, $error ) = + inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index ); + if ($error) { warning("Possibly invalid package\n") } + $current_package = $package; + + # check for error + my ( $next_nonblank_token, $i_next ) = + find_next_nonblank_token( $i, $rtokens, $max_token_index ); + if ( $next_nonblank_token !~ /^[;\}]$/ ) { + warning( + "Unexpected '$next_nonblank_token' after package name '$tok'\n" + ); } } - return ( $i, $error ); -} - -sub guess_if_pattern_or_conditional { - - # this routine is called when we have encountered a ? following an - # unknown bareword, and we must decide if it starts a pattern or not - # input parameters: - # $i - token index of the ? starting possible pattern - # output parameters: - # $is_pattern = 0 if probably not pattern, =1 if probably a pattern - # msg = a warning or diagnostic message - my ( $i, $rtokens, $rtoken_map ) = @_; - my $is_pattern = 0; - my $msg = "guessing that ? after $last_nonblank_token starts a "; - if ( $i >= $max_token_index ) { - $msg .= "conditional (no end to pattern found on the line)\n"; - } + # no match but line not blank -- + # could be a label with name package, like package: , for example. else { - my $ibeg = $i; - $i = $ibeg + 1; - my $next_token = $$rtokens[$i]; # first token after ? - - # look for a possible ending ? on this line.. - my $in_quote = 1; - my $quote_depth = 0; - my $quote_character = ''; - my $quote_pos = 0; - ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) = - follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character, - $quote_pos, $quote_depth ); + $type = 'k'; + } - if ($in_quote) { + return ( $i, $tok, $type ); +} - # we didn't find an ending ? on this line, - # so we bias towards conditional - $is_pattern = 0; - $msg .= "conditional (no ending ? on this line)\n"; +sub scan_identifier_do { - # we found an ending ?, so we bias towards a pattern - } + # This routine assembles tokens into identifiers. It maintains a + # scan state, id_scan_state. It updates id_scan_state based upon + # current id_scan_state and token, and returns an updated + # id_scan_state and the next index after the identifier. + # USES GLOBAL VARIABLES: $context, $last_nonblank_token, + # $last_nonblank_type + + my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index ) = @_; + my $i_begin = $i; + my $type = ''; + my $tok_begin = $$rtokens[$i_begin]; + if ( $tok_begin eq ':' ) { $tok_begin = '::' } + my $id_scan_state_begin = $id_scan_state; + my $identifier_begin = $identifier; + my $tok = $tok_begin; + my $message = ""; + + # these flags will be used to help figure out the type: + my $saw_alpha = ( $tok =~ /^[A-Za-z_]/ ); + my $saw_type; + + # allow old package separator (') except in 'use' statement + my $allow_tick = ( $last_nonblank_token ne 'use' ); + + # get started by defining a type and a state if necessary + unless ($id_scan_state) { + $context = UNKNOWN_CONTEXT; + + # fixup for digraph + if ( $tok eq '>' ) { + $tok = '->'; + $tok_begin = $tok; + } + $identifier = $tok; + + if ( $tok eq '$' || $tok eq '*' ) { + $id_scan_state = '$'; + $context = SCALAR_CONTEXT; + } + elsif ( $tok eq '%' || $tok eq '@' ) { + $id_scan_state = '$'; + $context = LIST_CONTEXT; + } + elsif ( $tok eq '&' ) { + $id_scan_state = '&'; + } + elsif ( $tok eq 'sub' or $tok eq 'package' ) { + $saw_alpha = 0; # 'sub' is considered type info here + $id_scan_state = '$'; + $identifier .= ' '; # need a space to separate sub from sub name + } + elsif ( $tok eq '::' ) { + $id_scan_state = 'A'; + } + elsif ( $tok =~ /^[A-Za-z_]/ ) { + $id_scan_state = ':'; + } + elsif ( $tok eq '->' ) { + $id_scan_state = '$'; + } else { - if ( pattern_expected( $i, $rtokens ) >= 0 ) { - $is_pattern = 1; - $msg .= "pattern (found ending ? and pattern expected)\n"; - } - else { - $msg .= "pattern (uncertain, but found ending ?)\n"; - } + # shouldn't happen + my ( $a, $b, $c ) = caller; + warning("Program Bug: scan_identifier given bad token = $tok \n"); + warning(" called from sub $a line: $c\n"); + report_definite_bug(); } + $saw_type = !$saw_alpha; + } + else { + $i--; + $saw_type = ( $tok =~ /([\$\%\@\*\&])/ ); } - return ( $is_pattern, $msg ); -} -sub guess_if_pattern_or_division { + # now loop to gather the identifier + my $i_save = $i; - # this routine is called when we have encountered a / following an - # unknown bareword, and we must decide if it starts a pattern or is a - # division - # input parameters: - # $i - token index of the / starting possible pattern - # output parameters: - # $is_pattern = 0 if probably division, =1 if probably a pattern - # msg = a warning or diagnostic message - my ( $i, $rtokens, $rtoken_map ) = @_; - my $is_pattern = 0; - my $msg = "guessing that / after $last_nonblank_token starts a "; + while ( $i < $max_token_index ) { + $i_save = $i unless ( $tok =~ /^\s*$/ ); + $tok = $$rtokens[ ++$i ]; - if ( $i >= $max_token_index ) { - "division (no end to pattern found on the line)\n"; - } - else { - my $ibeg = $i; - my $divide_expected = numerator_expected( $i, $rtokens ); - $i = $ibeg + 1; - my $next_token = $$rtokens[$i]; # first token after slash + if ( ( $tok eq ':' ) && ( $$rtokens[ $i + 1 ] eq ':' ) ) { + $tok = '::'; + $i++; + } - # look for a possible ending / on this line.. - my $in_quote = 1; - my $quote_depth = 0; - my $quote_character = ''; - my $quote_pos = 0; - ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) = - follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character, - $quote_pos, $quote_depth ); + if ( $id_scan_state eq '$' ) { # starting variable name - if ($in_quote) { + if ( $tok eq '$' ) { - # we didn't find an ending / on this line, - # so we bias towards division - if ( $divide_expected >= 0 ) { - $is_pattern = 0; - $msg .= "division (no ending / on this line)\n"; + $identifier .= $tok; + + # we've got a punctuation variable if end of line (punct.t) + if ( $i == $max_token_index ) { + $type = 'i'; + $id_scan_state = ''; + last; + } } - else { - $msg = "multi-line pattern (division not possible)\n"; - $is_pattern = 1; + elsif ( $tok =~ /^[A-Za-z_]/ ) { # alphanumeric .. + $saw_alpha = 1; + $id_scan_state = ':'; # now need :: + $identifier .= $tok; } + elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. + $saw_alpha = 1; + $id_scan_state = ':'; # now need :: + $identifier .= $tok; - } + # Perl will accept leading digits in identifiers, + # although they may not always produce useful results. + # Something like $main::0 is ok. But this also works: + # + # sub howdy::123::bubba{ print "bubba $54321!\n" } + # howdy::123::bubba(); + # + } + elsif ( $tok =~ /^[0-9]/ ) { # numeric + $saw_alpha = 1; + $id_scan_state = ':'; # now need :: + $identifier .= $tok; + } + elsif ( $tok eq '::' ) { + $id_scan_state = 'A'; + $identifier .= $tok; + } + elsif ( ( $tok eq '#' ) && ( $identifier eq '$' ) ) { # $#array + $identifier .= $tok; # keep same state, a $ could follow + } + elsif ( $tok eq '{' ) { - # we found an ending /, so we bias towards a pattern - else { + # check for something like ${#} or ${©} + if ( $identifier eq '$' + && $i + 2 <= $max_token_index + && $$rtokens[ $i + 2 ] eq '}' + && $$rtokens[ $i + 1 ] !~ /[\s\w]/ ) + { + my $next2 = $$rtokens[ $i + 2 ]; + my $next1 = $$rtokens[ $i + 1 ]; + $identifier .= $tok . $next1 . $next2; + $i += 2; + $id_scan_state = ''; + last; + } - if ( pattern_expected( $i, $rtokens ) >= 0 ) { + # skip something like ${xxx} or ->{ + $id_scan_state = ''; - if ( $divide_expected >= 0 ) { + # if this is the first token of a line, any tokens for this + # identifier have already been accumulated + if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; } + $i = $i_save; + last; + } - if ( $i - $ibeg > 60 ) { - $msg .= "division (matching / too distant)\n"; - $is_pattern = 0; + # space ok after leading $ % * & @ + elsif ( $tok =~ /^\s*$/ ) { + + if ( $identifier =~ /^[\$\%\*\&\@]/ ) { + + if ( length($identifier) > 1 ) { + $id_scan_state = ''; + $i = $i_save; + $type = 'i'; # probably punctuation variable + last; } else { - $msg .= "pattern (but division possible too)\n"; - $is_pattern = 1; + + # spaces after $'s are common, and space after @ + # is harmless, so only complain about space + # after other type characters. Space after $ and + # @ will be removed in formatting. Report space + # after % and * because they might indicate a + # parsing error. In other words '% ' might be a + # modulo operator. Delete this warning if it + # gets annoying. + if ( $identifier !~ /^[\@\$]$/ ) { + $message = + "Space in identifier, following $identifier\n"; + } } } - else { - $is_pattern = 1; - $msg .= "pattern (division not possible)\n"; - } + + # else: + # space after '->' is ok } - else { + elsif ( $tok eq '^' ) { - if ( $divide_expected >= 0 ) { - $is_pattern = 0; - $msg .= "division (pattern not possible)\n"; + # check for some special variables like $^W + if ( $identifier =~ /^[\$\*\@\%]$/ ) { + $identifier .= $tok; + $id_scan_state = 'A'; + + # Perl accepts '$^]' or '@^]', but + # there must not be a space before the ']'. + my $next1 = $$rtokens[ $i + 1 ]; + if ( $next1 eq ']' ) { + $i++; + $identifier .= $next1; + $id_scan_state = ""; + last; + } } else { - $is_pattern = 1; - $msg .= - "pattern (uncertain, but division would not work here)\n"; + $id_scan_state = ''; } } - } - } - return ( $is_pattern, $msg ); -} + else { # something else -sub find_here_doc { + # check for various punctuation variables + if ( $identifier =~ /^[\$\*\@\%]$/ ) { + $identifier .= $tok; + } - # find the target of a here document, if any - # input parameters: - # $i - token index of the second < of << - # ($i must be less than the last token index if this is called) - # output parameters: - # $found_target = 0 didn't find target; =1 found target - # HERE_TARGET - the target string (may be empty string) - # $i - unchanged if not here doc, - # or index of the last token of the here target - my ( $expecting, $i, $rtokens, $rtoken_map ) = @_; - my $ibeg = $i; - my $found_target = 0; - my $here_doc_target = ''; - my $here_quote_character = ''; - my ( $next_nonblank_token, $i_next_nonblank, $next_token ); - $next_token = $$rtokens[ $i + 1 ]; - - # perl allows a backslash before the target string (heredoc.t) - my $backslash = 0; - if ( $next_token eq '\\' ) { - $backslash = 1; - $next_token = $$rtokens[ $i + 2 ]; - } - - ( $next_nonblank_token, $i_next_nonblank ) = - find_next_nonblank_token_on_this_line( $i, $rtokens ); - - if ( $next_nonblank_token =~ /[\'\"\`]/ ) { - - my $in_quote = 1; - my $quote_depth = 0; - my $quote_pos = 0; - - ( $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth ) = - follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens, - $here_quote_character, $quote_pos, $quote_depth ); - - if ($in_quote) { # didn't find end of quote, so no target found - $i = $ibeg; - } - else { # found ending quote - my $j; - $found_target = 1; - - my $tokj; - for ( $j = $i_next_nonblank + 1 ; $j < $i ; $j++ ) { - $tokj = $$rtokens[$j]; - - # we have to remove any backslash before the quote character - # so that the here-doc-target exactly matches this string - next - if ( $tokj eq "\\" - && $j < $i - 1 - && $$rtokens[ $j + 1 ] eq $here_quote_character ); - $here_doc_target .= $tokj; - } - } - } - - elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) { - $found_target = 1; - write_logfile_entry( - "found blank here-target after <<; suggest using \"\"\n"); - $i = $ibeg; - } - elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after << - - my $here_doc_expected; - if ( $expecting == UNKNOWN ) { - $here_doc_expected = guess_if_here_doc($next_token); - } - else { - $here_doc_expected = 1; - } - - if ($here_doc_expected) { - $found_target = 1; - $here_doc_target = $next_token; - $i = $ibeg + 1; - } - - } - else { - - if ( $expecting == TERM ) { - $found_target = 1; - write_logfile_entry("Note: bare here-doc operator <<\n"); - } - else { - $i = $ibeg; - } - } - - # patch to neglect any prepended backslash - if ( $found_target && $backslash ) { $i++ } - - return ( $found_target, $here_doc_target, $here_quote_character, $i ); -} - -# try to resolve here-doc vs. shift by looking ahead for -# non-code or the end token (currently only looks for end token) -# returns 1 if it is probably a here doc, 0 if not -sub guess_if_here_doc { - - # This is how many lines we will search for a target as part of the - # guessing strategy. It is a constant because there is probably - # little reason to change it. - use constant HERE_DOC_WINDOW => 40; - - my $next_token = shift; - my $here_doc_expected = 0; - my $line; - my $k = 0; - my $msg = "checking <<"; - - while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $k++ ) ) - { - chomp $line; - - if ( $line =~ /^$next_token$/ ) { - $msg .= " -- found target $next_token ahead $k lines\n"; - $here_doc_expected = 1; # got it - last; - } - last if ( $k >= HERE_DOC_WINDOW ); - } - - unless ($here_doc_expected) { - - if ( !defined($line) ) { - $here_doc_expected = -1; # hit eof without seeing target - $msg .= " -- must be shift; target $next_token not in file\n"; - - } - else { # still unsure..taking a wild guess - - if ( !$is_constant{$current_package}{$next_token} ) { - $here_doc_expected = 1; - $msg .= - " -- guessing it's a here-doc ($next_token not a constant)\n"; - } - else { - $msg .= - " -- guessing it's a shift ($next_token is a constant)\n"; - } - } - } - write_logfile_entry($msg); - return $here_doc_expected; -} - -sub do_quote { - - # follow (or continue following) quoted string or pattern - # $in_quote return code: - # 0 - ok, found end - # 1 - still must find end of quote whose target is $quote_character - # 2 - still looking for end of first of two quotes - my ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth, $rtokens, - $rtoken_map ) - = @_; - - if ( $in_quote == 2 ) { # two quotes/patterns to follow - my $ibeg = $i; - ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) = - follow_quoted_string( $i, $in_quote, $rtokens, $quote_character, - $quote_pos, $quote_depth ); - - if ( $in_quote == 1 ) { - if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; } - $quote_character = ''; - } - } - - if ( $in_quote == 1 ) { # one (more) quote to follow - my $ibeg = $i; - ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) = - follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character, - $quote_pos, $quote_depth ); - } - return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ); -} - -sub scan_number_do { - - # scan a number in any of the formats that Perl accepts - # Underbars (_) are allowed in decimal numbers. - # input parameters - - # $input_line - the string to scan - # $i - pre_token index to start scanning - # $rtoken_map - reference to the pre_token map giving starting - # character position in $input_line of token $i - # output parameters - - # $i - last pre_token index of the number just scanned - # number - the number (characters); or undef if not a number - - my ( $input_line, $i, $rtoken_map, $input_type ) = @_; - my $pos_beg = $$rtoken_map[$i]; - my $pos; - my $i_begin = $i; - my $number = undef; - my $type = $input_type; - - my $first_char = substr( $input_line, $pos_beg, 1 ); - - # Look for bad starting characters; Shouldn't happen.. - if ( $first_char !~ /[\d\.\+\-Ee]/ ) { - warning("Program bug - scan_number given character $first_char\n"); - report_definite_bug(); - return ( $i, $type, $number ); - } - - # handle v-string without leading 'v' character ('Two Dot' rule) - # (vstring.t) - # TODO: v-strings may contain underscores - pos($input_line) = $pos_beg; - if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) { - $pos = pos($input_line); - my $numc = $pos - $pos_beg; - $number = substr( $input_line, $pos_beg, $numc ); - $type = 'v'; - unless ($saw_v_string) { report_v_string($number) } - } - - # handle octal, hex, binary - if ( !defined($number) ) { - pos($input_line) = $pos_beg; - if ( $input_line =~ /\G[+-]?0((x[0-9a-fA-F_]+)|([0-7_]+)|(b[01_]+))/g ) - { - $pos = pos($input_line); - my $numc = $pos - $pos_beg; - $number = substr( $input_line, $pos_beg, $numc ); - $type = 'n'; - } - } - - # handle decimal - if ( !defined($number) ) { - pos($input_line) = $pos_beg; - - if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) { - $pos = pos($input_line); - - # watch out for things like 0..40 which would give 0. by this; - if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' ) - && ( substr( $input_line, $pos, 1 ) eq '.' ) ) - { - $pos--; - } - my $numc = $pos - $pos_beg; - $number = substr( $input_line, $pos_beg, $numc ); - $type = 'n'; - } - } - - # filter out non-numbers like e + - . e2 .e3 +e6 - # the rule: at least one digit, and any 'e' must be preceded by a digit - if ( - $number !~ /\d/ # no digits - || ( $number =~ /^(.*)[eE]/ - && $1 !~ /\d/ ) # or no digits before the 'e' - ) - { - $number = undef; - $type = $input_type; - return ( $i, $type, $number ); - } - - # Found a number; now we must convert back from character position - # to pre_token index. An error here implies user syntax error. - # An example would be an invalid octal number like '009'. - my $error; - ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map ); - if ($error) { warning("Possibly invalid number\n") } - - return ( $i, $type, $number ); -} - -sub scan_bare_identifier_do { - - # this routine is called to scan a token starting with an alphanumeric - # variable or package separator, :: or '. - - my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map ) = @_; - my $i_begin = $i; - my $package = undef; - - my $i_beg = $i; - - # we have to back up one pretoken at a :: since each : is one pretoken - if ( $tok eq '::' ) { $i_beg-- } - if ( $tok eq '->' ) { $i_beg-- } - my $pos_beg = $$rtoken_map[$i_beg]; - pos($input_line) = $pos_beg; - - # Examples: - # A::B::C - # A:: - # ::A - # A'B - if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) { - - my $pos = pos($input_line); - my $numc = $pos - $pos_beg; - $tok = substr( $input_line, $pos_beg, $numc ); - - # type 'w' includes anything without leading type info - # ($,%,@,*) including something like abc::def::ghi - $type = 'w'; + elsif ( $identifier eq '$#' ) { - my $sub_name = ""; - if ( defined($2) ) { $sub_name = $2; } - if ( defined($1) ) { - $package = $1; + if ( $tok eq '{' ) { $type = 'i'; $i = $i_save } - # patch: don't allow isolated package name which just ends - # in the old style package separator (single quote). Example: - # use CGI':all'; - if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) { - $pos--; - } + # perl seems to allow just these: $#: $#- $#+ + elsif ( $tok =~ /^[\:\-\+]$/ ) { + $type = 'i'; + $identifier .= $tok; + } + else { + $i = $i_save; + write_logfile_entry( 'Use of $# is deprecated' . "\n" ); + } + } + elsif ( $identifier eq '$$' ) { - $package =~ s/\'/::/g; - if ( $package =~ /^\:/ ) { $package = 'main' . $package } - $package =~ s/::$//; - } - else { - $package = $current_package; + # perl does not allow references to punctuation + # variables without braces. For example, this + # won't work: + # $:=\4; + # $a = $$:; + # You would have to use + # $a = ${$:}; - if ( $is_keyword{$tok} ) { - $type = 'k'; + $i = $i_save; + if ( $tok eq '{' ) { $type = 't' } + else { $type = 'i' } + } + elsif ( $identifier eq '->' ) { + $i = $i_save; + } + else { + $i = $i_save; + if ( length($identifier) == 1 ) { $identifier = ''; } + } + $id_scan_state = ''; + last; } } + elsif ( $id_scan_state eq '&' ) { # starting sub call? - # if it is a bareword.. - if ( $type eq 'w' ) { - - # check for v-string with leading 'v' type character - # (This seems to have presidence over filehandle, type 'Y') - if ( $tok =~ /^v\d[_\d]*$/ ) { + if ( $tok =~ /^[\$A-Za-z_]/ ) { # alphanumeric .. + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; + $identifier .= $tok; + } + elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; + $identifier .= $tok; + } + elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; + $identifier .= $tok; + } + elsif ( $tok =~ /^\s*$/ ) { # allow space + } + elsif ( $tok eq '::' ) { # leading :: + $id_scan_state = 'A'; # accept alpha next + $identifier .= $tok; + } + elsif ( $tok eq '{' ) { + if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; } + $i = $i_save; + $id_scan_state = ''; + last; + } + else { - # we only have the first part - something like 'v101' - - # look for more - if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) { - $pos = pos($input_line); - $numc = $pos - $pos_beg; - $tok = substr( $input_line, $pos_beg, $numc ); + # punctuation variable? + # testfile: cunningham4.pl + if ( $identifier eq '&' ) { + $identifier .= $tok; } - $type = 'v'; - - # warn if this version can't handle v-strings - unless ($saw_v_string) { report_v_string($tok) } + else { + $identifier = ''; + $i = $i_save; + $type = '&'; + } + $id_scan_state = ''; + last; } + } + elsif ( $id_scan_state eq 'A' ) { # looking for alpha (after ::) - elsif ( $is_constant{$package}{$sub_name} ) { - $type = 'C'; + if ( $tok =~ /^[A-Za-z_]/ ) { # found it + $identifier .= $tok; + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; } - - # bareword after sort has implied empty prototype; for example: - # @sorted = sort numerically ( 53, 29, 11, 32, 7 ); - # This has priority over whatever the user has specified. - elsif ($last_nonblank_token eq 'sort' - && $last_nonblank_type eq 'k' ) - { - $type = 'Z'; + elsif ( $tok eq "'" && $allow_tick ) { + $identifier .= $tok; + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; } - - # Note: strangely, perl does not seem to really let you create - # functions which act like eval and do, in the sense that eval - # and do may have operators following the final }, but any operators - # that you create with prototype (&) apparently do not allow - # trailing operators, only terms. This seems strange. - # If this ever changes, here is the update - # to make perltidy behave accordingly: - - # elsif ( $is_block_function{$package}{$tok} ) { - # $tok='eval'; # patch to do braces like eval - doesn't work - # $type = 'k'; - #} - # FIXME: This could become a separate type to allow for different - # future behavior: - elsif ( $is_block_function{$package}{$sub_name} ) { - $type = 'G'; + elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above + $identifier .= $tok; + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; } - - elsif ( $is_block_list_function{$package}{$sub_name} ) { - $type = 'G'; + elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { + $id_scan_state = '('; + $identifier .= $tok; } - elsif ( $is_user_function{$package}{$sub_name} ) { - $type = 'U'; - $prototype = $user_function_prototype{$package}{$sub_name}; + elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { + $id_scan_state = ')'; + $identifier .= $tok; } + else { + $id_scan_state = ''; + $i = $i_save; + last; + } + } + elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha - # check for indirect object - elsif ( - - # added 2001-03-27: must not be followed immediately by '(' - # see fhandle.t - ( $input_line !~ m/\G\(/gc ) - - # and - && ( - - # preceded by keyword like 'print', 'printf' and friends - $is_indirect_object_taker{$last_nonblank_token} - - # or preceded by something like 'print(' or 'printf(' - || ( - ( $last_nonblank_token eq '(' ) - && $is_indirect_object_taker{ $paren_type[$paren_depth] - } - - ) - ) - ) - { - - # may not be indirect object unless followed by a space - if ( $input_line =~ m/\G\s+/gc ) { - $type = 'Y'; - - # Abandon Hope ... - # Perl's indirect object notation is a very bad - # thing and can cause subtle bugs, especially for - # beginning programmers. And I haven't even been - # able to figure out a sane warning scheme which - # doesn't get in the way of good scripts. - - # Complain if a filehandle has any lower case - # letters. This is suggested good practice, but the - # main reason for this warning is that prior to - # release 20010328, perltidy incorrectly parsed a - # function call after a print/printf, with the - # result that a space got added before the opening - # paren, thereby converting the function name to a - # filehandle according to perl's weird rules. This - # will not usually generate a syntax error, so this - # is a potentially serious bug. By warning - # of filehandles with any lower case letters, - # followed by opening parens, we will help the user - # find almost all of these older errors. - # use 'sub_name' because something like - # main::MYHANDLE is ok for filehandle - if ( $sub_name =~ /[a-z]/ ) { + if ( $tok eq '::' ) { # got it + $identifier .= $tok; + $id_scan_state = 'A'; # now require alpha + } + elsif ( $tok =~ /^[A-Za-z_]/ ) { # more alphanumeric is ok here + $identifier .= $tok; + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; + } + elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above + $identifier .= $tok; + $id_scan_state = ':'; # now need :: + $saw_alpha = 1; + } + elsif ( $tok eq "'" && $allow_tick ) { # tick - # could be bug caused by older perltidy if - # followed by '(' - if ( $input_line =~ m/\G\s*\(/gc ) { - complain( -"Caution: unknown word '$tok' in indirect object slot\n" - ); - } - } + if ( $is_keyword{$identifier} ) { + $id_scan_state = ''; # that's all + $i = $i_save; + } + else { + $identifier .= $tok; } + } + elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { + $id_scan_state = '('; + $identifier .= $tok; + } + elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { + $id_scan_state = ')'; + $identifier .= $tok; + } + else { + $id_scan_state = ''; # that's all + $i = $i_save; + last; + } + } + elsif ( $id_scan_state eq '(' ) { # looking for ( of prototype - # bareword not followed by a space -- may not be filehandle - # (may be function call defined in a 'use' statement) - else { - $type = 'Z'; - } + if ( $tok eq '(' ) { # got it + $identifier .= $tok; + $id_scan_state = ')'; # now find the end of it + } + elsif ( $tok =~ /^\s*$/ ) { # blank - keep going + $identifier .= $tok; + } + else { + $id_scan_state = ''; # that's all - no prototype + $i = $i_save; + last; } } + elsif ( $id_scan_state eq ')' ) { # looking for ) to end - # Now we must convert back from character position - # to pre_token index. - # I don't think an error flag can occur here ..but who knows - my $error; - ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map ); - if ($error) { - warning("scan_bare_identifier: Possibly invalid tokenization\n"); + if ( $tok eq ')' ) { # got it + $identifier .= $tok; + $id_scan_state = ''; # all done + last; + } + elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) { + $identifier .= $tok; + } + else { # probable error in script, but keep going + warning("Unexpected '$tok' while seeking end of prototype\n"); + $identifier .= $tok; + } + } + else { # can get here due to error in initialization + $id_scan_state = ''; + $i = $i_save; + last; } } - # no match but line not blank - could be syntax error - # perl will take '::' alone without complaint - else { - $type = 'w'; - - # change this warning to log message if it becomes annoying - warning("didn't find identifier after leading ::\n"); + if ( $id_scan_state eq ')' ) { + warning("Hit end of line while seeking ) to end prototype\n"); } - return ( $i, $tok, $type, $prototype ); -} - -sub scan_id_do { - - # This is the new scanner and will eventually replace scan_identifier. - # Only type 'sub' and 'package' are implemented. - # Token types $ * % @ & -> are not yet implemented. - # - # Scan identifier following a type token. - # The type of call depends on $id_scan_state: $id_scan_state = '' - # for starting call, in which case $tok must be the token defining - # the type. - # - # If the type token is the last nonblank token on the line, a value - # of $id_scan_state = $tok is returned, indicating that further - # calls must be made to get the identifier. If the type token is - # not the last nonblank token on the line, the identifier is - # scanned and handled and a value of '' is returned. - - my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state ) = @_; - my $type = ''; - my ( $i_beg, $pos_beg ); - - #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n"; - #my ($a,$b,$c) = caller; - #print "NSCAN: scan_id called with tok=$tok $a $b $c\n"; - # on re-entry, start scanning at first token on the line - if ($id_scan_state) { - $i_beg = $i; - $type = ''; + # once we enter the actual identifier, it may not extend beyond + # the end of the current line + if ( $id_scan_state =~ /^[A\:\(\)]/ ) { + $id_scan_state = ''; } + if ( $i < 0 ) { $i = 0 } - # on initial entry, start scanning just after type token - else { - $i_beg = $i + 1; - $id_scan_state = $tok; - $type = 't'; - } + unless ($type) { - # find $i_beg = index of next nonblank token, - # and handle empty lines - my $blank_line = 0; - my $next_nonblank_token = $$rtokens[$i_beg]; - if ( $i_beg > $max_token_index ) { - $blank_line = 1; - } - else { + if ($saw_type) { - # only a '#' immediately after a '$' is not a comment - if ( $next_nonblank_token eq '#' ) { - unless ( $tok eq '$' ) { - $blank_line = 1; + if ($saw_alpha) { + if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) { + $type = 'w'; + } + else { $type = 'i' } } - } - - if ( $next_nonblank_token =~ /^\s/ ) { - ( $next_nonblank_token, $i_beg ) = - find_next_nonblank_token_on_this_line( $i_beg, $rtokens ); - if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) { - $blank_line = 1; + elsif ( $identifier eq '->' ) { + $type = '->'; } - } - } - - # handle non-blank line; identifier, if any, must follow - unless ($blank_line) { + elsif ( + ( length($identifier) > 1 ) - if ( $id_scan_state eq 'sub' ) { - ( $i, $tok, $type, $id_scan_state ) = - do_scan_sub( $input_line, $i, $i_beg, $tok, $type, $rtokens, - $rtoken_map, $id_scan_state ); + # In something like '@$=' we have an identifier '@$' + # In something like '$${' we have type '$$' (and only + # part of an identifier) + && !( $identifier =~ /\$$/ && $tok eq '{' ) + && ( $identifier !~ /^(sub |package )$/ ) + ) + { + $type = 'i'; + } + else { $type = 't' } } + elsif ($saw_alpha) { - elsif ( $id_scan_state eq 'package' ) { - ( $i, $tok, $type ) = - do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens, - $rtoken_map ); - $id_scan_state = ''; + # type 'w' includes anything without leading type info + # ($,%,@,*) including something like abc::def::ghi + $type = 'w'; } - else { - warning("invalid token in scan_id: $tok\n"); - $id_scan_state = ''; - } + $type = ''; + } # this can happen on a restart } - if ( $id_scan_state && ( !defined($type) || !$type ) ) { - - # shouldn't happen: - warning( -"Program bug in scan_id: undefined type but scan_state=$id_scan_state\n" - ); - report_definite_bug(); + if ($identifier) { + $tok = $identifier; + if ($message) { write_logfile_entry($message) } + } + else { + $tok = $tok_begin; + $i = $i_begin; } - TOKENIZER_DEBUG_FLAG_NSCAN && do { + TOKENIZER_DEBUG_FLAG_SCAN_ID && do { + my ( $a, $b, $c ) = caller; print - "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n"; +"SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n"; + print +"SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n"; }; - return ( $i, $tok, $type, $id_scan_state ); + return ( $i, $tok, $type, $id_scan_state, $identifier ); } { @@ -24080,10 +25238,15 @@ sub scan_id_do { # sub name. For example, 'sub &doit' is wrong. Also, be sure # a name is given if and only if a non-anonymous sub is # appropriate. + # USES GLOBAL VARS: $current_package, $last_nonblank_token, + # $in_attribute_list, %saw_function_definition, + # $statement_type - my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map, - $id_scan_state ) - = @_; + my ( + $input_line, $i, $i_beg, + $tok, $type, $rtokens, + $rtoken_map, $id_scan_state, $max_token_index + ) = @_; $id_scan_state = ""; # normally we get everything in one call my $subname = undef; my $package = undef; @@ -24165,12 +25328,14 @@ sub scan_id_do { # I don't think an error flag can occur here ..but ? my $error; - ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map ); + ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map, + $max_token_index ); if ($error) { warning("Possibly invalid sub\n") } # check for multiple definitions of a sub ( $next_nonblank_token, my $i_next ) = - find_next_nonblank_token_on_this_line( $i, $rtokens ); + find_next_nonblank_token_on_this_line( $i, $rtokens, + $max_token_index ); } if ( $next_nonblank_token =~ /^(\s*|#)$/ ) @@ -24188,14 +25353,20 @@ sub scan_id_do { $subname_saved = ""; if ( $next_nonblank_token eq '{' ) { if ($subname) { - if ( $saw_function_definition{$package}{$subname} ) { + + # Check for multiple definitions of a sub, but + # it is ok to have multiple sub BEGIN, etc, + # so we do not complain if name is all caps + if ( $saw_function_definition{$package}{$subname} + && $subname !~ /^[A-Z]+$/ ) + { my $lno = $saw_function_definition{$package}{$subname}; warning( "already saw definition of 'sub $subname' in package '$package' at line $lno\n" ); } $saw_function_definition{$package}{$subname} = - $input_line_number; + $tokenizer_self->{_last_line_number}; } } elsif ( $next_nonblank_token eq ';' ) { @@ -24240,554 +25411,537 @@ sub scan_id_do { } } -sub check_prototype { - my ( $proto, $package, $subname ) = @_; - return unless ( defined($package) && defined($subname) ); - if ( defined($proto) ) { - $proto =~ s/^\s*\(\s*//; - $proto =~ s/\s*\)$//; - if ($proto) { - $is_user_function{$package}{$subname} = 1; - $user_function_prototype{$package}{$subname} = "($proto)"; - - # prototypes containing '&' must be treated specially.. - if ( $proto =~ /\&/ ) { - - # right curly braces of prototypes ending in - # '&' may be followed by an operator - if ( $proto =~ /\&$/ ) { - $is_block_function{$package}{$subname} = 1; - } - - # right curly braces of prototypes NOT ending in - # '&' may NOT be followed by an operator - elsif ( $proto !~ /\&$/ ) { - $is_block_list_function{$package}{$subname} = 1; - } - } - } - else { - $is_constant{$package}{$subname} = 1; - } - } - else { - $is_user_function{$package}{$subname} = 1; - } -} - -sub do_scan_package { - - # do_scan_package parses a package name - # it is called with $i_beg equal to the index of the first nonblank - # token following a 'package' token. - - my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map ) = @_; - my $package = undef; - my $pos_beg = $$rtoken_map[$i_beg]; - pos($input_line) = $pos_beg; - - # handle non-blank line; package name, if any, must follow - if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w+)/gc ) { - $package = $1; - $package = ( defined($1) && $1 ) ? $1 : 'main'; - $package =~ s/\'/::/g; - if ( $package =~ /^\:/ ) { $package = 'main' . $package } - $package =~ s/::$//; - my $pos = pos($input_line); - my $numc = $pos - $pos_beg; - $tok = 'package ' . substr( $input_line, $pos_beg, $numc ); - $type = 'i'; - - # Now we must convert back from character position - # to pre_token index. - # I don't think an error flag can occur here ..but ? - my $error; - ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map ); - if ($error) { warning("Possibly invalid package\n") } - $current_package = $package; - - # check for error - my ( $next_nonblank_token, $i_next ) = - find_next_nonblank_token( $i, $rtokens ); - if ( $next_nonblank_token !~ /^[;\}]$/ ) { - warning( - "Unexpected '$next_nonblank_token' after package name '$tok'\n" - ); - } - } - - # no match but line not blank -- - # could be a label with name package, like package: , for example. - else { - $type = 'k'; - } - - return ( $i, $tok, $type ); -} - -sub scan_identifier_do { +#########i############################################################### +# Tokenizer utility routines which may use CONSTANTS but no other GLOBALS +######################################################################### - # This routine assembles tokens into identifiers. It maintains a - # scan state, id_scan_state. It updates id_scan_state based upon - # current id_scan_state and token, and returns an updated - # id_scan_state and the next index after the identifier. +sub find_next_nonblank_token { + my ( $i, $rtokens, $max_token_index ) = @_; - my ( $i, $id_scan_state, $identifier, $rtokens ) = @_; - my $i_begin = $i; - my $type = ''; - my $tok_begin = $$rtokens[$i_begin]; - if ( $tok_begin eq ':' ) { $tok_begin = '::' } - my $id_scan_state_begin = $id_scan_state; - my $identifier_begin = $identifier; - my $tok = $tok_begin; - my $message = ""; + if ( $i >= $max_token_index ) { + if ( !peeked_ahead() ) { + peeked_ahead(1); + $rtokens = + peek_ahead_for_nonblank_token( $rtokens, $max_token_index ); + } + } + my $next_nonblank_token = $$rtokens[ ++$i ]; - # these flags will be used to help figure out the type: - my $saw_alpha = ( $tok =~ /^[A-Za-z_]/ ); - my $saw_type; + if ( $next_nonblank_token =~ /^\s*$/ ) { + $next_nonblank_token = $$rtokens[ ++$i ]; + } + return ( $next_nonblank_token, $i ); +} - # allow old package separator (') except in 'use' statement - my $allow_tick = ( $last_nonblank_token ne 'use' ); +sub numerator_expected { - # get started by defining a type and a state if necessary - unless ($id_scan_state) { - $context = UNKNOWN_CONTEXT; + # this is a filter for a possible numerator, in support of guessing + # for the / pattern delimiter token. + # returns - + # 1 - yes + # 0 - can't tell + # -1 - no + # Note: I am using the convention that variables ending in + # _expected have these 3 possible values. + my ( $i, $rtokens, $max_token_index ) = @_; + my $next_token = $$rtokens[ $i + 1 ]; + if ( $next_token eq '=' ) { $i++; } # handle /= + my ( $next_nonblank_token, $i_next ) = + find_next_nonblank_token( $i, $rtokens, $max_token_index ); - # fixup for digraph - if ( $tok eq '>' ) { - $tok = '->'; - $tok_begin = $tok; - } - $identifier = $tok; + if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) { + 1; + } + else { - if ( $tok eq '$' || $tok eq '*' ) { - $id_scan_state = '$'; - $context = SCALAR_CONTEXT; - } - elsif ( $tok eq '%' || $tok eq '@' ) { - $id_scan_state = '$'; - $context = LIST_CONTEXT; - } - elsif ( $tok eq '&' ) { - $id_scan_state = '&'; - } - elsif ( $tok eq 'sub' or $tok eq 'package' ) { - $saw_alpha = 0; # 'sub' is considered type info here - $id_scan_state = '$'; - $identifier .= ' '; # need a space to separate sub from sub name - } - elsif ( $tok eq '::' ) { - $id_scan_state = 'A'; - } - elsif ( $tok =~ /^[A-Za-z_]/ ) { - $id_scan_state = ':'; - } - elsif ( $tok eq '->' ) { - $id_scan_state = '$'; + if ( $next_nonblank_token =~ /^\s*$/ ) { + 0; } else { - - # shouldn't happen - my ( $a, $b, $c ) = caller; - warning("Program Bug: scan_identifier given bad token = $tok \n"); - warning(" called from sub $a line: $c\n"); - report_definite_bug(); + -1; } - $saw_type = !$saw_alpha; - } - else { - $i--; - $saw_type = ( $tok =~ /([\$\%\@\*\&])/ ); } +} - # now loop to gather the identifier - my $i_save = $i; +sub pattern_expected { - while ( $i < $max_token_index ) { - $i_save = $i unless ( $tok =~ /^\s*$/ ); - $tok = $$rtokens[ ++$i ]; + # This is the start of a filter for a possible pattern. + # It looks at the token after a possbible pattern and tries to + # determine if that token could end a pattern. + # returns - + # 1 - yes + # 0 - can't tell + # -1 - no + my ( $i, $rtokens, $max_token_index ) = @_; + my $next_token = $$rtokens[ $i + 1 ]; + if ( $next_token =~ /^[cgimosx]/ ) { $i++; } # skip possible modifier + my ( $next_nonblank_token, $i_next ) = + find_next_nonblank_token( $i, $rtokens, $max_token_index ); - if ( ( $tok eq ':' ) && ( $$rtokens[ $i + 1 ] eq ':' ) ) { - $tok = '::'; - $i++; - } + # list of tokens which may follow a pattern + # (can probably be expanded) + if ( $next_nonblank_token =~ /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/ ) + { + 1; + } + else { - if ( $id_scan_state eq '$' ) { # starting variable name + if ( $next_nonblank_token =~ /^\s*$/ ) { + 0; + } + else { + -1; + } + } +} - if ( $tok eq '$' ) { +sub find_next_nonblank_token_on_this_line { + my ( $i, $rtokens, $max_token_index ) = @_; + my $next_nonblank_token; - $identifier .= $tok; + if ( $i < $max_token_index ) { + $next_nonblank_token = $$rtokens[ ++$i ]; - # we've got a punctuation variable if end of line (punct.t) - if ( $i == $max_token_index ) { - $type = 'i'; - $id_scan_state = ''; - last; - } - } - elsif ( $tok =~ /^[A-Za-z_]/ ) { # alphanumeric .. - $saw_alpha = 1; - $id_scan_state = ':'; # now need :: - $identifier .= $tok; - } - elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. - $saw_alpha = 1; - $id_scan_state = ':'; # now need :: - $identifier .= $tok; + if ( $next_nonblank_token =~ /^\s*$/ ) { - # Perl will accept leading digits in identifiers, - # although they may not always produce useful results. - # Something like $main::0 is ok. But this also works: - # - # sub howdy::123::bubba{ print "bubba $54321!\n" } - # howdy::123::bubba(); - # - } - elsif ( $tok =~ /^[0-9]/ ) { # numeric - $saw_alpha = 1; - $id_scan_state = ':'; # now need :: - $identifier .= $tok; - } - elsif ( $tok eq '::' ) { - $id_scan_state = 'A'; - $identifier .= $tok; - } - elsif ( ( $tok eq '#' ) && ( $identifier eq '$' ) ) { # $#array - $identifier .= $tok; # keep same state, a $ could follow + if ( $i < $max_token_index ) { + $next_nonblank_token = $$rtokens[ ++$i ]; } - elsif ( $tok eq '{' ) { - - # check for something like ${#} or ${©} - if ( $identifier eq '$' - && $i + 2 <= $max_token_index - && $$rtokens[ $i + 2 ] eq '}' - && $$rtokens[ $i + 1 ] !~ /[\s\w]/ ) - { - my $next2 = $$rtokens[ $i + 2 ]; - my $next1 = $$rtokens[ $i + 1 ]; - $identifier .= $tok . $next1 . $next2; - $i += 2; - $id_scan_state = ''; - last; - } + } + } + else { + $next_nonblank_token = ""; + } + return ( $next_nonblank_token, $i ); +} - # skip something like ${xxx} or ->{ - $id_scan_state = ''; +sub find_angle_operator_termination { - # if this is the first token of a line, any tokens for this - # identifier have already been accumulated - if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; } - $i = $i_save; - last; - } + # We are looking at a '<' and want to know if it is an angle operator. + # We are to return: + # $i = pretoken index of ending '>' if found, current $i otherwise + # $type = 'Q' if found, '>' otherwise + my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_; + my $i = $i_beg; + my $type = '<'; + pos($input_line) = 1 + $$rtoken_map[$i]; - # space ok after leading $ % * & @ - elsif ( $tok =~ /^\s*$/ ) { + my $filter; - if ( $identifier =~ /^[\$\%\*\&\@]/ ) { + # we just have to find the next '>' if a term is expected + if ( $expecting == TERM ) { $filter = '[\>]' } - if ( length($identifier) > 1 ) { - $id_scan_state = ''; - $i = $i_save; - $type = 'i'; # probably punctuation variable - last; - } - else { + # we have to guess if we don't know what is expected + elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' } - # spaces after $'s are common, and space after @ - # is harmless, so only complain about space - # after other type characters. Space after $ and - # @ will be removed in formatting. Report space - # after % and * because they might indicate a - # parsing error. In other words '% ' might be a - # modulo operator. Delete this warning if it - # gets annoying. - if ( $identifier !~ /^[\@\$]$/ ) { - $message = - "Space in identifier, following $identifier\n"; - } - } - } + # shouldn't happen - we shouldn't be here if operator is expected + else { warning("Program Bug in find_angle_operator_termination\n") } - # else: - # space after '->' is ok - } - elsif ( $tok eq '^' ) { + # To illustrate what we might be looking at, in case we are + # guessing, here are some examples of valid angle operators + # (or file globs): + # + # + # <$fh> + # <*.c *.h> + # <_> + # ( glob.t) + # <${PREFIX}*img*.$IMAGE_TYPE> + # + # + # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl> + # + # Here are some examples of lines which do not have angle operators: + # return undef unless $self->[2]++ < $#{$self->[1]}; + # < 2 || @$t > + # + # the following line from dlister.pl caused trouble: + # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n"; + # + # If the '<' starts an angle operator, it must end on this line and + # it must not have certain characters like ';' and '=' in it. I use + # this to limit the testing. This filter should be improved if + # possible. - # check for some special variables like $^W - if ( $identifier =~ /^[\$\*\@\%]$/ ) { - $identifier .= $tok; - $id_scan_state = 'A'; - } - else { - $id_scan_state = ''; - } - } - else { # something else + if ( $input_line =~ /($filter)/g ) { - # check for various punctuation variables - if ( $identifier =~ /^[\$\*\@\%]$/ ) { - $identifier .= $tok; - } + if ( $1 eq '>' ) { - elsif ( $identifier eq '$#' ) { + # We MAY have found an angle operator termination if we get + # here, but we need to do more to be sure we haven't been + # fooled. + my $pos = pos($input_line); - if ( $tok eq '{' ) { $type = 'i'; $i = $i_save } + my $pos_beg = $$rtoken_map[$i]; + my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) ); - # perl seems to allow just these: $#: $#- $#+ - elsif ( $tok =~ /^[\:\-\+]$/ ) { - $type = 'i'; - $identifier .= $tok; - } - else { - $i = $i_save; - write_logfile_entry( 'Use of $# is deprecated' . "\n" ); - } + # Reject if the closing '>' follows a '-' as in: + # if ( VERSION < 5.009 && $op-> name eq 'aassign' ) { } + if ( $expecting eq UNKNOWN ) { + my $check = substr( $input_line, $pos - 2, 1 ); + if ( $check eq '-' ) { + return ( $i, $type ); } - elsif ( $identifier eq '$$' ) { + } - # perl does not allow references to punctuation - # variables without braces. For example, this - # won't work: - # $:=\4; - # $a = $$:; - # You would have to use - # $a = ${$:}; + ######################################debug##### + #write_diagnostics( "ANGLE? :$str\n"); + #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n"; + ######################################debug##### + $type = 'Q'; + my $error; + ( $i, $error ) = + inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index ); - $i = $i_save; - if ( $tok eq '{' ) { $type = 't' } - else { $type = 'i' } - } - elsif ( $identifier eq '->' ) { - $i = $i_save; - } - else { - $i = $i_save; - if ( length($identifier) == 1 ) { $identifier = ''; } - } - $id_scan_state = ''; - last; + # It may be possible that a quote ends midway in a pretoken. + # If this happens, it may be necessary to split the pretoken. + if ($error) { + warning( + "Possible tokinization error..please check this line\n"); + report_possible_bug(); } - } - elsif ( $id_scan_state eq '&' ) { # starting sub call? - if ( $tok =~ /^[\$A-Za-z_]/ ) { # alphanumeric .. - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - $identifier .= $tok; - } - elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric .. - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - $identifier .= $tok; - } - elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - $identifier .= $tok; - } - elsif ( $tok =~ /^\s*$/ ) { # allow space - } - elsif ( $tok eq '::' ) { # leading :: - $id_scan_state = 'A'; # accept alpha next - $identifier .= $tok; + # Now let's see where we stand.... + # OK if math op not possible + if ( $expecting == TERM ) { } - elsif ( $tok eq '{' ) { - if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; } - $i = $i_save; - $id_scan_state = ''; - last; + + # OK if there are no more than 2 pre-tokens inside + # (not possible to write 2 token math between < and >) + # This catches most common cases + elsif ( $i <= $i_beg + 3 ) { + write_diagnostics("ANGLE(1 or 2 tokens): $str\n"); } + + # Not sure.. else { - # punctuation variable? - # testfile: cunningham4.pl - if ( $identifier eq '&' ) { - $identifier .= $tok; + # Let's try a Brace Test: any braces inside must balance + my $br = 0; + while ( $str =~ /\{/g ) { $br++ } + while ( $str =~ /\}/g ) { $br-- } + my $sb = 0; + while ( $str =~ /\[/g ) { $sb++ } + while ( $str =~ /\]/g ) { $sb-- } + my $pr = 0; + while ( $str =~ /\(/g ) { $pr++ } + while ( $str =~ /\)/g ) { $pr-- } + + # if braces do not balance - not angle operator + if ( $br || $sb || $pr ) { + $i = $i_beg; + $type = '<'; + write_diagnostics( + "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n"); } + + # we should keep doing more checks here...to be continued + # Tentatively accepting this as a valid angle operator. + # There are lots more things that can be checked. else { - $identifier = ''; - $i = $i_save; - $type = '&'; + write_diagnostics( + "ANGLE-Guessing yes: $str expecting=$expecting\n"); + write_logfile_entry("Guessing angle operator here: $str\n"); } - $id_scan_state = ''; - last; } } - elsif ( $id_scan_state eq 'A' ) { # looking for alpha (after ::) - if ( $tok =~ /^[A-Za-z_]/ ) { # found it - $identifier .= $tok; - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - } - elsif ( $tok eq "'" && $allow_tick ) { - $identifier .= $tok; - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - } - elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above - $identifier .= $tok; - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - } - elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { - $id_scan_state = '('; - $identifier .= $tok; - } - elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { - $id_scan_state = ')'; - $identifier .= $tok; - } - else { - $id_scan_state = ''; - $i = $i_save; - last; + # didn't find ending > + else { + if ( $expecting == TERM ) { + warning("No ending > for angle operator\n"); } } - elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha + } + return ( $i, $type ); +} - if ( $tok eq '::' ) { # got it - $identifier .= $tok; - $id_scan_state = 'A'; # now require alpha - } - elsif ( $tok =~ /^[A-Za-z_]/ ) { # more alphanumeric is ok here - $identifier .= $tok; - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - } - elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above - $identifier .= $tok; - $id_scan_state = ':'; # now need :: - $saw_alpha = 1; - } - elsif ( $tok eq "'" && $allow_tick ) { # tick +sub scan_number_do { - if ( $is_keyword{$identifier} ) { - $id_scan_state = ''; # that's all - $i = $i_save; - } - else { - $identifier .= $tok; - } - } - elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) { - $id_scan_state = '('; - $identifier .= $tok; - } - elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) { - $id_scan_state = ')'; - $identifier .= $tok; - } - else { - $id_scan_state = ''; # that's all - $i = $i_save; - last; - } - } - elsif ( $id_scan_state eq '(' ) { # looking for ( of prototype + # scan a number in any of the formats that Perl accepts + # Underbars (_) are allowed in decimal numbers. + # input parameters - + # $input_line - the string to scan + # $i - pre_token index to start scanning + # $rtoken_map - reference to the pre_token map giving starting + # character position in $input_line of token $i + # output parameters - + # $i - last pre_token index of the number just scanned + # number - the number (characters); or undef if not a number + + my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_; + my $pos_beg = $$rtoken_map[$i]; + my $pos; + my $i_begin = $i; + my $number = undef; + my $type = $input_type; + + my $first_char = substr( $input_line, $pos_beg, 1 ); + + # Look for bad starting characters; Shouldn't happen.. + if ( $first_char !~ /[\d\.\+\-Ee]/ ) { + warning("Program bug - scan_number given character $first_char\n"); + report_definite_bug(); + return ( $i, $type, $number ); + } + + # handle v-string without leading 'v' character ('Two Dot' rule) + # (vstring.t) + # TODO: v-strings may contain underscores + pos($input_line) = $pos_beg; + if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) { + $pos = pos($input_line); + my $numc = $pos - $pos_beg; + $number = substr( $input_line, $pos_beg, $numc ); + $type = 'v'; + report_v_string($number); + } - if ( $tok eq '(' ) { # got it - $identifier .= $tok; - $id_scan_state = ')'; # now find the end of it - } - elsif ( $tok =~ /^\s*$/ ) { # blank - keep going - $identifier .= $tok; - } - else { - $id_scan_state = ''; # that's all - no prototype - $i = $i_save; - last; - } + # handle octal, hex, binary + if ( !defined($number) ) { + pos($input_line) = $pos_beg; + if ( $input_line =~ /\G[+-]?0((x[0-9a-fA-F_]+)|([0-7_]+)|(b[01_]+))/g ) + { + $pos = pos($input_line); + my $numc = $pos - $pos_beg; + $number = substr( $input_line, $pos_beg, $numc ); + $type = 'n'; } - elsif ( $id_scan_state eq ')' ) { # looking for ) to end + } - if ( $tok eq ')' ) { # got it - $identifier .= $tok; - $id_scan_state = ''; # all done - last; - } - elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) { - $identifier .= $tok; - } - else { # probable error in script, but keep going - warning("Unexpected '$tok' while seeking end of prototype\n"); - $identifier .= $tok; + # handle decimal + if ( !defined($number) ) { + pos($input_line) = $pos_beg; + + if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) { + $pos = pos($input_line); + + # watch out for things like 0..40 which would give 0. by this; + if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' ) + && ( substr( $input_line, $pos, 1 ) eq '.' ) ) + { + $pos--; } + my $numc = $pos - $pos_beg; + $number = substr( $input_line, $pos_beg, $numc ); + $type = 'n'; } - else { # can get here due to error in initialization - $id_scan_state = ''; - $i = $i_save; + } + + # filter out non-numbers like e + - . e2 .e3 +e6 + # the rule: at least one digit, and any 'e' must be preceded by a digit + if ( + $number !~ /\d/ # no digits + || ( $number =~ /^(.*)[eE]/ + && $1 !~ /\d/ ) # or no digits before the 'e' + ) + { + $number = undef; + $type = $input_type; + return ( $i, $type, $number ); + } + + # Found a number; now we must convert back from character position + # to pre_token index. An error here implies user syntax error. + # An example would be an invalid octal number like '009'. + my $error; + ( $i, $error ) = + inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index ); + if ($error) { warning("Possibly invalid number\n") } + + return ( $i, $type, $number ); +} + +sub inverse_pretoken_map { + + # Starting with the current pre_token index $i, scan forward until + # finding the index of the next pre_token whose position is $pos. + my ( $i, $pos, $rtoken_map, $max_token_index ) = @_; + my $error = 0; + + while ( ++$i <= $max_token_index ) { + + if ( $pos <= $$rtoken_map[$i] ) { + + # Let the calling routine handle errors in which we do not + # land on a pre-token boundary. It can happen by running + # perltidy on some non-perl scripts, for example. + if ( $pos < $$rtoken_map[$i] ) { $error = 1 } + $i--; last; } } + return ( $i, $error ); +} - if ( $id_scan_state eq ')' ) { - warning("Hit end of line while seeking ) to end prototype\n"); - } +sub find_here_doc { - # once we enter the actual identifier, it may not extend beyond - # the end of the current line - if ( $id_scan_state =~ /^[A\:\(\)]/ ) { - $id_scan_state = ''; + # find the target of a here document, if any + # input parameters: + # $i - token index of the second < of << + # ($i must be less than the last token index if this is called) + # output parameters: + # $found_target = 0 didn't find target; =1 found target + # HERE_TARGET - the target string (may be empty string) + # $i - unchanged if not here doc, + # or index of the last token of the here target + # $saw_error - flag noting unbalanced quote on here target + my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_; + my $ibeg = $i; + my $found_target = 0; + my $here_doc_target = ''; + my $here_quote_character = ''; + my $saw_error = 0; + my ( $next_nonblank_token, $i_next_nonblank, $next_token ); + $next_token = $$rtokens[ $i + 1 ]; + + # perl allows a backslash before the target string (heredoc.t) + my $backslash = 0; + if ( $next_token eq '\\' ) { + $backslash = 1; + $next_token = $$rtokens[ $i + 2 ]; } - if ( $i < 0 ) { $i = 0 } - unless ($type) { + ( $next_nonblank_token, $i_next_nonblank ) = + find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index ); - if ($saw_type) { + if ( $next_nonblank_token =~ /[\'\"\`]/ ) { - if ($saw_alpha) { - if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) { - $type = 'w'; - } - else { $type = 'i' } - } - elsif ( $identifier eq '->' ) { - $type = '->'; + my $in_quote = 1; + my $quote_depth = 0; + my $quote_pos = 0; + my $quoted_string; + + ( + $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth, + $quoted_string + ) + = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens, + $here_quote_character, $quote_pos, $quote_depth, $max_token_index ); + + if ($in_quote) { # didn't find end of quote, so no target found + $i = $ibeg; + if ( $expecting == TERM ) { + warning( +"Did not find here-doc string terminator ($here_quote_character) before end of line \n" + ); + $saw_error = 1; } - elsif ( - ( length($identifier) > 1 ) + } + else { # found ending quote + my $j; + $found_target = 1; - # In something like '@$=' we have an identifier '@$' - # In something like '$${' we have type '$$' (and only - # part of an identifier) - && !( $identifier =~ /\$$/ && $tok eq '{' ) - && ( $identifier !~ /^(sub |package )$/ ) - ) - { - $type = 'i'; + my $tokj; + for ( $j = $i_next_nonblank + 1 ; $j < $i ; $j++ ) { + $tokj = $$rtokens[$j]; + + # we have to remove any backslash before the quote character + # so that the here-doc-target exactly matches this string + next + if ( $tokj eq "\\" + && $j < $i - 1 + && $$rtokens[ $j + 1 ] eq $here_quote_character ); + $here_doc_target .= $tokj; } - else { $type = 't' } } - elsif ($saw_alpha) { + } - # type 'w' includes anything without leading type info - # ($,%,@,*) including something like abc::def::ghi - $type = 'w'; + elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) { + $found_target = 1; + write_logfile_entry( + "found blank here-target after <<; suggest using \"\"\n"); + $i = $ibeg; + } + elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after << + + my $here_doc_expected; + if ( $expecting == UNKNOWN ) { + $here_doc_expected = guess_if_here_doc($next_token); } else { - $type = ''; - } # this can happen on a restart - } + $here_doc_expected = 1; + } + + if ($here_doc_expected) { + $found_target = 1; + $here_doc_target = $next_token; + $i = $ibeg + 1; + } - if ($identifier) { - $tok = $identifier; - if ($message) { write_logfile_entry($message) } } else { - $tok = $tok_begin; - $i = $i_begin; + + if ( $expecting == TERM ) { + $found_target = 1; + write_logfile_entry("Note: bare here-doc operator <<\n"); + } + else { + $i = $ibeg; + } } - TOKENIZER_DEBUG_FLAG_SCAN_ID && do { - my ( $a, $b, $c ) = caller; - print -"SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n"; - print -"SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n"; - }; - return ( $i, $tok, $type, $id_scan_state, $identifier ); + # patch to neglect any prepended backslash + if ( $found_target && $backslash ) { $i++ } + + return ( $found_target, $here_doc_target, $here_quote_character, $i, + $saw_error ); +} + +sub do_quote { + + # follow (or continue following) quoted string(s) + # $in_quote return code: + # 0 - ok, found end + # 1 - still must find end of quote whose target is $quote_character + # 2 - still looking for end of first of two quotes + # + # Returns updated strings: + # $quoted_string_1 = quoted string seen while in_quote=1 + # $quoted_string_2 = quoted string seen while in_quote=2 + my ( + $i, $in_quote, $quote_character, + $quote_pos, $quote_depth, $quoted_string_1, + $quoted_string_2, $rtokens, $rtoken_map, + $max_token_index + ) = @_; + + my $in_quote_starting = $in_quote; + + my $quoted_string; + if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow + my $ibeg = $i; + ( + $i, $in_quote, $quote_character, $quote_pos, $quote_depth, + $quoted_string + ) + = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character, + $quote_pos, $quote_depth, $max_token_index ); + $quoted_string_2 .= $quoted_string; + if ( $in_quote == 1 ) { + if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; } + $quote_character = ''; + } + else { + $quoted_string_2 .= "\n"; + } + } + + if ( $in_quote == 1 ) { # one (more) quote to follow + my $ibeg = $i; + ( + $i, $in_quote, $quote_character, $quote_pos, $quote_depth, + $quoted_string + ) + = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character, + $quote_pos, $quote_depth, $max_token_index ); + $quoted_string_1 .= $quoted_string; + if ( $in_quote == 1 ) { + $quoted_string_1 .= "\n"; + } + } + return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth, + $quoted_string_1, $quoted_string_2 ); } sub follow_quoted_string { @@ -24806,10 +25960,13 @@ sub follow_quoted_string { # $beginning_tok = the starting quote character # $quote_pos = index to check next for alphanumeric delimiter # $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested. - my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth ) + # $quoted_string = the text of the quote (without quotation tokens) + my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth, + $max_token_index ) = @_; my ( $tok, $end_tok ); - my $i = $i_beg - 1; + my $i = $i_beg - 1; + my $quoted_string = ""; TOKENIZER_DEBUG_FLAG_QUOTE && do { print @@ -24861,8 +26018,10 @@ sub follow_quoted_string { # characters, whereas for a non-alphanumeric delimiter, only tokens of # length 1 can match. - # loop for case of alphanumeric quote delimiter.. + ################################################################### + # Case 1 (rare): loop for case of alphanumeric quote delimiter.. # "quote_pos" is the position the current word to begin searching + ################################################################### if ( $beginning_tok =~ /\w/ ) { # Note this because it is not recommended practice except @@ -24879,10 +26038,12 @@ sub follow_quoted_string { if ( $tok eq '\\' ) { + # retain backslash unless it hides the end token + $quoted_string .= $tok + unless $$rtokens[ $i + 1 ] eq $end_tok; $quote_pos++; last if ( $i >= $max_token_index ); $tok = $$rtokens[ ++$i ]; - } } my $old_pos = $quote_pos; @@ -24895,6 +26056,9 @@ sub follow_quoted_string { if ( $quote_pos > 0 ) { + $quoted_string .= + substr( $tok, $old_pos, $quote_pos - $old_pos - 1 ); + $quote_depth--; if ( $quote_depth == 0 ) { @@ -24902,10 +26066,15 @@ sub follow_quoted_string { last; } } + else { + $quoted_string .= substr( $tok, $old_pos ); + } } } - # loop for case of a non-alphanumeric quote delimiter.. + ######################################################################## + # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter.. + ######################################################################## else { while ( $i < $max_token_index ) { @@ -24923,12 +26092,188 @@ sub follow_quoted_string { $quote_depth++; } elsif ( $tok eq '\\' ) { - $i++; + + # retain backslash unless it hides the beginning or end token + $tok = $$rtokens[ ++$i ]; + $quoted_string .= '\\' + unless ( $tok eq $end_tok || $tok eq $beginning_tok ); } + $quoted_string .= $tok; } } if ( $i > $max_token_index ) { $i = $max_token_index } - return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth ); + return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth, + $quoted_string ); +} + +sub indicate_error { + my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_; + interrupt_logfile(); + warning($msg); + write_error_indicator_pair( $line_number, $input_line, $pos, $carrat ); + resume_logfile(); +} + +sub write_error_indicator_pair { + my ( $line_number, $input_line, $pos, $carrat ) = @_; + my ( $offset, $numbered_line, $underline ) = + make_numbered_line( $line_number, $input_line, $pos ); + $underline = write_on_underline( $underline, $pos - $offset, $carrat ); + warning( $numbered_line . "\n" ); + $underline =~ s/\s*$//; + warning( $underline . "\n" ); +} + +sub make_numbered_line { + + # Given an input line, its line number, and a character position of + # interest, create a string not longer than 80 characters of the form + # $lineno: sub_string + # such that the sub_string of $str contains the position of interest + # + # Here is an example of what we want, in this case we add trailing + # '...' because the line is long. + # + # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ... + # + # Here is another example, this time in which we used leading '...' + # because of excessive length: + # + # 2: ... er of the World Wide Web Consortium's + # + # input parameters are: + # $lineno = line number + # $str = the text of the line + # $pos = position of interest (the error) : 0 = first character + # + # We return : + # - $offset = an offset which corrects the position in case we only + # display part of a line, such that $pos-$offset is the effective + # position from the start of the displayed line. + # - $numbered_line = the numbered line as above, + # - $underline = a blank 'underline' which is all spaces with the same + # number of characters as the numbered line. + + my ( $lineno, $str, $pos ) = @_; + my $offset = ( $pos < 60 ) ? 0 : $pos - 40; + my $excess = length($str) - $offset - 68; + my $numc = ( $excess > 0 ) ? 68 : undef; + + if ( defined($numc) ) { + if ( $offset == 0 ) { + $str = substr( $str, $offset, $numc - 4 ) . " ..."; + } + else { + $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ..."; + } + } + else { + + if ( $offset == 0 ) { + } + else { + $str = "... " . substr( $str, $offset + 4 ); + } + } + + my $numbered_line = sprintf( "%d: ", $lineno ); + $offset -= length($numbered_line); + $numbered_line .= $str; + my $underline = " " x length($numbered_line); + return ( $offset, $numbered_line, $underline ); +} + +sub write_on_underline { + + # The "underline" is a string that shows where an error is; it starts + # out as a string of blanks with the same length as the numbered line of + # code above it, and we have to add marking to show where an error is. + # In the example below, we want to write the string '--^' just below + # the line of bad code: + # + # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ... + # ---^ + # We are given the current underline string, plus a position and a + # string to write on it. + # + # In the above example, there will be 2 calls to do this: + # First call: $pos=19, pos_chr=^ + # Second call: $pos=16, pos_chr=--- + # + # This is a trivial thing to do with substr, but there is some + # checking to do. + + my ( $underline, $pos, $pos_chr ) = @_; + + # check for error..shouldn't happen + unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) { + return $underline; + } + my $excess = length($pos_chr) + $pos - length($underline); + if ( $excess > 0 ) { + $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess ); + } + substr( $underline, $pos, length($pos_chr) ) = $pos_chr; + return ($underline); +} + +sub pre_tokenize { + + # Break a string, $str, into a sequence of preliminary tokens. We + # are interested in these types of tokens: + # words (type='w'), example: 'max_tokens_wanted' + # digits (type = 'd'), example: '0755' + # whitespace (type = 'b'), example: ' ' + # any other single character (i.e. punct; type = the character itself). + # We cannot do better than this yet because we might be in a quoted + # string or pattern. Caller sets $max_tokens_wanted to 0 to get all + # tokens. + my ( $str, $max_tokens_wanted ) = @_; + + # we return references to these 3 arrays: + my @tokens = (); # array of the tokens themselves + my @token_map = (0); # string position of start of each token + my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct + + do { + + # whitespace + if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; } + + # numbers + # note that this must come before words! + elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; } + + # words + elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; } + + # single-character punctuation + elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; } + + # that's all.. + else { + return ( \@tokens, \@token_map, \@type ); + } + + push @tokens, $1; + push @token_map, pos($str); + + } while ( --$max_tokens_wanted != 0 ); + + return ( \@tokens, \@token_map, \@type ); +} + +sub show_tokens { + + # this is an old debug routine + my ( $rtokens, $rtoken_map ) = @_; + my $num = scalar(@$rtokens); + my $i; + + for ( $i = 0 ; $i < $num ; $i++ ) { + my $len = length( $$rtokens[$i] ); + print "$i:$len:$$rtoken_map[$i]:$$rtokens[$i]:\n"; + } } sub matching_end_token { @@ -24953,6 +26298,81 @@ sub matching_end_token { } } +sub dump_token_types { + my $class = shift; + my $fh = shift; + + # This should be the latest list of token types in use + # adding NEW_TOKENS: add a comment here + print $fh <<'END_OF_LIST'; + +Here is a list of the token types currently used for lines of type 'CODE'. +For the following tokens, the "type" of a token is just the token itself. + +.. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <> +( ) <= >= == =~ !~ != ++ -- /= x= +... **= <<= >>= &&= ||= //= <=> +, + - / * | % ! x ~ = \ ? : . < > ^ & + +The following additional token types are defined: + + type meaning + b blank (white space) + { indent: opening structural curly brace or square bracket or paren + (code block, anonymous hash reference, or anonymous array reference) + } outdent: right structural curly brace or square bracket or paren + [ left non-structural square bracket (enclosing an array index) + ] right non-structural square bracket + ( left non-structural paren (all but a list right of an =) + ) right non-structural parena + L left non-structural curly brace (enclosing a key) + R right non-structural curly brace + ; terminal semicolon + f indicates a semicolon in a "for" statement + h here_doc operator << + # a comment + Q indicates a quote or pattern + q indicates a qw quote block + k a perl keyword + C user-defined constant or constant function (with void prototype = ()) + U user-defined function taking parameters + G user-defined function taking block parameter (like grep/map/eval) + M (unused, but reserved for subroutine definition name) + P (unused, but -html uses it to label pod text) + t type indicater such as %,$,@,*,&,sub + w bare word (perhaps a subroutine call) + i identifier of some type (with leading %, $, @, *, &, sub, -> ) + n a number + v a v-string + F a file test operator (like -e) + Y File handle + Z identifier in indirect object slot: may be file handle, object + J LABEL: code block label + j LABEL after next, last, redo, goto + p unary + + m unary - + pp pre-increment operator ++ + mm pre-decrement operator -- + A : used as attribute separator + + Here are the '_line_type' codes used internally: + SYSTEM - system-specific code before hash-bang line + CODE - line of perl code (including comments) + POD_START - line starting pod, such as '=head' + POD - pod documentation text + POD_END - last line of pod section, '=cut' + HERE - text of here-document + HERE_END - last line of here-doc (target word) + FORMAT - format section + FORMAT_END - last line of format section, '.' + DATA_START - __DATA__ line + DATA - unidentified text following __DATA__ + END_START - __END__ line + END - unidentified text following __END__ + ERROR - we are in big trouble, probably not a perl script +END_OF_LIST +} + BEGIN { # These names are used in error messages @@ -24961,11 +26381,11 @@ BEGIN { my @digraphs = qw( .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <> - <= >= == =~ !~ != ++ -- /= x= + <= >= == =~ !~ != ++ -- /= x= ~~ ); @is_digraph{@digraphs} = (1) x scalar(@digraphs); - my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> ); + my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ ); @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs); # make a hash of all valid token types for self-checking the tokenizer @@ -24992,7 +26412,7 @@ BEGIN { @is_block_operator{@_} = (1) x scalar(@_); # these functions allow an identifier in the indirect object slot - @_ = qw( print printf sort exec system ); + @_ = qw( print printf sort exec system say); @is_indirect_object_taker{@_} = (1) x scalar(@_); # These tokens may precede a code block @@ -25225,9 +26645,14 @@ BEGIN { given when err + say ); - # patched above for SWITCH/CASE + # patched above for SWITCH/CASE given/when err say + # 'err' is a fairly safe addition. + # TODO: 'default' still needed if appropriate + # 'use feature' seen, but perltidy works ok without it. + # Concerned that 'default' could break code. push( @Keywords, @value_requestor ); # These are treated the same but are not keywords: @@ -25282,7 +26707,7 @@ BEGIN { # these token TYPES expect trailing operator but not a term # note: ++ and -- are post-increment and decrement, 'C' = constant - my @operator_requestor_types = qw( ++ -- C ); + my @operator_requestor_types = qw( ++ -- C <> q ); @expecting_operator_types{@operator_requestor_types} = (1) x scalar(@operator_requestor_types); @@ -25292,14 +26717,19 @@ BEGIN { my @value_requestor_type = qw# L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //= - <= >= == != => \ > < % * / ? & | ** <=> - f F pp mm Y p m U J G + <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ + f F pp mm Y p m U J G j >> << ^ t #; push( @value_requestor_type, ',' ) ; # (perl doesn't like a ',' in a qw block) @expecting_term_types{@value_requestor_type} = (1) x scalar(@value_requestor_type); + # Note: the following valid token types are not assigned here to + # hashes requesting to be followed by values or terms, but are + # instead currently hard-coded into sub operator_expected: + # ) -> :: Q R Z ] b h i k n v w } # + # For simple syntax checking, it is nice to have a list of operators which # will really be unhappy if not followed by a term. This includes most # of the above... @@ -25730,7 +27160,7 @@ might run, from the command line, where F is a short script of interest. This will produce F with interleaved lines of text and their token types. -The -D flag has been in perltidy from the beginning for this purpose. +The B<-D> flag has been in perltidy from the beginning for this purpose. If you want to see the code which creates this file, it is C in Tidy.pm. @@ -25745,7 +27175,7 @@ to perltidy. =head1 VERSION -This man page documents Perl::Tidy version 20060614. +This man page documents Perl::Tidy version 20070424. =head1 AUTHOR