use File::Basename;
BEGIN {
- ( $VERSION = q($Id: Tidy.pm,v 1.49 2006/06/14 01:56:24 perltidy Exp $) ) =~ s/^.*\s+(\d+)\/(\d+)\/(\d+).*$/$1$2$3/; # all one line for MakeMaker
+ ( $VERSION = q($Id: Tidy.pm,v 1.56 2006/07/19 23:13:33 perltidy Exp $) ) =~ s/^.*\s+(\d+)\/(\d+)\/(\d+).*$/$1$2$3/; # all one line for MakeMaker
}
sub streamhandle {
}
if ($input_file) {
- if ( ref $input_file ) { print STDERR " of reference to:" }
- else { print STDERR " of file:" }
+ if ( ref $input_file ) { print STDERR " of reference to:" }
+ else { print STDERR " of file:" }
print STDERR " $input_file";
}
print STDERR "\n";
my $hash_ref = $input_hash{$key};
if ( defined($hash_ref) ) {
unless ( ref($hash_ref) eq 'HASH' ) {
- my $what = ref($hash_ref);
+ my $what = ref($hash_ref);
my $but_is =
$what ? "but is ref to $what" : "but is not a reference";
croak <<EOM;
# chk --> check-multiline-quotes # check for old bug; to be deleted
# scl --> short-concatenation-item-length # helps break at '.'
# recombine # for debugging line breaks
+ # valign # for debugging vertical alignment
# I --> DIAGNOSTICS # for debugging
######################################################################
no-profile
npro
recombine!
+ valign!
);
my $category = 13; # Debugging
########################################
$add_option->( 'break-at-old-keyword-breakpoints', 'bok', '!' );
$add_option->( 'break-at-old-logical-breakpoints', 'bol', '!' );
- $add_option->( 'break-at-old-trinary-breakpoints', 'bot', '!' );
+ $add_option->( 'break-at-old-ternary-breakpoints', 'bot', '!' );
$add_option->( 'ignore-old-breakpoints', 'iob', '!' );
########################################
brace-vertical-tightness-closing=0
brace-vertical-tightness=0
break-at-old-logical-breakpoints
- break-at-old-trinary-breakpoints
+ break-at-old-ternary-breakpoints
break-at-old-keyword-breakpoints
comma-arrow-breakpoints=1
nocheck-syntax
paren-vertical-tightness=0
pass-version-line
recombine
+ valign
short-concatenation-item-length=8
space-for-semicolon
square-bracket-tightness=1
'outdent-long-lines' => [qw(outdent-long-quotes outdent-long-comments)],
'nooutdent-long-lines' =>
[qw(nooutdent-long-quotes nooutdent-long-comments)],
- 'noll' => [qw(nooutdent-long-lines)],
- 'io' => [qw(indent-only)],
+ 'noll' => [qw(nooutdent-long-lines)],
+ 'io' => [qw(indent-only)],
'delete-all-comments' =>
[qw(delete-block-comments delete-side-comments delete-pod)],
'nodelete-all-comments' =>
[qw(nodelete-block-comments nodelete-side-comments nodelete-pod)],
- 'dac' => [qw(delete-all-comments)],
- 'ndac' => [qw(nodelete-all-comments)],
- 'gnu' => [qw(gnu-style)],
+ 'dac' => [qw(delete-all-comments)],
+ 'ndac' => [qw(nodelete-all-comments)],
+ 'gnu' => [qw(gnu-style)],
+ 'pbp' => [qw(perl-best-practices)],
'tee-all-comments' =>
[qw(tee-block-comments tee-side-comments tee-pod)],
'notee-all-comments' =>
'baa' => [qw(cab=0)],
'nbaa' => [qw(cab=1)],
+ 'break-at-old-trinary-breakpoints' => [qw(bot)],
+
'cti=0' => [qw(cpi=0 cbi=0 csbi=0)],
'cti=1' => [qw(cpi=1 cbi=1 csbi=1)],
'cti=2' => [qw(cpi=2 cbi=2 csbi=2)],
)
],
+ # Style suggested in Damian Conway's Perl Best Practices
+ 'perl-best-practices' => [
+ qw(l=78 i=4 ci=4 st se vt=2 cti=0 pt=1 bt=1 sbt=1 bbt=1 nsfs nolq),
+q(wbb=% + - * / x != == >= <= =~ !~ < > | & >= < = **= += *= &= <<= &&= -= /= |= >>= ||= .= %= ^= x=)
+ ],
+
# Additional styles can be added here
);
# look for a config file if we don't have one yet
my $rconfig_file_chatter;
$$rconfig_file_chatter = "";
- $config_file =
+ $config_file =
find_config_file( $is_Windows, $Windows_type, $rconfig_file_chatter,
$rpending_complaint )
unless $config_file;
}
)
{
+
if ( defined( $Opts{$_} ) ) {
delete $Opts{$_};
warn "ignoring --$_ in config file: $config_file\n";
-boc break at old comma breaks: turns off all automatic list formatting
-bol break at old logical breakpoints: or, and, ||, && (default)
-bok break at old list keyword breakpoints such as map, sort (default)
- -bot break at old conditional (trinary ?:) operator breakpoints (default)
+ -bot break at old conditional (ternary ?:) operator breakpoints (default)
-cab=n break at commas after a comma-arrow (=>):
n=0 break at all commas after =>
n=1 stable: break unless this breaks an existing one-line container
my $line_information_string = "";
if ($input_line_number) {
- my $output_line_number = $self->{_output_line_number};
- my $brace_depth = $line_of_tokens->{_curly_brace_depth};
- my $paren_depth = $line_of_tokens->{_paren_depth};
- my $square_bracket_depth = $line_of_tokens->{_square_bracket_depth};
+ my $output_line_number = $self->{_output_line_number};
+ my $brace_depth = $line_of_tokens->{_curly_brace_depth};
+ my $paren_depth = $line_of_tokens->{_paren_depth};
+ my $square_bracket_depth = $line_of_tokens->{_square_bracket_depth};
my $python_indentation_level =
$line_of_tokens->{_python_indentation_level};
my $rlevels = $line_of_tokens->{_rlevels};
# for longer scripts it doesn't really matter
my $extra_space = "";
$extra_space .=
- ( $input_line_number < 10 ) ? " "
+ ( $input_line_number < 10 ) ? " "
: ( $input_line_number < 100 ) ? " "
- : "";
+ : "";
$extra_space .=
- ( $output_line_number < 10 ) ? " "
+ ( $output_line_number < 10 ) ? " "
: ( $output_line_number < 100 ) ? " "
- : "";
+ : "";
# there are 2 possible nesting strings:
# the original which looks like this: (0 [1 {2
elsif ( $saw_code_bug == 1 ) {
if ( $self->{_saw_extrude} ) {
$self->warning(<<EOM);
-You may have encountered a bug in perltidy. However, since you are
-using the -extrude option, the problem may be with perl itself, which
-has occasional parsing problems with this type of file. If you believe
-that the problem is with perltidy, and the problem is not listed in the
-BUGS file at http://perltidy.sourceforge.net, please report it so that
-it can be corrected. Include the smallest possible script which has the
-problem, along with the .LOG file. See the manual pages for contact
-information.
+
+You may have encountered a bug in perltidy. However, since you are using the
+-extrude option, the problem may be with perl or one of its modules, which have
+occasional problems with this type of file. If you believe that the
+problem is with perltidy, and the problem is not listed in the BUGS file at
+http://perltidy.sourceforge.net, please report it so that it can be corrected.
+Include the smallest possible script which has the problem, along with the .LOG
+file. See the manual pages for contact information.
Thank you!
EOM
}
# 1. Make the table of contents panel, with appropriate changes
# to the anchor names
my $src_frame_name = 'SRC';
- my $first_anchor =
+ my $first_anchor =
write_toc_html( $title, $toc_filename, $src_basename, $rtoc,
$src_frame_name );
elsif ( $line_type eq 'FORMAT' ) { $line_character = 'H' }
elsif ( $line_type eq 'FORMAT_END' ) { $line_character = 'h' }
elsif ( $line_type eq 'SYSTEM' ) { $line_character = 'c' }
- elsif ( $line_type eq 'END_START' ) {
+ elsif ( $line_type eq 'END_START' ) {
$line_character = 'k';
$self->add_toc_item( '__END__', '__END__' );
}
# add the line number if requested
if ( $rOpts->{'html-line-numbers'} ) {
my $extra_space .=
- ( $line_number < 10 ) ? " "
+ ( $line_number < 10 ) ? " "
: ( $line_number < 100 ) ? " "
: ( $line_number < 1000 ) ? " "
- : "";
+ : "";
$html_line = $extra_space . $line_number . " " . $html_line;
}
$last_last_nonblank_token_to_go
@nonblank_lines_at_depth
$starting_in_quote
+ $ending_in_quote
$in_format_skipping_section
$format_skipping_pattern_begin
$added_semicolon_count
$first_added_semicolon_at
$last_added_semicolon_at
- $saw_negative_indentation
$first_tabbing_disagreement
$last_tabbing_disagreement
$in_tabbing_disagreement
%is_assignment
%is_chain_operator
%is_if_unless_and_or_last_next_redo_return
+ %is_until_while_for_if_elsif_else
@has_broken_sublist
@dont_align
$rOpts_break_at_old_keyword_breakpoints
$rOpts_break_at_old_comma_breakpoints
$rOpts_break_at_old_logical_breakpoints
- $rOpts_break_at_old_trinary_breakpoints
+ $rOpts_break_at_old_ternary_breakpoints
$rOpts_closing_side_comment_else_flag
$rOpts_closing_side_comment_maximum_text
$rOpts_continuation_indentation
@_ = qw(is if unless and or err last next redo return);
@is_if_unless_and_or_last_next_redo_return{@_} = (1) x scalar(@_);
+ # always break after a closing curly of these block types:
+ @_ = qw(until while for if elsif else);
+ @is_until_while_for_if_elsif_else{@_} = (1) x scalar(@_);
+
@_ = qw(last next redo return);
@is_last_next_redo_return{@_} = (1) x scalar(@_);
@want_comma_break = ();
@ci_stack = ("");
- $saw_negative_indentation = 0;
$first_tabbing_disagreement = 0;
$last_tabbing_disagreement = 0;
$tabbing_disagreement_count = 0;
my $space_count = 0;
my $available_space = 0;
$level = -1; # flag to prevent storing in item_list
- $leading_spaces_to_go[$max_index_to_go] =
+ $leading_spaces_to_go[$max_index_to_go] =
$reduced_spaces_to_go[$max_index_to_go] =
new_lp_indentation_item( $space_count, $level, $ci_level,
$available_space, 0 );
# find the position if we break at the '='
my $i_test = $last_equals;
if ( $types_to_go[ $i_test + 1 ] eq 'b' ) { $i_test++ }
+
+ # TESTING
+ ##my $too_close = ($i_test==$max_index_to_go-1);
+
my $test_position = total_line_length( $i_test, $max_index_to_go );
if (
+ # the equals is not just before an open paren (testing)
+ ##!$too_close &&
+
# if we are beyond the midpoint
$gnu_position_predictor > $half_maximum_line_length
- # or if we can save some space by breaking at the '='
- # without obscuring the second line by the first
- || ( $test_position > 1 +
- total_line_length( $line_start_index_to_go, $last_equals ) )
+ # or we are beyont the 1/4 point and there was an old
+ # break at the equals
+ || (
+ $gnu_position_predictor > $half_maximum_line_length / 2
+ && (
+ $old_breakpoint_to_go[$last_equals]
+ || ( $last_equals > 0
+ && $old_breakpoint_to_go[ $last_equals - 1 ] )
+ || ( $last_equals > 1
+ && $types_to_go[ $last_equals - 1 ] eq 'b'
+ && $old_breakpoint_to_go[ $last_equals - 2 ] )
+ )
+ )
)
{
# Define here tokens which may follow the closing brace of a do statement
# on the same line, as in:
# } while ( $something);
- @_ = qw(until while unless if ; );
+ @_ = qw(until while unless if ; : );
push @_, ',';
@is_do_follower{@_} = (1) x scalar(@_);
%is_else_brace_follower = ();
# what can follow a multi-line anonymous sub definition closing curly:
- @_ = qw# ; : => or and && || ) #;
+ @_ = qw# ; : => or and && || ~~ ) #;
push @_, ',';
@is_anon_sub_brace_follower{@_} = (1) x scalar(@_);
# what can follow a one-line anonynomous sub closing curly:
# one-line anonumous subs also have ']' here...
# see tk3.t and PP.pm
- @_ = qw# ; : => or and && || ) ] #;
+ @_ = qw# ; : => or and && || ) ] ~~ #;
push @_, ',';
@is_anon_sub_1_brace_follower{@_} = (1) x scalar(@_);
);
# frequently used parameters
- $rOpts_add_newlines = $rOpts->{'add-newlines'};
- $rOpts_add_whitespace = $rOpts->{'add-whitespace'};
- $rOpts_block_brace_tightness = $rOpts->{'block-brace-tightness'};
+ $rOpts_add_newlines = $rOpts->{'add-newlines'};
+ $rOpts_add_whitespace = $rOpts->{'add-whitespace'};
+ $rOpts_block_brace_tightness = $rOpts->{'block-brace-tightness'};
$rOpts_block_brace_vertical_tightness =
$rOpts->{'block-brace-vertical-tightness'};
$rOpts_brace_left_and_indent = $rOpts->{'brace-left-and-indent'};
$rOpts_comma_arrow_breakpoints = $rOpts->{'comma-arrow-breakpoints'};
- $rOpts_break_at_old_trinary_breakpoints =
- $rOpts->{'break-at-old-trinary-breakpoints'};
+ $rOpts_break_at_old_ternary_breakpoints =
+ $rOpts->{'break-at-old-ternary-breakpoints'};
$rOpts_break_at_old_comma_breakpoints =
$rOpts->{'break-at-old-comma-breakpoints'};
$rOpts_break_at_old_keyword_breakpoints =
# for avoiding syntax problems rather than for formatting.
my ( $tokenll, $typell, $tokenl, $typel, $tokenr, $typer ) = @_;
- # never combine two bare words or numbers
- my $result = ( ( $tokenr =~ /^[\'\w]/ ) && ( $tokenl =~ /[\'\w]$/ ) )
+ my $result =
+
+ # never combine two bare words or numbers
+ # examples: and ::ok(1)
+ # return ::spw(...)
+ # for bla::bla:: abc
+ # example is "%overload:: and" in files Dumpvalue.pm or colonbug.pl
+ # $input eq"quit" to make $inputeq"quit"
+ # my $size=-s::SINK if $file; <==OK but we won't do it
+ # don't join something like: for bla::bla:: abc
+ # example is "%overload:: and" in files Dumpvalue.pm or colonbug.pl
+ ( ( $tokenl =~ /([\'\w]|\:\:)$/ ) && ( $tokenr =~ /^([\'\w]|\:\:)/ ) )
# do not combine a number with a concatination dot
# example: pom.caputo:
# retain any space after possible filehandle
# (testfiles prnterr1.t with --extrude and mangle.t with --mangle)
- || ( $typel eq 'Z' || $typell eq 'Z' )
+ || ( $typel eq 'Z' )
+
+ # Perl is sensitive to whitespace after the + here:
+ # $b = xvals $a + 0.1 * yvals $a;
+ || ( $typell eq 'Z' && $typel =~ /^[\/\?\+\-\*]$/ )
# keep paren separate in 'use Foo::Bar ()'
|| ( $tokenr eq '('
#use Mail::Internet 1.28 (); (see Entity.pm, Head.pm, Test.pm)
|| ( ( $typel eq 'n' ) && ( $tokenr eq '(' ) )
- # don't join something like: for bla::bla:: abc
- # example is "%overload:: and" in files Dumpvalue.pm or colonbug.pl
- || ( $tokenl =~ /\:\:$/ && ( $tokenr =~ /^[\'\w]/ ) )
; # the value of this long logic sequence is the result we want
return $result;
}
my @spaces_both_sides = qw"
+ - * / % ? = . : x < > | & ^ .. << >> ** && .. || // => += -=
- .= %= x= &= |= ^= *= <> <= >= == =~ !~ /= != ... <<= >>=
+ .= %= x= &= |= ^= *= <> <= >= == =~ !~ /= != ... <<= >>= ~~
&&= ||= //= <=> A k f w F n C Y U G v
";
# patch for SWITCH/CASE: make space at ']{' optional
# since the '{' might begin a case or when block
- elsif ( $token eq '{' && $last_token eq ']' ) {
+ elsif ( ( $token eq '{' && $type ne 'L' ) && $last_token eq ']' ) {
$ws = WS_OPTIONAL;
}
elsif ( $type eq '#' ) { $ws = WS_YES if $j > 0 }
# always preserver whatever space was used after a possible
- # filehandle or here doc operator
- if ( $type ne '#' && ( $last_type eq 'Z' || $last_type eq 'h' ) ) {
+ # filehandle (except _) or here doc operator
+ if (
+ $type ne '#'
+ && ( ( $last_type eq 'Z' && $last_token ne '_' )
+ || $last_type eq 'h' )
+ )
+ {
$ws = WS_OPTIONAL;
}
# If this becomes too much of a problem, we might give up and just clip
# them at zero.
## $levels_to_go[$max_index_to_go] = ( $level > 0 ) ? $level : 0;
- $levels_to_go[$max_index_to_go] = $level;
+ $levels_to_go[$max_index_to_go] = $level;
$nesting_depth_to_go[$max_index_to_go] = ( $slevel >= 0 ) ? $slevel : 0;
$lengths_to_go[ $max_index_to_go + 1 ] =
$lengths_to_go[$max_index_to_go] + length($token);
return;
}
- my %is_until_while_for_if_elsif_else;
-
- BEGIN {
-
- # always break after a closing curly of these block types:
- @_ = qw(until while for if elsif else);
- @is_until_while_for_if_elsif_else{@_} = (1) x scalar(@_);
-
- }
-
sub print_line_of_tokens {
my $line_of_tokens = shift;
$in_continued_quote = $starting_in_quote =
$line_of_tokens->{_starting_in_quote};
- $in_quote = $line_of_tokens->{_ending_in_quote};
+ $in_quote = $line_of_tokens->{_ending_in_quote};
+ $ending_in_quote = $in_quote;
$python_indentation_level =
$line_of_tokens->{_python_indentation_level};
&& $rOpts->{'static-block-comments'}
&& $input_line =~ /$static_block_comment_pattern/o )
{
- $is_static_block_comment = 1;
+ $is_static_block_comment = 1;
$is_static_block_comment_without_leading_space =
substr( $input_line, 0, 1 ) eq '#';
}
# /([\$*])(([\w\:\']*)\bVERSION)\b.*\=/
# Examples:
# *VERSION = \'1.01';
- # ( $VERSION ) = '$Revision: 1.49 $ ' =~ /\$Revision:\s+([^\s]+)/;
+ # ( $VERSION ) = '$Revision: 1.56 $ ' =~ /\$Revision:\s+([^\s]+)/;
# We will pass such a line straight through without breaking
# it unless -npvl is used
}
# take care of indentation-only
- # also write a line which is entirely a 'qw' list
- if ( $rOpts->{'indent-only'}
- || ( ( $jmax == 0 ) && ( $$rtoken_type[0] eq 'q' ) ) )
- {
+ # NOTE: In previous versions we sent all qw lines out immediately here.
+ # No longer doing this: also write a line which is entirely a 'qw' list
+ # to allow stacking of opening and closing tokens. Note that interior
+ # qw lines will still go out at the end of this routine.
+ if ( $rOpts->{'indent-only'} ) {
flush();
$input_line =~ s/^\s*//; # trim left end
$input_line =~ s/\s*$//; # trim right end
#
# But make a line break if the curly ends a
# significant block:
- if ( $is_until_while_for_if_elsif_else{$block_type} ) {
+ ##if ( $is_until_while_for_if_elsif_else{$block_type} ) {
+ if (
+ $is_block_without_semicolon{$block_type}
+
+ # if needless semicolon follows we handle it later
+ && $next_nonblank_token ne ';'
+ )
+ {
output_line_to_go() unless ($no_internal_newlines);
}
}
}
}
- # TESTING ONLY for SWITCH/CASE - this is where to start
- # recoding to retain else's on the same line as a case,
- # but there is a lot more that would need to be done.
- ##elsif ($block_type eq 'case') {$rbrace_follower = {else=>1};}
-
# None of the above: specify what can follow a closing
# brace of a block which is not an
# if/elsif/else/do/sort/map/grep/eval
# if there is a side comment
( ( $type eq '#' ) && !$rOpts->{'delete-side-comments'} )
- # if this line which ends in a quote
+ # if this line ends in a quote
+ # NOTE: This is critically important for insuring that quoted lines
+ # do not get processed by things like -sot and -sct
|| $in_quote
# if this is a VERSION statement
for ( $i = $j + 1 ; $i <= $jmax ; $i++ ) {
# old whitespace could be arbitrarily large, so don't use it
- if ( $$rtoken_type[$i] eq 'b' ) { $pos += 1 }
- else { $pos += length( $$rtokens[$i] ) }
+ if ( $$rtoken_type[$i] eq 'b' ) { $pos += 1 }
+ else { $pos += length( $$rtokens[$i] ) }
# Return false result if we exceed the maximum line length,
if ( $pos > $rOpts_maximum_line_length ) {
# we might pad token $ibeg, so be sure that it
# is at the same depth as the next line.
next
- if ( $nesting_depth_to_go[ $ibeg + 1 ] !=
+ if ( $nesting_depth_to_go[$ibeg] !=
$nesting_depth_to_go[$ibeg_next] );
# We can pad on line 1 of a statement if at least 3
# then we are probably vertically aligned. We could set
# an exact flag in sub scan_list, but this is good
# enough.
- my $indentation_count = keys %saw_indentation;
+ my $indentation_count = keys %saw_indentation;
my $is_vertically_aligned =
( $i == $ibeg
&& $first_line_comma_count > 1
Perl::Tidy::VerticalAligner::flush();
}
-# output_line_to_go sends one logical line of tokens on down the
+# sub output_line_to_go sends one logical line of tokens on down the
# pipeline to the VerticalAligner package, breaking the line into continuation
# lines as necessary. The line of tokens is ready to go in the "to_go"
# arrays.
-
sub output_line_to_go {
# debug stuff; this routine can be called from many points
# any unfinished items in its stack
finish_lp_batch();
+ # If this line ends in a code block brace, set breaks at any
+ # previous closing code block braces to breakup a chain of code
+ # blocks on one line. This is very rare but can happen for
+ # user-defined subs. For example we might be looking at this:
+ # BOOL { $server_data{uptime} > 0; } NUM { $server_data{load}; } STR {
+ my $saw_good_break = 0; # flag to force breaks even if short line
+ if (
+
+ # looking for opening or closing block brace
+ $block_type_to_go[$max_index_to_go]
+
+ # but not one of these which are never duplicated on a line:
+ ##&& !$is_until_while_for_if_elsif_else{ $block_type_to_go
+ ## [$max_index_to_go] }
+ && !$is_block_without_semicolon{ $block_type_to_go[$max_index_to_go] }
+ )
+ {
+ my $lev = $nesting_depth_to_go[$max_index_to_go];
+
+ # Walk backwards from the end and
+ # set break at any closing block braces at the same level.
+ # But quit if we are not in a chain of blocks.
+ for ( my $i = $max_index_to_go - 1 ; $i >= 0 ; $i-- ) {
+ last if ( $levels_to_go[$i] < $lev ); # stop at a lower level
+ next if ( $levels_to_go[$i] > $lev ); # skip past higher level
+
+ if ( $block_type_to_go[$i] ) {
+ if ( $tokens_to_go[$i] eq '}' ) {
+ set_forced_breakpoint($i);
+ $saw_good_break = 1;
+ }
+ }
+
+ # quit if we see anything besides words, function, blanks
+ # at this level
+ elsif ( $types_to_go[$i] !~ /^[\(\)Gwib]$/ ) { last }
+ }
+ }
+
my $imin = 0;
my $imax = $max_index_to_go;
# break before all package declarations
# MCONVERSION LOCATION - for tokenizaton change
- elsif ( $leading_token =~ /^(package\s)/ && $leading_type eq 'i' ) {
+ elsif ($leading_token =~ /^(package\s)/
+ && $leading_type eq 'i' )
+ {
$want_blank = ( $rOpts->{'blanks-before-subs'} );
}
);
}
- # Break before certain block types if we haven't had a break at this
- # level for a while. This is the difficult decision..
+ # Break before certain block types if we haven't had a
+ # break at this level for a while. This is the
+ # difficult decision..
elsif ($leading_token =~ /^(unless|if|while|until|for|foreach)$/
&& $leading_type eq 'k' )
{
pad_array_to_go();
# set all forced breakpoints for good list formatting
- my $saw_good_break = 0;
- my $is_long_line = excess_line_length( $imin, $max_index_to_go ) > 0;
+ my $is_long_line = excess_line_length( $imin, $max_index_to_go ) > 0;
if (
$max_index_to_go > 0
)
)
{
- $saw_good_break = scan_list();
+ $saw_good_break ||= scan_list();
}
# let $ri_first and $ri_last be references to lists of
if ( $accumulating_text_for_block !~ /^els/ ) {
$rleading_block_if_elsif_text = [];
}
- $leading_block_text = "";
- $leading_block_text_level = $levels_to_go[$i];
+ $leading_block_text = "";
+ $leading_block_text_level = $levels_to_go[$i];
$leading_block_text_line_number =
$vertical_aligner_object->get_output_line_number();
$leading_block_text_length_exceeded = 0;
&& $block_type_to_go[$i_terminal] =~
/$closing_side_comment_list_pattern/o
+ # .. but not an anonymous sub
+ # These are not normally of interest, and their closing braces are
+ # often followed by commas or semicolons anyway. This also avoids
+ # possible erratic output due to line numbering inconsistencies
+ # in the cases where their closing braces terminate a line.
+ && $block_type_to_go[$i_terminal] ne 'sub'
+
# ..and the corresponding opening brace must is not in this batch
# (because we do not need to tag one-line blocks, although this
# should also be caught with a positive -csci value)
else {
# insert the new side comment into the output token stream
- my $type = '#';
- my $block_type = '';
- my $type_sequence = '';
+ my $type = '#';
+ my $block_type = '';
+ my $type_sequence = '';
my $container_environment =
$container_environment_to_go[$max_index_to_go];
my $level = $levels_to_go[$max_index_to_go];
my $rindentation_list = [0]; # ref to indentations for each line
+ # define the array @matching_token_to_go for the output tokens
+ # which will be non-blank for each special token (such as =>)
+ # for which alignment is required.
set_vertical_alignment_markers( $ri_first, $ri_last );
# flush if necessary to avoid unwanted alignment
# Mark most things before arrows as a quote to
# get them to line up. Testfile: mixed.pl.
if ( ( $i < $iend - 1 ) && ( $type =~ /^[wnC]$/ ) ) {
- my $next_type = $types_to_go[ $i + 1 ];
+ my $next_type = $types_to_go[ $i + 1 ];
my $i_next_nonblank =
( ( $next_type eq 'b' ) ? $i + 2 : $i + 1 );
# done with this line .. join text of tokens to make the last field
push( @fields, join( '', @tokens_to_go[ $i_start .. $iend ] ) );
- my ( $indentation, $lev, $level_end, $is_semicolon_terminated,
- $is_outdented_line )
+ my ( $indentation, $lev, $level_end, $terminal_type,
+ $is_semicolon_terminated, $is_outdented_line )
= set_adjusted_indentation( $ibeg, $iend, \@fields, \@patterns,
$ri_first, $ri_last, $rindentation_list );
# flush an outdented line to avoid any unwanted vertical alignment
Perl::Tidy::VerticalAligner::flush() if ($is_outdented_line);
+ my $is_terminal_ternary = 0;
+ if ( $tokens_to_go[$ibeg] eq ':'
+ || $n > 0 && $tokens_to_go[ $$ri_last[ $n - 1 ] ] eq ':' )
+ {
+ if ( ( $terminal_type eq ';' && $level_end <= $lev )
+ || ( $level_end < $lev ) )
+ {
+ $is_terminal_ternary = 1;
+ }
+ }
+
# send this new line down the pipe
my $forced_breakpoint = $forced_breakpoint_to_go[$iend];
Perl::Tidy::VerticalAligner::append_line(
\@patterns,
$forced_breakpoint_to_go[$iend] || $in_comma_list,
$outdent_long_lines,
+ $is_terminal_ternary,
$is_semicolon_terminated,
$do_not_pad,
$rvertical_tightness_flags,
if ( $saved_opening_indentation{$seqno} ) {
( $indent, $offset ) = @{ $saved_opening_indentation{$seqno} };
}
+
+ # some kind of serious error
+ # (example is badfile.t)
+ else {
+ $indent = 0;
+ $offset = 0;
+ }
}
# if no sequence number it must be an unbalanced container
if ( $types_to_go[$i_terminal] =~ /^[\}\]\)R]$/
&& $i_terminal == $ibeg )
{
- my $ci = $ci_levels_to_go[$ibeg];
- my $lev = $levels_to_go[$ibeg];
- my $next_type = $types_to_go[ $ibeg + 1 ];
+ my $ci = $ci_levels_to_go[$ibeg];
+ my $lev = $levels_to_go[$ibeg];
+ my $next_type = $types_to_go[ $ibeg + 1 ];
my $i_next_nonblank =
( ( $next_type eq 'b' ) ? $ibeg + 2 : $ibeg + 1 );
if ( $i_next_nonblank <= $max_index_to_go
}
}
- return ( $indentation, $lev, $level_end, $is_semicolon_terminated,
- $is_outdented_line );
+ return ( $indentation, $lev, $level_end, $terminal_type,
+ $is_semicolon_terminated, $is_outdented_line );
}
}
# These flags are used by sub set_leading_whitespace in
# the vertical aligner
- my $rvertical_tightness_flags;
+ my $rvertical_tightness_flags = [ 0, 0, 0, 0, 0, 0 ];
# For non-BLOCK tokens, we will need to examine the next line
# too, so we won't consider the last line.
# patch to make something like 'qw(' behave like an opening paren
# (aran.t)
if ( $types_to_go[$ibeg_next] eq 'q' ) {
- if ( $token_beg_next =~ /^q.([\[\(\{])$/ ) {
+ if ( $token_beg_next =~ /^qw\s*([\[\(\{])$/ ) {
$token_beg_next = $1;
}
}
( 3, $rOpts_block_brace_vertical_tightness, 0, 1 );
}
+ # pack in the sequence numbers of the ends of this line
+ $rvertical_tightness_flags->[4] = get_seqno($ibeg);
+ $rvertical_tightness_flags->[5] = get_seqno($iend);
return $rvertical_tightness_flags;
}
+sub get_seqno {
+
+ # get opening and closing sequence numbers of a token for the vertical
+ # aligner. Assign qw quotes a value to allow qw opening and closing tokens
+ # to be treated somewhat like opening and closing tokens for stacking
+ # tokens by the vertical aligner.
+ my ($ii) = @_;
+ my $seqno = $type_sequence_to_go[$ii];
+ if ( $types_to_go[$ii] eq 'q' ) {
+ my $SEQ_QW = -1;
+ if ( $ii > 0 ) {
+ $seqno = $SEQ_QW if ( $tokens_to_go[$ii] =~ /^qw\s*[\(\{\[]/ );
+ }
+ else {
+ if ( !$ending_in_quote ) {
+ $seqno = $SEQ_QW if ( $tokens_to_go[$ii] =~ /[\)\}\]]$/ );
+ }
+ }
+ }
+ return ($seqno);
+}
+
{
my %is_vertical_alignment_type;
my %is_vertical_alignment_keyword;
@_ = qw#
= **= += *= &= <<= &&= -= /= |= >>= ||= //= .= %= ^= x=
- { ? : => =~ && || //
+ { ? : => =~ && || // ~~
#;
@is_vertical_alignment_type{@_} = (1) x scalar(@_);
sub set_vertical_alignment_markers {
- # Look at the tokens in this output batch and define the array
- # 'matching_token_to_go' which marks tokens at which we would
+ # This routine takes the first step toward vertical alignment of the
+ # lines of output text. It looks for certain tokens which can serve as
+ # vertical alignment markers (such as an '=').
+ #
+ # Method: We look at each token $i in this output batch and set
+ # $matching_token_to_go[$i] equal to those tokens at which we would
# accept vertical alignment.
# nothing to do if we aren't allowed to change whitespace
my ( $ri_first, $ri_last ) = @_;
+ # remember the index of last nonblank token before any sidecomment
+ my $i_terminal = $max_index_to_go;
+ if ( $types_to_go[$i_terminal] eq '#' ) {
+ if ( $i_terminal > 0 && $types_to_go[ --$i_terminal ] eq 'b' ) {
+ if ( $i_terminal > 0 ) { --$i_terminal }
+ }
+ }
+
# look at each line of this batch..
my $last_vertical_alignment_before_index;
my $vert_last_nonblank_type;
my $max_line = @$ri_first - 1;
my ( $i, $type, $token, $block_type, $alignment_type );
my ( $ibeg, $iend, $line );
+
foreach $line ( 0 .. $max_line ) {
$ibeg = $$ri_first[$line];
$iend = $$ri_last[$line];
# align before the first token and 2) the second
# token must be a blank if we are to align before
# the third
- if ( $i < $ibeg + 2 ) {
- }
+ if ( $i < $ibeg + 2 ) { }
# must follow a blank token
- elsif ( $types_to_go[ $i - 1 ] ne 'b' ) {
- }
+ elsif ( $types_to_go[ $i - 1 ] ne 'b' ) { }
# align a side comment --
elsif ( $type eq '#' ) {
# otherwise, do not align two in a row to create a
# blank field
- elsif ( $last_vertical_alignment_before_index == $i - 2 ) {
- }
+ elsif ( $last_vertical_alignment_before_index == $i - 2 ) { }
# align before one of these keywords
# (within a line, since $i>1)
elsif ( $is_vertical_alignment_type{$type} ) {
$alignment_type = $token;
+ # Do not align a terminal token. Although it might
+ # occasionally look ok to do this, it has been found to be
+ # a good general rule. The main problems are:
+ # (1) that the terminal token (such as an = or :) might get
+ # moved far to the right where it is hard to see because
+ # nothing follows it, and
+ # (2) doing so may prevent other good alignments.
+ if ( $i == $iend || $i >= $i_terminal ) {
+ $alignment_type = "";
+ }
+
+ # Do not align leading ': ('. This would prevent
+ # alignment in something like the following:
+ # $extra_space .=
+ # ( $input_line_number < 10 ) ? " "
+ # : ( $input_line_number < 100 ) ? " "
+ # : "";
+ if ( $i == $ibeg + 2
+ && $types_to_go[$ibeg] eq ':'
+ && $types_to_go[ $i - 1 ] eq 'b' )
+ {
+ $alignment_type = "";
+ }
+
# For a paren after keyword, only align something like this:
# if ( $a ) { &a }
# elsif ( $b ) { &b }
# if ($token ne $type) {$alignment_type .= $type}
}
- # NOTE: This is deactivated until the new vertical aligner
- # is finished because it causes the previous if/elsif alignment
- # to fail
- #elsif ( $type eq '}' && $token eq '}' && $block_type_to_go[$i]) {
- # $alignment_type = $type;
- #}
+ # NOTE: This is deactivated because it causes the previous
+ # if/elsif alignment to fail
+ #elsif ( $type eq '}' && $token eq '}' && $block_type_to_go[$i])
+ #{ $alignment_type = $type; }
if ($alignment_type) {
$last_vertical_alignment_before_index = $i;
# make these a little weaker than nominal so that they get
# favored for end-of-line characters
- @_ = qw"!= == =~ !~";
- @left_bond_strength{@_} = (STRONG) x scalar(@_);
+ @_ = qw"!= == =~ !~ ~~";
+ @left_bond_strength{@_} = (STRONG) x scalar(@_);
@right_bond_strength{@_} =
( 0.9 * NOMINAL + 0.1 * WEAK ) x scalar(@_);
# break AFTER these
- @_ = qw" < > | & >= <=";
- @left_bond_strength{@_} = (VERY_STRONG) x scalar(@_);
+ @_ = qw" < > | & >= <=";
+ @left_bond_strength{@_} = (VERY_STRONG) x scalar(@_);
@right_bond_strength{@_} =
( 0.8 * NOMINAL + 0.2 * WEAK ) x scalar(@_);
$left_bond_strength{'G'} = NOMINAL;
$right_bond_strength{'G'} = STRONG;
- # it is very good to break AFTER various assignment operators
+ # it is good to break AFTER various assignment operators
@_ = qw(
= **= += *= &= <<= &&=
-= /= |= >>= ||= //=
.= %= ^=
x=
);
- @left_bond_strength{@_} = (STRONG) x scalar(@_);
+ @left_bond_strength{@_} = (STRONG) x scalar(@_);
@right_bond_strength{@_} =
( 0.4 * WEAK + 0.6 * VERY_WEAK ) x scalar(@_);
# to simplify coding in scan_list and set_bond_strengths, it helps
# to create some extra blank tokens at the end of the arrays
- $tokens_to_go[ $max_index_to_go + 1 ] = '';
- $tokens_to_go[ $max_index_to_go + 2 ] = '';
- $types_to_go[ $max_index_to_go + 1 ] = 'b';
- $types_to_go[ $max_index_to_go + 2 ] = 'b';
+ $tokens_to_go[ $max_index_to_go + 1 ] = '';
+ $tokens_to_go[ $max_index_to_go + 2 ] = '';
+ $types_to_go[ $max_index_to_go + 1 ] = 'b';
+ $types_to_go[ $max_index_to_go + 2 ] = 'b';
$nesting_depth_to_go[ $max_index_to_go + 1 ] =
$nesting_depth_to_go[$max_index_to_go];
# TESTING: retain break at a ':' line break
if ( ( $i == $i_line_start || $i == $i_line_end )
- && $rOpts_break_at_old_trinary_breakpoints )
+ && $rOpts_break_at_old_ternary_breakpoints )
{
# TESTING:
$rfor_semicolon_list[$depth] = [];
$i_equals[$depth] = -1;
$want_comma_break[$depth] = 0;
- $container_type[$depth] =
+ $container_type[$depth] =
( $last_nonblank_type =~ /^(k|=>|&&|\|\||\?|\:|\.)$/ )
? $last_nonblank_token
: "";
if ( $rOpts_line_up_parentheses && $saw_opening_structure )
{
my $item = $leading_spaces_to_go[ $i_opening + 1 ];
+ if ( $i_opening + 1 < $max_index_to_go
+ && $types_to_go[ $i_opening + 1 ] eq 'b' )
+ {
+ $item = $leading_spaces_to_go[ $i_opening + 2 ];
+ }
if ( defined($item) ) {
my $i_start_2 = $item->get_STARTING_INDEX();
if (
# Looks like a list of items. We have to look at it and size it up.
#---------------------------------------------------------------
- my $opening_token = $tokens_to_go[$i_opening_paren];
+ my $opening_token = $tokens_to_go[$i_opening_paren];
my $opening_environment =
$container_environment_to_go[$i_opening_paren];
# Field width parameters
my $pair_width = ( $max_length[0] + $max_length[1] );
- my $max_width =
+ my $max_width =
( $max_length[0] > $max_length[1] ) ? $max_length[0] : $max_length[1];
# Number of free columns across the page width for laying out tables
# )
# if $style eq 'all';
- my $i_last_comma = $$rcomma_index[ $comma_count - 1 ];
- my $long_last_term = excess_line_length( 0, $i_last_comma ) <= 0;
+ my $i_last_comma = $$rcomma_index[ $comma_count - 1 ];
+ my $long_last_term = excess_line_length( 0, $i_last_comma ) <= 0;
my $long_first_term =
excess_line_length( $i_first_comma + 1, $max_index_to_go ) <= 0;
# align; high sparsity does not look good, especially with few lines
my $sparsity = ($unused_columns) / ($formatted_columns);
my $max_allowed_sparsity =
- ( $item_count < 3 ) ? 0.1
+ ( $item_count < 3 ) ? 0.1
: ( $packed_lines == 1 ) ? 0.15
: ( $packed_lines == 2 ) ? 0.4
- : 0.7;
+ : 0.7;
# Begin check for shortcut methods, which avoid treating a list
# as a table for relatively small parenthesized lists. These
sub table_columns_available {
my $i_first_comma = shift;
- my $columns =
+ my $columns =
$rOpts_maximum_line_length - leading_spaces_to_go($i_first_comma);
# Patch: the vertical formatter does not line up lines whose lengths
my $ifff = $n + 2 <= $nmax ? $$ri_first[ $n + 2 ] : -1;
my $imm = $n > 1 ? $$ri_first[ $n - 2 ] : -1;
my $seqno = $type_sequence_to_go[$imidr];
- my $f_ok =
+ my $f_ok =
( $types_to_go[$if] eq ':'
&& $type_sequence_to_go[$if] ==
$seqno - TYPE_SEQUENCE_INCREMENT );
)
);
-
- # override breakpoint
- ##$forced_breakpoint_to_go[$imid] = 0;
}
# handle leading "if" and "unless"
&& $is_and_or{ $tokens_to_go[$if] }
);
-
- # override breakpoint
- ##$forced_breakpoint_to_go[$imid] = 0;
-
}
# handle all other leading keywords
&& $is_if_unless{ $tokens_to_go[$if] }
);
-
- # override breakpoint
- ##$forced_breakpoint_to_go[$imid] = 0;
}
#----------------------------------------------------------
# loop to find next break point
for ( $i_test = $i_begin ; $i_test <= $imax ; $i_test++ ) {
- my $type = $types_to_go[$i_test];
- my $token = $tokens_to_go[$i_test];
- my $next_type = $types_to_go[ $i_test + 1 ];
- my $next_token = $tokens_to_go[ $i_test + 1 ];
+ my $type = $types_to_go[$i_test];
+ my $token = $tokens_to_go[$i_test];
+ my $next_type = $types_to_go[ $i_test + 1 ];
+ my $next_token = $tokens_to_go[ $i_test + 1 ];
my $i_next_nonblank =
( ( $next_type eq 'b' ) ? $i_test + 2 : $i_test + 1 );
my $next_nonblank_type = $types_to_go[$i_next_nonblank];
# There is an implied forced break at a terminal opening brace
|| ( ( $type eq '{' ) && ( $i_test == $imax ) )
-
)
{
my ( $item, $spaces_needed ) = @_;
my $available_spaces = $item->get_AVAILABLE_SPACES();
- my $deleted_spaces =
+ my $deleted_spaces =
( $available_spaces > $spaces_needed )
? $spaces_needed
: $available_spaces;
# caller.
my ( $item, $spaces_needed ) = @_;
my $available_spaces = $item->get_AVAILABLE_SPACES();
- my $deleted_spaces =
+ my $deleted_spaces =
( $available_spaces > $spaces_needed )
? $spaces_needed
: $available_spaces;
use constant VALIGN_DEBUG_FLAG_APPEND => 0;
use constant VALIGN_DEBUG_FLAG_APPEND0 => 0;
+ use constant VALIGN_DEBUG_FLAG_TERNARY => 0;
my $debug_warning = sub {
print "VALIGN_DEBUGGING with key $_[0]\n";
$cached_seqno
$cached_line_valid
$cached_line_leading_space_count
+ $cached_seqno_string
+
+ $seqno_string
+ $last_nonblank_seqno_string
$rOpts
$rOpts_indent_columns
$rOpts_tabs
$rOpts_entab_leading_whitespace
+ $rOpts_valign
$rOpts_minimum_space_to_comment
$cached_seqno = 0;
$cached_line_valid = 0;
$cached_line_leading_space_count = 0;
+ $cached_seqno_string = "";
+
+ # string of sequence numbers joined together
+ $seqno_string = "";
+ $last_nonblank_seqno_string = "";
# frequently used parameters
$rOpts_indent_columns = $rOpts->{'indent-columns'};
$rOpts_entab_leading_whitespace = $rOpts->{'entab-leading-whitespace'};
$rOpts_minimum_space_to_comment = $rOpts->{'minimum-space-to-comment'};
$rOpts_maximum_line_length = $rOpts->{'maximum-line-length'};
+ $rOpts_valign = $rOpts->{'valign'};
forget_side_comment();
# The log file warns the user if there are any such tabs.
my (
- $level, $level_end,
- $indentation, $rfields,
- $rtokens, $rpatterns,
- $is_forced_break, $outdent_long_lines,
- $is_terminal_statement, $do_not_pad,
- $rvertical_tightness_flags, $level_jump,
+ $level, $level_end,
+ $indentation, $rfields,
+ $rtokens, $rpatterns,
+ $is_forced_break, $outdent_long_lines,
+ $is_terminal_ternary, $is_terminal_statement,
+ $do_not_pad, $rvertical_tightness_flags,
+ $level_jump,
) = @_;
# number of fields is $jmax
# number of tokens between fields is $jmax-1
my $jmax = $#{$rfields};
- $previous_minimum_jmax_seen = $minimum_jmax_seen;
- $previous_maximum_jmax_seen = $maximum_jmax_seen;
my $leading_space_count = get_SPACES($indentation);
if ($rvertical_tightness_flags) {
if ( $maximum_line_index <= 0
&& $cached_line_type
+ && $cached_seqno
+ && $rvertical_tightness_flags->[2]
&& $rvertical_tightness_flags->[2] == $cached_seqno )
{
$rvertical_tightness_flags->[3] ||= 1;
if ( $level < 0 ) { $level = 0 }
# do not align code across indentation level changes
- if ( $level != $group_level || $is_outdented ) {
+ # or if vertical alignment is turned off for debugging
+ if ( $level != $group_level || $is_outdented || !$rOpts_valign ) {
# we are allowed to shift a group of lines to the right if its
# level is greater than the previous and next group
}
}
+ # --------------------------------------------------------------------
+ # add dummy fields for terminal ternary
+ # --------------------------------------------------------------------
+ if ( $is_terminal_ternary && $current_line ) {
+ fix_terminal_ternary( $rfields, $rtokens, $rpatterns );
+ $jmax = @{$rfields} - 1;
+ }
+
+ # --------------------------------------------------------------------
+ # add dummy fields for else statement
+ # --------------------------------------------------------------------
+ if ( $rfields->[0] =~ /^else\s*$/
+ && $current_line
+ && $level_jump == 0 )
+ {
+ fix_terminal_else( $rfields, $rtokens, $rpatterns );
+ $jmax = @{$rfields} - 1;
+ }
+
# --------------------------------------------------------------------
# Step 1. Handle simple line of code with no fields to match.
# --------------------------------------------------------------------
# Future update to allow this to vary:
$current_line = $new_line if ( $maximum_line_index == 0 );
+ my_flush() if ( $group_type eq "TERMINAL" );
+
# --------------------------------------------------------------------
# Step 8. Some old debugging stuff
# --------------------------------------------------------------------
my $old_line = shift;
my $jmax = $new_line->get_jmax();
- my $old_rtokens = $old_line->get_rtokens();
- my $rtokens = $new_line->get_rtokens();
+ my $old_rtokens = $old_line->get_rtokens();
+ my $rtokens = $new_line->get_rtokens();
my $is_assignment =
- ( $rtokens->[0] =~ /^=\d*$/ && ( $old_rtokens->[0] eq $rtokens->[0] ) );
+ ( $rtokens->[0] =~ /^=\d*$/ && ( $old_rtokens->[0] eq $rtokens->[0] )
+ || $group_type eq "TERMINAL" );
# must be monotonic variation
return unless ( $is_assignment || $previous_maximum_jmax_seen <= $jmax );
my $rpatterns = $new_line->get_rpatterns();
my $old_rpatterns = $old_line->get_rpatterns();
- # loop over all old tokens except comment
+ # loop over all OLD tokens except comment and check match
my $match = 1;
my $k;
for ( $k = 0 ; $k < $maximum_field_index - 1 ; $k++ ) {
if ( ( $$old_rtokens[$k] ne $$rtokens[$k] )
- || ( $$old_rpatterns[$k] ne $$rpatterns[$k] ) )
+ || ( $$old_rpatterns[$k] ne $$rpatterns[$k] )
+ && $group_type ne "TERMINAL" )
{
$match = 0;
last;
}
}
- # first tokens agree, so combine new tokens
+ # first tokens agree, so combine extra new tokens
if ($match) {
for $k ( $maximum_field_index .. $jmax - 1 ) {
$new_line->set_jmax($jmax);
}
+sub fix_terminal_ternary {
+
+ # Add empty fields as necessary to align a ternary term
+ # like this:
+ #
+ # my $leapyear =
+ # $year % 4 ? 0
+ # : $year % 100 ? 1
+ # : $year % 400 ? 0
+ # : 1;
+ #
+ my ( $rfields, $rtokens, $rpatterns ) = @_;
+
+ my $jmax = @{$rfields} - 1;
+ my $old_line = $group_lines[$maximum_line_index];
+ my $rfields_old = $old_line->get_rfields();
+
+ my $rpatterns_old = $old_line->get_rpatterns();
+ my $rtokens_old = $old_line->get_rtokens();
+ my $maximum_field_index = $old_line->get_jmax();
+
+ # look for the question mark after the :
+ my ($jquestion);
+ my $depth_question;
+ my $pad = "";
+ for ( my $j = 0 ; $j < $maximum_field_index ; $j++ ) {
+ my $tok = $rtokens_old->[$j];
+ if ( $tok =~ /^\?(\d+)$/ ) {
+ $depth_question = $1;
+
+ # depth must be correct
+ next unless ( $depth_question eq $group_level );
+
+ $jquestion = $j;
+ if ( $rfields_old->[ $j + 1 ] =~ /^(\?\s*)/ ) {
+ $pad = " " x length($1);
+ }
+ else {
+ return; # shouldn't happen
+ }
+ last;
+ }
+ }
+ return unless ( defined($jquestion) ); # shouldn't happen
+
+ # Now splice the tokens and patterns of the previous line
+ # into the else line to insure a match. Add empty fields
+ # as necessary.
+ my $jadd = $jquestion;
+
+ # Work on copies of the actual arrays in case we have
+ # to return due to an error
+ my @fields = @{$rfields};
+ my @patterns = @{$rpatterns};
+ my @tokens = @{$rtokens};
+
+ VALIGN_DEBUG_FLAG_TERNARY && do {
+ local $" = '><';
+ print "CURRENT FIELDS=<@{$rfields_old}>\n";
+ print "CURRENT TOKENS=<@{$rtokens_old}>\n";
+ print "CURRENT PATTERNS=<@{$rpatterns_old}>\n";
+ print "UNMODIFIED FIELDS=<@{$rfields}>\n";
+ print "UNMODIFIED TOKENS=<@{$rtokens}>\n";
+ print "UNMODIFIED PATTERNS=<@{$rpatterns}>\n";
+ };
+
+ # handle cases of leading colon on this line
+ if ( $fields[0] =~ /^(:\s*)(.*)$/ ) {
+
+ my ( $colon, $therest ) = ( $1, $2 );
+
+ # Handle sub-case of first field with leading colon plus additional code
+ # This is the usual situation as at the '1' below:
+ # ...
+ # : $year % 400 ? 0
+ # : 1;
+ if ($therest) {
+
+ # Split the first field after the leading colon and insert padding.
+ # Note that this padding will remain even if the terminal value goes
+ # out on a separate line. This does not seem to look to bad, so no
+ # mechanism has been included to undo it.
+ my $field1 = shift @fields;
+ unshift @fields, ( $colon, $pad . $therest );
+
+ # change the leading pattern from : to ?
+ return unless ( $patterns[0] =~ s/^\:/?/ );
+
+ # install leading tokens and patterns of existing line
+ unshift( @tokens, @{$rtokens_old}[ 0 .. $jquestion ] );
+ unshift( @patterns, @{$rpatterns_old}[ 0 .. $jquestion ] );
+
+ # insert appropriate number of empty fields
+ splice( @fields, 1, 0, ('') x $jadd ) if $jadd;
+ }
+
+ # handle sub-case of first field just equal to leading colon.
+ # This can happen for example in the example below where
+ # the leading '(' would create a new alignment token
+ # : ( $name =~ /[]}]$/ ) ? ( $mname = $name )
+ # : ( $mname = $name . '->' );
+ else {
+
+ return unless ( $jmax > 0 && $tokens[0] ne '#' ); # shouldn't happen
+
+ # prepend a leading ? onto the second pattern
+ $patterns[1] = "?b" . $patterns[1];
+
+ # pad the second field
+ $fields[1] = $pad . $fields[1];
+
+ # install leading tokens and patterns of existing line, replacing
+ # leading token and inserting appropriate number of empty fields
+ splice( @tokens, 0, 1, @{$rtokens_old}[ 0 .. $jquestion ] );
+ splice( @patterns, 1, 0, @{$rpatterns_old}[ 1 .. $jquestion ] );
+ splice( @fields, 1, 0, ('') x $jadd ) if $jadd;
+ }
+ }
+
+ # Handle case of no leading colon on this line. This will
+ # be the case when -wba=':' is used. For example,
+ # $year % 400 ? 0 :
+ # 1;
+ else {
+
+ # install leading tokens and patterns of existing line
+ $patterns[0] = '?' . 'b' . $patterns[0];
+ unshift( @tokens, @{$rtokens_old}[ 0 .. $jquestion ] );
+ unshift( @patterns, @{$rpatterns_old}[ 0 .. $jquestion ] );
+
+ # insert appropriate number of empty fields
+ $jadd = $jquestion + 1;
+ $fields[0] = $pad . $fields[0];
+ splice( @fields, 0, 0, ('') x $jadd ) if $jadd;
+ }
+
+ VALIGN_DEBUG_FLAG_TERNARY && do {
+ local $" = '><';
+ print "MODIFIED TOKENS=<@tokens>\n";
+ print "MODIFIED PATTERNS=<@patterns>\n";
+ print "MODIFIED FIELDS=<@fields>\n";
+ };
+
+ # all ok .. update the arrays
+ @{$rfields} = @fields;
+ @{$rtokens} = @tokens;
+ @{$rpatterns} = @patterns;
+
+ # force a flush after this line
+ $group_type = "TERMINAL";
+ return;
+}
+
+sub fix_terminal_else {
+
+ # Add empty fields as necessary to align a balanced terminal
+ # else block to a previous if/elsif/unless block,
+ # like this:
+ #
+ # if ( 1 || $x ) { print "ok 13\n"; }
+ # else { print "not ok 13\n"; }
+ #
+ my ( $rfields, $rtokens, $rpatterns ) = @_;
+ my $jmax = @{$rfields} - 1;
+ return unless ( $jmax > 0 );
+
+ # check for balanced else block following if/elsif/unless
+ my $rfields_old = $current_line->get_rfields();
+
+ # TBD: add handling for 'case'
+ return unless ( $rfields_old->[0] =~ /^(if|elsif|unless)\s*$/ );
+
+ # look for the opening brace after the else, and extrace the depth
+ my $tok_brace = $rtokens->[0];
+ my $depth_brace;
+ if ( $tok_brace =~ /^\{(\d+)$/ ) { $depth_brace = $1; }
+
+ # probably: "else # side_comment"
+ else { return }
+
+ my $rpatterns_old = $current_line->get_rpatterns();
+ my $rtokens_old = $current_line->get_rtokens();
+ my $maximum_field_index = $current_line->get_jmax();
+
+ # be sure the previous if/elsif is followed by an opening paren
+ my $jparen = 0;
+ my $tok_paren = '(' . $depth_brace;
+ my $tok_test = $rtokens_old->[$jparen];
+ return unless ( $tok_test eq $tok_paren ); # shouldn't happen
+
+ # Now find the opening block brace
+ my ($jbrace);
+ for ( my $j = 1 ; $j < $maximum_field_index ; $j++ ) {
+ my $tok = $rtokens_old->[$j];
+ if ( $tok eq $tok_brace ) {
+ $jbrace = $j;
+ last;
+ }
+ }
+ return unless ( defined($jbrace) ); # shouldn't happen
+
+ # Now splice the tokens and patterns of the previous line
+ # into the else line to insure a match. Add empty fields
+ # as necessary.
+ my $jadd = $jbrace - $jparen;
+ splice( @{$rtokens}, 0, 0, @{$rtokens_old}[ $jparen .. $jbrace - 1 ] );
+ splice( @{$rpatterns}, 1, 0, @{$rpatterns_old}[ $jparen + 1 .. $jbrace ] );
+ splice( @{$rfields}, 1, 0, ('') x $jadd );
+
+ # force a flush after this line if it does not follow a case
+ $group_type = "TERMINAL"
+ unless ( $rfields_old->[0] =~ /^case\s*$/ );
+ return;
+}
+
sub check_match {
my $new_line = shift;
my $old_tok = $$old_rtokens[$j];
my $new_tok = $$rtokens[$j];
- # dumb down the match after an equals
+ # Dumb down the match AFTER an equals and
+ # also dumb down after seeing a ? ternary operator ...
+ # Everything after a + is the token which preceded the previous
+ # opening paren (container name). We won't require them to match.
if ( $saw_equals && $new_tok =~ /(.*)\+/ ) {
$new_tok = $1;
$old_tok =~ s/\+.*$//;
}
- if ( $new_tok =~ /^=\d*$/ ) { $saw_equals = 1 }
+
+ if ( $new_tok =~ /^[\?=]\d*$/ ) { $saw_equals = 1 }
# we never match if the matching tokens differ
if ( $j < $jlimit
my $maximum_field_index = $old_line->get_jmax();
for $j ( 0 .. $jmax ) {
- ## testing patch to avoid excessive gaps in previous lines,
- # due to a line of fewer fields.
- # return join( ".",
- # $self->{"dfi"}, $self->{"aa"}, $self->rsvd, $self->{"rd"},
- # $self->{"area"}, $self->{"id"}, $self->{"sel"} );
- ## MOVED BELOW AS A TEST
- ##next if ($jmax < $maximum_field_index && $j==$jmax-1);
-
$pad = length( $$rfields[$j] ) - $old_line->current_field_width($j);
if ( $j == 0 ) {
last;
}
- # TESTING PATCH moved from above to be sure we fit
+ # patch to avoid excessive gaps in previous lines,
+ # due to a line of fewer fields.
+ # return join( ".",
+ # $self->{"dfi"}, $self->{"aa"}, $self->rsvd, $self->{"rd"},
+ # $self->{"area"}, $self->{"id"}, $self->{"sel"} );
next if ( $jmax < $maximum_field_index && $j == $jmax - 1 );
# looks ok, squeeze this field in
sub accept_line {
+ # The current line either starts a new alignment group or is
+ # accepted into the current alignment group.
my $new_line = shift;
$group_lines[ ++$maximum_line_index ] = $new_line;
$group_lines[ $maximum_line_index - 1 ]->get_alignments();
$new_line->set_alignments(@new_alignments);
}
+
+ # remember group jmax extremes for next call to append_line
+ $previous_minimum_jmax_seen = $minimum_jmax_seen;
+ $previous_maximum_jmax_seen = $maximum_jmax_seen;
}
sub dump_array {
if ( $maximum_line_index < 0 ) {
if ($cached_line_type) {
+ $seqno_string = $cached_seqno_string;
entab_and_output( $cached_line_text,
$cached_line_leading_space_count,
$last_group_level_written );
- $cached_line_type = 0;
- $cached_line_text = "";
+ $cached_line_type = 0;
+ $cached_line_text = "";
+ $cached_seqno_string = "";
}
}
else {
# zero leading space count if any lines are too long
my $max_excess = 0;
for my $i ( 0 .. $maximum_line_index ) {
- my $str = $group_lines[$i];
+ my $str = $group_lines[$i];
my $excess =
length($str) + $leading_space_count - $rOpts_maximum_line_length;
if ( $excess > $max_excess ) {
# Do not try to align two lines which are not really similar
return unless $maximum_line_index == 1;
+ return if ( $group_type eq "TERMINAL" );
my $group_list_type = $group_lines[0]->get_list_type();
|| $group_maximum_gap > 12
# or lines with differing number of alignment tokens
+ # TODO: this could be improved. It occasionally rejects
+ # good matches.
|| $previous_maximum_jmax_seen != $previous_minimum_jmax_seen
)
);
$total_pad_count = 0;
$str .= $$rfields[$j];
}
+ else {
+ $total_pad_count = 0;
+ }
# update side comment history buffer
if ( $j == $maximum_field_index ) {
length($str) - $side_comment_length + $leading_space_count -
$rOpts_maximum_line_length;
if ( $excess > 0 ) {
- $leading_space_count = 0;
+ $leading_space_count = 0;
$last_outdented_line_at =
$file_writer_object->get_output_line_number();
# [2] sequence number of container
# [3] valid flag: do not append if this flag is false
#
- my ( $open_or_close, $tightness_flag, $seqno, $valid );
+ my ( $open_or_close, $tightness_flag, $seqno, $valid, $seqno_beg,
+ $seqno_end );
if ($rvertical_tightness_flags) {
- ( $open_or_close, $tightness_flag, $seqno, $valid ) =
- @{$rvertical_tightness_flags};
+ (
+ $open_or_close, $tightness_flag, $seqno, $valid, $seqno_beg,
+ $seqno_end
+ ) = @{$rvertical_tightness_flags};
}
+ $seqno_string = $seqno_end;
+
# handle any cached line ..
# either append this line to it or write it out
if ( length($cached_line_text) ) {
if ( $gap >= 0 ) {
$leading_string = $cached_line_text . ' ' x $gap;
$leading_space_count = $cached_line_leading_space_count;
+ $seqno_string = $cached_seqno_string . ':' . $seqno_beg;
}
else {
entab_and_output( $cached_line_text,
my $test_line = $cached_line_text . ' ' x $cached_line_flag . $str;
if ( length($test_line) <= $rOpts_maximum_line_length ) {
+
+ $seqno_string = $cached_seqno_string . ':' . $seqno_beg;
+
+ # Patch to outdent closing tokens ending # in ');'
+ # If we are joining a line like ');' to a previous stacked
+ # set of closing tokens, then decide if we may outdent the
+ # combined stack to the indentation of the ');'. Since we
+ # should not normally outdent any of the other tokens more than
+ # the indentation of the lines that contained them, we will
+ # only do this if all of the corresponding opening
+ # tokens were on the same line. This can happen with
+ # -sot and -sct. For example, it is ok here:
+ # __PACKAGE__->load_components( qw(
+ # PK::Auto
+ # Core
+ # ));
+ #
+ # But, for example, we do not outdent in this example because
+ # that would put the closing sub brace out farther than the
+ # opening sub brace:
+ #
+ # perltidy -sot -sct
+ # $c->Tk::bind(
+ # '<Control-f>' => sub {
+ # my ($c) = @_;
+ # my $e = $c->XEvent;
+ # itemsUnderArea $c;
+ # } );
+ #
+ if ( $str =~ /^\);/ && $cached_line_text =~ /^[\)\}\]\s]*$/ ) {
+
+ # The way to tell this is if the stacked sequence numbers
+ # of this output line are the reverse of the stacked
+ # sequence numbers of the previous non-blank line of
+ # sequence numbers. So we can join if the previous
+ # nonblank string of tokens is the mirror image. For
+ # example if stack )}] is 13:8:6 then we are looking for a
+ # leading stack like [{( which is 6:8:13 We only need to
+ # check the two ends, because the intermediate tokens must
+ # fall in order. Note on speed: having to split on colons
+ # and eliminate multiple colons might appear to be slow,
+ # but it's not an issue because we almost never come
+ # through here. In a typical file we don't.
+ $seqno_string =~ s/^:+//;
+ $last_nonblank_seqno_string =~ s/^:+//;
+ $seqno_string =~ s/:+/:/g;
+ $last_nonblank_seqno_string =~ s/:+/:/g;
+
+ # how many spaces can we outdent?
+ my $diff =
+ $cached_line_leading_space_count - $leading_space_count;
+ if ( $diff > 0
+ && length($seqno_string)
+ && length($last_nonblank_seqno_string) ==
+ length($seqno_string) )
+ {
+ my @seqno_last =
+ ( split ':', $last_nonblank_seqno_string );
+ my @seqno_now = ( split ':', $seqno_string );
+ if ( $seqno_now[-1] == $seqno_last[0]
+ && $seqno_now[0] == $seqno_last[-1] )
+ {
+
+ # OK to outdent ..
+ # for absolute safety, be sure we only remove
+ # whitespace
+ my $ws = substr( $test_line, 0, $diff );
+ if ( ( length($ws) == $diff ) && $ws =~ /^\s+$/ ) {
+
+ $test_line = substr( $test_line, $diff );
+ $cached_line_leading_space_count -= $diff;
+ }
+
+ # shouldn't happen, but not critical:
+ ##else {
+ ## ERROR transferring indentation here
+ ##}
+ }
+ }
+ }
+
$str = $test_line;
$leading_string = "";
$leading_space_count = $cached_line_leading_space_count;
my $line = $leading_string . $str;
# write or cache this line
- if ( !$rvertical_tightness_flags || $side_comment_length > 0 ) {
+ if ( !$open_or_close || $side_comment_length > 0 ) {
entab_and_output( $line, $leading_space_count, $group_level );
}
else {
$cached_seqno = $seqno;
$cached_line_valid = $valid;
$cached_line_leading_space_count = $leading_space_count;
+ $cached_seqno_string = $seqno_string;
}
$last_group_level_written = $group_level;
# Handle option of one tab per level
else {
my $leading_string = ( "\t" x $level );
- my $space_count =
+ my $space_count =
$leading_space_count - $level * $rOpts_indent_columns;
# shouldn't happen:
}
}
$file_writer_object->write_code_line( $line . "\n" );
+ if ($seqno_string) {
+ $last_nonblank_seqno_string = $seqno_string;
+ }
}
{ # begin get_leading_string
}
use Carp;
+
+# PACKAGE VARIABLES for for processing an entire FILE.
use vars qw{
$tokenizer_self
- $level_in_tokenizer
- $slevel_in_tokenizer
- $nesting_token_string
- $nesting_type_string
- $nesting_block_string
- $nesting_block_flag
- $nesting_list_string
- $nesting_list_flag
- $saw_negative_indentation
- $id_scan_state
+
$last_nonblank_token
$last_nonblank_type
$last_nonblank_block_type
- $last_nonblank_container_type
- $last_nonblank_type_sequence
- $last_last_nonblank_token
- $last_last_nonblank_type
- $last_last_nonblank_block_type
- $last_last_nonblank_container_type
- $last_last_nonblank_type_sequence
- $last_nonblank_prototype
$statement_type
- $identifier
$in_attribute_list
- $in_quote
- $quote_type
- $quote_character
- $quote_pos
- $quote_depth
- $allowed_quote_modifiers
+ $current_package
+ $context
+
+ %is_constant
+ %is_user_function
+ %user_function_prototype
+ %is_block_function
+ %is_block_list_function
+ %saw_function_definition
+
+ $brace_depth
$paren_depth
+ $square_bracket_depth
+
+ @current_depth
+ @nesting_sequence_number
+ @current_sequence_number
@paren_type
@paren_semicolon_count
@paren_structural_type
- $brace_depth
@brace_type
@brace_structural_type
@brace_statement_type
@brace_context
@brace_package
- $square_bracket_depth
@square_bracket_type
@square_bracket_structural_type
@depth_array
@starting_line_of_current_depth
- @current_depth
- @current_sequence_number
- @nesting_sequence_number
- @lower_case_labels_at
- $saw_v_string
- %is_constant
- %is_user_function
- %user_function_prototype
- %saw_function_definition
- $max_token_index
- $peeked_ahead
- $current_package
- $unexpected_error_count
- $input_line
- $input_line_number
- $rpretokens
- $rpretoken_map
- $rpretoken_type
- $want_paren
- $context
- @slevel_stack
- $ci_string_in_tokenizer
- $continuation_string_in_tokenizer
- $in_statement_continuation
- $started_looking_for_here_target_at
- $nearly_matched_here_target_at
+};
+# GLOBAL CONSTANTS for routines in this package
+use vars qw{
%is_indirect_object_taker
%is_block_operator
%expecting_operator_token
%expecting_operator_types
%expecting_term_types
%expecting_term_token
- %is_block_function
- %is_block_list_function
%is_digraph
%is_file_test_operator
%is_trigraph
# Note: 'tabs' and 'indent_columns' are temporary and should be
# removed asap
my %defaults = (
- source_object => undef,
- debugger_object => undef,
- diagnostics_object => undef,
- logger_object => undef,
- starting_level => undef,
- indent_columns => 4,
- tabs => 0,
- look_for_hash_bang => 0,
- trim_qw => 1,
- look_for_autoloader => 1,
- look_for_selfloader => 1,
+ source_object => undef,
+ debugger_object => undef,
+ diagnostics_object => undef,
+ logger_object => undef,
+ starting_level => undef,
+ indent_columns => 4,
+ tabs => 0,
+ look_for_hash_bang => 0,
+ trim_qw => 1,
+ look_for_autoloader => 1,
+ look_for_selfloader => 1,
+ starting_line_number => 1,
);
my %args = ( %defaults, @_ );
# _know_input_tabstr flag indicating if we know _input_tabstr
# _line_buffer_object object with get_line() method to supply source code
# _diagnostics_object place to write debugging information
+ # _unexpected_error_count error count used to limit output
+ # _lower_case_labels_at line numbers where lower case labels seen
$tokenizer_self = {
- _rhere_target_list => undef,
- _in_here_doc => 0,
- _here_doc_target => "",
- _here_quote_character => "",
- _in_data => 0,
- _in_end => 0,
- _in_format => 0,
- _in_error => 0,
- _in_pod => 0,
- _in_attribute_list => 0,
- _in_quote => 0,
- _quote_target => "",
- _line_start_quote => -1,
- _starting_level => $args{starting_level},
- _know_starting_level => defined( $args{starting_level} ),
- _tabs => $args{tabs},
- _indent_columns => $args{indent_columns},
- _look_for_hash_bang => $args{look_for_hash_bang},
- _trim_qw => $args{trim_qw},
- _input_tabstr => "",
- _know_input_tabstr => -1,
- _last_line_number => 0,
- _saw_perl_dash_P => 0,
- _saw_perl_dash_w => 0,
- _saw_use_strict => 0,
- _look_for_autoloader => $args{look_for_autoloader},
- _look_for_selfloader => $args{look_for_selfloader},
- _saw_autoloader => 0,
- _saw_selfloader => 0,
- _saw_hash_bang => 0,
- _saw_end => 0,
- _saw_data => 0,
- _saw_lc_filehandle => 0,
- _started_tokenizing => 0,
- _line_buffer_object => $line_buffer_object,
- _debugger_object => $args{debugger_object},
- _diagnostics_object => $args{diagnostics_object},
- _logger_object => $args{logger_object},
+ _rhere_target_list => [],
+ _in_here_doc => 0,
+ _here_doc_target => "",
+ _here_quote_character => "",
+ _in_data => 0,
+ _in_end => 0,
+ _in_format => 0,
+ _in_error => 0,
+ _in_pod => 0,
+ _in_attribute_list => 0,
+ _in_quote => 0,
+ _quote_target => "",
+ _line_start_quote => -1,
+ _starting_level => $args{starting_level},
+ _know_starting_level => defined( $args{starting_level} ),
+ _tabs => $args{tabs},
+ _indent_columns => $args{indent_columns},
+ _look_for_hash_bang => $args{look_for_hash_bang},
+ _trim_qw => $args{trim_qw},
+ _input_tabstr => "",
+ _know_input_tabstr => -1,
+ _last_line_number => $args{starting_line_number} - 1,
+ _saw_perl_dash_P => 0,
+ _saw_perl_dash_w => 0,
+ _saw_use_strict => 0,
+ _saw_v_string => 0,
+ _look_for_autoloader => $args{look_for_autoloader},
+ _look_for_selfloader => $args{look_for_selfloader},
+ _saw_autoloader => 0,
+ _saw_selfloader => 0,
+ _saw_hash_bang => 0,
+ _saw_end => 0,
+ _saw_data => 0,
+ _saw_negative_indentation => 0,
+ _started_tokenizing => 0,
+ _line_buffer_object => $line_buffer_object,
+ _debugger_object => $args{debugger_object},
+ _diagnostics_object => $args{diagnostics_object},
+ _logger_object => $args{logger_object},
+ _unexpected_error_count => 0,
+ _started_looking_for_here_target_at => 0,
+ _nearly_matched_here_target_at => undef,
+ _line_text => "",
+ _rlower_case_labels_at => undef,
};
prepare_for_a_new_file();
warning("hit EOF while in format description\n");
}
- # this check may be removed after a year or so
- if ( $tokenizer_self->{_saw_lc_filehandle} ) {
-
- warning( <<'EOM' );
-------------------------------------------------------------------------
-PLEASE NOTE: If you get this message, it is because perltidy noticed
-possible ambiguous syntax at one or more places in your script, as
-noted above. The problem is with statements accepting indirect objects,
-such as print and printf statements of the form
-
- print bareword ( $etc
-
-Perltidy needs your help in deciding if 'bareword' is a filehandle or a
-function call. The problem is the space between 'bareword' and '('. If
-'bareword' is a function call, you should remove the trailing space. If
-'bareword' is a filehandle, you should avoid the opening paren or else
-globally capitalize 'bareword' to be BAREWORD. So the above line
-would be:
-
- print bareword( $etc # function
-or
- print bareword @list # filehandle
-or
- print BAREWORD ( $etc # filehandle
-
-If you want to keep the line as it is, and are sure it is correct,
-you can use -w=0 to prevent this message.
-------------------------------------------------------------------------
-EOM
-
- }
-
if ( $tokenizer_self->{_in_pod} ) {
# Just write log entry if this is after __END__ or __DATA__
if ( $tokenizer_self->{_in_here_doc} ) {
my $here_doc_target = $tokenizer_self->{_here_doc_target};
+ my $started_looking_for_here_target_at =
+ $tokenizer_self->{_started_looking_for_here_target_at};
if ($here_doc_target) {
warning(
"hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
"hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
);
}
+ my $nearly_matched_here_target_at =
+ $tokenizer_self->{_nearly_matched_here_target_at};
if ($nearly_matched_here_target_at) {
warning(
"NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
if ( $tokenizer_self->{_in_quote} ) {
my $line_start_quote = $tokenizer_self->{_line_start_quote};
my $quote_target = $tokenizer_self->{_quote_target};
- my $what =
+ my $what =
( $tokenizer_self->{_in_attribute_list} )
? "attribute list"
: "quote/pattern";
# it is suggested that lables have at least one upper case character
# for legibility and to avoid code breakage as new keywords are introduced
- if (@lower_case_labels_at) {
- my $num = @lower_case_labels_at;
+ if ( $tokenizer_self->{_rlower_case_labels_at} ) {
+ my @lower_case_labels_at =
+ @{ $tokenizer_self->{_rlower_case_labels_at} };
write_logfile_entry(
"Suggest using upper case characters in label(s)\n");
local $" = ')(';
# warn if this version can't handle v-strings
my $tok = shift;
- $saw_v_string = $input_line_number;
+ unless ( $tokenizer_self->{_saw_v_string} ) {
+ $tokenizer_self->{_saw_v_string} = $tokenizer_self->{_last_line_number};
+ }
if ( $] < 5.006 ) {
warning(
"Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
my $self = shift;
+ # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
+ # $square_bracket_depth, $paren_depth
+
my $input_line = $tokenizer_self->{_line_buffer_object}->get_line();
+ $tokenizer_self->{_line_text} = $input_line;
return undef unless ($input_line);
- $tokenizer_self->{_last_line_number}++;
+ my $input_line_number = ++$tokenizer_self->{_last_line_number};
# Find and remove what characters terminate this line, including any
# control r
# for backwards compatability we keep the line text terminated with
# a newline character
$input_line .= "\n";
-
- my $input_line_number = $tokenizer_self->{_last_line_number};
+ $tokenizer_self->{_line_text} = $input_line; # update
# create a data structure describing this line which will be
# returned to the caller.
_rci_levels => undef,
_rnesting_blocks => undef,
_python_indentation_level => -1, ## 0,
- _starting_in_quote =>
- ( $tokenizer_self->{_in_quote} && ( $quote_type eq 'Q' ) ),
+ _starting_in_quote => 0, # to be set by subroutine
_ending_in_quote => 0,
_curly_brace_depth => $brace_depth,
_square_bracket_depth => $square_bracket_depth,
my $candidate_target = $input_line;
chomp $candidate_target;
if ( $candidate_target eq $here_doc_target ) {
- $nearly_matched_here_target_at = undef;
- $line_of_tokens->{_line_type} = 'HERE_END';
+ $tokenizer_self->{_nearly_matched_here_target_at} = undef;
+ $line_of_tokens->{_line_type} = 'HERE_END';
write_logfile_entry("Exiting HERE document $here_doc_target\n");
my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
if (@$rhere_target_list) { # there can be multiple here targets
( $here_doc_target, $here_quote_character ) =
@{ shift @$rhere_target_list };
- $tokenizer_self->{_here_doc_target} = $here_doc_target;
+ $tokenizer_self->{_here_doc_target} = $here_doc_target;
$tokenizer_self->{_here_quote_character} =
$here_quote_character;
write_logfile_entry(
"Entering HERE document $here_doc_target\n");
- $nearly_matched_here_target_at = undef;
- $started_looking_for_here_target_at = $input_line_number;
+ $tokenizer_self->{_nearly_matched_here_target_at} = undef;
+ $tokenizer_self->{_started_looking_for_here_target_at} =
+ $input_line_number;
}
else {
$tokenizer_self->{_in_here_doc} = 0;
$candidate_target =~ s/\s*$//;
$candidate_target =~ s/^\s*//;
if ( $candidate_target eq $here_doc_target ) {
- $nearly_matched_here_target_at = $input_line_number;
+ $tokenizer_self->{_nearly_matched_here_target_at} =
+ $input_line_number;
}
}
return $line_of_tokens;
my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
if (@$rhere_target_list) {
- #my $here_doc_target = shift @$rhere_target_list;
my ( $here_doc_target, $here_quote_character ) =
@{ shift @$rhere_target_list };
$tokenizer_self->{_in_here_doc} = 1;
$tokenizer_self->{_here_doc_target} = $here_doc_target;
$tokenizer_self->{_here_quote_character} = $here_quote_character;
write_logfile_entry("Entering HERE document $here_doc_target\n");
- $started_looking_for_here_target_at = $input_line_number;
+ $tokenizer_self->{_started_looking_for_here_target_at} =
+ $input_line_number;
}
# NOTE: __END__ and __DATA__ statements are written unformatted
and ( $tokenizer_self->{_line_start_quote} < 0 ) )
{
- if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
+ #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
+ if (
+ ( my $quote_target = $tokenizer_self->{_quote_target} ) !~ /^\s*$/ )
+ {
$tokenizer_self->{_line_start_quote} = $input_line_number;
- $tokenizer_self->{_quote_target} = $quote_target;
write_logfile_entry(
"Start multi-line quote or pattern ending in $quote_target\n");
}
sub find_starting_indentation_level {
+ # USES GLOBAL VARIABLES: $tokenizer_self
my $starting_level = 0;
my $know_input_tabstr = -1; # flag for find_indentation_level
sub find_indentation_level {
my ( $line, $structural_indentation_level ) = @_;
+
+ # USES GLOBAL VARIABLES: $tokenizer_self
my $level = 0;
my $msg = "";
}
else {
$columns = int $columns;
- $msg =
+ $msg =
"old indentation is unclear, using $columns $entabbed spaces\n";
}
$input_tabstr = " " x $columns;
return ( $level, $msg );
}
-sub dump_token_types {
- my $class = shift;
- my $fh = shift;
-
- # This should be the latest list of token types in use
- # adding NEW_TOKENS: add a comment here
- print $fh <<'END_OF_LIST';
-
-Here is a list of the token types currently used for lines of type 'CODE'.
-For the following tokens, the "type" of a token is just the token itself.
-
-.. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
-( ) <= >= == =~ !~ != ++ -- /= x=
-... **= <<= >>= &&= ||= //= <=>
-, + - / * | % ! x ~ = \ ? : . < > ^ &
-
-The following additional token types are defined:
-
- type meaning
- b blank (white space)
- { indent: opening structural curly brace or square bracket or paren
- (code block, anonymous hash reference, or anonymous array reference)
- } outdent: right structural curly brace or square bracket or paren
- [ left non-structural square bracket (enclosing an array index)
- ] right non-structural square bracket
- ( left non-structural paren (all but a list right of an =)
- ) right non-structural parena
- L left non-structural curly brace (enclosing a key)
- R right non-structural curly brace
- ; terminal semicolon
- f indicates a semicolon in a "for" statement
- h here_doc operator <<
- # a comment
- Q indicates a quote or pattern
- q indicates a qw quote block
- k a perl keyword
- C user-defined constant or constant function (with void prototype = ())
- U user-defined function taking parameters
- G user-defined function taking block parameter (like grep/map/eval)
- M (unused, but reserved for subroutine definition name)
- P (unused, but -html uses it to label pod text)
- t type indicater such as %,$,@,*,&,sub
- w bare word (perhaps a subroutine call)
- i identifier of some type (with leading %, $, @, *, &, sub, -> )
- n a number
- v a v-string
- F a file test operator (like -e)
- Y File handle
- Z identifier in indirect object slot: may be file handle, object
- J LABEL: code block label
- j LABEL after next, last, redo, goto
- p unary +
- m unary -
- pp pre-increment operator ++
- mm pre-decrement operator --
- A : used as attribute separator
-
- Here are the '_line_type' codes used internally:
- SYSTEM - system-specific code before hash-bang line
- CODE - line of perl code (including comments)
- POD_START - line starting pod, such as '=head'
- POD - pod documentation text
- POD_END - last line of pod section, '=cut'
- HERE - text of here-document
- HERE_END - last line of here-doc (target word)
- FORMAT - format section
- FORMAT_END - last line of format section, '.'
- DATA_START - __DATA__ line
- DATA - unidentified text following __DATA__
- END_START - __END__ line
- END - unidentified text following __END__
- ERROR - we are in big trouble, probably not a perl script
-END_OF_LIST
-}
-
# This is a currently unused debug routine
sub dump_functions {
}
sub prepare_for_a_new_file {
- $saw_negative_indentation = 0;
- $id_scan_state = '';
- $statement_type = ''; # '' or 'use' or 'sub..' or 'case..'
+
+ # previous tokens needed to determine what to expect next
$last_nonblank_token = ';'; # the only possible starting state which
$last_nonblank_type = ';'; # will make a leading brace a code block
$last_nonblank_block_type = '';
- $last_nonblank_container_type = '';
- $last_nonblank_type_sequence = '';
- $last_last_nonblank_token = ';';
- $last_last_nonblank_type = ';';
- $last_last_nonblank_block_type = '';
- $last_last_nonblank_container_type = '';
- $last_last_nonblank_type_sequence = '';
- $last_nonblank_prototype = "";
- $identifier = '';
- $in_attribute_list = 0; # ATTRS
- $in_quote = 0; # flag telling if we are chasing a quote, and what kind
- $quote_type = 'Q';
- $quote_character = ""; # character we seek if chasing a quote
- $quote_pos = 0; # next character index to check for case of alphanum char
- $quote_depth = 0;
- $allowed_quote_modifiers = "";
- $paren_depth = 0;
- $brace_depth = 0;
- $square_bracket_depth = 0;
- $current_package = "main";
+
+ # scalars for remembering statement types across multiple lines
+ $statement_type = ''; # '' or 'use' or 'sub..' or 'case..'
+ $in_attribute_list = 0;
+
+ # scalars for remembering where we are in the file
+ $current_package = "main";
+ $context = UNKNOWN_CONTEXT;
+
+ # hashes used to remember function information
+ %is_constant = (); # user-defined constants
+ %is_user_function = (); # user-defined functions
+ %user_function_prototype = (); # their prototypes
+ %is_block_function = ();
+ %is_block_list_function = ();
+ %saw_function_definition = ();
+
+ # variables used to track depths of various containers
+ # and report nesting errors
+ $paren_depth = 0;
+ $brace_depth = 0;
+ $square_bracket_depth = 0;
@current_depth[ 0 .. $#closing_brace_names ] =
(0) x scalar @closing_brace_names;
@nesting_sequence_number[ 0 .. $#closing_brace_names ] =
( 0 .. $#closing_brace_names );
- @current_sequence_number = ();
-
+ @current_sequence_number = ();
$paren_type[$paren_depth] = '';
$paren_semicolon_count[$paren_depth] = 0;
+ $paren_structural_type[$brace_depth] = '';
$brace_type[$brace_depth] = ';'; # identify opening brace as code block
$brace_structural_type[$brace_depth] = '';
$brace_statement_type[$brace_depth] = "";
$brace_context[$brace_depth] = UNKNOWN_CONTEXT;
- $paren_structural_type[$brace_depth] = '';
+ $brace_package[$paren_depth] = $current_package;
$square_bracket_type[$square_bracket_depth] = '';
$square_bracket_structural_type[$square_bracket_depth] = '';
- $brace_package[$paren_depth] = $current_package;
- %is_constant = (); # user-defined constants
- %is_user_function = (); # user-defined functions
- %user_function_prototype = (); # their prototypes
- %is_block_function = ();
- %is_block_list_function = ();
- %saw_function_definition = ();
- $unexpected_error_count = 0;
- $want_paren = "";
- $context = UNKNOWN_CONTEXT;
- @slevel_stack = ();
- $ci_string_in_tokenizer = "";
- $continuation_string_in_tokenizer = "0";
- $in_statement_continuation = 0;
- @lower_case_labels_at = ();
- $saw_v_string = 0; # for warning of v-strings on older perl
- $nesting_token_string = "";
- $nesting_type_string = "";
- $nesting_block_string = '1'; # initially in a block
- $nesting_block_flag = 1;
- $nesting_list_string = '0'; # initially not in a list
- $nesting_list_flag = 0; # initially not in a list
- $nearly_matched_here_target_at = undef;
-}
-
-sub get_quote_target {
- return matching_end_token($quote_character);
-}
-
-sub get_indentation_level {
- return $level_in_tokenizer;
-}
-
-sub reset_indentation_level {
- $level_in_tokenizer = $_[0];
- $slevel_in_tokenizer = $_[0];
- push @slevel_stack, $slevel_in_tokenizer;
-}
-
-{ # begin tokenize_this_line
+
+ initialize_tokenizer_state();
+}
+
+{ # begin tokenize_this_line
use constant BRACE => 0;
use constant SQUARE_BRACKET => 1;
use constant PAREN => 2;
use constant QUESTION_COLON => 3;
+ # TV1: scalars for processing one LINE.
+ # Re-initialized on each entry to sub tokenize_this_line.
+ my (
+ $block_type, $container_type, $expecting,
+ $i, $i_tok, $input_line,
+ $input_line_number, $last_nonblank_i, $max_token_index,
+ $next_tok, $next_type, $peeked_ahead,
+ $prototype, $rhere_target_list, $rtoken_map,
+ $rtoken_type, $rtokens, $tok,
+ $type, $type_sequence,
+ );
+
+ # TV2: refs to ARRAYS for processing one LINE
+ # Re-initialized on each call.
+ my $routput_token_list = []; # stack of output token indexes
+ my $routput_token_type = []; # token types
+ my $routput_block_type = []; # types of code block
+ my $routput_container_type = []; # paren types, such as if, elsif, ..
+ my $routput_type_sequence = []; # nesting sequential number
+
+ # TV3: SCALARS for quote variables. These are initialized with a
+ # subroutine call and continually updated as lines are processed.
+ my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
+ $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
+
+ # TV4: SCALARS for multi-line identifiers and
+ # statements. These are initialized with a subroutine call
+ # and continually updated as lines are processed.
+ my ( $id_scan_state, $identifier, $want_paren, );
+
+ # TV5: SCALARS for tracking indentation level.
+ # Initialized once and continually updated as lines are
+ # processed.
my (
- $block_type, $container_type, $expecting,
- $here_doc_target, $here_quote_character, $i,
- $i_tok, $last_nonblank_i, $next_tok,
- $next_type, $prototype, $rtoken_map,
- $rtoken_type, $rtokens, $tok,
- $type, $type_sequence,
+ $nesting_token_string, $nesting_type_string,
+ $nesting_block_string, $nesting_block_flag,
+ $nesting_list_string, $nesting_list_flag,
+ $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
+ $in_statement_continuation, $level_in_tokenizer,
+ $slevel_in_tokenizer, $rslevel_stack,
);
- my @output_token_list = (); # stack of output token indexes
- my @output_token_type = (); # token types
- my @output_block_type = (); # types of code block
- my @output_container_type = (); # paren types, such as if, elsif, ..
- my @output_type_sequence = (); # nesting sequential number
+ # TV6: SCALARS for remembering several previous
+ # tokens. Initialized once and continually updated as
+ # lines are processed.
+ my (
+ $last_nonblank_container_type, $last_nonblank_type_sequence,
+ $last_last_nonblank_token, $last_last_nonblank_type,
+ $last_last_nonblank_block_type, $last_last_nonblank_container_type,
+ $last_last_nonblank_type_sequence, $last_nonblank_prototype,
+ );
+
+ # ----------------------------------------------------------------
+ # beginning of tokenizer variable access and manipulation routines
+ # ----------------------------------------------------------------
+
+ sub initialize_tokenizer_state {
+
+ # TV1: initialized on each call
+ # TV2: initialized on each call
+ # TV3:
+ $in_quote = 0;
+ $quote_type = 'Q';
+ $quote_character = "";
+ $quote_pos = 0;
+ $quote_depth = 0;
+ $quoted_string_1 = "";
+ $quoted_string_2 = "";
+ $allowed_quote_modifiers = "";
+
+ # TV4:
+ $id_scan_state = '';
+ $identifier = '';
+ $want_paren = "";
+
+ # TV5:
+ $nesting_token_string = "";
+ $nesting_type_string = "";
+ $nesting_block_string = '1'; # initially in a block
+ $nesting_block_flag = 1;
+ $nesting_list_string = '0'; # initially not in a list
+ $nesting_list_flag = 0; # initially not in a list
+ $ci_string_in_tokenizer = "";
+ $continuation_string_in_tokenizer = "0";
+ $in_statement_continuation = 0;
+ $level_in_tokenizer = 0;
+ $slevel_in_tokenizer = 0;
+ $rslevel_stack = [];
+
+ # TV6:
+ $last_nonblank_container_type = '';
+ $last_nonblank_type_sequence = '';
+ $last_last_nonblank_token = ';';
+ $last_last_nonblank_type = ';';
+ $last_last_nonblank_block_type = '';
+ $last_last_nonblank_container_type = '';
+ $last_last_nonblank_type_sequence = '';
+ $last_nonblank_prototype = "";
+ }
+
+ sub save_tokenizer_state {
+
+ my $rTV1 = [
+ $block_type, $container_type, $expecting,
+ $i, $i_tok, $input_line,
+ $input_line_number, $last_nonblank_i, $max_token_index,
+ $next_tok, $next_type, $peeked_ahead,
+ $prototype, $rhere_target_list, $rtoken_map,
+ $rtoken_type, $rtokens, $tok,
+ $type, $type_sequence,
+ ];
+
+ my $rTV2 = [
+ $routput_token_list, $routput_token_type,
+ $routput_block_type, $routput_container_type,
+ $routput_type_sequence,
+ ];
+
+ my $rTV3 = [
+ $in_quote, $quote_type,
+ $quote_character, $quote_pos,
+ $quote_depth, $quoted_string_1,
+ $quoted_string_2, $allowed_quote_modifiers,
+ ];
+
+ my $rTV4 = [ $id_scan_state, $identifier, $want_paren, ];
+
+ my $rTV5 = [
+ $nesting_token_string, $nesting_type_string,
+ $nesting_block_string, $nesting_block_flag,
+ $nesting_list_string, $nesting_list_flag,
+ $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
+ $in_statement_continuation, $level_in_tokenizer,
+ $slevel_in_tokenizer, $rslevel_stack,
+ ];
+
+ my $rTV6 = [
+ $last_nonblank_container_type,
+ $last_nonblank_type_sequence,
+ $last_last_nonblank_token,
+ $last_last_nonblank_type,
+ $last_last_nonblank_block_type,
+ $last_last_nonblank_container_type,
+ $last_last_nonblank_type_sequence,
+ $last_nonblank_prototype,
+ ];
+ return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
+ }
+
+ sub restore_tokenizer_state {
+ my ($rstate) = @_;
+ my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
+ (
+ $block_type, $container_type, $expecting,
+ $i, $i_tok, $input_line,
+ $input_line_number, $last_nonblank_i, $max_token_index,
+ $next_tok, $next_type, $peeked_ahead,
+ $prototype, $rhere_target_list, $rtoken_map,
+ $rtoken_type, $rtokens, $tok,
+ $type, $type_sequence,
+ ) = @{$rTV1};
+
+ (
+ $routput_token_list, $routput_token_type,
+ $routput_block_type, $routput_container_type,
+ $routput_type_sequence,
+ ) = @{$rTV2};
+
+ (
+ $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
+ $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
+ ) = @{$rTV3};
+
+ ( $id_scan_state, $identifier, $want_paren, ) = @{$rTV4};
+
+ (
+ $nesting_token_string, $nesting_type_string,
+ $nesting_block_string, $nesting_block_flag,
+ $nesting_list_string, $nesting_list_flag,
+ $ci_string_in_tokenizer, $continuation_string_in_tokenizer,
+ $in_statement_continuation, $level_in_tokenizer,
+ $slevel_in_tokenizer, $rslevel_stack,
+ ) = @{$rTV5};
+
+ (
+ $last_nonblank_container_type,
+ $last_nonblank_type_sequence,
+ $last_last_nonblank_token,
+ $last_last_nonblank_type,
+ $last_last_nonblank_block_type,
+ $last_last_nonblank_container_type,
+ $last_last_nonblank_type_sequence,
+ $last_nonblank_prototype,
+ ) = @{$rTV6};
+ }
+
+ sub get_indentation_level {
+ return $level_in_tokenizer;
+ }
+
+ sub reset_indentation_level {
+ $level_in_tokenizer = $_[0];
+ $slevel_in_tokenizer = $_[0];
+ push @{$rslevel_stack}, $slevel_in_tokenizer;
+ }
+
+ sub peeked_ahead {
+ $peeked_ahead = defined( $_[0] ) ? $_[0] : $peeked_ahead;
+ }
- my @here_target_list = (); # list of here-doc target strings
+ # ------------------------------------------------------------
+ # end of tokenizer variable access and manipulation routines
+ # ------------------------------------------------------------
# ------------------------------------------------------------
- # beginning of various scanner interfaces to simplify coding
+ # beginning of various scanner interface routines
# ------------------------------------------------------------
+ sub scan_replacement_text {
+
+ # check for here-docs in replacement text invoked by
+ # a substitution operator with executable modifier 'e'.
+ #
+ # given:
+ # $replacement_text
+ # return:
+ # $rht = reference to any here-doc targets
+ my ($replacement_text) = @_;
+
+ # quick check
+ return undef unless ( $replacement_text =~ /<</ );
+
+ write_logfile_entry("scanning replacement text for here-doc targets\n");
+
+ # save the logger object for error messages
+ my $logger_object = $tokenizer_self->{_logger_object};
+
+ # localize all package variables
+ local (
+ $tokenizer_self, $last_nonblank_token,
+ $last_nonblank_type, $last_nonblank_block_type,
+ $statement_type, $in_attribute_list,
+ $current_package, $context,
+ %is_constant, %is_user_function,
+ %user_function_prototype, %is_block_function,
+ %is_block_list_function, %saw_function_definition,
+ $brace_depth, $paren_depth,
+ $square_bracket_depth, @current_depth,
+ @nesting_sequence_number, @current_sequence_number,
+ @paren_type, @paren_semicolon_count,
+ @paren_structural_type, @brace_type,
+ @brace_structural_type, @brace_statement_type,
+ @brace_context, @brace_package,
+ @square_bracket_type, @square_bracket_structural_type,
+ @depth_array, @starting_line_of_current_depth,
+ );
+
+ # save all lexical variables
+ my $rstate = save_tokenizer_state();
+ _decrement_count(); # avoid error check for multiple tokenizers
+
+ # make a new tokenizer
+ my $rOpts = {};
+ my $rpending_logfile_message;
+ my $source_object =
+ Perl::Tidy::LineSource->new( \$replacement_text, $rOpts,
+ $rpending_logfile_message );
+ my $tokenizer = Perl::Tidy::Tokenizer->new(
+ source_object => $source_object,
+ logger_object => $logger_object,
+ starting_line_number => $input_line_number,
+ );
+
+ # scan the replacement text
+ 1 while ( $tokenizer->get_line() );
+
+ # remove any here doc targets
+ my $rht = undef;
+ if ( $tokenizer_self->{_in_here_doc} ) {
+ $rht = [];
+ push @{$rht},
+ [
+ $tokenizer_self->{_here_doc_target},
+ $tokenizer_self->{_here_quote_character}
+ ];
+ if ( $tokenizer_self->{_rhere_target_list} ) {
+ push @{$rht}, @{ $tokenizer_self->{_rhere_target_list} };
+ $tokenizer_self->{_rhere_target_list} = undef;
+ }
+ $tokenizer_self->{_in_here_doc} = undef;
+ }
+
+ # now its safe to report errors
+ $tokenizer->report_tokenization_errors();
+
+ # restore all tokenizer lexical variables
+ restore_tokenizer_state($rstate);
+
+ # return the here doc targets
+ return $rht;
+ }
+
sub scan_bare_identifier {
( $i, $tok, $type, $prototype ) =
scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
- $rtoken_map );
+ $rtoken_map, $max_token_index );
}
sub scan_identifier {
( $i, $tok, $type, $id_scan_state, $identifier ) =
- scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens );
+ scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
+ $max_token_index );
}
sub scan_id {
( $i, $tok, $type, $id_scan_state ) =
scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
- $id_scan_state );
+ $id_scan_state, $max_token_index );
}
- my $number;
-
sub scan_number {
+ my $number;
( $i, $type, $number ) =
- scan_number_do( $input_line, $i, $rtoken_map, $type );
+ scan_number_do( $input_line, $i, $rtoken_map, $type,
+ $max_token_index );
+ return $number;
}
# a sub to warn if token found where term expected
sub error_if_expecting_TERM {
if ( $expecting == TERM ) {
if ( $really_want_term{$last_nonblank_type} ) {
- unexpected( $tok, "term", $i_tok, $last_nonblank_i );
+ unexpected( $tok, "term", $i_tok, $last_nonblank_i, $rtoken_map,
+ $rtoken_type, $input_line );
1;
}
}
sub error_if_expecting_OPERATOR {
if ( $expecting == OPERATOR ) {
my $thing = defined $_[0] ? $_[0] : $tok;
- unexpected( $thing, "operator", $i_tok, $last_nonblank_i );
+ unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
+ $rtoken_map, $rtoken_type, $input_line );
if ( $i_tok == 0 ) {
interrupt_logfile();
warning("Missing ';' above?\n");
## '||=' => undef,
## '//=' => undef,
## '~' => undef,
+## '~~' => undef,
'>' => sub {
error_if_expecting_TERM()
# error; for example, we might have a constant pi and
# invoke it with pi() or just pi;
my ( $next_nonblank_token, $i_next ) =
- find_next_nonblank_token( $i, $rtokens );
+ find_next_nonblank_token( $i, $rtokens,
+ $max_token_index );
if ( $next_nonblank_token ne ')' ) {
my $hint;
error_if_expecting_OPERATOR('(');
} ## end if ( $expecting == OPERATOR...
}
$paren_type[$paren_depth] = $container_type;
- $type_sequence = increase_nesting_depth( PAREN, $i_tok );
+ $type_sequence =
+ increase_nesting_depth( PAREN, $$rtoken_map[$i_tok] );
# propagate types down through nested parens
# for example: the second paren in 'if ((' would be structural
},
')' => sub {
- $type_sequence = decrease_nesting_depth( PAREN, $i_tok );
+ $type_sequence =
+ decrease_nesting_depth( PAREN, $$rtoken_map[$i_tok] );
if ( $paren_structural_type[$paren_depth] eq '{' ) {
$type = '}';
if ( $expecting == UNKNOWN ) { # indeterminte, must guess..
my $msg;
( $is_pattern, $msg ) =
- guess_if_pattern_or_division( $i, $rtokens, $rtoken_map );
+ guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
+ $max_token_index );
if ($msg) {
write_diagnostics("DIVIDE:$msg\n");
$type = $tok;
}
- #DEBUG - collecting info on what tokens follow a divide
- # for development of guessing algorithm
- #if ( numerator_expected( $i, $rtokens ) < 0 ) {
- # #write_diagnostics( "DIVIDE? $input_line\n" );
- #}
+ #DEBUG - collecting info on what tokens follow a divide
+ # for development of guessing algorithm
+ #if ( numerator_expected( $i, $rtokens, $max_token_index ) < 0 ) {
+ # #write_diagnostics( "DIVIDE? $input_line\n" );
+ #}
}
},
'{' => sub {
# which will be blank for an anonymous hash
else {
- $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type );
+ $block_type =
+ code_block_type( $i_tok, $rtokens, $rtoken_type,
+ $max_token_index );
# patch to promote bareword type to function taking block
if ( $block_type
&& $last_nonblank_type eq 'w'
&& $last_nonblank_i >= 0 )
{
- if ( $output_token_type[$last_nonblank_i] eq 'w' ) {
- $output_token_type[$last_nonblank_i] = 'G';
+ if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
+ $routput_token_type->[$last_nonblank_i] = 'G';
}
}
}
$brace_type[ ++$brace_depth ] = $block_type;
$brace_package[$brace_depth] = $current_package;
- $type_sequence = increase_nesting_depth( BRACE, $i_tok );
+ $type_sequence =
+ increase_nesting_depth( BRACE, $$rtoken_map[$i_tok] );
$brace_structural_type[$brace_depth] = $type;
$brace_context[$brace_depth] = $context;
$brace_statement_type[$brace_depth] = $statement_type;
# can happen on brace error (caught elsewhere)
else {
}
- $type_sequence = decrease_nesting_depth( BRACE, $i_tok );
+ $type_sequence =
+ decrease_nesting_depth( BRACE, $$rtoken_map[$i_tok] );
if ( $brace_structural_type[$brace_depth] eq 'L' ) {
$type = 'R';
if ( $expecting != OPERATOR ) {
( $i, $type ) =
find_angle_operator_termination( $input_line, $i, $rtoken_map,
- $expecting );
+ $expecting, $max_token_index );
}
else {
my $msg;
( $is_pattern, $msg ) =
- guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map );
+ guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
+ $max_token_index );
if ($msg) { write_logfile_entry($msg) }
}
$allowed_quote_modifiers = '[cgimosx]'; # TBD:check this
}
else {
-
$type_sequence =
- increase_nesting_depth( QUESTION_COLON, $i_tok );
+ increase_nesting_depth( QUESTION_COLON,
+ $$rtoken_map[$i_tok] );
}
},
'*' => sub { # typeglob, or multiply?
# otherwise, it should be part of a ?/: operator
else {
$type_sequence =
- decrease_nesting_depth( QUESTION_COLON, $i_tok );
+ decrease_nesting_depth( QUESTION_COLON,
+ $$rtoken_map[$i_tok] );
if ( $last_nonblank_token eq '?' ) {
warning("Syntax error near ? :\n");
}
'+' => sub { # what kind of plus?
if ( $expecting == TERM ) {
- scan_number();
+ my $number = scan_number();
# unary plus is safest assumption if not a number
if ( !defined($number) ) { $type = 'p'; }
'[' => sub {
$square_bracket_type[ ++$square_bracket_depth ] =
$last_nonblank_token;
- $type_sequence = increase_nesting_depth( SQUARE_BRACKET, $i_tok );
+ $type_sequence =
+ increase_nesting_depth( SQUARE_BRACKET, $$rtoken_map[$i_tok] );
# It may seem odd, but structural square brackets have
# type '{' and '}'. This simplifies the indentation logic.
$square_bracket_structural_type[$square_bracket_depth] = $type;
},
']' => sub {
- $type_sequence = decrease_nesting_depth( SQUARE_BRACKET, $i_tok );
+ $type_sequence =
+ decrease_nesting_depth( SQUARE_BRACKET, $$rtoken_map[$i_tok] );
if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
{
$type = 'F';
}
elsif ( $expecting == TERM ) {
- scan_number();
+ my $number = scan_number();
# maybe part of bareword token? unary is safest
if ( !defined($number) ) { $type = 'm'; }
; # here-doc not possible if end of line
if ( $expecting != OPERATOR ) {
- my ($found_target);
- ( $found_target, $here_doc_target, $here_quote_character, $i ) =
- find_here_doc( $expecting, $i, $rtokens, $rtoken_map );
+ my ( $found_target, $here_doc_target, $here_quote_character,
+ $saw_error );
+ (
+ $found_target, $here_doc_target, $here_quote_character, $i,
+ $saw_error
+ )
+ = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
+ $max_token_index );
if ($found_target) {
- push @here_target_list,
+ push @{$rhere_target_list},
[ $here_doc_target, $here_quote_character ];
$type = 'h';
if ( length($here_doc_target) > 80 ) {
}
}
elsif ( $expecting == TERM ) {
+ unless ($saw_error) {
- # shouldn't happen..
- warning("Program bug; didn't find here doc target\n");
- report_definite_bug();
+ # shouldn't happen..
+ warning("Program bug; didn't find here doc target\n");
+ report_definite_bug();
+ }
}
}
else {
if ( $expecting == TERM ) { $type = 'pp' }
elsif ( $expecting == UNKNOWN ) {
my ( $next_nonblank_token, $i_next ) =
- find_next_nonblank_token( $i, $rtokens );
+ find_next_nonblank_token( $i, $rtokens, $max_token_index );
if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
}
},
if ( $expecting == TERM ) { $type = 'mm' }
elsif ( $expecting == UNKNOWN ) {
my ( $next_nonblank_token, $i_next ) =
- find_next_nonblank_token( $i, $rtokens );
+ find_next_nonblank_token( $i, $rtokens, $max_token_index );
if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
}
},
# *, then run diff between the output of the previous version and the
# current version.
#
+ # *. For another example, search for the smartmatch operator '~~'
+ # with your editor to see where updates were made for it.
+ #
# -----------------------------------------------------------------------
my $line_of_tokens = shift;
# extract line number for use in error messages
$input_line_number = $line_of_tokens->{_line_number};
+ # reinitialize for multi-line quote
+ $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
+
# check for pod documentation
if ( ( $untrimmed_input_line =~ /^=[A-Za-z_]/ ) ) {
$input_line =~ s/^\s*//; # trim left end
}
+ # update the copy of the line for use in error messages
+ # This must be exactly what we give the pre_tokenizer
+ $tokenizer_self->{_line_text} = $input_line;
+
# re-initialize for the main loop
- @output_token_list = (); # stack of output token indexes
- @output_token_type = (); # token types
- @output_block_type = (); # types of code block
- @output_container_type = (); # paren types, such as if, elsif, ..
- @output_type_sequence = (); # nesting sequential number
+ $routput_token_list = []; # stack of output token indexes
+ $routput_token_type = []; # token types
+ $routput_block_type = []; # types of code block
+ $routput_container_type = []; # paren types, such as if, elsif, ..
+ $routput_type_sequence = []; # nesting sequential number
+
+ $rhere_target_list = [];
$tok = $last_nonblank_token;
$type = $last_nonblank_type;
$block_type = $last_nonblank_block_type;
$container_type = $last_nonblank_container_type;
$type_sequence = $last_nonblank_type_sequence;
- @here_target_list = (); # list of here-doc target strings
-
- $peeked_ahead = 0;
+ $peeked_ahead = 0;
# tokenization is done in two stages..
# stage 1 is a very simple pre-tokenization
}
# start by breaking the line into pre-tokens
- ( $rpretokens, $rpretoken_map, $rpretoken_type ) =
+ ( $rtokens, $rtoken_map, $rtoken_type ) =
pre_tokenize( $input_line, $max_tokens_wanted );
- $max_token_index = scalar(@$rpretokens) - 1;
- push( @$rpretokens, ' ', ' ', ' ' ); # extra whitespace simplifies logic
- push( @$rpretoken_map, 0, 0, 0 ); # shouldn't be referenced
- push( @$rpretoken_type, 'b', 'b', 'b' );
-
- # temporary copies while coding change is underway
- ( $rtokens, $rtoken_map, $rtoken_type ) =
- ( $rpretokens, $rpretoken_map, $rpretoken_type );
+ $max_token_index = scalar(@$rtokens) - 1;
+ push( @$rtokens, ' ', ' ', ' ' ); # extra whitespace simplifies logic
+ push( @$rtoken_map, 0, 0, 0 ); # shouldn't be referenced
+ push( @$rtoken_type, 'b', 'b', 'b' );
# initialize for main loop
for $i ( 0 .. $max_token_index + 3 ) {
- $output_token_type[$i] = "";
- $output_block_type[$i] = "";
- $output_container_type[$i] = "";
- $output_type_sequence[$i] = "";
+ $routput_token_type->[$i] = "";
+ $routput_block_type->[$i] = "";
+ $routput_container_type->[$i] = "";
+ $routput_type_sequence->[$i] = "";
}
$i = -1;
$i_tok = -1;
if ($in_quote) { # continue looking for end of a quote
$type = $quote_type;
- unless (@output_token_list) { # initialize if continuation line
- push( @output_token_list, $i );
- $output_token_type[$i] = $type;
+ unless ( @{$routput_token_list} )
+ { # initialize if continuation line
+ push( @{$routput_token_list}, $i );
+ $routput_token_type->[$i] = $type;
}
$tok = $quote_character unless ( $quote_character =~ /^\s*$/ );
# scan for the end of the quote or pattern
- ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) =
- do_quote( $i, $in_quote, $quote_character, $quote_pos,
- $quote_depth, $rtokens, $rtoken_map );
+ (
+ $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
+ $quoted_string_1, $quoted_string_2
+ )
+ = do_quote(
+ $i, $in_quote, $quote_character,
+ $quote_pos, $quote_depth, $quoted_string_1,
+ $quoted_string_2, $rtokens, $rtoken_map,
+ $max_token_index
+ );
# all done if we didn't find it
last if ($in_quote);
+ # save pattern and replacement text for rescanning
+ my $qs1 = $quoted_string_1;
+ my $qs2 = $quoted_string_2;
+
# re-initialize for next search
$quote_character = '';
$quote_pos = 0;
$quote_type = 'Q';
+ $quoted_string_1 = "";
+ $quoted_string_2 = "";
last if ( ++$i > $max_token_index );
# look for any modifiers
# check for exact quote modifiers
if ( $$rtokens[$i] =~ /^[A-Za-z_]/ ) {
my $str = $$rtokens[$i];
- while ( $str =~ /\G$allowed_quote_modifiers/gc ) { }
+ my $saw_modifier_e;
+ while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
+ my $pos = pos($str);
+ my $char = substr( $str, $pos - 1, 1 );
+ $saw_modifier_e ||= ( $char eq 'e' );
+ }
+
+ # For an 'e' quote modifier we must scan the replacement
+ # text for here-doc targets.
+ if ($saw_modifier_e) {
+
+ my $rht = scan_replacement_text($qs1);
+
+ # Change type from 'Q' to 'h' for quotes with
+ # here-doc targets so that the formatter (see sub
+ # print_line_of_tokens) will not make any line
+ # breaks after this point.
+ if ($rht) {
+ push @{$rhere_target_list}, @{$rht};
+ $type = 'h';
+ if ( $i_tok < 0 ) {
+ my $ilast = $routput_token_list->[-1];
+ $routput_token_type->[$ilast] = $type;
+ }
+ }
+ }
if ( defined( pos($str) ) ) {
}
}
- $last_last_nonblank_token = $last_nonblank_token;
- $last_last_nonblank_type = $last_nonblank_type;
- $last_last_nonblank_block_type = $last_nonblank_block_type;
+ $last_last_nonblank_token = $last_nonblank_token;
+ $last_last_nonblank_type = $last_nonblank_type;
+ $last_last_nonblank_block_type = $last_nonblank_block_type;
$last_last_nonblank_container_type =
$last_nonblank_container_type;
$last_last_nonblank_type_sequence =
# store previous token type
if ( $i_tok >= 0 ) {
- $output_token_type[$i_tok] = $type;
- $output_block_type[$i_tok] = $block_type;
- $output_container_type[$i_tok] = $container_type;
- $output_type_sequence[$i_tok] = $type_sequence;
+ $routput_token_type->[$i_tok] = $type;
+ $routput_block_type->[$i_tok] = $block_type;
+ $routput_container_type->[$i_tok] = $container_type;
+ $routput_type_sequence->[$i_tok] = $type_sequence;
}
my $pre_tok = $$rtokens[$i]; # get the next pre-token
my $pre_type = $$rtoken_type[$i]; # and type
$i_tok = $i;
# this pre-token will start an output token
- push( @output_token_list, $i_tok );
+ push( @{$routput_token_list}, $i_tok );
# continue gathering identifier if necessary
# but do not start on blanks and comments
if ( $pre_type eq 'w' ) {
$expecting = operator_expected( $prev_type, $tok, $next_type );
my ( $next_nonblank_token, $i_next ) =
- find_next_nonblank_token( $i, $rtokens );
+ find_next_nonblank_token( $i, $rtokens, $max_token_index );
# ATTRS: handle sub and variable attributes
if ($in_attribute_list) {
$type = 'C';
}
elsif ( $is_user_function{$current_package}{$tok} ) {
- $type = 'U';
+ $type = 'U';
$prototype =
$user_function_prototype{$current_package}{$tok};
}
elsif ( $tok =~ /^v\d+$/ ) {
$type = 'v';
- unless ($saw_v_string) { report_v_string($tok) }
+ report_v_string($tok);
}
else { $type = 'w' }
{
scan_bare_identifier();
my ( $next_nonblank_token, $i_next ) =
- find_next_nonblank_token( $i, $rtokens );
+ find_next_nonblank_token( $i, $rtokens,
+ $max_token_index );
if ($next_nonblank_token) {
)
{
if ( $tok !~ /A-Z/ ) {
- push @lower_case_labels_at, $input_line_number;
+ push @{ $tokenizer_self->{_rlower_case_labels_at} },
+ $input_line_number;
}
$type = 'J';
$tok .= ':';
$type = 'U';
}
- # mark bare words following a file test operator as
- # something that will expect an operator next.
- # patch 072901: unless followed immediately by a paren,
- # in which case it must be a function call (pid.t)
- if ( $last_nonblank_type eq 'F' && $next_tok ne '(' ) {
- $type = 'C';
+ # underscore after file test operator is file handle
+ if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
+ $type = 'Z';
}
# patch for SWITCH/CASE if 'case' and 'when are
$expecting = operator_expected( $prev_type, $tok, $next_type );
error_if_expecting_OPERATOR("Number")
if ( $expecting == OPERATOR );
- scan_number();
+ my $number = scan_number();
if ( !defined($number) ) {
# shouldn't happen - we should always get a number
# -----------------------------
if ( $i_tok >= 0 ) {
- $output_token_type[$i_tok] = $type;
- $output_block_type[$i_tok] = $block_type;
- $output_container_type[$i_tok] = $container_type;
- $output_type_sequence[$i_tok] = $type_sequence;
+ $routput_token_type->[$i_tok] = $type;
+ $routput_block_type->[$i_tok] = $block_type;
+ $routput_container_type->[$i_tok] = $container_type;
+ $routput_type_sequence->[$i_tok] = $type_sequence;
}
unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
# indentation level, if it is is appropriate for list formatting.
# If so, continuation indentation is used to indent long list items.
# $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
-# @slevel_stack = a stack of total nesting depths at each
+# @{$rslevel_stack} = a stack of total nesting depths at each
# structural indentation level, where "total nesting depth" means
# the nesting depth that would occur if every nesting token -- '{', '[',
# and '(' -- , regardless of context, is used to compute a nesting
$nesting_list_string_i, $nesting_token_string_i,
$nesting_type_string_i, );
- foreach $i (@output_token_list) { # scan the list of pre-tokens indexes
+ foreach $i ( @{$routput_token_list} )
+ { # scan the list of pre-tokens indexes
# self-checking for valid token types
- my $type = $output_token_type[$i];
+ my $type = $routput_token_type->[$i];
my $tok = $$rtokens[$i]; # the token, but ONLY if same as pretoken
$level_i = $level_in_tokenizer;
$container_environment =
$nesting_block_flag ? 'BLOCK'
: $nesting_list_flag ? 'LIST'
- : "";
+ : "";
# if the difference between total nesting levels is not 1,
# there are intervening non-structural nesting types between
# this '{' and the previous unclosed '{'
my $intervening_secondary_structure = 0;
- if (@slevel_stack) {
+ if ( @{$rslevel_stack} ) {
$intervening_secondary_structure =
- $slevel_in_tokenizer - $slevel_stack[-1];
+ $slevel_in_tokenizer - $rslevel_stack->[-1];
}
# =head1 Continuation Indentation
# variable.
# save the current states
- push( @slevel_stack, 1 + $slevel_in_tokenizer );
+ push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
$level_in_tokenizer++;
- if ( $output_block_type[$i] ) {
+ if ( $routput_block_type->[$i] ) {
$nesting_block_flag = 1;
$nesting_block_string .= '1';
}
# we will use continuation indentation within containers
# which are not blocks and not logical expressions
my $bit = 0;
- if ( !$output_block_type[$i] ) {
+ if ( !$routput_block_type->[$i] ) {
# propagate flag down at nested open parens
- if ( $output_container_type[$i] eq '(' ) {
+ if ( $routput_container_type->[$i] eq '(' ) {
$bit = 1 if $nesting_list_flag;
}
else {
$bit = 1
unless
- $is_logical_container{ $output_container_type[$i] };
+ $is_logical_container{ $routput_container_type->[$i]
+ };
}
}
$nesting_list_string .= $bit;
my $total_ci = $ci_string_sum;
if (
- !$output_block_type[$i] # patch: skip for BLOCK
+ !$routput_block_type->[$i] # patch: skip for BLOCK
&& ($in_statement_continuation)
)
{
elsif ( $type eq '}' || $type eq 'R' ) {
# only a nesting error in the script would prevent popping here
- if ( @slevel_stack > 1 ) { pop(@slevel_stack); }
+ if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
$level_i = --$level_in_tokenizer;
# zero continuation flag at terminal BLOCK '}' which
# ends a statement.
- if ( $output_block_type[$i] ) {
+ if ( $routput_block_type->[$i] ) {
# ...These include non-anonymous subs
# note: could be sub ::abc { or sub 'abc
- if ( $output_block_type[$i] =~ m/^sub\s*/gc ) {
+ if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
# note: older versions of perl require the /gc modifier
# here or else the \G does not work.
- if ( $output_block_type[$i] =~ /\G('|::|\w)/gc ) {
+ if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
+ {
$in_statement_continuation = 0;
}
}
# block prototypes and these: (sort|grep|map|do|eval)
# /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
elsif (
- $is_zero_continuation_block_type{ $output_block_type
- [$i] } )
+ $is_zero_continuation_block_type{
+ $routput_block_type->[$i] } )
{
$in_statement_continuation = 0;
}
# /^(sort|grep|map|do|eval)$/ )
elsif (
$is_not_zero_continuation_block_type{
- $output_block_type[$i] } )
+ $routput_block_type->[$i] } )
{
}
# ..and a block introduced by a label
# /^\w+\s*:$/gc ) {
- elsif ( $output_block_type[$i] =~ /:$/ ) {
+ elsif ( $routput_block_type->[$i] =~ /:$/ ) {
$in_statement_continuation = 0;
}
- # ..nor user function with block prototype
+ # user function with block prototype
else {
+ $in_statement_continuation = 0;
}
}
# );
elsif ( $tok eq ')' ) {
$in_statement_continuation = 1
- if $output_container_type[$i] =~ /^[;,\{\}]$/;
+ if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
}
}
$container_environment =
$nesting_block_flag ? 'BLOCK'
: $nesting_list_flag ? 'LIST'
- : "";
+ : "";
$ci_string_i = $ci_string_sum + $in_statement_continuation;
$nesting_block_string_i = $nesting_block_string;
$nesting_list_string_i = $nesting_list_string;
$container_environment =
$nesting_block_flag ? 'BLOCK'
: $nesting_list_flag ? 'LIST'
- : "";
+ : "";
# zero the continuation indentation at certain tokens so
# that they will be at the same level as its container. For
}
if ( $level_in_tokenizer < 0 ) {
- unless ($saw_negative_indentation) {
- $saw_negative_indentation = 1;
+ unless ( $tokenizer_self->{_saw_negative_indentation} ) {
+ $tokenizer_self->{_saw_negative_indentation} = 1;
warning("Starting negative indentation\n");
}
}
}
}
- push( @block_type, $output_block_type[$i] );
+ push( @block_type, $routput_block_type->[$i] );
push( @ci_string, $ci_string_i );
push( @container_environment, $container_environment );
- push( @container_type, $output_container_type[$i] );
+ push( @container_type, $routput_container_type->[$i] );
push( @levels, $level_i );
push( @nesting_tokens, $nesting_token_string_i );
push( @nesting_types, $nesting_type_string_i );
push( @slevels, $slevel_i );
push( @token_type, $fix_type );
- push( @type_sequence, $output_type_sequence[$i] );
+ push( @type_sequence, $routput_type_sequence->[$i] );
push( @nesting_blocks, $nesting_block_string );
push( @nesting_lists, $nesting_list_string );
$tokenizer_self->{_in_attribute_list} = $in_attribute_list;
$tokenizer_self->{_in_quote} = $in_quote;
- $tokenizer_self->{_rhere_target_list} = \@here_target_list;
+ $tokenizer_self->{_quote_target} =
+ $in_quote ? matching_end_token($quote_character) : "";
+ $tokenizer_self->{_rhere_target_list} = $rhere_target_list;
$line_of_tokens->{_rtoken_type} = \@token_type;
$line_of_tokens->{_rtokens} = \@tokens;
}
} # end tokenize_this_line
-sub new_statement_ok {
-
- # return true if the current token can start a new statement
+#########i#############################################################
+# Tokenizer routines which assist in identifying token types
+#######################################################################
- return label_ok() # a label would be ok here
+sub operator_expected {
- || $last_nonblank_type eq 'J'; # or we follow a label
+ # Many perl symbols have two or more meanings. For example, '<<'
+ # can be a shift operator or a here-doc operator. The
+ # interpretation of these symbols depends on the current state of
+ # the tokenizer, which may either be expecting a term or an
+ # operator. For this example, a << would be a shift if an operator
+ # is expected, and a here-doc if a term is expected. This routine
+ # is called to make this decision for any current token. It returns
+ # one of three possible values:
+ #
+ # OPERATOR - operator expected (or at least, not a term)
+ # UNKNOWN - can't tell
+ # TERM - a term is expected (or at least, not an operator)
+ #
+ # The decision is based on what has been seen so far. This
+ # information is stored in the "$last_nonblank_type" and
+ # "$last_nonblank_token" variables. For example, if the
+ # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
+ # if $last_nonblank_type is 'n' (numeric), we are expecting an
+ # OPERATOR.
+ #
+ # If a UNKNOWN is returned, the calling routine must guess. A major
+ # goal of this tokenizer is to minimize the possiblity of returning
+ # UNKNOWN, because a wrong guess can spoil the formatting of a
+ # script.
+ #
+ # adding NEW_TOKENS: it is critically important that this routine be
+ # updated to allow it to determine if an operator or term is to be
+ # expected after the new token. Doing this simply involves adding
+ # the new token character to one of the regexes in this routine or
+ # to one of the hash lists
+ # that it uses, which are initialized in the BEGIN section.
+ # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
+ # $statement_type
-}
+ my ( $prev_type, $tok, $next_type ) = @_;
-sub label_ok {
+ my $op_expected = UNKNOWN;
- # Decide if a bare word followed by a colon here is a label
+#print "tok=$tok last type=$last_nonblank_type last tok=$last_nonblank_token\n";
- # if it follows an opening or closing code block curly brace..
- if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
- && $last_nonblank_type eq $last_nonblank_token )
- {
+# Note: function prototype is available for token type 'U' for future
+# program development. It contains the leading and trailing parens,
+# and no blanks. It might be used to eliminate token type 'C', for
+# example (prototype = '()'). Thus:
+# if ($last_nonblank_type eq 'U') {
+# print "previous token=$last_nonblank_token type=$last_nonblank_type prototype=$last_nonblank_prototype\n";
+# }
- # it is a label if and only if the curly encloses a code block
- return $brace_type[$brace_depth];
- }
+ # A possible filehandle (or object) requires some care...
+ if ( $last_nonblank_type eq 'Z' ) {
- # otherwise, it is a label if and only if it follows a ';'
- # (real or fake)
- else {
- return ( $last_nonblank_type eq ';' );
- }
-}
+ # angle.t
+ if ( $last_nonblank_token =~ /^[A-Za-z_]/ ) {
+ $op_expected = UNKNOWN;
+ }
-sub code_block_type {
+ # For possible file handle like "$a", Perl uses weird parsing rules.
+ # For example:
+ # print $a/2,"/hi"; - division
+ # print $a / 2,"/hi"; - division
+ # print $a/ 2,"/hi"; - division
+ # print $a /2,"/hi"; - pattern (and error)!
+ elsif ( ( $prev_type eq 'b' ) && ( $next_type ne 'b' ) ) {
+ $op_expected = TERM;
+ }
- # Decide if this is a block of code, and its type.
- # Must be called only when $type = $token = '{'
- # The problem is to distinguish between the start of a block of code
- # and the start of an anonymous hash reference
- # Returns "" if not code block, otherwise returns 'last_nonblank_token'
- # to indicate the type of code block. (For example, 'last_nonblank_token'
- # might be 'if' for an if block, 'else' for an else block, etc).
+ # Note when an operation is being done where a
+ # filehandle might be expected, since a change in whitespace
+ # could change the interpretation of the statement.
+ else {
+ if ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
+ complain("operator in print statement not recommended\n");
+ $op_expected = OPERATOR;
+ }
+ }
+ }
- # handle case of multiple '{'s
+ # handle something after 'do' and 'eval'
+ elsif ( $is_block_operator{$last_nonblank_token} ) {
+
+ # something like $a = eval "expression";
+ # ^
+ if ( $last_nonblank_type eq 'k' ) {
+ $op_expected = TERM; # expression or list mode following keyword
+ }
+
+ # something like $a = do { BLOCK } / 2;
+ # ^
+ else {
+ $op_expected = OPERATOR; # block mode following }
+ }
+ }
+
+ # handle bare word..
+ elsif ( $last_nonblank_type eq 'w' ) {
+
+ # unfortunately, we can't tell what type of token to expect next
+ # after most bare words
+ $op_expected = UNKNOWN;
+ }
+
+ # operator, but not term possible after these types
+ # Note: moved ')' from type to token because parens in list context
+ # get marked as '{' '}' now. This is a minor glitch in the following:
+ # my %opts = (ref $_[0] eq 'HASH') ? %{shift()} : ();
+ #
+ elsif (( $last_nonblank_type =~ /^[\]RnviQh]$/ )
+ || ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) )
+ {
+ $op_expected = OPERATOR;
+
+ # in a 'use' statement, numbers and v-strings are not true
+ # numbers, so to avoid incorrect error messages, we will
+ # mark them as unknown for now (use.t)
+ # TODO: it would be much nicer to create a new token V for VERSION
+ # number in a use statement. Then this could be a check on type V
+ # and related patches which change $statement_type for '=>'
+ # and ',' could be removed. Further, it would clean things up to
+ # scan the 'use' statement with a separate subroutine.
+ if ( ( $statement_type eq 'use' )
+ && ( $last_nonblank_type =~ /^[nv]$/ ) )
+ {
+ $op_expected = UNKNOWN;
+ }
+ }
+
+ # no operator after many keywords, such as "die", "warn", etc
+ elsif ( $expecting_term_token{$last_nonblank_token} ) {
+
+ # patch for dor.t (defined or).
+ # perl functions which may be unary operators
+ # TODO: This list is incomplete, and these should be put
+ # into a hash.
+ if ( $tok eq '/'
+ && $next_type eq '/'
+ && $last_nonblank_type eq 'k'
+ && $last_nonblank_token =~ /^eof|undef|shift|pop$/ )
+ {
+ $op_expected = OPERATOR;
+ }
+ else {
+ $op_expected = TERM;
+ }
+ }
+
+ # no operator after things like + - ** (i.e., other operators)
+ elsif ( $expecting_term_types{$last_nonblank_type} ) {
+ $op_expected = TERM;
+ }
+
+ # a few operators, like "time", have an empty prototype () and so
+ # take no parameters but produce a value to operate on
+ elsif ( $expecting_operator_token{$last_nonblank_token} ) {
+ $op_expected = OPERATOR;
+ }
+
+ # post-increment and decrement produce values to be operated on
+ elsif ( $expecting_operator_types{$last_nonblank_type} ) {
+ $op_expected = OPERATOR;
+ }
+
+ # no value to operate on after sub block
+ elsif ( $last_nonblank_token =~ /^sub\s/ ) { $op_expected = TERM; }
+
+ # a right brace here indicates the end of a simple block.
+ # all non-structural right braces have type 'R'
+ # all braces associated with block operator keywords have been given those
+ # keywords as "last_nonblank_token" and caught above.
+ # (This statement is order dependent, and must come after checking
+ # $last_nonblank_token).
+ elsif ( $last_nonblank_type eq '}' ) {
+
+ # patch for dor.t (defined or).
+ if ( $tok eq '/'
+ && $next_type eq '/'
+ && $last_nonblank_token eq ']' )
+ {
+ $op_expected = OPERATOR;
+ }
+ else {
+ $op_expected = TERM;
+ }
+ }
+
+ # something else..what did I forget?
+ else {
+
+ # collecting diagnostics on unknown operator types..see what was missed
+ $op_expected = UNKNOWN;
+ write_diagnostics(
+"OP: unknown after type=$last_nonblank_type token=$last_nonblank_token\n"
+ );
+ }
+
+ TOKENIZER_DEBUG_FLAG_EXPECT && do {
+ print
+"EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
+ };
+ return $op_expected;
+}
+
+sub new_statement_ok {
+
+ # return true if the current token can start a new statement
+ # USES GLOBAL VARIABLES: $last_nonblank_type
+
+ return label_ok() # a label would be ok here
+
+ || $last_nonblank_type eq 'J'; # or we follow a label
+
+}
+
+sub label_ok {
+
+ # Decide if a bare word followed by a colon here is a label
+ # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
+ # $brace_depth, @brace_type
+
+ # if it follows an opening or closing code block curly brace..
+ if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
+ && $last_nonblank_type eq $last_nonblank_token )
+ {
+
+ # it is a label if and only if the curly encloses a code block
+ return $brace_type[$brace_depth];
+ }
+
+ # otherwise, it is a label if and only if it follows a ';'
+ # (real or fake)
+ else {
+ return ( $last_nonblank_type eq ';' );
+ }
+}
+
+sub code_block_type {
+
+ # Decide if this is a block of code, and its type.
+ # Must be called only when $type = $token = '{'
+ # The problem is to distinguish between the start of a block of code
+ # and the start of an anonymous hash reference
+ # Returns "" if not code block, otherwise returns 'last_nonblank_token'
+ # to indicate the type of code block. (For example, 'last_nonblank_token'
+ # might be 'if' for an if block, 'else' for an else block, etc).
+ # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
+ # $last_nonblank_block_type, $brace_depth, @brace_type
+
+ # handle case of multiple '{'s
# print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
- my ( $i, $rtokens, $rtoken_type ) = @_;
+ my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
if ( $last_nonblank_token eq '{'
&& $last_nonblank_type eq $last_nonblank_token )
{
# opening brace where a statement may appear is probably
# a code block but might be and anonymous hash reference
if ( $brace_type[$brace_depth] ) {
- return decide_if_code_block( $i, $rtokens, $rtoken_type );
+ return decide_if_code_block( $i, $rtokens, $rtoken_type,
+ $max_token_index );
}
# cannot start a code block within an anonymous hash
# an opening brace where a statement may appear is probably
# a code block but might be and anonymous hash reference
- return decide_if_code_block( $i, $rtokens, $rtoken_type );
+ return decide_if_code_block( $i, $rtokens, $rtoken_type,
+ $max_token_index );
}
# handle case of '}{'
# a } { situation ...
# could be hash reference after code block..(blktype1.t)
if ($last_nonblank_block_type) {
- return decide_if_code_block( $i, $rtokens, $rtoken_type );
+ return decide_if_code_block( $i, $rtokens, $rtoken_type,
+ $max_token_index );
}
# must be a block if it follows a closing hash reference
# check bareword
elsif ( $last_nonblank_type eq 'w' ) {
- return decide_if_code_block( $i, $rtokens, $rtoken_type );
+ return decide_if_code_block( $i, $rtokens, $rtoken_type,
+ $max_token_index );
}
# anything else must be anonymous hash reference
sub decide_if_code_block {
- my ( $i, $rtokens, $rtoken_type ) = @_;
+ # USES GLOBAL VARIABLES: $last_nonblank_token
+ my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
my ( $next_nonblank_token, $i_next ) =
- find_next_nonblank_token( $i, $rtokens );
+ find_next_nonblank_token( $i, $rtokens, $max_token_index );
# we are at a '{' where a statement may appear.
# We must decide if this brace starts an anonymous hash or a code
sub unexpected {
# report unexpected token type and show where it is
- my ( $found, $expecting, $i_tok, $last_nonblank_i ) = @_;
- $unexpected_error_count++;
- if ( $unexpected_error_count <= MAX_NAG_MESSAGES ) {
+ # USES GLOBAL VARIABLES: $tokenizer_self
+ my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
+ $rpretoken_type, $input_line )
+ = @_;
+
+ if ( ++$tokenizer_self->{_unexpected_error_count} <= MAX_NAG_MESSAGES ) {
my $msg = "found $found where $expecting expected";
my $pos = $$rpretoken_map[$i_tok];
interrupt_logfile();
+ my $input_line_number = $tokenizer_self->{_last_line_number};
my ( $offset, $numbered_line, $underline ) =
make_numbered_line( $input_line_number, $input_line, $pos );
$underline = write_on_underline( $underline, $pos - $offset, '^' );
}
}
-sub indicate_error {
- my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
- interrupt_logfile();
- warning($msg);
- write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
- resume_logfile();
-}
+sub is_non_structural_brace {
-sub write_error_indicator_pair {
- my ( $line_number, $input_line, $pos, $carrat ) = @_;
- my ( $offset, $numbered_line, $underline ) =
- make_numbered_line( $line_number, $input_line, $pos );
- $underline = write_on_underline( $underline, $pos - $offset, $carrat );
- warning( $numbered_line . "\n" );
- $underline =~ s/\s*$//;
- warning( $underline . "\n" );
-}
+ # Decide if a brace or bracket is structural or non-structural
+ # by looking at the previous token and type
+ # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
-sub make_numbered_line {
+ # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
+ # Tentatively deactivated because it caused the wrong operator expectation
+ # for this code:
+ # $user = @vars[1] / 100;
+ # Must update sub operator_expected before re-implementing.
+ # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
+ # return 0;
+ # }
- # Given an input line, its line number, and a character position of
- # interest, create a string not longer than 80 characters of the form
- # $lineno: sub_string
- # such that the sub_string of $str contains the position of interest
- #
- # Here is an example of what we want, in this case we add trailing
- # '...' because the line is long.
- #
- # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
- #
- # Here is another example, this time in which we used leading '...'
- # because of excessive length:
- #
- # 2: ... er of the World Wide Web Consortium's
- #
- # input parameters are:
- # $lineno = line number
- # $str = the text of the line
- # $pos = position of interest (the error) : 0 = first character
- #
- # We return :
- # - $offset = an offset which corrects the position in case we only
- # display part of a line, such that $pos-$offset is the effective
- # position from the start of the displayed line.
- # - $numbered_line = the numbered line as above,
- # - $underline = a blank 'underline' which is all spaces with the same
- # number of characters as the numbered line.
+ # NOTE: braces after type characters start code blocks, but for
+ # simplicity these are not identified as such. See also
+ # sub code_block_type
+ # if ($last_nonblank_type eq 't') {return 0}
- my ( $lineno, $str, $pos ) = @_;
- my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
- my $excess = length($str) - $offset - 68;
- my $numc = ( $excess > 0 ) ? 68 : undef;
+ # otherwise, it is non-structural if it is decorated
+ # by type information.
+ # For example, the '{' here is non-structural: ${xxx}
+ (
+ $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
- if ( defined($numc) ) {
- if ( $offset == 0 ) {
- $str = substr( $str, $offset, $numc - 4 ) . " ...";
- }
- else {
- $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
- }
- }
- else {
+ # or if we follow a hash or array closing curly brace or bracket
+ # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
+ # because the first '}' would have been given type 'R'
+ || $last_nonblank_type =~ /^([R\]])$/
+ );
+}
- if ( $offset == 0 ) {
- }
- else {
- $str = "... " . substr( $str, $offset + 4 );
- }
- }
+#########i#############################################################
+# Tokenizer routines for tracking container nesting depths
+#######################################################################
- my $numbered_line = sprintf( "%d: ", $lineno );
- $offset -= length($numbered_line);
- $numbered_line .= $str;
- my $underline = " " x length($numbered_line);
- return ( $offset, $numbered_line, $underline );
-}
+# The following routines keep track of nesting depths of the nesting
+# types, ( [ { and ?. This is necessary for determining the indentation
+# level, and also for debugging programs. Not only do they keep track of
+# nesting depths of the individual brace types, but they check that each
+# of the other brace types is balanced within matching pairs. For
+# example, if the program sees this sequence:
+#
+# { ( ( ) }
+#
+# then it can determine that there is an extra left paren somewhere
+# between the { and the }. And so on with every other possible
+# combination of outer and inner brace types. For another
+# example:
+#
+# ( [ ..... ] ] )
+#
+# which has an extra ] within the parens.
+#
+# The brace types have indexes 0 .. 3 which are indexes into
+# the matrices.
+#
+# The pair ? : are treated as just another nesting type, with ? acting
+# as the opening brace and : acting as the closing brace.
+#
+# The matrix
+#
+# $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
+#
+# saves the nesting depth of brace type $b (where $b is either of the other
+# nesting types) when brace type $a enters a new depth. When this depth
+# decreases, a check is made that the current depth of brace types $b is
+# unchanged, or otherwise there must have been an error. This can
+# be very useful for localizing errors, particularly when perl runs to
+# the end of a large file (such as this one) and announces that there
+# is a problem somewhere.
+#
+# A numerical sequence number is maintained for every nesting type,
+# so that each matching pair can be uniquely identified in a simple
+# way.
-sub write_on_underline {
+sub increase_nesting_depth {
+ my ( $a, $pos ) = @_;
- # The "underline" is a string that shows where an error is; it starts
- # out as a string of blanks with the same length as the numbered line of
- # code above it, and we have to add marking to show where an error is.
- # In the example below, we want to write the string '--^' just below
- # the line of bad code:
- #
- # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
- # ---^
- # We are given the current underline string, plus a position and a
- # string to write on it.
- #
- # In the above example, there will be 2 calls to do this:
- # First call: $pos=19, pos_chr=^
- # Second call: $pos=16, pos_chr=---
- #
- # This is a trivial thing to do with substr, but there is some
- # checking to do.
+ # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
+ # @current_sequence_number, @depth_array, @starting_line_of_current_depth
+ my $b;
+ $current_depth[$a]++;
+ my $input_line_number = $tokenizer_self->{_last_line_number};
+ my $input_line = $tokenizer_self->{_line_text};
- my ( $underline, $pos, $pos_chr ) = @_;
+ # Sequence numbers increment by number of items. This keeps
+ # a unique set of numbers but still allows the relative location
+ # of any type to be determined.
+ $nesting_sequence_number[$a] += scalar(@closing_brace_names);
+ my $seqno = $nesting_sequence_number[$a];
+ $current_sequence_number[$a][ $current_depth[$a] ] = $seqno;
- # check for error..shouldn't happen
- unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
- return $underline;
- }
- my $excess = length($pos_chr) + $pos - length($underline);
- if ( $excess > 0 ) {
- $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
+ $starting_line_of_current_depth[$a][ $current_depth[$a] ] =
+ [ $input_line_number, $input_line, $pos ];
+
+ for $b ( 0 .. $#closing_brace_names ) {
+ next if ( $b == $a );
+ $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
}
- substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
- return ($underline);
+ return $seqno;
}
-sub is_non_structural_brace {
-
- # Decide if a brace or bracket is structural or non-structural
- # by looking at the previous token and type
-
- # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
- # Tentatively deactivated because it caused the wrong operator expectation
- # for this code:
- # $user = @vars[1] / 100;
- # Must update sub operator_expected before re-implementing.
- # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
- # return 0;
- # }
-
- # NOTE: braces after type characters start code blocks, but for
- # simplicity these are not identified as such. See also
- # sub code_block_type
- # if ($last_nonblank_type eq 't') {return 0}
+sub decrease_nesting_depth {
- # otherwise, it is non-structural if it is decorated
- # by type information.
- # For example, the '{' here is non-structural: ${xxx}
- (
- $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
+ my ( $a, $pos ) = @_;
- # or if we follow a hash or array closing curly brace or bracket
- # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
- # because the first '}' would have been given type 'R'
- || $last_nonblank_type =~ /^([R\]])$/
- );
-}
+ # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
+ # @current_sequence_number, @depth_array, @starting_line_of_current_depth
+ my $b;
+ my $seqno = 0;
+ my $input_line_number = $tokenizer_self->{_last_line_number};
+ my $input_line = $tokenizer_self->{_line_text};
-sub operator_expected {
+ if ( $current_depth[$a] > 0 ) {
- # Many perl symbols have two or more meanings. For example, '<<'
- # can be a shift operator or a here-doc operator. The
- # interpretation of these symbols depends on the current state of
- # the tokenizer, which may either be expecting a term or an
- # operator. For this example, a << would be a shift if an operator
- # is expected, and a here-doc if a term is expected. This routine
- # is called to make this decision for any current token. It returns
- # one of three possible values:
- #
- # OPERATOR - operator expected (or at least, not a term)
- # UNKNOWN - can't tell
- # TERM - a term is expected (or at least, not an operator)
- #
- # The decision is based on what has been seen so far. This
- # information is stored in the "$last_nonblank_type" and
- # "$last_nonblank_token" variables. For example, if the
- # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
- # if $last_nonblank_type is 'n' (numeric), we are expecting an
- # OPERATOR.
- #
- # If a UNKNOWN is returned, the calling routine must guess. A major
- # goal of this tokenizer is to minimize the possiblity of returning
- # UNKNOWN, because a wrong guess can spoil the formatting of a
- # script.
- #
- # adding NEW_TOKENS: it is critically important that this routine be
- # updated to allow it to determine if an operator or term is to be
- # expected after the new token. Doing this simply involves adding
- # the new token character to one of the regexes in this routine or
- # to one of the hash lists
- # that it uses, which are initialized in the BEGIN section.
+ $seqno = $current_sequence_number[$a][ $current_depth[$a] ];
- my ( $prev_type, $tok, $next_type ) = @_;
- my $op_expected = UNKNOWN;
+ # check that any brace types $b contained within are balanced
+ for $b ( 0 .. $#closing_brace_names ) {
+ next if ( $b == $a );
-#print "tok=$tok last type=$last_nonblank_type last tok=$last_nonblank_token\n";
+ unless ( $depth_array[$a][$b][ $current_depth[$a] ] ==
+ $current_depth[$b] )
+ {
+ my $diff = $current_depth[$b] -
+ $depth_array[$a][$b][ $current_depth[$a] ];
-# Note: function prototype is available for token type 'U' for future
-# program development. It contains the leading and trailing parens,
-# and no blanks. It might be used to eliminate token type 'C', for
-# example (prototype = '()'). Thus:
-# if ($last_nonblank_type eq 'U') {
-# print "previous token=$last_nonblank_token type=$last_nonblank_type prototype=$last_nonblank_prototype\n";
-# }
+ # don't whine too many times
+ my $saw_brace_error = get_saw_brace_error();
+ if (
+ $saw_brace_error <= MAX_NAG_MESSAGES
- # A possible filehandle (or object) requires some care...
- if ( $last_nonblank_type eq 'Z' ) {
+ # if too many closing types have occured, we probably
+ # already caught this error
+ && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
+ )
+ {
+ interrupt_logfile();
+ my $rsl =
+ $starting_line_of_current_depth[$a][ $current_depth[$a] ];
+ my $sl = $$rsl[0];
+ my $rel = [ $input_line_number, $input_line, $pos ];
+ my $el = $$rel[0];
+ my ($ess);
- # angle.t
- if ( $last_nonblank_token =~ /^[A-Za-z_]/ ) {
- $op_expected = UNKNOWN;
- }
+ if ( $diff == 1 || $diff == -1 ) {
+ $ess = '';
+ }
+ else {
+ $ess = 's';
+ }
+ my $bname =
+ ( $diff > 0 )
+ ? $opening_brace_names[$b]
+ : $closing_brace_names[$b];
+ write_error_indicator_pair( @$rsl, '^' );
+ my $msg = <<"EOM";
+Found $diff extra $bname$ess between $opening_brace_names[$a] on line $sl and $closing_brace_names[$a] on line $el
+EOM
- # For possible file handle like "$a", Perl uses weird parsing rules.
- # For example:
- # print $a/2,"/hi"; - division
- # print $a / 2,"/hi"; - division
- # print $a/ 2,"/hi"; - division
- # print $a /2,"/hi"; - pattern (and error)!
- elsif ( ( $prev_type eq 'b' ) && ( $next_type ne 'b' ) ) {
- $op_expected = TERM;
+ if ( $diff > 0 ) {
+ my $rml =
+ $starting_line_of_current_depth[$b]
+ [ $current_depth[$b] ];
+ my $ml = $$rml[0];
+ $msg .=
+" The most recent un-matched $bname is on line $ml\n";
+ write_error_indicator_pair( @$rml, '^' );
+ }
+ write_error_indicator_pair( @$rel, '^' );
+ warning($msg);
+ resume_logfile();
+ }
+ increment_brace_error();
+ }
}
+ $current_depth[$a]--;
+ }
+ else {
- # Note when an operation is being done where a
- # filehandle might be expected, since a change in whitespace
- # could change the interpretation of the statement.
- else {
- if ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
- complain("operator in print statement not recommended\n");
- $op_expected = OPERATOR;
- }
+ my $saw_brace_error = get_saw_brace_error();
+ if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
+ my $msg = <<"EOM";
+There is no previous $opening_brace_names[$a] to match a $closing_brace_names[$a] on line $input_line_number
+EOM
+ indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
}
+ increment_brace_error();
}
+ return $seqno;
+}
- # handle something after 'do' and 'eval'
- elsif ( $is_block_operator{$last_nonblank_token} ) {
+sub check_final_nesting_depths {
+ my ($a);
- # something like $a = eval "expression";
- # ^
- if ( $last_nonblank_type eq 'k' ) {
- $op_expected = TERM; # expression or list mode following keyword
- }
+ # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
- # something like $a = do { BLOCK } / 2;
- # ^
- else {
- $op_expected = OPERATOR; # block mode following }
+ for $a ( 0 .. $#closing_brace_names ) {
+
+ if ( $current_depth[$a] ) {
+ my $rsl = $starting_line_of_current_depth[$a][ $current_depth[$a] ];
+ my $sl = $$rsl[0];
+ my $msg = <<"EOM";
+Final nesting depth of $opening_brace_names[$a]s is $current_depth[$a]
+The most recent un-matched $opening_brace_names[$a] is on line $sl
+EOM
+ indicate_error( $msg, @$rsl, '^' );
+ increment_brace_error();
}
}
+}
- # handle bare word..
- elsif ( $last_nonblank_type eq 'w' ) {
+#########i#############################################################
+# Tokenizer routines for looking ahead in input stream
+#######################################################################
- # unfortunately, we can't tell what type of token to expect next
- # after most bare words
- $op_expected = UNKNOWN;
+sub peek_ahead_for_n_nonblank_pre_tokens {
+
+ # returns next n pretokens if they exist
+ # returns undef's if hits eof without seeing any pretokens
+ # USES GLOBAL VARIABLES: $tokenizer_self
+ my $max_pretokens = shift;
+ my $line;
+ my $i = 0;
+ my ( $rpre_tokens, $rmap, $rpre_types );
+
+ while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
+ {
+ $line =~ s/^\s*//; # trim leading blanks
+ next if ( length($line) <= 0 ); # skip blank
+ next if ( $line =~ /^#/ ); # skip comment
+ ( $rpre_tokens, $rmap, $rpre_types ) =
+ pre_tokenize( $line, $max_pretokens );
+ last;
}
+ return ( $rpre_tokens, $rpre_types );
+}
- # operator, but not term possible after these types
- # Note: moved ')' from type to token because parens in list context
- # get marked as '{' '}' now. This is a minor glitch in the following:
- # my %opts = (ref $_[0] eq 'HASH') ? %{shift()} : ();
- #
- elsif (( $last_nonblank_type =~ /^[\]RnviQh]$/ )
- || ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) )
+# look ahead for next non-blank, non-comment line of code
+sub peek_ahead_for_nonblank_token {
+
+ # USES GLOBAL VARIABLES: $tokenizer_self
+ my ( $rtokens, $max_token_index ) = @_;
+ my $line;
+ my $i = 0;
+
+ while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
{
- $op_expected = OPERATOR;
+ $line =~ s/^\s*//; # trim leading blanks
+ next if ( length($line) <= 0 ); # skip blank
+ next if ( $line =~ /^#/ ); # skip comment
+ my ( $rtok, $rmap, $rtype ) =
+ pre_tokenize( $line, 2 ); # only need 2 pre-tokens
+ my $j = $max_token_index + 1;
+ my $tok;
- # in a 'use' statement, numbers and v-strings are not true
- # numbers, so to avoid incorrect error messages, we will
- # mark them as unknown for now (use.t)
- # TODO: it would be much nicer to create a new token V for VERSION
- # number in a use statement. Then this could be a check on type V
- # and related patches which change $statement_type for '=>'
- # and ',' could be removed. Further, it would clean things up to
- # scan the 'use' statement with a separate subroutine.
- if ( ( $statement_type eq 'use' )
- && ( $last_nonblank_type =~ /^[nv]$/ ) )
- {
- $op_expected = UNKNOWN;
+ foreach $tok (@$rtok) {
+ last if ( $tok =~ "\n" );
+ $$rtokens[ ++$j ] = $tok;
}
+ last;
}
+ return $rtokens;
+}
- # no operator after many keywords, such as "die", "warn", etc
- elsif ( $expecting_term_token{$last_nonblank_token} ) {
+#########i#############################################################
+# Tokenizer guessing routines for ambiguous situations
+#######################################################################
- # patch for dor.t (defined or).
- # perl functions which may be unary operators
- # TODO: This list is incomplete, and these should be put
- # into a hash.
- if ( $tok eq '/'
- && $next_type eq '/'
- && $last_nonblank_type eq 'k'
- && $last_nonblank_token =~ /^eof|undef|shift|pop$/ )
- {
- $op_expected = OPERATOR;
+sub guess_if_pattern_or_conditional {
+
+ # this routine is called when we have encountered a ? following an
+ # unknown bareword, and we must decide if it starts a pattern or not
+ # input parameters:
+ # $i - token index of the ? starting possible pattern
+ # output parameters:
+ # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
+ # msg = a warning or diagnostic message
+ # USES GLOBAL VARIABLES: $last_nonblank_token
+ my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
+ my $is_pattern = 0;
+ my $msg = "guessing that ? after $last_nonblank_token starts a ";
+
+ if ( $i >= $max_token_index ) {
+ $msg .= "conditional (no end to pattern found on the line)\n";
+ }
+ else {
+ my $ibeg = $i;
+ $i = $ibeg + 1;
+ my $next_token = $$rtokens[$i]; # first token after ?
+
+ # look for a possible ending ? on this line..
+ my $in_quote = 1;
+ my $quote_depth = 0;
+ my $quote_character = '';
+ my $quote_pos = 0;
+ my $quoted_string;
+ (
+ $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
+ $quoted_string
+ )
+ = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
+ $quote_pos, $quote_depth, $max_token_index );
+
+ if ($in_quote) {
+
+ # we didn't find an ending ? on this line,
+ # so we bias towards conditional
+ $is_pattern = 0;
+ $msg .= "conditional (no ending ? on this line)\n";
+
+ # we found an ending ?, so we bias towards a pattern
}
else {
- $op_expected = TERM;
+
+ if ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
+ $is_pattern = 1;
+ $msg .= "pattern (found ending ? and pattern expected)\n";
+ }
+ else {
+ $msg .= "pattern (uncertain, but found ending ?)\n";
+ }
}
}
+ return ( $is_pattern, $msg );
+}
- # no operator after things like + - ** (i.e., other operators)
- elsif ( $expecting_term_types{$last_nonblank_type} ) {
- $op_expected = TERM;
- }
+sub guess_if_pattern_or_division {
- # a few operators, like "time", have an empty prototype () and so
- # take no parameters but produce a value to operate on
- elsif ( $expecting_operator_token{$last_nonblank_token} ) {
- $op_expected = OPERATOR;
- }
+ # this routine is called when we have encountered a / following an
+ # unknown bareword, and we must decide if it starts a pattern or is a
+ # division
+ # input parameters:
+ # $i - token index of the / starting possible pattern
+ # output parameters:
+ # $is_pattern = 0 if probably division, =1 if probably a pattern
+ # msg = a warning or diagnostic message
+ # USES GLOBAL VARIABLES: $last_nonblank_token
+ my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
+ my $is_pattern = 0;
+ my $msg = "guessing that / after $last_nonblank_token starts a ";
- # post-increment and decrement produce values to be operated on
- elsif ( $expecting_operator_types{$last_nonblank_type} ) {
- $op_expected = OPERATOR;
+ if ( $i >= $max_token_index ) {
+ "division (no end to pattern found on the line)\n";
}
+ else {
+ my $ibeg = $i;
+ my $divide_expected =
+ numerator_expected( $i, $rtokens, $max_token_index );
+ $i = $ibeg + 1;
+ my $next_token = $$rtokens[$i]; # first token after slash
- # no value to operate on after sub block
- elsif ( $last_nonblank_token =~ /^sub\s/ ) { $op_expected = TERM; }
+ # look for a possible ending / on this line..
+ my $in_quote = 1;
+ my $quote_depth = 0;
+ my $quote_character = '';
+ my $quote_pos = 0;
+ my $quoted_string;
+ (
+ $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
+ $quoted_string
+ )
+ = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
+ $quote_pos, $quote_depth, $max_token_index );
- # a right brace here indicates the end of a simple block.
- # all non-structural right braces have type 'R'
- # all braces associated with block operator keywords have been given those
- # keywords as "last_nonblank_token" and caught above.
- # (This statement is order dependent, and must come after checking
- # $last_nonblank_token).
- elsif ( $last_nonblank_type eq '}' ) {
+ if ($in_quote) {
+
+ # we didn't find an ending / on this line,
+ # so we bias towards division
+ if ( $divide_expected >= 0 ) {
+ $is_pattern = 0;
+ $msg .= "division (no ending / on this line)\n";
+ }
+ else {
+ $msg = "multi-line pattern (division not possible)\n";
+ $is_pattern = 1;
+ }
- # patch for dor.t (defined or).
- if ( $tok eq '/'
- && $next_type eq '/'
- && $last_nonblank_token eq ']' )
- {
- $op_expected = OPERATOR;
}
+
+ # we found an ending /, so we bias towards a pattern
else {
- $op_expected = TERM;
- }
- }
- # something else..what did I forget?
- else {
+ if ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
- # collecting diagnostics on unknown operator types..see what was missed
- $op_expected = UNKNOWN;
- write_diagnostics(
-"OP: unknown after type=$last_nonblank_type token=$last_nonblank_token\n"
- );
- }
+ if ( $divide_expected >= 0 ) {
- TOKENIZER_DEBUG_FLAG_EXPECT && do {
- print
-"EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
- };
- return $op_expected;
+ if ( $i - $ibeg > 60 ) {
+ $msg .= "division (matching / too distant)\n";
+ $is_pattern = 0;
+ }
+ else {
+ $msg .= "pattern (but division possible too)\n";
+ $is_pattern = 1;
+ }
+ }
+ else {
+ $is_pattern = 1;
+ $msg .= "pattern (division not possible)\n";
+ }
+ }
+ else {
+
+ if ( $divide_expected >= 0 ) {
+ $is_pattern = 0;
+ $msg .= "division (pattern not possible)\n";
+ }
+ else {
+ $is_pattern = 1;
+ $msg .=
+ "pattern (uncertain, but division would not work here)\n";
+ }
+ }
+ }
+ }
+ return ( $is_pattern, $msg );
}
-# The following routines keep track of nesting depths of the nesting
-# types, ( [ { and ?. This is necessary for determining the indentation
-# level, and also for debugging programs. Not only do they keep track of
-# nesting depths of the individual brace types, but they check that each
-# of the other brace types is balanced within matching pairs. For
-# example, if the program sees this sequence:
-#
-# { ( ( ) }
-#
-# then it can determine that there is an extra left paren somewhere
-# between the { and the }. And so on with every other possible
-# combination of outer and inner brace types. For another
-# example:
-#
-# ( [ ..... ] ] )
-#
-# which has an extra ] within the parens.
-#
-# The brace types have indexes 0 .. 3 which are indexes into
-# the matrices.
-#
-# The pair ? : are treated as just another nesting type, with ? acting
-# as the opening brace and : acting as the closing brace.
-#
-# The matrix
-#
-# $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
-#
-# saves the nesting depth of brace type $b (where $b is either of the other
-# nesting types) when brace type $a enters a new depth. When this depth
-# decreases, a check is made that the current depth of brace types $b is
-# unchanged, or otherwise there must have been an error. This can
-# be very useful for localizing errors, particularly when perl runs to
-# the end of a large file (such as this one) and announces that there
-# is a problem somewhere.
-#
-# A numerical sequence number is maintained for every nesting type,
-# so that each matching pair can be uniquely identified in a simple
-# way.
+# try to resolve here-doc vs. shift by looking ahead for
+# non-code or the end token (currently only looks for end token)
+# returns 1 if it is probably a here doc, 0 if not
+sub guess_if_here_doc {
-sub increase_nesting_depth {
- my ( $a, $i_tok ) = @_;
- my $b;
- $current_depth[$a]++;
+ # This is how many lines we will search for a target as part of the
+ # guessing strategy. It is a constant because there is probably
+ # little reason to change it.
+ # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
+ # %is_constant,
+ use constant HERE_DOC_WINDOW => 40;
- # Sequence numbers increment by number of items. This keeps
- # a unique set of numbers but still allows the relative location
- # of any type to be determined.
- $nesting_sequence_number[$a] += scalar(@closing_brace_names);
- my $seqno = $nesting_sequence_number[$a];
- $current_sequence_number[$a][ $current_depth[$a] ] = $seqno;
+ my $next_token = shift;
+ my $here_doc_expected = 0;
+ my $line;
+ my $k = 0;
+ my $msg = "checking <<";
- my $pos = $$rpretoken_map[$i_tok];
- $starting_line_of_current_depth[$a][ $current_depth[$a] ] =
- [ $input_line_number, $input_line, $pos ];
+ while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $k++ ) )
+ {
+ chomp $line;
- for $b ( 0 .. $#closing_brace_names ) {
- next if ( $b == $a );
- $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
+ if ( $line =~ /^$next_token$/ ) {
+ $msg .= " -- found target $next_token ahead $k lines\n";
+ $here_doc_expected = 1; # got it
+ last;
+ }
+ last if ( $k >= HERE_DOC_WINDOW );
}
- return $seqno;
-}
-
-sub decrease_nesting_depth {
- my ( $a, $i_tok ) = @_;
- my $pos = $$rpretoken_map[$i_tok];
- my $b;
- my $seqno = 0;
-
- if ( $current_depth[$a] > 0 ) {
+ unless ($here_doc_expected) {
- $seqno = $current_sequence_number[$a][ $current_depth[$a] ];
+ if ( !defined($line) ) {
+ $here_doc_expected = -1; # hit eof without seeing target
+ $msg .= " -- must be shift; target $next_token not in file\n";
- # check that any brace types $b contained within are balanced
- for $b ( 0 .. $#closing_brace_names ) {
- next if ( $b == $a );
+ }
+ else { # still unsure..taking a wild guess
- unless ( $depth_array[$a][$b][ $current_depth[$a] ] ==
- $current_depth[$b] )
- {
- my $diff = $current_depth[$b] -
- $depth_array[$a][$b][ $current_depth[$a] ];
+ if ( !$is_constant{$current_package}{$next_token} ) {
+ $here_doc_expected = 1;
+ $msg .=
+ " -- guessing it's a here-doc ($next_token not a constant)\n";
+ }
+ else {
+ $msg .=
+ " -- guessing it's a shift ($next_token is a constant)\n";
+ }
+ }
+ }
+ write_logfile_entry($msg);
+ return $here_doc_expected;
+}
- # don't whine too many times
- my $saw_brace_error = get_saw_brace_error();
- if (
- $saw_brace_error <= MAX_NAG_MESSAGES
+#########i#############################################################
+# Tokenizer Routines for scanning identifiers and related items
+#######################################################################
- # if too many closing types have occured, we probably
- # already caught this error
- && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
- )
- {
- interrupt_logfile();
- my $rsl =
- $starting_line_of_current_depth[$a][ $current_depth[$a] ];
- my $sl = $$rsl[0];
- my $rel = [ $input_line_number, $input_line, $pos ];
- my $el = $$rel[0];
- my ($ess);
+sub scan_bare_identifier_do {
- if ( $diff == 1 || $diff == -1 ) {
- $ess = '';
- }
- else {
- $ess = 's';
- }
- my $bname =
- ( $diff > 0 )
- ? $opening_brace_names[$b]
- : $closing_brace_names[$b];
- write_error_indicator_pair( @$rsl, '^' );
- my $msg = <<"EOM";
-Found $diff extra $bname$ess between $opening_brace_names[$a] on line $sl and $closing_brace_names[$a] on line $el
-EOM
+ # this routine is called to scan a token starting with an alphanumeric
+ # variable or package separator, :: or '.
+ # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
+ # $last_nonblank_type,@paren_type, $paren_depth
- if ( $diff > 0 ) {
- my $rml =
- $starting_line_of_current_depth[$b]
- [ $current_depth[$b] ];
- my $ml = $$rml[0];
- $msg .=
-" The most recent un-matched $bname is on line $ml\n";
- write_error_indicator_pair( @$rml, '^' );
- }
- write_error_indicator_pair( @$rel, '^' );
- warning($msg);
- resume_logfile();
- }
- increment_brace_error();
- }
- }
- $current_depth[$a]--;
- }
- else {
+ my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
+ $max_token_index )
+ = @_;
+ my $i_begin = $i;
+ my $package = undef;
- my $saw_brace_error = get_saw_brace_error();
- if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
- my $msg = <<"EOM";
-There is no previous $opening_brace_names[$a] to match a $closing_brace_names[$a] on line $input_line_number
-EOM
- indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
- }
- increment_brace_error();
- }
- return $seqno;
-}
+ my $i_beg = $i;
-sub check_final_nesting_depths {
- my ($a);
+ # we have to back up one pretoken at a :: since each : is one pretoken
+ if ( $tok eq '::' ) { $i_beg-- }
+ if ( $tok eq '->' ) { $i_beg-- }
+ my $pos_beg = $$rtoken_map[$i_beg];
+ pos($input_line) = $pos_beg;
- for $a ( 0 .. $#closing_brace_names ) {
+ # Examples:
+ # A::B::C
+ # A::
+ # ::A
+ # A'B
+ if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
- if ( $current_depth[$a] ) {
- my $rsl = $starting_line_of_current_depth[$a][ $current_depth[$a] ];
- my $sl = $$rsl[0];
- my $msg = <<"EOM";
-Final nesting depth of $opening_brace_names[$a]s is $current_depth[$a]
-The most recent un-matched $opening_brace_names[$a] is on line $sl
-EOM
- indicate_error( $msg, @$rsl, '^' );
- increment_brace_error();
- }
- }
-}
+ my $pos = pos($input_line);
+ my $numc = $pos - $pos_beg;
+ $tok = substr( $input_line, $pos_beg, $numc );
-sub numerator_expected {
+ # type 'w' includes anything without leading type info
+ # ($,%,@,*) including something like abc::def::ghi
+ $type = 'w';
- # this is a filter for a possible numerator, in support of guessing
- # for the / pattern delimiter token.
- # returns -
- # 1 - yes
- # 0 - can't tell
- # -1 - no
- # Note: I am using the convention that variables ending in
- # _expected have these 3 possible values.
- my ( $i, $rtokens ) = @_;
- my $next_token = $$rtokens[ $i + 1 ];
- if ( $next_token eq '=' ) { $i++; } # handle /=
- my ( $next_nonblank_token, $i_next ) =
- find_next_nonblank_token( $i, $rtokens );
+ my $sub_name = "";
+ if ( defined($2) ) { $sub_name = $2; }
+ if ( defined($1) ) {
+ $package = $1;
- if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
- 1;
- }
- else {
+ # patch: don't allow isolated package name which just ends
+ # in the old style package separator (single quote). Example:
+ # use CGI':all';
+ if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
+ $pos--;
+ }
- if ( $next_nonblank_token =~ /^\s*$/ ) {
- 0;
+ $package =~ s/\'/::/g;
+ if ( $package =~ /^\:/ ) { $package = 'main' . $package }
+ $package =~ s/::$//;
}
else {
- -1;
+ $package = $current_package;
+
+ if ( $is_keyword{$tok} ) {
+ $type = 'k';
+ }
}
- }
-}
-sub pattern_expected {
+ # if it is a bareword..
+ if ( $type eq 'w' ) {
- # This is the start of a filter for a possible pattern.
- # It looks at the token after a possbible pattern and tries to
- # determine if that token could end a pattern.
- # returns -
- # 1 - yes
- # 0 - can't tell
- # -1 - no
- my ( $i, $rtokens ) = @_;
- my $next_token = $$rtokens[ $i + 1 ];
- if ( $next_token =~ /^[cgimosx]/ ) { $i++; } # skip possible modifier
- my ( $next_nonblank_token, $i_next ) =
- find_next_nonblank_token( $i, $rtokens );
+ # check for v-string with leading 'v' type character
+ # (This seems to have presidence over filehandle, type 'Y')
+ if ( $tok =~ /^v\d[_\d]*$/ ) {
- # list of tokens which may follow a pattern
- # (can probably be expanded)
- if ( $next_nonblank_token =~ /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/ )
- {
- 1;
- }
- else {
+ # we only have the first part - something like 'v101' -
+ # look for more
+ if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
+ $pos = pos($input_line);
+ $numc = $pos - $pos_beg;
+ $tok = substr( $input_line, $pos_beg, $numc );
+ }
+ $type = 'v';
- if ( $next_nonblank_token =~ /^\s*$/ ) {
- 0;
- }
- else {
- -1;
- }
- }
-}
+ # warn if this version can't handle v-strings
+ report_v_string($tok);
+ }
-sub find_next_nonblank_token_on_this_line {
- my ( $i, $rtokens ) = @_;
- my $next_nonblank_token;
+ elsif ( $is_constant{$package}{$sub_name} ) {
+ $type = 'C';
+ }
- if ( $i < $max_token_index ) {
- $next_nonblank_token = $$rtokens[ ++$i ];
+ # bareword after sort has implied empty prototype; for example:
+ # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
+ # This has priority over whatever the user has specified.
+ elsif ($last_nonblank_token eq 'sort'
+ && $last_nonblank_type eq 'k' )
+ {
+ $type = 'Z';
+ }
- if ( $next_nonblank_token =~ /^\s*$/ ) {
+ # Note: strangely, perl does not seem to really let you create
+ # functions which act like eval and do, in the sense that eval
+ # and do may have operators following the final }, but any operators
+ # that you create with prototype (&) apparently do not allow
+ # trailing operators, only terms. This seems strange.
+ # If this ever changes, here is the update
+ # to make perltidy behave accordingly:
- if ( $i < $max_token_index ) {
- $next_nonblank_token = $$rtokens[ ++$i ];
+ # elsif ( $is_block_function{$package}{$tok} ) {
+ # $tok='eval'; # patch to do braces like eval - doesn't work
+ # $type = 'k';
+ #}
+ # FIXME: This could become a separate type to allow for different
+ # future behavior:
+ elsif ( $is_block_function{$package}{$sub_name} ) {
+ $type = 'G';
}
- }
- }
- else {
- $next_nonblank_token = "";
- }
- return ( $next_nonblank_token, $i );
-}
-
-sub find_next_nonblank_token {
- my ( $i, $rtokens ) = @_;
- if ( $i >= $max_token_index ) {
+ elsif ( $is_block_list_function{$package}{$sub_name} ) {
+ $type = 'G';
+ }
+ elsif ( $is_user_function{$package}{$sub_name} ) {
+ $type = 'U';
+ $prototype = $user_function_prototype{$package}{$sub_name};
+ }
- if ( !$peeked_ahead ) {
- $peeked_ahead = 1;
- $rtokens = peek_ahead_for_nonblank_token($rtokens);
- }
- }
- my $next_nonblank_token = $$rtokens[ ++$i ];
+ # check for indirect object
+ elsif (
- if ( $next_nonblank_token =~ /^\s*$/ ) {
- $next_nonblank_token = $$rtokens[ ++$i ];
- }
- return ( $next_nonblank_token, $i );
-}
+ # added 2001-03-27: must not be followed immediately by '('
+ # see fhandle.t
+ ( $input_line !~ m/\G\(/gc )
-sub peek_ahead_for_n_nonblank_pre_tokens {
+ # and
+ && (
- # returns next n pretokens if they exist
- # returns undef's if hits eof without seeing any pretokens
- my $max_pretokens = shift;
- my $line;
- my $i = 0;
- my ( $rpre_tokens, $rmap, $rpre_types );
+ # preceded by keyword like 'print', 'printf' and friends
+ $is_indirect_object_taker{$last_nonblank_token}
- while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
- {
- $line =~ s/^\s*//; # trim leading blanks
- next if ( length($line) <= 0 ); # skip blank
- next if ( $line =~ /^#/ ); # skip comment
- ( $rpre_tokens, $rmap, $rpre_types ) =
- pre_tokenize( $line, $max_pretokens );
- last;
- }
- return ( $rpre_tokens, $rpre_types );
-}
+ # or preceded by something like 'print(' or 'printf('
+ || (
+ ( $last_nonblank_token eq '(' )
+ && $is_indirect_object_taker{ $paren_type[$paren_depth]
+ }
-# look ahead for next non-blank, non-comment line of code
-sub peek_ahead_for_nonblank_token {
- my $rtokens = shift;
- my $line;
- my $i = 0;
+ )
+ )
+ )
+ {
- while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
- {
- $line =~ s/^\s*//; # trim leading blanks
- next if ( length($line) <= 0 ); # skip blank
- next if ( $line =~ /^#/ ); # skip comment
- my ( $rtok, $rmap, $rtype ) =
- pre_tokenize( $line, 2 ); # only need 2 pre-tokens
- my $j = $max_token_index + 1;
- my $tok;
+ # may not be indirect object unless followed by a space
+ if ( $input_line =~ m/\G\s+/gc ) {
+ $type = 'Y';
- foreach $tok (@$rtok) {
- last if ( $tok =~ "\n" );
- $$rtokens[ ++$j ] = $tok;
- }
- last;
- }
- return $rtokens;
-}
+ # Abandon Hope ...
+ # Perl's indirect object notation is a very bad
+ # thing and can cause subtle bugs, especially for
+ # beginning programmers. And I haven't even been
+ # able to figure out a sane warning scheme which
+ # doesn't get in the way of good scripts.
-sub pre_tokenize {
+ # Complain if a filehandle has any lower case
+ # letters. This is suggested good practice, but the
+ # main reason for this warning is that prior to
+ # release 20010328, perltidy incorrectly parsed a
+ # function call after a print/printf, with the
+ # result that a space got added before the opening
+ # paren, thereby converting the function name to a
+ # filehandle according to perl's weird rules. This
+ # will not usually generate a syntax error, so this
+ # is a potentially serious bug. By warning
+ # of filehandles with any lower case letters,
+ # followed by opening parens, we will help the user
+ # find almost all of these older errors.
+ # use 'sub_name' because something like
+ # main::MYHANDLE is ok for filehandle
+ if ( $sub_name =~ /[a-z]/ ) {
- # Break a string, $str, into a sequence of preliminary tokens. We
- # are interested in these types of tokens:
- # words (type='w'), example: 'max_tokens_wanted'
- # digits (type = 'd'), example: '0755'
- # whitespace (type = 'b'), example: ' '
- # any other single character (i.e. punct; type = the character itself).
- # We cannot do better than this yet because we might be in a quoted
- # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
- # tokens.
- my ( $str, $max_tokens_wanted ) = @_;
+ # could be bug caused by older perltidy if
+ # followed by '('
+ if ( $input_line =~ m/\G\s*\(/gc ) {
+ complain(
+"Caution: unknown word '$tok' in indirect object slot\n"
+ );
+ }
+ }
+ }
- # we return references to these 3 arrays:
- my @tokens = (); # array of the tokens themselves
- my @token_map = (0); # string position of start of each token
- my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
+ # bareword not followed by a space -- may not be filehandle
+ # (may be function call defined in a 'use' statement)
+ else {
+ $type = 'Z';
+ }
+ }
+ }
- do {
+ # Now we must convert back from character position
+ # to pre_token index.
+ # I don't think an error flag can occur here ..but who knows
+ my $error;
+ ( $i, $error ) =
+ inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
+ if ($error) {
+ warning("scan_bare_identifier: Possibly invalid tokenization\n");
+ }
+ }
- # whitespace
- if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
+ # no match but line not blank - could be syntax error
+ # perl will take '::' alone without complaint
+ else {
+ $type = 'w';
- # numbers
- # note that this must come before words!
- elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
+ # change this warning to log message if it becomes annoying
+ warning("didn't find identifier after leading ::\n");
+ }
+ return ( $i, $tok, $type, $prototype );
+}
- # words
- elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
+sub scan_id_do {
- # single-character punctuation
- elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
+# This is the new scanner and will eventually replace scan_identifier.
+# Only type 'sub' and 'package' are implemented.
+# Token types $ * % @ & -> are not yet implemented.
+#
+# Scan identifier following a type token.
+# The type of call depends on $id_scan_state: $id_scan_state = ''
+# for starting call, in which case $tok must be the token defining
+# the type.
+#
+# If the type token is the last nonblank token on the line, a value
+# of $id_scan_state = $tok is returned, indicating that further
+# calls must be made to get the identifier. If the type token is
+# not the last nonblank token on the line, the identifier is
+# scanned and handled and a value of '' is returned.
+# USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
+# $statement_type, $tokenizer_self
+
+ my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
+ $max_token_index )
+ = @_;
+ my $type = '';
+ my ( $i_beg, $pos_beg );
- # that's all..
- else {
- return ( \@tokens, \@token_map, \@type );
- }
+ #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
+ #my ($a,$b,$c) = caller;
+ #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
- push @tokens, $1;
- push @token_map, pos($str);
+ # on re-entry, start scanning at first token on the line
+ if ($id_scan_state) {
+ $i_beg = $i;
+ $type = '';
+ }
- } while ( --$max_tokens_wanted != 0 );
+ # on initial entry, start scanning just after type token
+ else {
+ $i_beg = $i + 1;
+ $id_scan_state = $tok;
+ $type = 't';
+ }
- return ( \@tokens, \@token_map, \@type );
-}
-
-sub show_tokens {
-
- # this is an old debug routine
- my ( $rtokens, $rtoken_map ) = @_;
- my $num = scalar(@$rtokens);
- my $i;
-
- for ( $i = 0 ; $i < $num ; $i++ ) {
- my $len = length( $$rtokens[$i] );
- print "$i:$len:$$rtoken_map[$i]:$$rtokens[$i]:\n";
+ # find $i_beg = index of next nonblank token,
+ # and handle empty lines
+ my $blank_line = 0;
+ my $next_nonblank_token = $$rtokens[$i_beg];
+ if ( $i_beg > $max_token_index ) {
+ $blank_line = 1;
}
-}
-
-sub find_angle_operator_termination {
-
- # We are looking at a '<' and want to know if it is an angle operator.
- # We are to return:
- # $i = pretoken index of ending '>' if found, current $i otherwise
- # $type = 'Q' if found, '>' otherwise
- my ( $input_line, $i_beg, $rtoken_map, $expecting ) = @_;
- my $i = $i_beg;
- my $type = '<';
- pos($input_line) = 1 + $$rtoken_map[$i];
-
- my $filter;
-
- # we just have to find the next '>' if a term is expected
- if ( $expecting == TERM ) { $filter = '[\>]' }
-
- # we have to guess if we don't know what is expected
- elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
-
- # shouldn't happen - we shouldn't be here if operator is expected
- else { warning("Program Bug in find_angle_operator_termination\n") }
-
- # To illustrate what we might be looking at, in case we are
- # guessing, here are some examples of valid angle operators
- # (or file globs):
- # <tmp_imp/*>
- # <FH>
- # <$fh>
- # <*.c *.h>
- # <_>
- # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
- # <${PREFIX}*img*.$IMAGE_TYPE>
- # <img*.$IMAGE_TYPE>
- # <Timg*.$IMAGE_TYPE>
- # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
- #
- # Here are some examples of lines which do not have angle operators:
- # return undef unless $self->[2]++ < $#{$self->[1]};
- # < 2 || @$t >
- #
- # the following line from dlister.pl caused trouble:
- # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
- #
- # If the '<' starts an angle operator, it must end on this line and
- # it must not have certain characters like ';' and '=' in it. I use
- # this to limit the testing. This filter should be improved if
- # possible.
+ else {
- if ( $input_line =~ /($filter)/g ) {
+ # only a '#' immediately after a '$' is not a comment
+ if ( $next_nonblank_token eq '#' ) {
+ unless ( $tok eq '$' ) {
+ $blank_line = 1;
+ }
+ }
- if ( $1 eq '>' ) {
+ if ( $next_nonblank_token =~ /^\s/ ) {
+ ( $next_nonblank_token, $i_beg ) =
+ find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
+ $max_token_index );
+ if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
+ $blank_line = 1;
+ }
+ }
+ }
- # We MAY have found an angle operator termination if we get
- # here, but we need to do more to be sure we haven't been
- # fooled.
- my $pos = pos($input_line);
+ # handle non-blank line; identifier, if any, must follow
+ unless ($blank_line) {
- my $pos_beg = $$rtoken_map[$i];
- my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
+ if ( $id_scan_state eq 'sub' ) {
+ ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
+ $input_line, $i, $i_beg,
+ $tok, $type, $rtokens,
+ $rtoken_map, $id_scan_state, $max_token_index
+ );
+ }
- # Reject if the closing '>' follows a '-' as in:
- # if ( VERSION < 5.009 && $op-> name eq 'aassign' ) { }
- if ( $expecting eq UNKNOWN ) {
- my $check = substr( $input_line, $pos - 2, 1 );
- if ( $check eq '-' ) {
- return ( $i, $type );
- }
- }
+ elsif ( $id_scan_state eq 'package' ) {
+ ( $i, $tok, $type ) =
+ do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
+ $rtoken_map, $max_token_index );
+ $id_scan_state = '';
+ }
- ######################################debug#####
- #write_diagnostics( "ANGLE? :$str\n");
- #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
- ######################################debug#####
- $type = 'Q';
- my $error;
- ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map );
+ else {
+ warning("invalid token in scan_id: $tok\n");
+ $id_scan_state = '';
+ }
+ }
- # It may be possible that a quote ends midway in a pretoken.
- # If this happens, it may be necessary to split the pretoken.
- if ($error) {
- warning(
- "Possible tokinization error..please check this line\n");
- report_possible_bug();
- }
+ if ( $id_scan_state && ( !defined($type) || !$type ) ) {
- # Now let's see where we stand....
- # OK if math op not possible
- if ( $expecting == TERM ) {
- }
+ # shouldn't happen:
+ warning(
+"Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
+ );
+ report_definite_bug();
+ }
- # OK if there are no more than 2 pre-tokens inside
- # (not possible to write 2 token math between < and >)
- # This catches most common cases
- elsif ( $i <= $i_beg + 3 ) {
- write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
- }
+ TOKENIZER_DEBUG_FLAG_NSCAN && do {
+ print
+ "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
+ };
+ return ( $i, $tok, $type, $id_scan_state );
+}
- # Not sure..
- else {
+sub check_prototype {
+ my ( $proto, $package, $subname ) = @_;
+ return unless ( defined($package) && defined($subname) );
+ if ( defined($proto) ) {
+ $proto =~ s/^\s*\(\s*//;
+ $proto =~ s/\s*\)$//;
+ if ($proto) {
+ $is_user_function{$package}{$subname} = 1;
+ $user_function_prototype{$package}{$subname} = "($proto)";
- # Let's try a Brace Test: any braces inside must balance
- my $br = 0;
- while ( $str =~ /\{/g ) { $br++ }
- while ( $str =~ /\}/g ) { $br-- }
- my $sb = 0;
- while ( $str =~ /\[/g ) { $sb++ }
- while ( $str =~ /\]/g ) { $sb-- }
- my $pr = 0;
- while ( $str =~ /\(/g ) { $pr++ }
- while ( $str =~ /\)/g ) { $pr-- }
+ # prototypes containing '&' must be treated specially..
+ if ( $proto =~ /\&/ ) {
- # if braces do not balance - not angle operator
- if ( $br || $sb || $pr ) {
- $i = $i_beg;
- $type = '<';
- write_diagnostics(
- "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
+ # right curly braces of prototypes ending in
+ # '&' may be followed by an operator
+ if ( $proto =~ /\&$/ ) {
+ $is_block_function{$package}{$subname} = 1;
}
- # we should keep doing more checks here...to be continued
- # Tentatively accepting this as a valid angle operator.
- # There are lots more things that can be checked.
- else {
- write_diagnostics(
- "ANGLE-Guessing yes: $str expecting=$expecting\n");
- write_logfile_entry("Guessing angle operator here: $str\n");
+ # right curly braces of prototypes NOT ending in
+ # '&' may NOT be followed by an operator
+ elsif ( $proto !~ /\&$/ ) {
+ $is_block_list_function{$package}{$subname} = 1;
}
}
}
-
- # didn't find ending >
else {
- if ( $expecting == TERM ) {
- warning("No ending > for angle operator\n");
- }
+ $is_constant{$package}{$subname} = 1;
}
}
- return ( $i, $type );
+ else {
+ $is_user_function{$package}{$subname} = 1;
+ }
}
-sub inverse_pretoken_map {
+sub do_scan_package {
- # Starting with the current pre_token index $i, scan forward until
- # finding the index of the next pre_token whose position is $pos.
- my ( $i, $pos, $rtoken_map ) = @_;
- my $error = 0;
+ # do_scan_package parses a package name
+ # it is called with $i_beg equal to the index of the first nonblank
+ # token following a 'package' token.
+ # USES GLOBAL VARIABLES: $current_package,
- while ( ++$i <= $max_token_index ) {
+ my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
+ $max_token_index )
+ = @_;
+ my $package = undef;
+ my $pos_beg = $$rtoken_map[$i_beg];
+ pos($input_line) = $pos_beg;
- if ( $pos <= $$rtoken_map[$i] ) {
+ # handle non-blank line; package name, if any, must follow
+ if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w+)/gc ) {
+ $package = $1;
+ $package = ( defined($1) && $1 ) ? $1 : 'main';
+ $package =~ s/\'/::/g;
+ if ( $package =~ /^\:/ ) { $package = 'main' . $package }
+ $package =~ s/::$//;
+ my $pos = pos($input_line);
+ my $numc = $pos - $pos_beg;
+ $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
+ $type = 'i';
- # Let the calling routine handle errors in which we do not
- # land on a pre-token boundary. It can happen by running
- # perltidy on some non-perl scripts, for example.
- if ( $pos < $$rtoken_map[$i] ) { $error = 1 }
- $i--;
- last;
+ # Now we must convert back from character position
+ # to pre_token index.
+ # I don't think an error flag can occur here ..but ?
+ my $error;
+ ( $i, $error ) =
+ inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
+ if ($error) { warning("Possibly invalid package\n") }
+ $current_package = $package;
+
+ # check for error
+ my ( $next_nonblank_token, $i_next ) =
+ find_next_nonblank_token( $i, $rtokens, $max_token_index );
+ if ( $next_nonblank_token !~ /^[;\}]$/ ) {
+ warning(
+ "Unexpected '$next_nonblank_token' after package name '$tok'\n"
+ );
}
}
- return ( $i, $error );
+
+ # no match but line not blank --
+ # could be a label with name package, like package: , for example.
+ else {
+ $type = 'k';
+ }
+
+ return ( $i, $tok, $type );
}
-sub guess_if_pattern_or_conditional {
+sub scan_identifier_do {
- # this routine is called when we have encountered a ? following an
- # unknown bareword, and we must decide if it starts a pattern or not
- # input parameters:
- # $i - token index of the ? starting possible pattern
- # output parameters:
- # $is_pattern = 0 if probably not pattern, =1 if probably a pattern
- # msg = a warning or diagnostic message
- my ( $i, $rtokens, $rtoken_map ) = @_;
- my $is_pattern = 0;
- my $msg = "guessing that ? after $last_nonblank_token starts a ";
+ # This routine assembles tokens into identifiers. It maintains a
+ # scan state, id_scan_state. It updates id_scan_state based upon
+ # current id_scan_state and token, and returns an updated
+ # id_scan_state and the next index after the identifier.
+ # USES GLOBAL VARIABLES: $context, $last_nonblank_token,
+ # $last_nonblank_type
- if ( $i >= $max_token_index ) {
- $msg .= "conditional (no end to pattern found on the line)\n";
- }
- else {
- my $ibeg = $i;
- $i = $ibeg + 1;
- my $next_token = $$rtokens[$i]; # first token after ?
+ my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index ) = @_;
+ my $i_begin = $i;
+ my $type = '';
+ my $tok_begin = $$rtokens[$i_begin];
+ if ( $tok_begin eq ':' ) { $tok_begin = '::' }
+ my $id_scan_state_begin = $id_scan_state;
+ my $identifier_begin = $identifier;
+ my $tok = $tok_begin;
+ my $message = "";
- # look for a possible ending ? on this line..
- my $in_quote = 1;
- my $quote_depth = 0;
- my $quote_character = '';
- my $quote_pos = 0;
- ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) =
- follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
- $quote_pos, $quote_depth );
+ # these flags will be used to help figure out the type:
+ my $saw_alpha = ( $tok =~ /^[A-Za-z_]/ );
+ my $saw_type;
- if ($in_quote) {
+ # allow old package separator (') except in 'use' statement
+ my $allow_tick = ( $last_nonblank_token ne 'use' );
- # we didn't find an ending ? on this line,
- # so we bias towards conditional
- $is_pattern = 0;
- $msg .= "conditional (no ending ? on this line)\n";
+ # get started by defining a type and a state if necessary
+ unless ($id_scan_state) {
+ $context = UNKNOWN_CONTEXT;
- # we found an ending ?, so we bias towards a pattern
+ # fixup for digraph
+ if ( $tok eq '>' ) {
+ $tok = '->';
+ $tok_begin = $tok;
+ }
+ $identifier = $tok;
+
+ if ( $tok eq '$' || $tok eq '*' ) {
+ $id_scan_state = '$';
+ $context = SCALAR_CONTEXT;
+ }
+ elsif ( $tok eq '%' || $tok eq '@' ) {
+ $id_scan_state = '$';
+ $context = LIST_CONTEXT;
+ }
+ elsif ( $tok eq '&' ) {
+ $id_scan_state = '&';
+ }
+ elsif ( $tok eq 'sub' or $tok eq 'package' ) {
+ $saw_alpha = 0; # 'sub' is considered type info here
+ $id_scan_state = '$';
+ $identifier .= ' '; # need a space to separate sub from sub name
+ }
+ elsif ( $tok eq '::' ) {
+ $id_scan_state = 'A';
+ }
+ elsif ( $tok =~ /^[A-Za-z_]/ ) {
+ $id_scan_state = ':';
+ }
+ elsif ( $tok eq '->' ) {
+ $id_scan_state = '$';
}
else {
- if ( pattern_expected( $i, $rtokens ) >= 0 ) {
- $is_pattern = 1;
- $msg .= "pattern (found ending ? and pattern expected)\n";
- }
- else {
- $msg .= "pattern (uncertain, but found ending ?)\n";
- }
+ # shouldn't happen
+ my ( $a, $b, $c ) = caller;
+ warning("Program Bug: scan_identifier given bad token = $tok \n");
+ warning(" called from sub $a line: $c\n");
+ report_definite_bug();
}
+ $saw_type = !$saw_alpha;
+ }
+ else {
+ $i--;
+ $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
}
- return ( $is_pattern, $msg );
-}
-sub guess_if_pattern_or_division {
+ # now loop to gather the identifier
+ my $i_save = $i;
- # this routine is called when we have encountered a / following an
- # unknown bareword, and we must decide if it starts a pattern or is a
- # division
- # input parameters:
- # $i - token index of the / starting possible pattern
- # output parameters:
- # $is_pattern = 0 if probably division, =1 if probably a pattern
- # msg = a warning or diagnostic message
- my ( $i, $rtokens, $rtoken_map ) = @_;
- my $is_pattern = 0;
- my $msg = "guessing that / after $last_nonblank_token starts a ";
+ while ( $i < $max_token_index ) {
+ $i_save = $i unless ( $tok =~ /^\s*$/ );
+ $tok = $$rtokens[ ++$i ];
- if ( $i >= $max_token_index ) {
- "division (no end to pattern found on the line)\n";
- }
- else {
- my $ibeg = $i;
- my $divide_expected = numerator_expected( $i, $rtokens );
- $i = $ibeg + 1;
- my $next_token = $$rtokens[$i]; # first token after slash
+ if ( ( $tok eq ':' ) && ( $$rtokens[ $i + 1 ] eq ':' ) ) {
+ $tok = '::';
+ $i++;
+ }
- # look for a possible ending / on this line..
- my $in_quote = 1;
- my $quote_depth = 0;
- my $quote_character = '';
- my $quote_pos = 0;
- ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) =
- follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
- $quote_pos, $quote_depth );
+ if ( $id_scan_state eq '$' ) { # starting variable name
- if ($in_quote) {
+ if ( $tok eq '$' ) {
- # we didn't find an ending / on this line,
- # so we bias towards division
- if ( $divide_expected >= 0 ) {
- $is_pattern = 0;
- $msg .= "division (no ending / on this line)\n";
+ $identifier .= $tok;
+
+ # we've got a punctuation variable if end of line (punct.t)
+ if ( $i == $max_token_index ) {
+ $type = 'i';
+ $id_scan_state = '';
+ last;
+ }
}
- else {
- $msg = "multi-line pattern (division not possible)\n";
- $is_pattern = 1;
+ elsif ( $tok =~ /^[A-Za-z_]/ ) { # alphanumeric ..
+ $saw_alpha = 1;
+ $id_scan_state = ':'; # now need ::
+ $identifier .= $tok;
}
+ elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
+ $saw_alpha = 1;
+ $id_scan_state = ':'; # now need ::
+ $identifier .= $tok;
- }
+ # Perl will accept leading digits in identifiers,
+ # although they may not always produce useful results.
+ # Something like $main::0 is ok. But this also works:
+ #
+ # sub howdy::123::bubba{ print "bubba $54321!\n" }
+ # howdy::123::bubba();
+ #
+ }
+ elsif ( $tok =~ /^[0-9]/ ) { # numeric
+ $saw_alpha = 1;
+ $id_scan_state = ':'; # now need ::
+ $identifier .= $tok;
+ }
+ elsif ( $tok eq '::' ) {
+ $id_scan_state = 'A';
+ $identifier .= $tok;
+ }
+ elsif ( ( $tok eq '#' ) && ( $identifier eq '$' ) ) { # $#array
+ $identifier .= $tok; # keep same state, a $ could follow
+ }
+ elsif ( $tok eq '{' ) {
- # we found an ending /, so we bias towards a pattern
- else {
+ # check for something like ${#} or ${©}
+ if ( $identifier eq '$'
+ && $i + 2 <= $max_token_index
+ && $$rtokens[ $i + 2 ] eq '}'
+ && $$rtokens[ $i + 1 ] !~ /[\s\w]/ )
+ {
+ my $next2 = $$rtokens[ $i + 2 ];
+ my $next1 = $$rtokens[ $i + 1 ];
+ $identifier .= $tok . $next1 . $next2;
+ $i += 2;
+ $id_scan_state = '';
+ last;
+ }
+
+ # skip something like ${xxx} or ->{
+ $id_scan_state = '';
- if ( pattern_expected( $i, $rtokens ) >= 0 ) {
+ # if this is the first token of a line, any tokens for this
+ # identifier have already been accumulated
+ if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; }
+ $i = $i_save;
+ last;
+ }
- if ( $divide_expected >= 0 ) {
+ # space ok after leading $ % * & @
+ elsif ( $tok =~ /^\s*$/ ) {
- if ( $i - $ibeg > 60 ) {
- $msg .= "division (matching / too distant)\n";
- $is_pattern = 0;
+ if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
+
+ if ( length($identifier) > 1 ) {
+ $id_scan_state = '';
+ $i = $i_save;
+ $type = 'i'; # probably punctuation variable
+ last;
}
else {
- $msg .= "pattern (but division possible too)\n";
- $is_pattern = 1;
+
+ # spaces after $'s are common, and space after @
+ # is harmless, so only complain about space
+ # after other type characters. Space after $ and
+ # @ will be removed in formatting. Report space
+ # after % and * because they might indicate a
+ # parsing error. In other words '% ' might be a
+ # modulo operator. Delete this warning if it
+ # gets annoying.
+ if ( $identifier !~ /^[\@\$]$/ ) {
+ $message =
+ "Space in identifier, following $identifier\n";
+ }
}
}
- else {
- $is_pattern = 1;
- $msg .= "pattern (division not possible)\n";
- }
+
+ # else:
+ # space after '->' is ok
}
- else {
+ elsif ( $tok eq '^' ) {
- if ( $divide_expected >= 0 ) {
- $is_pattern = 0;
- $msg .= "division (pattern not possible)\n";
+ # check for some special variables like $^W
+ if ( $identifier =~ /^[\$\*\@\%]$/ ) {
+ $identifier .= $tok;
+ $id_scan_state = 'A';
+
+ # Perl accepts '$^]' or '@^]', but
+ # there must not be a space before the ']'.
+ my $next1 = $$rtokens[ $i + 1 ];
+ if ( $next1 eq ']' ) {
+ $i++;
+ $identifier .= $next1;
+ $id_scan_state = "";
+ last;
+ }
}
else {
- $is_pattern = 1;
- $msg .=
- "pattern (uncertain, but division would not work here)\n";
+ $id_scan_state = '';
}
}
- }
- }
- return ( $is_pattern, $msg );
-}
-
-sub find_here_doc {
-
- # find the target of a here document, if any
- # input parameters:
- # $i - token index of the second < of <<
- # ($i must be less than the last token index if this is called)
- # output parameters:
- # $found_target = 0 didn't find target; =1 found target
- # HERE_TARGET - the target string (may be empty string)
- # $i - unchanged if not here doc,
- # or index of the last token of the here target
- my ( $expecting, $i, $rtokens, $rtoken_map ) = @_;
- my $ibeg = $i;
- my $found_target = 0;
- my $here_doc_target = '';
- my $here_quote_character = '';
- my ( $next_nonblank_token, $i_next_nonblank, $next_token );
- $next_token = $$rtokens[ $i + 1 ];
-
- # perl allows a backslash before the target string (heredoc.t)
- my $backslash = 0;
- if ( $next_token eq '\\' ) {
- $backslash = 1;
- $next_token = $$rtokens[ $i + 2 ];
- }
-
- ( $next_nonblank_token, $i_next_nonblank ) =
- find_next_nonblank_token_on_this_line( $i, $rtokens );
-
- if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
-
- my $in_quote = 1;
- my $quote_depth = 0;
- my $quote_pos = 0;
-
- ( $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth ) =
- follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
- $here_quote_character, $quote_pos, $quote_depth );
-
- if ($in_quote) { # didn't find end of quote, so no target found
- $i = $ibeg;
- }
- else { # found ending quote
- my $j;
- $found_target = 1;
-
- my $tokj;
- for ( $j = $i_next_nonblank + 1 ; $j < $i ; $j++ ) {
- $tokj = $$rtokens[$j];
-
- # we have to remove any backslash before the quote character
- # so that the here-doc-target exactly matches this string
- next
- if ( $tokj eq "\\"
- && $j < $i - 1
- && $$rtokens[ $j + 1 ] eq $here_quote_character );
- $here_doc_target .= $tokj;
- }
- }
- }
-
- elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
- $found_target = 1;
- write_logfile_entry(
- "found blank here-target after <<; suggest using \"\"\n");
- $i = $ibeg;
- }
- elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
-
- my $here_doc_expected;
- if ( $expecting == UNKNOWN ) {
- $here_doc_expected = guess_if_here_doc($next_token);
- }
- else {
- $here_doc_expected = 1;
- }
-
- if ($here_doc_expected) {
- $found_target = 1;
- $here_doc_target = $next_token;
- $i = $ibeg + 1;
- }
-
- }
- else {
-
- if ( $expecting == TERM ) {
- $found_target = 1;
- write_logfile_entry("Note: bare here-doc operator <<\n");
- }
- else {
- $i = $ibeg;
- }
- }
-
- # patch to neglect any prepended backslash
- if ( $found_target && $backslash ) { $i++ }
-
- return ( $found_target, $here_doc_target, $here_quote_character, $i );
-}
-
-# try to resolve here-doc vs. shift by looking ahead for
-# non-code or the end token (currently only looks for end token)
-# returns 1 if it is probably a here doc, 0 if not
-sub guess_if_here_doc {
-
- # This is how many lines we will search for a target as part of the
- # guessing strategy. It is a constant because there is probably
- # little reason to change it.
- use constant HERE_DOC_WINDOW => 40;
-
- my $next_token = shift;
- my $here_doc_expected = 0;
- my $line;
- my $k = 0;
- my $msg = "checking <<";
-
- while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $k++ ) )
- {
- chomp $line;
-
- if ( $line =~ /^$next_token$/ ) {
- $msg .= " -- found target $next_token ahead $k lines\n";
- $here_doc_expected = 1; # got it
- last;
- }
- last if ( $k >= HERE_DOC_WINDOW );
- }
-
- unless ($here_doc_expected) {
-
- if ( !defined($line) ) {
- $here_doc_expected = -1; # hit eof without seeing target
- $msg .= " -- must be shift; target $next_token not in file\n";
-
- }
- else { # still unsure..taking a wild guess
-
- if ( !$is_constant{$current_package}{$next_token} ) {
- $here_doc_expected = 1;
- $msg .=
- " -- guessing it's a here-doc ($next_token not a constant)\n";
- }
- else {
- $msg .=
- " -- guessing it's a shift ($next_token is a constant)\n";
- }
- }
- }
- write_logfile_entry($msg);
- return $here_doc_expected;
-}
-
-sub do_quote {
-
- # follow (or continue following) quoted string or pattern
- # $in_quote return code:
- # 0 - ok, found end
- # 1 - still must find end of quote whose target is $quote_character
- # 2 - still looking for end of first of two quotes
- my ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth, $rtokens,
- $rtoken_map )
- = @_;
-
- if ( $in_quote == 2 ) { # two quotes/patterns to follow
- my $ibeg = $i;
- ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) =
- follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
- $quote_pos, $quote_depth );
-
- if ( $in_quote == 1 ) {
- if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
- $quote_character = '';
- }
- }
-
- if ( $in_quote == 1 ) { # one (more) quote to follow
- my $ibeg = $i;
- ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth ) =
- follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
- $quote_pos, $quote_depth );
- }
- return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth );
-}
-
-sub scan_number_do {
-
- # scan a number in any of the formats that Perl accepts
- # Underbars (_) are allowed in decimal numbers.
- # input parameters -
- # $input_line - the string to scan
- # $i - pre_token index to start scanning
- # $rtoken_map - reference to the pre_token map giving starting
- # character position in $input_line of token $i
- # output parameters -
- # $i - last pre_token index of the number just scanned
- # number - the number (characters); or undef if not a number
-
- my ( $input_line, $i, $rtoken_map, $input_type ) = @_;
- my $pos_beg = $$rtoken_map[$i];
- my $pos;
- my $i_begin = $i;
- my $number = undef;
- my $type = $input_type;
-
- my $first_char = substr( $input_line, $pos_beg, 1 );
-
- # Look for bad starting characters; Shouldn't happen..
- if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
- warning("Program bug - scan_number given character $first_char\n");
- report_definite_bug();
- return ( $i, $type, $number );
- }
-
- # handle v-string without leading 'v' character ('Two Dot' rule)
- # (vstring.t)
- # TODO: v-strings may contain underscores
- pos($input_line) = $pos_beg;
- if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
- $pos = pos($input_line);
- my $numc = $pos - $pos_beg;
- $number = substr( $input_line, $pos_beg, $numc );
- $type = 'v';
- unless ($saw_v_string) { report_v_string($number) }
- }
-
- # handle octal, hex, binary
- if ( !defined($number) ) {
- pos($input_line) = $pos_beg;
- if ( $input_line =~ /\G[+-]?0((x[0-9a-fA-F_]+)|([0-7_]+)|(b[01_]+))/g )
- {
- $pos = pos($input_line);
- my $numc = $pos - $pos_beg;
- $number = substr( $input_line, $pos_beg, $numc );
- $type = 'n';
- }
- }
-
- # handle decimal
- if ( !defined($number) ) {
- pos($input_line) = $pos_beg;
-
- if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
- $pos = pos($input_line);
-
- # watch out for things like 0..40 which would give 0. by this;
- if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
- && ( substr( $input_line, $pos, 1 ) eq '.' ) )
- {
- $pos--;
- }
- my $numc = $pos - $pos_beg;
- $number = substr( $input_line, $pos_beg, $numc );
- $type = 'n';
- }
- }
-
- # filter out non-numbers like e + - . e2 .e3 +e6
- # the rule: at least one digit, and any 'e' must be preceded by a digit
- if (
- $number !~ /\d/ # no digits
- || ( $number =~ /^(.*)[eE]/
- && $1 !~ /\d/ ) # or no digits before the 'e'
- )
- {
- $number = undef;
- $type = $input_type;
- return ( $i, $type, $number );
- }
-
- # Found a number; now we must convert back from character position
- # to pre_token index. An error here implies user syntax error.
- # An example would be an invalid octal number like '009'.
- my $error;
- ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map );
- if ($error) { warning("Possibly invalid number\n") }
-
- return ( $i, $type, $number );
-}
-
-sub scan_bare_identifier_do {
-
- # this routine is called to scan a token starting with an alphanumeric
- # variable or package separator, :: or '.
-
- my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map ) = @_;
- my $i_begin = $i;
- my $package = undef;
-
- my $i_beg = $i;
-
- # we have to back up one pretoken at a :: since each : is one pretoken
- if ( $tok eq '::' ) { $i_beg-- }
- if ( $tok eq '->' ) { $i_beg-- }
- my $pos_beg = $$rtoken_map[$i_beg];
- pos($input_line) = $pos_beg;
-
- # Examples:
- # A::B::C
- # A::
- # ::A
- # A'B
- if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
-
- my $pos = pos($input_line);
- my $numc = $pos - $pos_beg;
- $tok = substr( $input_line, $pos_beg, $numc );
-
- # type 'w' includes anything without leading type info
- # ($,%,@,*) including something like abc::def::ghi
- $type = 'w';
-
- my $sub_name = "";
- if ( defined($2) ) { $sub_name = $2; }
- if ( defined($1) ) {
- $package = $1;
+ else { # something else
- # patch: don't allow isolated package name which just ends
- # in the old style package separator (single quote). Example:
- # use CGI':all';
- if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
- $pos--;
- }
+ # check for various punctuation variables
+ if ( $identifier =~ /^[\$\*\@\%]$/ ) {
+ $identifier .= $tok;
+ }
- $package =~ s/\'/::/g;
- if ( $package =~ /^\:/ ) { $package = 'main' . $package }
- $package =~ s/::$//;
- }
- else {
- $package = $current_package;
+ elsif ( $identifier eq '$#' ) {
- if ( $is_keyword{$tok} ) {
- $type = 'k';
- }
- }
+ if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
- # if it is a bareword..
- if ( $type eq 'w' ) {
+ # perl seems to allow just these: $#: $#- $#+
+ elsif ( $tok =~ /^[\:\-\+]$/ ) {
+ $type = 'i';
+ $identifier .= $tok;
+ }
+ else {
+ $i = $i_save;
+ write_logfile_entry( 'Use of $# is deprecated' . "\n" );
+ }
+ }
+ elsif ( $identifier eq '$$' ) {
- # check for v-string with leading 'v' type character
- # (This seems to have presidence over filehandle, type 'Y')
- if ( $tok =~ /^v\d[_\d]*$/ ) {
+ # perl does not allow references to punctuation
+ # variables without braces. For example, this
+ # won't work:
+ # $:=\4;
+ # $a = $$:;
+ # You would have to use
+ # $a = ${$:};
- # we only have the first part - something like 'v101' -
- # look for more
- if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
- $pos = pos($input_line);
- $numc = $pos - $pos_beg;
- $tok = substr( $input_line, $pos_beg, $numc );
+ $i = $i_save;
+ if ( $tok eq '{' ) { $type = 't' }
+ else { $type = 'i' }
}
- $type = 'v';
-
- # warn if this version can't handle v-strings
- unless ($saw_v_string) { report_v_string($tok) }
+ elsif ( $identifier eq '->' ) {
+ $i = $i_save;
+ }
+ else {
+ $i = $i_save;
+ if ( length($identifier) == 1 ) { $identifier = ''; }
+ }
+ $id_scan_state = '';
+ last;
}
+ }
+ elsif ( $id_scan_state eq '&' ) { # starting sub call?
- elsif ( $is_constant{$package}{$sub_name} ) {
- $type = 'C';
+ if ( $tok =~ /^[\$A-Za-z_]/ ) { # alphanumeric ..
+ $id_scan_state = ':'; # now need ::
+ $saw_alpha = 1;
+ $identifier .= $tok;
}
-
- # bareword after sort has implied empty prototype; for example:
- # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
- # This has priority over whatever the user has specified.
- elsif ($last_nonblank_token eq 'sort'
- && $last_nonblank_type eq 'k' )
- {
- $type = 'Z';
+ elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
+ $id_scan_state = ':'; # now need ::
+ $saw_alpha = 1;
+ $identifier .= $tok;
}
-
- # Note: strangely, perl does not seem to really let you create
- # functions which act like eval and do, in the sense that eval
- # and do may have operators following the final }, but any operators
- # that you create with prototype (&) apparently do not allow
- # trailing operators, only terms. This seems strange.
- # If this ever changes, here is the update
- # to make perltidy behave accordingly:
-
- # elsif ( $is_block_function{$package}{$tok} ) {
- # $tok='eval'; # patch to do braces like eval - doesn't work
- # $type = 'k';
- #}
- # FIXME: This could become a separate type to allow for different
- # future behavior:
- elsif ( $is_block_function{$package}{$sub_name} ) {
- $type = 'G';
+ elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
+ $id_scan_state = ':'; # now need ::
+ $saw_alpha = 1;
+ $identifier .= $tok;
}
-
- elsif ( $is_block_list_function{$package}{$sub_name} ) {
- $type = 'G';
+ elsif ( $tok =~ /^\s*$/ ) { # allow space
}
- elsif ( $is_user_function{$package}{$sub_name} ) {
- $type = 'U';
- $prototype = $user_function_prototype{$package}{$sub_name};
+ elsif ( $tok eq '::' ) { # leading ::
+ $id_scan_state = 'A'; # accept alpha next
+ $identifier .= $tok;
}
+ elsif ( $tok eq '{' ) {
+ if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; }
+ $i = $i_save;
+ $id_scan_state = '';
+ last;
+ }
+ else {
- # check for indirect object
- elsif (
-
- # added 2001-03-27: must not be followed immediately by '('
- # see fhandle.t
- ( $input_line !~ m/\G\(/gc )
-
- # and
- && (
-
- # preceded by keyword like 'print', 'printf' and friends
- $is_indirect_object_taker{$last_nonblank_token}
-
- # or preceded by something like 'print(' or 'printf('
- || (
- ( $last_nonblank_token eq '(' )
- && $is_indirect_object_taker{ $paren_type[$paren_depth]
- }
-
- )
- )
- )
- {
-
- # may not be indirect object unless followed by a space
- if ( $input_line =~ m/\G\s+/gc ) {
- $type = 'Y';
+ # punctuation variable?
+ # testfile: cunningham4.pl
+ if ( $identifier eq '&' ) {
+ $identifier .= $tok;
+ }
+ else {
+ $identifier = '';
+ $i = $i_save;
+ $type = '&';
+ }
+ $id_scan_state = '';
+ last;
+ }
+ }
+ elsif ( $id_scan_state eq 'A' ) { # looking for alpha (after ::)
- # Abandon Hope ...
- # Perl's indirect object notation is a very bad
- # thing and can cause subtle bugs, especially for
- # beginning programmers. And I haven't even been
- # able to figure out a sane warning scheme which
- # doesn't get in the way of good scripts.
+ if ( $tok =~ /^[A-Za-z_]/ ) { # found it
+ $identifier .= $tok;
+ $id_scan_state = ':'; # now need ::
+ $saw_alpha = 1;
+ }
+ elsif ( $tok eq "'" && $allow_tick ) {
+ $identifier .= $tok;
+ $id_scan_state = ':'; # now need ::
+ $saw_alpha = 1;
+ }
+ elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
+ $identifier .= $tok;
+ $id_scan_state = ':'; # now need ::
+ $saw_alpha = 1;
+ }
+ elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
+ $id_scan_state = '(';
+ $identifier .= $tok;
+ }
+ elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
+ $id_scan_state = ')';
+ $identifier .= $tok;
+ }
+ else {
+ $id_scan_state = '';
+ $i = $i_save;
+ last;
+ }
+ }
+ elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha
- # Complain if a filehandle has any lower case
- # letters. This is suggested good practice, but the
- # main reason for this warning is that prior to
- # release 20010328, perltidy incorrectly parsed a
- # function call after a print/printf, with the
- # result that a space got added before the opening
- # paren, thereby converting the function name to a
- # filehandle according to perl's weird rules. This
- # will not usually generate a syntax error, so this
- # is a potentially serious bug. By warning
- # of filehandles with any lower case letters,
- # followed by opening parens, we will help the user
- # find almost all of these older errors.
- # use 'sub_name' because something like
- # main::MYHANDLE is ok for filehandle
- if ( $sub_name =~ /[a-z]/ ) {
+ if ( $tok eq '::' ) { # got it
+ $identifier .= $tok;
+ $id_scan_state = 'A'; # now require alpha
+ }
+ elsif ( $tok =~ /^[A-Za-z_]/ ) { # more alphanumeric is ok here
+ $identifier .= $tok;
+ $id_scan_state = ':'; # now need ::
+ $saw_alpha = 1;
+ }
+ elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
+ $identifier .= $tok;
+ $id_scan_state = ':'; # now need ::
+ $saw_alpha = 1;
+ }
+ elsif ( $tok eq "'" && $allow_tick ) { # tick
- # could be bug caused by older perltidy if
- # followed by '('
- if ( $input_line =~ m/\G\s*\(/gc ) {
- complain(
-"Caution: unknown word '$tok' in indirect object slot\n"
- );
- }
- }
+ if ( $is_keyword{$identifier} ) {
+ $id_scan_state = ''; # that's all
+ $i = $i_save;
+ }
+ else {
+ $identifier .= $tok;
}
+ }
+ elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
+ $id_scan_state = '(';
+ $identifier .= $tok;
+ }
+ elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
+ $id_scan_state = ')';
+ $identifier .= $tok;
+ }
+ else {
+ $id_scan_state = ''; # that's all
+ $i = $i_save;
+ last;
+ }
+ }
+ elsif ( $id_scan_state eq '(' ) { # looking for ( of prototype
- # bareword not followed by a space -- may not be filehandle
- # (may be function call defined in a 'use' statement)
- else {
- $type = 'Z';
- }
+ if ( $tok eq '(' ) { # got it
+ $identifier .= $tok;
+ $id_scan_state = ')'; # now find the end of it
+ }
+ elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
+ $identifier .= $tok;
+ }
+ else {
+ $id_scan_state = ''; # that's all - no prototype
+ $i = $i_save;
+ last;
}
}
+ elsif ( $id_scan_state eq ')' ) { # looking for ) to end
- # Now we must convert back from character position
- # to pre_token index.
- # I don't think an error flag can occur here ..but who knows
- my $error;
- ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map );
- if ($error) {
- warning("scan_bare_identifier: Possibly invalid tokenization\n");
+ if ( $tok eq ')' ) { # got it
+ $identifier .= $tok;
+ $id_scan_state = ''; # all done
+ last;
+ }
+ elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
+ $identifier .= $tok;
+ }
+ else { # probable error in script, but keep going
+ warning("Unexpected '$tok' while seeking end of prototype\n");
+ $identifier .= $tok;
+ }
+ }
+ else { # can get here due to error in initialization
+ $id_scan_state = '';
+ $i = $i_save;
+ last;
}
}
- # no match but line not blank - could be syntax error
- # perl will take '::' alone without complaint
- else {
- $type = 'w';
-
- # change this warning to log message if it becomes annoying
- warning("didn't find identifier after leading ::\n");
+ if ( $id_scan_state eq ')' ) {
+ warning("Hit end of line while seeking ) to end prototype\n");
}
- return ( $i, $tok, $type, $prototype );
-}
-
-sub scan_id_do {
-
- # This is the new scanner and will eventually replace scan_identifier.
- # Only type 'sub' and 'package' are implemented.
- # Token types $ * % @ & -> are not yet implemented.
- #
- # Scan identifier following a type token.
- # The type of call depends on $id_scan_state: $id_scan_state = ''
- # for starting call, in which case $tok must be the token defining
- # the type.
- #
- # If the type token is the last nonblank token on the line, a value
- # of $id_scan_state = $tok is returned, indicating that further
- # calls must be made to get the identifier. If the type token is
- # not the last nonblank token on the line, the identifier is
- # scanned and handled and a value of '' is returned.
-
- my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state ) = @_;
- my $type = '';
- my ( $i_beg, $pos_beg );
-
- #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
- #my ($a,$b,$c) = caller;
- #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
- # on re-entry, start scanning at first token on the line
- if ($id_scan_state) {
- $i_beg = $i;
- $type = '';
+ # once we enter the actual identifier, it may not extend beyond
+ # the end of the current line
+ if ( $id_scan_state =~ /^[A\:\(\)]/ ) {
+ $id_scan_state = '';
}
+ if ( $i < 0 ) { $i = 0 }
- # on initial entry, start scanning just after type token
- else {
- $i_beg = $i + 1;
- $id_scan_state = $tok;
- $type = 't';
- }
+ unless ($type) {
- # find $i_beg = index of next nonblank token,
- # and handle empty lines
- my $blank_line = 0;
- my $next_nonblank_token = $$rtokens[$i_beg];
- if ( $i_beg > $max_token_index ) {
- $blank_line = 1;
- }
- else {
+ if ($saw_type) {
- # only a '#' immediately after a '$' is not a comment
- if ( $next_nonblank_token eq '#' ) {
- unless ( $tok eq '$' ) {
- $blank_line = 1;
+ if ($saw_alpha) {
+ if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
+ $type = 'w';
+ }
+ else { $type = 'i' }
}
- }
-
- if ( $next_nonblank_token =~ /^\s/ ) {
- ( $next_nonblank_token, $i_beg ) =
- find_next_nonblank_token_on_this_line( $i_beg, $rtokens );
- if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
- $blank_line = 1;
+ elsif ( $identifier eq '->' ) {
+ $type = '->';
}
- }
- }
-
- # handle non-blank line; identifier, if any, must follow
- unless ($blank_line) {
+ elsif (
+ ( length($identifier) > 1 )
- if ( $id_scan_state eq 'sub' ) {
- ( $i, $tok, $type, $id_scan_state ) =
- do_scan_sub( $input_line, $i, $i_beg, $tok, $type, $rtokens,
- $rtoken_map, $id_scan_state );
+ # In something like '@$=' we have an identifier '@$'
+ # In something like '$${' we have type '$$' (and only
+ # part of an identifier)
+ && !( $identifier =~ /\$$/ && $tok eq '{' )
+ && ( $identifier !~ /^(sub |package )$/ )
+ )
+ {
+ $type = 'i';
+ }
+ else { $type = 't' }
}
+ elsif ($saw_alpha) {
- elsif ( $id_scan_state eq 'package' ) {
- ( $i, $tok, $type ) =
- do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
- $rtoken_map );
- $id_scan_state = '';
+ # type 'w' includes anything without leading type info
+ # ($,%,@,*) including something like abc::def::ghi
+ $type = 'w';
}
-
else {
- warning("invalid token in scan_id: $tok\n");
- $id_scan_state = '';
- }
+ $type = '';
+ } # this can happen on a restart
}
- if ( $id_scan_state && ( !defined($type) || !$type ) ) {
-
- # shouldn't happen:
- warning(
-"Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
- );
- report_definite_bug();
+ if ($identifier) {
+ $tok = $identifier;
+ if ($message) { write_logfile_entry($message) }
+ }
+ else {
+ $tok = $tok_begin;
+ $i = $i_begin;
}
- TOKENIZER_DEBUG_FLAG_NSCAN && do {
+ TOKENIZER_DEBUG_FLAG_SCAN_ID && do {
+ my ( $a, $b, $c ) = caller;
print
- "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
+"SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
+ print
+"SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
};
- return ( $i, $tok, $type, $id_scan_state );
+ return ( $i, $tok, $type, $id_scan_state, $identifier );
}
{
# sub name. For example, 'sub &doit' is wrong. Also, be sure
# a name is given if and only if a non-anonymous sub is
# appropriate.
+ # USES GLOBAL VARS: $current_package, $last_nonblank_token,
+ # $in_attribute_list, %saw_function_definition,
+ # $statement_type
- my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
- $id_scan_state )
- = @_;
+ my (
+ $input_line, $i, $i_beg,
+ $tok, $type, $rtokens,
+ $rtoken_map, $id_scan_state, $max_token_index
+ ) = @_;
$id_scan_state = ""; # normally we get everything in one call
my $subname = undef;
my $package = undef;
# I don't think an error flag can occur here ..but ?
my $error;
- ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map );
+ ( $i, $error ) =
+ inverse_pretoken_map( $i, $pos, $rtoken_map,
+ $max_token_index );
if ($error) { warning("Possibly invalid sub\n") }
# check for multiple definitions of a sub
( $next_nonblank_token, my $i_next ) =
- find_next_nonblank_token_on_this_line( $i, $rtokens );
+ find_next_nonblank_token_on_this_line( $i, $rtokens,
+ $max_token_index );
}
if ( $next_nonblank_token =~ /^(\s*|#)$/ )
);
}
$saw_function_definition{$package}{$subname} =
- $input_line_number;
+ $tokenizer_self->{_last_line_number};
}
}
elsif ( $next_nonblank_token eq ';' ) {
}
}
-sub check_prototype {
- my ( $proto, $package, $subname ) = @_;
- return unless ( defined($package) && defined($subname) );
- if ( defined($proto) ) {
- $proto =~ s/^\s*\(\s*//;
- $proto =~ s/\s*\)$//;
- if ($proto) {
- $is_user_function{$package}{$subname} = 1;
- $user_function_prototype{$package}{$subname} = "($proto)";
-
- # prototypes containing '&' must be treated specially..
- if ( $proto =~ /\&/ ) {
-
- # right curly braces of prototypes ending in
- # '&' may be followed by an operator
- if ( $proto =~ /\&$/ ) {
- $is_block_function{$package}{$subname} = 1;
- }
-
- # right curly braces of prototypes NOT ending in
- # '&' may NOT be followed by an operator
- elsif ( $proto !~ /\&$/ ) {
- $is_block_list_function{$package}{$subname} = 1;
- }
- }
- }
- else {
- $is_constant{$package}{$subname} = 1;
- }
- }
- else {
- $is_user_function{$package}{$subname} = 1;
- }
-}
-
-sub do_scan_package {
-
- # do_scan_package parses a package name
- # it is called with $i_beg equal to the index of the first nonblank
- # token following a 'package' token.
-
- my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map ) = @_;
- my $package = undef;
- my $pos_beg = $$rtoken_map[$i_beg];
- pos($input_line) = $pos_beg;
-
- # handle non-blank line; package name, if any, must follow
- if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w+)/gc ) {
- $package = $1;
- $package = ( defined($1) && $1 ) ? $1 : 'main';
- $package =~ s/\'/::/g;
- if ( $package =~ /^\:/ ) { $package = 'main' . $package }
- $package =~ s/::$//;
- my $pos = pos($input_line);
- my $numc = $pos - $pos_beg;
- $tok = 'package ' . substr( $input_line, $pos_beg, $numc );
- $type = 'i';
-
- # Now we must convert back from character position
- # to pre_token index.
- # I don't think an error flag can occur here ..but ?
- my $error;
- ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map );
- if ($error) { warning("Possibly invalid package\n") }
- $current_package = $package;
-
- # check for error
- my ( $next_nonblank_token, $i_next ) =
- find_next_nonblank_token( $i, $rtokens );
- if ( $next_nonblank_token !~ /^[;\}]$/ ) {
- warning(
- "Unexpected '$next_nonblank_token' after package name '$tok'\n"
- );
- }
- }
-
- # no match but line not blank --
- # could be a label with name package, like package: , for example.
- else {
- $type = 'k';
- }
-
- return ( $i, $tok, $type );
-}
-
-sub scan_identifier_do {
+#########i###############################################################
+# Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
+#########################################################################
- # This routine assembles tokens into identifiers. It maintains a
- # scan state, id_scan_state. It updates id_scan_state based upon
- # current id_scan_state and token, and returns an updated
- # id_scan_state and the next index after the identifier.
+sub find_next_nonblank_token {
+ my ( $i, $rtokens, $max_token_index ) = @_;
- my ( $i, $id_scan_state, $identifier, $rtokens ) = @_;
- my $i_begin = $i;
- my $type = '';
- my $tok_begin = $$rtokens[$i_begin];
- if ( $tok_begin eq ':' ) { $tok_begin = '::' }
- my $id_scan_state_begin = $id_scan_state;
- my $identifier_begin = $identifier;
- my $tok = $tok_begin;
- my $message = "";
+ if ( $i >= $max_token_index ) {
+ if ( !peeked_ahead() ) {
+ peeked_ahead(1);
+ $rtokens =
+ peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
+ }
+ }
+ my $next_nonblank_token = $$rtokens[ ++$i ];
- # these flags will be used to help figure out the type:
- my $saw_alpha = ( $tok =~ /^[A-Za-z_]/ );
- my $saw_type;
+ if ( $next_nonblank_token =~ /^\s*$/ ) {
+ $next_nonblank_token = $$rtokens[ ++$i ];
+ }
+ return ( $next_nonblank_token, $i );
+}
- # allow old package separator (') except in 'use' statement
- my $allow_tick = ( $last_nonblank_token ne 'use' );
+sub numerator_expected {
- # get started by defining a type and a state if necessary
- unless ($id_scan_state) {
- $context = UNKNOWN_CONTEXT;
+ # this is a filter for a possible numerator, in support of guessing
+ # for the / pattern delimiter token.
+ # returns -
+ # 1 - yes
+ # 0 - can't tell
+ # -1 - no
+ # Note: I am using the convention that variables ending in
+ # _expected have these 3 possible values.
+ my ( $i, $rtokens, $max_token_index ) = @_;
+ my $next_token = $$rtokens[ $i + 1 ];
+ if ( $next_token eq '=' ) { $i++; } # handle /=
+ my ( $next_nonblank_token, $i_next ) =
+ find_next_nonblank_token( $i, $rtokens, $max_token_index );
- # fixup for digraph
- if ( $tok eq '>' ) {
- $tok = '->';
- $tok_begin = $tok;
- }
- $identifier = $tok;
+ if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
+ 1;
+ }
+ else {
- if ( $tok eq '$' || $tok eq '*' ) {
- $id_scan_state = '$';
- $context = SCALAR_CONTEXT;
- }
- elsif ( $tok eq '%' || $tok eq '@' ) {
- $id_scan_state = '$';
- $context = LIST_CONTEXT;
- }
- elsif ( $tok eq '&' ) {
- $id_scan_state = '&';
- }
- elsif ( $tok eq 'sub' or $tok eq 'package' ) {
- $saw_alpha = 0; # 'sub' is considered type info here
- $id_scan_state = '$';
- $identifier .= ' '; # need a space to separate sub from sub name
- }
- elsif ( $tok eq '::' ) {
- $id_scan_state = 'A';
- }
- elsif ( $tok =~ /^[A-Za-z_]/ ) {
- $id_scan_state = ':';
- }
- elsif ( $tok eq '->' ) {
- $id_scan_state = '$';
+ if ( $next_nonblank_token =~ /^\s*$/ ) {
+ 0;
}
else {
-
- # shouldn't happen
- my ( $a, $b, $c ) = caller;
- warning("Program Bug: scan_identifier given bad token = $tok \n");
- warning(" called from sub $a line: $c\n");
- report_definite_bug();
+ -1;
}
- $saw_type = !$saw_alpha;
- }
- else {
- $i--;
- $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
}
+}
- # now loop to gather the identifier
- my $i_save = $i;
+sub pattern_expected {
- while ( $i < $max_token_index ) {
- $i_save = $i unless ( $tok =~ /^\s*$/ );
- $tok = $$rtokens[ ++$i ];
+ # This is the start of a filter for a possible pattern.
+ # It looks at the token after a possbible pattern and tries to
+ # determine if that token could end a pattern.
+ # returns -
+ # 1 - yes
+ # 0 - can't tell
+ # -1 - no
+ my ( $i, $rtokens, $max_token_index ) = @_;
+ my $next_token = $$rtokens[ $i + 1 ];
+ if ( $next_token =~ /^[cgimosx]/ ) { $i++; } # skip possible modifier
+ my ( $next_nonblank_token, $i_next ) =
+ find_next_nonblank_token( $i, $rtokens, $max_token_index );
- if ( ( $tok eq ':' ) && ( $$rtokens[ $i + 1 ] eq ':' ) ) {
- $tok = '::';
- $i++;
- }
+ # list of tokens which may follow a pattern
+ # (can probably be expanded)
+ if ( $next_nonblank_token =~ /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/ )
+ {
+ 1;
+ }
+ else {
- if ( $id_scan_state eq '$' ) { # starting variable name
+ if ( $next_nonblank_token =~ /^\s*$/ ) {
+ 0;
+ }
+ else {
+ -1;
+ }
+ }
+}
- if ( $tok eq '$' ) {
+sub find_next_nonblank_token_on_this_line {
+ my ( $i, $rtokens, $max_token_index ) = @_;
+ my $next_nonblank_token;
- $identifier .= $tok;
+ if ( $i < $max_token_index ) {
+ $next_nonblank_token = $$rtokens[ ++$i ];
- # we've got a punctuation variable if end of line (punct.t)
- if ( $i == $max_token_index ) {
- $type = 'i';
- $id_scan_state = '';
- last;
- }
- }
- elsif ( $tok =~ /^[A-Za-z_]/ ) { # alphanumeric ..
- $saw_alpha = 1;
- $id_scan_state = ':'; # now need ::
- $identifier .= $tok;
- }
- elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
- $saw_alpha = 1;
- $id_scan_state = ':'; # now need ::
- $identifier .= $tok;
+ if ( $next_nonblank_token =~ /^\s*$/ ) {
- # Perl will accept leading digits in identifiers,
- # although they may not always produce useful results.
- # Something like $main::0 is ok. But this also works:
- #
- # sub howdy::123::bubba{ print "bubba $54321!\n" }
- # howdy::123::bubba();
- #
- }
- elsif ( $tok =~ /^[0-9]/ ) { # numeric
- $saw_alpha = 1;
- $id_scan_state = ':'; # now need ::
- $identifier .= $tok;
- }
- elsif ( $tok eq '::' ) {
- $id_scan_state = 'A';
- $identifier .= $tok;
- }
- elsif ( ( $tok eq '#' ) && ( $identifier eq '$' ) ) { # $#array
- $identifier .= $tok; # keep same state, a $ could follow
+ if ( $i < $max_token_index ) {
+ $next_nonblank_token = $$rtokens[ ++$i ];
}
- elsif ( $tok eq '{' ) {
-
- # check for something like ${#} or ${©}
- if ( $identifier eq '$'
- && $i + 2 <= $max_token_index
- && $$rtokens[ $i + 2 ] eq '}'
- && $$rtokens[ $i + 1 ] !~ /[\s\w]/ )
- {
- my $next2 = $$rtokens[ $i + 2 ];
- my $next1 = $$rtokens[ $i + 1 ];
- $identifier .= $tok . $next1 . $next2;
- $i += 2;
- $id_scan_state = '';
- last;
- }
+ }
+ }
+ else {
+ $next_nonblank_token = "";
+ }
+ return ( $next_nonblank_token, $i );
+}
- # skip something like ${xxx} or ->{
- $id_scan_state = '';
+sub find_angle_operator_termination {
- # if this is the first token of a line, any tokens for this
- # identifier have already been accumulated
- if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; }
- $i = $i_save;
- last;
- }
+ # We are looking at a '<' and want to know if it is an angle operator.
+ # We are to return:
+ # $i = pretoken index of ending '>' if found, current $i otherwise
+ # $type = 'Q' if found, '>' otherwise
+ my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
+ my $i = $i_beg;
+ my $type = '<';
+ pos($input_line) = 1 + $$rtoken_map[$i];
- # space ok after leading $ % * & @
- elsif ( $tok =~ /^\s*$/ ) {
+ my $filter;
- if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
+ # we just have to find the next '>' if a term is expected
+ if ( $expecting == TERM ) { $filter = '[\>]' }
- if ( length($identifier) > 1 ) {
- $id_scan_state = '';
- $i = $i_save;
- $type = 'i'; # probably punctuation variable
- last;
- }
- else {
+ # we have to guess if we don't know what is expected
+ elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
- # spaces after $'s are common, and space after @
- # is harmless, so only complain about space
- # after other type characters. Space after $ and
- # @ will be removed in formatting. Report space
- # after % and * because they might indicate a
- # parsing error. In other words '% ' might be a
- # modulo operator. Delete this warning if it
- # gets annoying.
- if ( $identifier !~ /^[\@\$]$/ ) {
- $message =
- "Space in identifier, following $identifier\n";
- }
- }
- }
+ # shouldn't happen - we shouldn't be here if operator is expected
+ else { warning("Program Bug in find_angle_operator_termination\n") }
- # else:
- # space after '->' is ok
- }
- elsif ( $tok eq '^' ) {
+ # To illustrate what we might be looking at, in case we are
+ # guessing, here are some examples of valid angle operators
+ # (or file globs):
+ # <tmp_imp/*>
+ # <FH>
+ # <$fh>
+ # <*.c *.h>
+ # <_>
+ # <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
+ # <${PREFIX}*img*.$IMAGE_TYPE>
+ # <img*.$IMAGE_TYPE>
+ # <Timg*.$IMAGE_TYPE>
+ # <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
+ #
+ # Here are some examples of lines which do not have angle operators:
+ # return undef unless $self->[2]++ < $#{$self->[1]};
+ # < 2 || @$t >
+ #
+ # the following line from dlister.pl caused trouble:
+ # print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
+ #
+ # If the '<' starts an angle operator, it must end on this line and
+ # it must not have certain characters like ';' and '=' in it. I use
+ # this to limit the testing. This filter should be improved if
+ # possible.
- # check for some special variables like $^W
- if ( $identifier =~ /^[\$\*\@\%]$/ ) {
- $identifier .= $tok;
- $id_scan_state = 'A';
- }
- else {
- $id_scan_state = '';
- }
- }
- else { # something else
+ if ( $input_line =~ /($filter)/g ) {
- # check for various punctuation variables
- if ( $identifier =~ /^[\$\*\@\%]$/ ) {
- $identifier .= $tok;
- }
+ if ( $1 eq '>' ) {
- elsif ( $identifier eq '$#' ) {
+ # We MAY have found an angle operator termination if we get
+ # here, but we need to do more to be sure we haven't been
+ # fooled.
+ my $pos = pos($input_line);
- if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
+ my $pos_beg = $$rtoken_map[$i];
+ my $str = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
- # perl seems to allow just these: $#: $#- $#+
- elsif ( $tok =~ /^[\:\-\+]$/ ) {
- $type = 'i';
- $identifier .= $tok;
- }
- else {
- $i = $i_save;
- write_logfile_entry( 'Use of $# is deprecated' . "\n" );
- }
+ # Reject if the closing '>' follows a '-' as in:
+ # if ( VERSION < 5.009 && $op-> name eq 'aassign' ) { }
+ if ( $expecting eq UNKNOWN ) {
+ my $check = substr( $input_line, $pos - 2, 1 );
+ if ( $check eq '-' ) {
+ return ( $i, $type );
}
- elsif ( $identifier eq '$$' ) {
+ }
- # perl does not allow references to punctuation
- # variables without braces. For example, this
- # won't work:
- # $:=\4;
- # $a = $$:;
- # You would have to use
- # $a = ${$:};
+ ######################################debug#####
+ #write_diagnostics( "ANGLE? :$str\n");
+ #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
+ ######################################debug#####
+ $type = 'Q';
+ my $error;
+ ( $i, $error ) =
+ inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
- $i = $i_save;
- if ( $tok eq '{' ) { $type = 't' }
- else { $type = 'i' }
- }
- elsif ( $identifier eq '->' ) {
- $i = $i_save;
- }
- else {
- $i = $i_save;
- if ( length($identifier) == 1 ) { $identifier = ''; }
- }
- $id_scan_state = '';
- last;
+ # It may be possible that a quote ends midway in a pretoken.
+ # If this happens, it may be necessary to split the pretoken.
+ if ($error) {
+ warning(
+ "Possible tokinization error..please check this line\n");
+ report_possible_bug();
}
- }
- elsif ( $id_scan_state eq '&' ) { # starting sub call?
- if ( $tok =~ /^[\$A-Za-z_]/ ) { # alphanumeric ..
- $id_scan_state = ':'; # now need ::
- $saw_alpha = 1;
- $identifier .= $tok;
- }
- elsif ( $tok eq "'" && $allow_tick ) { # alphanumeric ..
- $id_scan_state = ':'; # now need ::
- $saw_alpha = 1;
- $identifier .= $tok;
- }
- elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
- $id_scan_state = ':'; # now need ::
- $saw_alpha = 1;
- $identifier .= $tok;
- }
- elsif ( $tok =~ /^\s*$/ ) { # allow space
- }
- elsif ( $tok eq '::' ) { # leading ::
- $id_scan_state = 'A'; # accept alpha next
- $identifier .= $tok;
+ # Now let's see where we stand....
+ # OK if math op not possible
+ if ( $expecting == TERM ) {
}
- elsif ( $tok eq '{' ) {
- if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; }
- $i = $i_save;
- $id_scan_state = '';
- last;
+
+ # OK if there are no more than 2 pre-tokens inside
+ # (not possible to write 2 token math between < and >)
+ # This catches most common cases
+ elsif ( $i <= $i_beg + 3 ) {
+ write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
}
+
+ # Not sure..
else {
- # punctuation variable?
- # testfile: cunningham4.pl
- if ( $identifier eq '&' ) {
- $identifier .= $tok;
+ # Let's try a Brace Test: any braces inside must balance
+ my $br = 0;
+ while ( $str =~ /\{/g ) { $br++ }
+ while ( $str =~ /\}/g ) { $br-- }
+ my $sb = 0;
+ while ( $str =~ /\[/g ) { $sb++ }
+ while ( $str =~ /\]/g ) { $sb-- }
+ my $pr = 0;
+ while ( $str =~ /\(/g ) { $pr++ }
+ while ( $str =~ /\)/g ) { $pr-- }
+
+ # if braces do not balance - not angle operator
+ if ( $br || $sb || $pr ) {
+ $i = $i_beg;
+ $type = '<';
+ write_diagnostics(
+ "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
}
+
+ # we should keep doing more checks here...to be continued
+ # Tentatively accepting this as a valid angle operator.
+ # There are lots more things that can be checked.
else {
- $identifier = '';
- $i = $i_save;
- $type = '&';
+ write_diagnostics(
+ "ANGLE-Guessing yes: $str expecting=$expecting\n");
+ write_logfile_entry("Guessing angle operator here: $str\n");
}
- $id_scan_state = '';
- last;
}
}
- elsif ( $id_scan_state eq 'A' ) { # looking for alpha (after ::)
- if ( $tok =~ /^[A-Za-z_]/ ) { # found it
- $identifier .= $tok;
- $id_scan_state = ':'; # now need ::
- $saw_alpha = 1;
- }
- elsif ( $tok eq "'" && $allow_tick ) {
- $identifier .= $tok;
- $id_scan_state = ':'; # now need ::
- $saw_alpha = 1;
- }
- elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
- $identifier .= $tok;
- $id_scan_state = ':'; # now need ::
- $saw_alpha = 1;
- }
- elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
- $id_scan_state = '(';
- $identifier .= $tok;
- }
- elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
- $id_scan_state = ')';
- $identifier .= $tok;
- }
- else {
- $id_scan_state = '';
- $i = $i_save;
- last;
+ # didn't find ending >
+ else {
+ if ( $expecting == TERM ) {
+ warning("No ending > for angle operator\n");
}
}
- elsif ( $id_scan_state eq ':' ) { # looking for :: after alpha
+ }
+ return ( $i, $type );
+}
- if ( $tok eq '::' ) { # got it
- $identifier .= $tok;
- $id_scan_state = 'A'; # now require alpha
- }
- elsif ( $tok =~ /^[A-Za-z_]/ ) { # more alphanumeric is ok here
- $identifier .= $tok;
- $id_scan_state = ':'; # now need ::
- $saw_alpha = 1;
- }
- elsif ( $tok =~ /^[0-9]/ ) { # numeric..see comments above
- $identifier .= $tok;
- $id_scan_state = ':'; # now need ::
- $saw_alpha = 1;
- }
- elsif ( $tok eq "'" && $allow_tick ) { # tick
+sub scan_number_do {
- if ( $is_keyword{$identifier} ) {
- $id_scan_state = ''; # that's all
- $i = $i_save;
- }
- else {
- $identifier .= $tok;
- }
- }
- elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
- $id_scan_state = '(';
- $identifier .= $tok;
- }
- elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
- $id_scan_state = ')';
- $identifier .= $tok;
- }
- else {
- $id_scan_state = ''; # that's all
- $i = $i_save;
- last;
- }
- }
- elsif ( $id_scan_state eq '(' ) { # looking for ( of prototype
+ # scan a number in any of the formats that Perl accepts
+ # Underbars (_) are allowed in decimal numbers.
+ # input parameters -
+ # $input_line - the string to scan
+ # $i - pre_token index to start scanning
+ # $rtoken_map - reference to the pre_token map giving starting
+ # character position in $input_line of token $i
+ # output parameters -
+ # $i - last pre_token index of the number just scanned
+ # number - the number (characters); or undef if not a number
+
+ my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
+ my $pos_beg = $$rtoken_map[$i];
+ my $pos;
+ my $i_begin = $i;
+ my $number = undef;
+ my $type = $input_type;
+
+ my $first_char = substr( $input_line, $pos_beg, 1 );
+
+ # Look for bad starting characters; Shouldn't happen..
+ if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
+ warning("Program bug - scan_number given character $first_char\n");
+ report_definite_bug();
+ return ( $i, $type, $number );
+ }
+
+ # handle v-string without leading 'v' character ('Two Dot' rule)
+ # (vstring.t)
+ # TODO: v-strings may contain underscores
+ pos($input_line) = $pos_beg;
+ if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
+ $pos = pos($input_line);
+ my $numc = $pos - $pos_beg;
+ $number = substr( $input_line, $pos_beg, $numc );
+ $type = 'v';
+ report_v_string($number);
+ }
- if ( $tok eq '(' ) { # got it
- $identifier .= $tok;
- $id_scan_state = ')'; # now find the end of it
- }
- elsif ( $tok =~ /^\s*$/ ) { # blank - keep going
- $identifier .= $tok;
- }
- else {
- $id_scan_state = ''; # that's all - no prototype
- $i = $i_save;
- last;
- }
+ # handle octal, hex, binary
+ if ( !defined($number) ) {
+ pos($input_line) = $pos_beg;
+ if ( $input_line =~ /\G[+-]?0((x[0-9a-fA-F_]+)|([0-7_]+)|(b[01_]+))/g )
+ {
+ $pos = pos($input_line);
+ my $numc = $pos - $pos_beg;
+ $number = substr( $input_line, $pos_beg, $numc );
+ $type = 'n';
}
- elsif ( $id_scan_state eq ')' ) { # looking for ) to end
+ }
- if ( $tok eq ')' ) { # got it
- $identifier .= $tok;
- $id_scan_state = ''; # all done
- last;
- }
- elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
- $identifier .= $tok;
- }
- else { # probable error in script, but keep going
- warning("Unexpected '$tok' while seeking end of prototype\n");
- $identifier .= $tok;
+ # handle decimal
+ if ( !defined($number) ) {
+ pos($input_line) = $pos_beg;
+
+ if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
+ $pos = pos($input_line);
+
+ # watch out for things like 0..40 which would give 0. by this;
+ if ( ( substr( $input_line, $pos - 1, 1 ) eq '.' )
+ && ( substr( $input_line, $pos, 1 ) eq '.' ) )
+ {
+ $pos--;
}
+ my $numc = $pos - $pos_beg;
+ $number = substr( $input_line, $pos_beg, $numc );
+ $type = 'n';
}
- else { # can get here due to error in initialization
- $id_scan_state = '';
- $i = $i_save;
+ }
+
+ # filter out non-numbers like e + - . e2 .e3 +e6
+ # the rule: at least one digit, and any 'e' must be preceded by a digit
+ if (
+ $number !~ /\d/ # no digits
+ || ( $number =~ /^(.*)[eE]/
+ && $1 !~ /\d/ ) # or no digits before the 'e'
+ )
+ {
+ $number = undef;
+ $type = $input_type;
+ return ( $i, $type, $number );
+ }
+
+ # Found a number; now we must convert back from character position
+ # to pre_token index. An error here implies user syntax error.
+ # An example would be an invalid octal number like '009'.
+ my $error;
+ ( $i, $error ) =
+ inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
+ if ($error) { warning("Possibly invalid number\n") }
+
+ return ( $i, $type, $number );
+}
+
+sub inverse_pretoken_map {
+
+ # Starting with the current pre_token index $i, scan forward until
+ # finding the index of the next pre_token whose position is $pos.
+ my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
+ my $error = 0;
+
+ while ( ++$i <= $max_token_index ) {
+
+ if ( $pos <= $$rtoken_map[$i] ) {
+
+ # Let the calling routine handle errors in which we do not
+ # land on a pre-token boundary. It can happen by running
+ # perltidy on some non-perl scripts, for example.
+ if ( $pos < $$rtoken_map[$i] ) { $error = 1 }
+ $i--;
last;
}
}
+ return ( $i, $error );
+}
- if ( $id_scan_state eq ')' ) {
- warning("Hit end of line while seeking ) to end prototype\n");
- }
+sub find_here_doc {
- # once we enter the actual identifier, it may not extend beyond
- # the end of the current line
- if ( $id_scan_state =~ /^[A\:\(\)]/ ) {
- $id_scan_state = '';
+ # find the target of a here document, if any
+ # input parameters:
+ # $i - token index of the second < of <<
+ # ($i must be less than the last token index if this is called)
+ # output parameters:
+ # $found_target = 0 didn't find target; =1 found target
+ # HERE_TARGET - the target string (may be empty string)
+ # $i - unchanged if not here doc,
+ # or index of the last token of the here target
+ # $saw_error - flag noting unbalanced quote on here target
+ my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
+ my $ibeg = $i;
+ my $found_target = 0;
+ my $here_doc_target = '';
+ my $here_quote_character = '';
+ my $saw_error = 0;
+ my ( $next_nonblank_token, $i_next_nonblank, $next_token );
+ $next_token = $$rtokens[ $i + 1 ];
+
+ # perl allows a backslash before the target string (heredoc.t)
+ my $backslash = 0;
+ if ( $next_token eq '\\' ) {
+ $backslash = 1;
+ $next_token = $$rtokens[ $i + 2 ];
}
- if ( $i < 0 ) { $i = 0 }
- unless ($type) {
+ ( $next_nonblank_token, $i_next_nonblank ) =
+ find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
- if ($saw_type) {
+ if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
- if ($saw_alpha) {
- if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
- $type = 'w';
- }
- else { $type = 'i' }
- }
- elsif ( $identifier eq '->' ) {
- $type = '->';
+ my $in_quote = 1;
+ my $quote_depth = 0;
+ my $quote_pos = 0;
+ my $quoted_string;
+
+ (
+ $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
+ $quoted_string
+ )
+ = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
+ $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
+
+ if ($in_quote) { # didn't find end of quote, so no target found
+ $i = $ibeg;
+ if ( $expecting == TERM ) {
+ warning(
+"Did not find here-doc string terminator ($here_quote_character) before end of line \n"
+ );
+ $saw_error = 1;
}
- elsif (
- ( length($identifier) > 1 )
+ }
+ else { # found ending quote
+ my $j;
+ $found_target = 1;
- # In something like '@$=' we have an identifier '@$'
- # In something like '$${' we have type '$$' (and only
- # part of an identifier)
- && !( $identifier =~ /\$$/ && $tok eq '{' )
- && ( $identifier !~ /^(sub |package )$/ )
- )
- {
- $type = 'i';
+ my $tokj;
+ for ( $j = $i_next_nonblank + 1 ; $j < $i ; $j++ ) {
+ $tokj = $$rtokens[$j];
+
+ # we have to remove any backslash before the quote character
+ # so that the here-doc-target exactly matches this string
+ next
+ if ( $tokj eq "\\"
+ && $j < $i - 1
+ && $$rtokens[ $j + 1 ] eq $here_quote_character );
+ $here_doc_target .= $tokj;
}
- else { $type = 't' }
}
- elsif ($saw_alpha) {
+ }
- # type 'w' includes anything without leading type info
- # ($,%,@,*) including something like abc::def::ghi
- $type = 'w';
+ elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
+ $found_target = 1;
+ write_logfile_entry(
+ "found blank here-target after <<; suggest using \"\"\n");
+ $i = $ibeg;
+ }
+ elsif ( $next_token =~ /^\w/ ) { # simple bareword or integer after <<
+
+ my $here_doc_expected;
+ if ( $expecting == UNKNOWN ) {
+ $here_doc_expected = guess_if_here_doc($next_token);
}
else {
- $type = '';
- } # this can happen on a restart
- }
+ $here_doc_expected = 1;
+ }
+
+ if ($here_doc_expected) {
+ $found_target = 1;
+ $here_doc_target = $next_token;
+ $i = $ibeg + 1;
+ }
- if ($identifier) {
- $tok = $identifier;
- if ($message) { write_logfile_entry($message) }
}
else {
- $tok = $tok_begin;
- $i = $i_begin;
+
+ if ( $expecting == TERM ) {
+ $found_target = 1;
+ write_logfile_entry("Note: bare here-doc operator <<\n");
+ }
+ else {
+ $i = $ibeg;
+ }
}
- TOKENIZER_DEBUG_FLAG_SCAN_ID && do {
- my ( $a, $b, $c ) = caller;
- print
-"SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
- print
-"SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
- };
- return ( $i, $tok, $type, $id_scan_state, $identifier );
+ # patch to neglect any prepended backslash
+ if ( $found_target && $backslash ) { $i++ }
+
+ return ( $found_target, $here_doc_target, $here_quote_character, $i,
+ $saw_error );
+}
+
+sub do_quote {
+
+ # follow (or continue following) quoted string(s)
+ # $in_quote return code:
+ # 0 - ok, found end
+ # 1 - still must find end of quote whose target is $quote_character
+ # 2 - still looking for end of first of two quotes
+ #
+ # Returns updated strings:
+ # $quoted_string_1 = quoted string seen while in_quote=1
+ # $quoted_string_2 = quoted string seen while in_quote=2
+ my (
+ $i, $in_quote, $quote_character,
+ $quote_pos, $quote_depth, $quoted_string_1,
+ $quoted_string_2, $rtokens, $rtoken_map,
+ $max_token_index
+ ) = @_;
+
+ my $in_quote_starting = $in_quote;
+
+ my $quoted_string;
+ if ( $in_quote == 2 ) { # two quotes/quoted_string_1s to follow
+ my $ibeg = $i;
+ (
+ $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
+ $quoted_string
+ )
+ = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
+ $quote_pos, $quote_depth, $max_token_index );
+ $quoted_string_2 .= $quoted_string;
+ if ( $in_quote == 1 ) {
+ if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
+ $quote_character = '';
+ }
+ else {
+ $quoted_string_2 .= "\n";
+ }
+ }
+
+ if ( $in_quote == 1 ) { # one (more) quote to follow
+ my $ibeg = $i;
+ (
+ $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
+ $quoted_string
+ )
+ = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
+ $quote_pos, $quote_depth, $max_token_index );
+ $quoted_string_1 .= $quoted_string;
+ if ( $in_quote == 1 ) {
+ $quoted_string_1 .= "\n";
+ }
+ }
+ return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
+ $quoted_string_1, $quoted_string_2 );
}
sub follow_quoted_string {
# $beginning_tok = the starting quote character
# $quote_pos = index to check next for alphanumeric delimiter
# $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
- my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth )
+ # $quoted_string = the text of the quote (without quotation tokens)
+ my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
+ $max_token_index )
= @_;
my ( $tok, $end_tok );
- my $i = $i_beg - 1;
+ my $i = $i_beg - 1;
+ my $quoted_string = "";
TOKENIZER_DEBUG_FLAG_QUOTE && do {
print
# characters, whereas for a non-alphanumeric delimiter, only tokens of
# length 1 can match.
- # loop for case of alphanumeric quote delimiter..
+ ###################################################################
+ # Case 1 (rare): loop for case of alphanumeric quote delimiter..
# "quote_pos" is the position the current word to begin searching
+ ###################################################################
if ( $beginning_tok =~ /\w/ ) {
# Note this because it is not recommended practice except
if ( $tok eq '\\' ) {
+ # retain backslash unless it hides the end token
+ $quoted_string .= $tok
+ unless $$rtokens[ $i + 1 ] eq $end_tok;
$quote_pos++;
last if ( $i >= $max_token_index );
$tok = $$rtokens[ ++$i ];
-
}
}
my $old_pos = $quote_pos;
if ( $quote_pos > 0 ) {
+ $quoted_string .=
+ substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
+
$quote_depth--;
if ( $quote_depth == 0 ) {
last;
}
}
+ else {
+ $quoted_string .= substr( $tok, $old_pos );
+ }
}
}
- # loop for case of a non-alphanumeric quote delimiter..
+ ########################################################################
+ # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
+ ########################################################################
else {
while ( $i < $max_token_index ) {
$quote_depth++;
}
elsif ( $tok eq '\\' ) {
- $i++;
+
+ # retain backslash unless it hides the beginning or end token
+ $tok = $$rtokens[ ++$i ];
+ $quoted_string .= '\\'
+ unless ( $tok eq $end_tok || $tok eq $beginning_tok );
}
+ $quoted_string .= $tok;
}
}
if ( $i > $max_token_index ) { $i = $max_token_index }
- return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth );
+ return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
+ $quoted_string );
+}
+
+sub indicate_error {
+ my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
+ interrupt_logfile();
+ warning($msg);
+ write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
+ resume_logfile();
+}
+
+sub write_error_indicator_pair {
+ my ( $line_number, $input_line, $pos, $carrat ) = @_;
+ my ( $offset, $numbered_line, $underline ) =
+ make_numbered_line( $line_number, $input_line, $pos );
+ $underline = write_on_underline( $underline, $pos - $offset, $carrat );
+ warning( $numbered_line . "\n" );
+ $underline =~ s/\s*$//;
+ warning( $underline . "\n" );
+}
+
+sub make_numbered_line {
+
+ # Given an input line, its line number, and a character position of
+ # interest, create a string not longer than 80 characters of the form
+ # $lineno: sub_string
+ # such that the sub_string of $str contains the position of interest
+ #
+ # Here is an example of what we want, in this case we add trailing
+ # '...' because the line is long.
+ #
+ # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
+ #
+ # Here is another example, this time in which we used leading '...'
+ # because of excessive length:
+ #
+ # 2: ... er of the World Wide Web Consortium's
+ #
+ # input parameters are:
+ # $lineno = line number
+ # $str = the text of the line
+ # $pos = position of interest (the error) : 0 = first character
+ #
+ # We return :
+ # - $offset = an offset which corrects the position in case we only
+ # display part of a line, such that $pos-$offset is the effective
+ # position from the start of the displayed line.
+ # - $numbered_line = the numbered line as above,
+ # - $underline = a blank 'underline' which is all spaces with the same
+ # number of characters as the numbered line.
+
+ my ( $lineno, $str, $pos ) = @_;
+ my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
+ my $excess = length($str) - $offset - 68;
+ my $numc = ( $excess > 0 ) ? 68 : undef;
+
+ if ( defined($numc) ) {
+ if ( $offset == 0 ) {
+ $str = substr( $str, $offset, $numc - 4 ) . " ...";
+ }
+ else {
+ $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
+ }
+ }
+ else {
+
+ if ( $offset == 0 ) {
+ }
+ else {
+ $str = "... " . substr( $str, $offset + 4 );
+ }
+ }
+
+ my $numbered_line = sprintf( "%d: ", $lineno );
+ $offset -= length($numbered_line);
+ $numbered_line .= $str;
+ my $underline = " " x length($numbered_line);
+ return ( $offset, $numbered_line, $underline );
+}
+
+sub write_on_underline {
+
+ # The "underline" is a string that shows where an error is; it starts
+ # out as a string of blanks with the same length as the numbered line of
+ # code above it, and we have to add marking to show where an error is.
+ # In the example below, we want to write the string '--^' just below
+ # the line of bad code:
+ #
+ # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
+ # ---^
+ # We are given the current underline string, plus a position and a
+ # string to write on it.
+ #
+ # In the above example, there will be 2 calls to do this:
+ # First call: $pos=19, pos_chr=^
+ # Second call: $pos=16, pos_chr=---
+ #
+ # This is a trivial thing to do with substr, but there is some
+ # checking to do.
+
+ my ( $underline, $pos, $pos_chr ) = @_;
+
+ # check for error..shouldn't happen
+ unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
+ return $underline;
+ }
+ my $excess = length($pos_chr) + $pos - length($underline);
+ if ( $excess > 0 ) {
+ $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
+ }
+ substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
+ return ($underline);
+}
+
+sub pre_tokenize {
+
+ # Break a string, $str, into a sequence of preliminary tokens. We
+ # are interested in these types of tokens:
+ # words (type='w'), example: 'max_tokens_wanted'
+ # digits (type = 'd'), example: '0755'
+ # whitespace (type = 'b'), example: ' '
+ # any other single character (i.e. punct; type = the character itself).
+ # We cannot do better than this yet because we might be in a quoted
+ # string or pattern. Caller sets $max_tokens_wanted to 0 to get all
+ # tokens.
+ my ( $str, $max_tokens_wanted ) = @_;
+
+ # we return references to these 3 arrays:
+ my @tokens = (); # array of the tokens themselves
+ my @token_map = (0); # string position of start of each token
+ my @type = (); # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
+
+ do {
+
+ # whitespace
+ if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
+
+ # numbers
+ # note that this must come before words!
+ elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
+
+ # words
+ elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
+
+ # single-character punctuation
+ elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
+
+ # that's all..
+ else {
+ return ( \@tokens, \@token_map, \@type );
+ }
+
+ push @tokens, $1;
+ push @token_map, pos($str);
+
+ } while ( --$max_tokens_wanted != 0 );
+
+ return ( \@tokens, \@token_map, \@type );
+}
+
+sub show_tokens {
+
+ # this is an old debug routine
+ my ( $rtokens, $rtoken_map ) = @_;
+ my $num = scalar(@$rtokens);
+ my $i;
+
+ for ( $i = 0 ; $i < $num ; $i++ ) {
+ my $len = length( $$rtokens[$i] );
+ print "$i:$len:$$rtoken_map[$i]:$$rtokens[$i]:\n";
+ }
}
sub matching_end_token {
}
}
+sub dump_token_types {
+ my $class = shift;
+ my $fh = shift;
+
+ # This should be the latest list of token types in use
+ # adding NEW_TOKENS: add a comment here
+ print $fh <<'END_OF_LIST';
+
+Here is a list of the token types currently used for lines of type 'CODE'.
+For the following tokens, the "type" of a token is just the token itself.
+
+.. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
+( ) <= >= == =~ !~ != ++ -- /= x=
+... **= <<= >>= &&= ||= //= <=>
+, + - / * | % ! x ~ = \ ? : . < > ^ &
+
+The following additional token types are defined:
+
+ type meaning
+ b blank (white space)
+ { indent: opening structural curly brace or square bracket or paren
+ (code block, anonymous hash reference, or anonymous array reference)
+ } outdent: right structural curly brace or square bracket or paren
+ [ left non-structural square bracket (enclosing an array index)
+ ] right non-structural square bracket
+ ( left non-structural paren (all but a list right of an =)
+ ) right non-structural parena
+ L left non-structural curly brace (enclosing a key)
+ R right non-structural curly brace
+ ; terminal semicolon
+ f indicates a semicolon in a "for" statement
+ h here_doc operator <<
+ # a comment
+ Q indicates a quote or pattern
+ q indicates a qw quote block
+ k a perl keyword
+ C user-defined constant or constant function (with void prototype = ())
+ U user-defined function taking parameters
+ G user-defined function taking block parameter (like grep/map/eval)
+ M (unused, but reserved for subroutine definition name)
+ P (unused, but -html uses it to label pod text)
+ t type indicater such as %,$,@,*,&,sub
+ w bare word (perhaps a subroutine call)
+ i identifier of some type (with leading %, $, @, *, &, sub, -> )
+ n a number
+ v a v-string
+ F a file test operator (like -e)
+ Y File handle
+ Z identifier in indirect object slot: may be file handle, object
+ J LABEL: code block label
+ j LABEL after next, last, redo, goto
+ p unary +
+ m unary -
+ pp pre-increment operator ++
+ mm pre-decrement operator --
+ A : used as attribute separator
+
+ Here are the '_line_type' codes used internally:
+ SYSTEM - system-specific code before hash-bang line
+ CODE - line of perl code (including comments)
+ POD_START - line starting pod, such as '=head'
+ POD - pod documentation text
+ POD_END - last line of pod section, '=cut'
+ HERE - text of here-document
+ HERE_END - last line of here-doc (target word)
+ FORMAT - format section
+ FORMAT_END - last line of format section, '.'
+ DATA_START - __DATA__ line
+ DATA - unidentified text following __DATA__
+ END_START - __END__ line
+ END - unidentified text following __END__
+ ERROR - we are in big trouble, probably not a perl script
+END_OF_LIST
+}
+
BEGIN {
# These names are used in error messages
@opening_brace_names = qw# '{' '[' '(' '?' #;
@closing_brace_names = qw# '}' ']' ')' ':' #;
+ ## TESTING: added ~~
my @digraphs = qw(
.. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
- <= >= == =~ !~ != ++ -- /= x=
+ <= >= == =~ !~ != ++ -- /= x= ~~
);
@is_digraph{@digraphs} = (1) x scalar(@digraphs);
@is_block_operator{@_} = (1) x scalar(@_);
# these functions allow an identifier in the indirect object slot
- @_ = qw( print printf sort exec system );
+ @_ = qw( print printf sort exec system say);
@is_indirect_object_taker{@_} = (1) x scalar(@_);
# These tokens may precede a code block
given
when
err
+ say
);
- # patched above for SWITCH/CASE
+ # patched above for SWITCH/CASE given/when err say
+ # 'err' is a fairly safe addition.
+ # TODO: 'default' still needed if appropriate
+ # 'use feature' seen, but perltidy works ok without it.
+ # Concerned that 'default' could break code.
push( @Keywords, @value_requestor );
# These are treated the same but are not keywords:
# these token TYPES expect trailing operator but not a term
# note: ++ and -- are post-increment and decrement, 'C' = constant
- my @operator_requestor_types = qw( ++ -- C );
+ my @operator_requestor_types = qw( ++ -- C <> q );
@expecting_operator_types{@operator_requestor_types} =
(1) x scalar(@operator_requestor_types);
my @value_requestor_type = qw#
L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
**= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
- <= >= == != => \ > < % * / ? & | ** <=>
- f F pp mm Y p m U J G
+ <= >= == != => \ > < % * / ? & | ** <=> ~~
+ f F pp mm Y p m U J G j >> << ^ t
#;
push( @value_requestor_type, ',' )
; # (perl doesn't like a ',' in a qw block)
@expecting_term_types{@value_requestor_type} =
(1) x scalar(@value_requestor_type);
+ # Note: the following valid token types are not assigned here to
+ # hashes requesting to be followed by values or terms, but are
+ # instead currently hard-coded into sub operator_expected:
+ # ) -> :: Q R Z ] b h i k n v w } #
+
# For simple syntax checking, it is nice to have a list of operators which
# will really be unhappy if not followed by a term. This includes most
# of the above...
=head1 VERSION
-This man page documents Perl::Tidy version 20060614.
+This man page documents Perl::Tidy version 20060719.
=head1 AUTHOR