lib/Perl/Tidy/Tokenizer.pm

   1 #####################################################################
   2 #
   3 # The Perl::Tidy::Tokenizer package is essentially a filter which
   4 # reads lines of perl source code from a source object and provides
   5 # corresponding tokenized lines through its get_line() method.  Lines
   6 # flow from the source_object to the caller like this:
   7 #
   8 # source_object --> LineBuffer_object --> Tokenizer -->  calling routine
   9 #   get_line()         get_line()           get_line()     line_of_tokens
  10 #
  11 # The source object can be any object with a get_line() method which
  12 # supplies one line (a character string) perl call.
  13 # The LineBuffer object is created by the Tokenizer.
  14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
  15 # containing one tokenized line for each call to its get_line() method.
  16 #
  17 # WARNING: This is not a real class.  Only one tokenizer my be used.
  18 #
  19 ########################################################################
  20
  21 package Perl::Tidy::Tokenizer;
  22 use strict;
  23 use warnings;
  24 use English qw( -no_match_vars );
  25
  26 our $VERSION = '20221112';
  27
  28 use Perl::Tidy::LineBuffer;
  29 use Carp;
  30
  31 use constant DEVEL_MODE   => 0;
  32 use constant EMPTY_STRING => q{};
  33 use constant SPACE        => q{ };
  34
  35 # Decimal values of some ascii characters for quick checks
  36 use constant ORD_TAB           => 9;
  37 use constant ORD_SPACE         => 32;
  38 use constant ORD_PRINTABLE_MIN => 33;
  39 use constant ORD_PRINTABLE_MAX => 126;
  40
  41 # PACKAGE VARIABLES for processing an entire FILE.
  42 # These must be package variables because most may get localized during
  43 # processing.  Most are initialized in sub prepare_for_a_new_file.
  44 use vars qw{
  45   $tokenizer_self
  46
  47   $last_nonblank_token
  48   $last_nonblank_type
  49   $last_nonblank_block_type
  50   $statement_type
  51   $in_attribute_list
  52   $current_package
  53   $context
  54
  55   %is_constant
  56   %is_user_function
  57   %user_function_prototype
  58   %is_block_function
  59   %is_block_list_function
  60   %saw_function_definition
  61   %saw_use_module
  62
  63   $brace_depth
  64   $paren_depth
  65   $square_bracket_depth
  66
  67   @current_depth
  68   @total_depth
  69   $total_depth
  70   $next_sequence_number
  71   @nesting_sequence_number
  72   @current_sequence_number
  73   @paren_type
  74   @paren_semicolon_count
  75   @paren_structural_type
  76   @brace_type
  77   @brace_structural_type
  78   @brace_context
  79   @brace_package
  80   @square_bracket_type
  81   @square_bracket_structural_type
  82   @depth_array
  83   @nested_ternary_flag
  84   @nested_statement_type
  85   @starting_line_of_current_depth
  86 };
  87
  88 # GLOBAL CONSTANTS for routines in this package,
  89 # Initialized in a BEGIN block.
  90 use vars qw{
  91   %is_indirect_object_taker
  92   %is_block_operator
  93   %expecting_operator_token
  94   %expecting_operator_types
  95   %expecting_term_types
  96   %expecting_term_token
  97   %is_digraph
  98   %can_start_digraph
  99   %is_file_test_operator
 100   %is_trigraph
 101   %is_tetragraph
 102   %is_valid_token_type
 103   %is_keyword
 104   %is_code_block_token
 105   %is_sort_map_grep_eval_do
 106   %is_sort_map_grep
 107   %is_grep_alias
 108   %really_want_term
 109   @opening_brace_names
 110   @closing_brace_names
 111   %is_keyword_taking_list
 112   %is_keyword_taking_optional_arg
 113   %is_keyword_rejecting_slash_as_pattern_delimiter
 114   %is_keyword_rejecting_question_as_pattern_delimiter
 115   %is_q_qq_qx_qr_s_y_tr_m
 116   %is_q_qq_qw_qx_qr_s_y_tr_m
 117   %is_sub
 118   %is_package
 119   %is_comma_question_colon
 120   %is_if_elsif_unless
 121   %is_if_elsif_unless_case_when
 122   %other_line_endings
 123   %is_END_DATA_format_sub
 124   %is_semicolon_or_t
 125   $code_skipping_pattern_begin
 126   $code_skipping_pattern_end
 127 };
 128
 129 # GLOBAL VARIABLES which are constant after being configured by user-supplied
 130 # parameters.  They remain constant as a file is being processed.
 131 my (
 132
 133     $rOpts_code_skipping,
 134     $code_skipping_pattern_begin,
 135     $code_skipping_pattern_end,
 136 );
 137
 138 # possible values of operator_expected()
 139 use constant TERM     => -1;
 140 use constant UNKNOWN  => 0;
 141 use constant OPERATOR => 1;
 142
 143 # possible values of context
 144 use constant SCALAR_CONTEXT  => -1;
 145 use constant UNKNOWN_CONTEXT => 0;
 146 use constant LIST_CONTEXT    => 1;
 147
 148 # Maximum number of little messages; probably need not be changed.
 149 use constant MAX_NAG_MESSAGES => 6;
 150
 151 BEGIN {
 152
 153     # Array index names for $self.
 154     # Do not combine with other BEGIN blocks (c101).
 155     my $i = 0;
 156     use constant {
 157         _rhere_target_list_                  => $i++,
 158         _in_here_doc_                        => $i++,
 159         _here_doc_target_                    => $i++,
 160         _here_quote_character_               => $i++,
 161         _in_data_                            => $i++,
 162         _in_end_                             => $i++,
 163         _in_format_                          => $i++,
 164         _in_error_                           => $i++,
 165         _in_pod_                             => $i++,
 166         _in_skipped_                         => $i++,
 167         _in_attribute_list_                  => $i++,
 168         _in_quote_                           => $i++,
 169         _quote_target_                       => $i++,
 170         _line_start_quote_                   => $i++,
 171         _starting_level_                     => $i++,
 172         _know_starting_level_                => $i++,
 173         _tabsize_                            => $i++,
 174         _indent_columns_                     => $i++,
 175         _look_for_hash_bang_                 => $i++,
 176         _trim_qw_                            => $i++,
 177         _continuation_indentation_           => $i++,
 178         _outdent_labels_                     => $i++,
 179         _last_line_number_                   => $i++,
 180         _saw_perl_dash_P_                    => $i++,
 181         _saw_perl_dash_w_                    => $i++,
 182         _saw_use_strict_                     => $i++,
 183         _saw_v_string_                       => $i++,
 184         _hit_bug_                            => $i++,
 185         _look_for_autoloader_                => $i++,
 186         _look_for_selfloader_                => $i++,
 187         _saw_autoloader_                     => $i++,
 188         _saw_selfloader_                     => $i++,
 189         _saw_hash_bang_                      => $i++,
 190         _saw_end_                            => $i++,
 191         _saw_data_                           => $i++,
 192         _saw_negative_indentation_           => $i++,
 193         _started_tokenizing_                 => $i++,
 194         _line_buffer_object_                 => $i++,
 195         _debugger_object_                    => $i++,
 196         _diagnostics_object_                 => $i++,
 197         _logger_object_                      => $i++,
 198         _unexpected_error_count_             => $i++,
 199         _started_looking_for_here_target_at_ => $i++,
 200         _nearly_matched_here_target_at_      => $i++,
 201         _line_of_text_                       => $i++,
 202         _rlower_case_labels_at_              => $i++,
 203         _extended_syntax_                    => $i++,
 204         _maximum_level_                      => $i++,
 205         _true_brace_error_count_             => $i++,
 206         _rOpts_maximum_level_errors_         => $i++,
 207         _rOpts_maximum_unexpected_errors_    => $i++,
 208         _rOpts_logfile_                      => $i++,
 209         _rOpts_                              => $i++,
 210     };
 211 }
 212
 213 {    ## closure for subs to count instances
 214
 215     # methods to count instances
 216     my $_count = 0;
 217     sub get_count        { return $_count; }
 218     sub _increment_count { return ++$_count }
 219     sub _decrement_count { return --$_count }
 220 }
 221
 222 sub DESTROY {
 223     my $self = shift;
 224     $self->_decrement_count();
 225     return;
 226 }
 227
 228 sub AUTOLOAD {
 229
 230     # Catch any undefined sub calls so that we are sure to get
 231     # some diagnostic information.  This sub should never be called
 232     # except for a programming error.
 233     our $AUTOLOAD;
 234     return if ( $AUTOLOAD =~ /\bDESTROY$/ );
 235     my ( $pkg, $fname, $lno ) = caller();
 236     my $my_package = __PACKAGE__;
 237     print STDERR <<EOM;
 238 ======================================================================
 239 Error detected in package '$my_package', version $VERSION
 240 Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
 241 Called from package: '$pkg'
 242 Called from File '$fname'  at line '$lno'
 243 This error is probably due to a recent programming change
 244 ======================================================================
 245 EOM
 246     exit 1;
 247 } ## end sub AUTOLOAD
 248
 249 sub Die {
 250     my ($msg) = @_;
 251     Perl::Tidy::Die($msg);
 252     croak "unexpected return from Perl::Tidy::Die";
 253 }
 254
 255 sub Fault {
 256     my ($msg) = @_;
 257
 258     # This routine is called for errors that really should not occur
 259     # except if there has been a bug introduced by a recent program change.
 260     # Please add comments at calls to Fault to explain why the call
 261     # should not occur, and where to look to fix it.
 262     my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
 263     my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
 264     my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
 265     my $input_stream_name = get_input_stream_name();
 266
 267     Die(<<EOM);
 268 ==============================================================================
 269 While operating on input stream with name: '$input_stream_name'
 270 A fault was detected at line $line0 of sub '$subroutine1'
 271 in file '$filename1'
 272 which was called from line $line1 of sub '$subroutine2'
 273 Message: '$msg'
 274 This is probably an error introduced by a recent programming change.
 275 Perl::Tidy::Tokenizer.pm reports VERSION='$VERSION'.
 276 ==============================================================================
 277 EOM
 278
 279     # We shouldn't get here, but this return is to keep Perl-Critic from
 280     # complaining.
 281     return;
 282 } ## end sub Fault
 283
 284 sub bad_pattern {
 285
 286     # See if a pattern will compile. We have to use a string eval here,
 287     # but it should be safe because the pattern has been constructed
 288     # by this program.
 289     my ($pattern) = @_;
 290     my $ok = eval "'##'=~/$pattern/";
 291     return !defined($ok) || $EVAL_ERROR;
 292 }
 293
 294 sub make_code_skipping_pattern {
 295     my ( $rOpts, $opt_name, $default ) = @_;
 296     my $param = $rOpts->{$opt_name};
 297     unless ($param) { $param = $default }
 298     $param =~ s/^\s*//;    # allow leading spaces to be like format-skipping
 299     if ( $param !~ /^#/ ) {
 300         Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
 301     }
 302     my $pattern = '^\s*' . $param . '\b';
 303     if ( bad_pattern($pattern) ) {
 304         Die(
 305 "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
 306         );
 307     }
 308     return $pattern;
 309 } ## end sub make_code_skipping_pattern
 310
 311 sub check_options {
 312
 313     # Check Tokenizer parameters
 314     my $rOpts = shift;
 315
 316     %is_sub = ();
 317     $is_sub{'sub'} = 1;
 318
 319     %is_END_DATA_format_sub = (
 320         '__END__'  => 1,
 321         '__DATA__' => 1,
 322         'format'   => 1,
 323         'sub'      => 1,
 324     );
 325
 326     # Install any aliases to 'sub'
 327     if ( $rOpts->{'sub-alias-list'} ) {
 328
 329         # Note that any 'sub-alias-list' has been preprocessed to
 330         # be a trimmed, space-separated list which includes 'sub'
 331         # for example, it might be 'sub method fun'
 332         my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
 333         foreach my $word (@sub_alias_list) {
 334             $is_sub{$word}                 = 1;
 335             $is_END_DATA_format_sub{$word} = 1;
 336         }
 337     }
 338
 339     %is_grep_alias = ();
 340     if ( $rOpts->{'grep-alias-list'} ) {
 341
 342         # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
 343         # space-separated list
 344         my @q = split /\s+/, $rOpts->{'grep-alias-list'};
 345         @{is_grep_alias}{@q} = (1) x scalar(@q);
 346     }
 347
 348     $rOpts_code_skipping = $rOpts->{'code-skipping'};
 349     $code_skipping_pattern_begin =
 350       make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
 351     $code_skipping_pattern_end =
 352       make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
 353     return;
 354 } ## end sub check_options
 355
 356 sub new {
 357
 358     my ( $class, @args ) = @_;
 359
 360     # Note: 'tabs' and 'indent_columns' are temporary and should be
 361     # removed asap
 362     my %defaults = (
 363         source_object        => undef,
 364         debugger_object      => undef,
 365         diagnostics_object   => undef,
 366         logger_object        => undef,
 367         starting_level       => undef,
 368         indent_columns       => 4,
 369         tabsize              => 8,
 370         look_for_hash_bang   => 0,
 371         trim_qw              => 1,
 372         look_for_autoloader  => 1,
 373         look_for_selfloader  => 1,
 374         starting_line_number => 1,
 375         extended_syntax      => 0,
 376         rOpts                => {},
 377     );
 378     my %args = ( %defaults, @args );
 379
 380     # we are given an object with a get_line() method to supply source lines
 381     my $source_object = $args{source_object};
 382     my $rOpts         = $args{rOpts};
 383
 384     # we create another object with a get_line() and peek_ahead() method
 385     my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
 386
 387     # Tokenizer state data is as follows:
 388     # _rhere_target_list_    reference to list of here-doc targets
 389     # _here_doc_target_      the target string for a here document
 390     # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
 391     #                        to determine if interpolation is done
 392     # _quote_target_         character we seek if chasing a quote
 393     # _line_start_quote_     line where we started looking for a long quote
 394     # _in_here_doc_          flag indicating if we are in a here-doc
 395     # _in_pod_               flag set if we are in pod documentation
 396     # _in_skipped_           flag set if we are in a skipped section
 397     # _in_error_             flag set if we saw severe error (binary in script)
 398     # _in_data_              flag set if we are in __DATA__ section
 399     # _in_end_               flag set if we are in __END__ section
 400     # _in_format_            flag set if we are in a format description
 401     # _in_attribute_list_    flag telling if we are looking for attributes
 402     # _in_quote_             flag telling if we are chasing a quote
 403     # _starting_level_       indentation level of first line
 404     # _line_buffer_object_   object with get_line() method to supply source code
 405     # _diagnostics_object_   place to write debugging information
 406     # _unexpected_error_count_ error count used to limit output
 407     # _lower_case_labels_at_ line numbers where lower case labels seen
 408     # _hit_bug_              program bug detected
 409
 410     my $self = [];
 411     $self->[_rhere_target_list_]        = [];
 412     $self->[_in_here_doc_]              = 0;
 413     $self->[_here_doc_target_]          = EMPTY_STRING;
 414     $self->[_here_quote_character_]     = EMPTY_STRING;
 415     $self->[_in_data_]                  = 0;
 416     $self->[_in_end_]                   = 0;
 417     $self->[_in_format_]                = 0;
 418     $self->[_in_error_]                 = 0;
 419     $self->[_in_pod_]                   = 0;
 420     $self->[_in_skipped_]               = 0;
 421     $self->[_in_attribute_list_]        = 0;
 422     $self->[_in_quote_]                 = 0;
 423     $self->[_quote_target_]             = EMPTY_STRING;
 424     $self->[_line_start_quote_]         = -1;
 425     $self->[_starting_level_]           = $args{starting_level};
 426     $self->[_know_starting_level_]      = defined( $args{starting_level} );
 427     $self->[_tabsize_]                  = $args{tabsize};
 428     $self->[_indent_columns_]           = $args{indent_columns};
 429     $self->[_look_for_hash_bang_]       = $args{look_for_hash_bang};
 430     $self->[_trim_qw_]                  = $args{trim_qw};
 431     $self->[_continuation_indentation_] = $args{continuation_indentation};
 432     $self->[_outdent_labels_]           = $args{outdent_labels};
 433     $self->[_last_line_number_]         = $args{starting_line_number} - 1;
 434     $self->[_saw_perl_dash_P_]          = 0;
 435     $self->[_saw_perl_dash_w_]          = 0;
 436     $self->[_saw_use_strict_]           = 0;
 437     $self->[_saw_v_string_]             = 0;
 438     $self->[_hit_bug_]                  = 0;
 439     $self->[_look_for_autoloader_]      = $args{look_for_autoloader};
 440     $self->[_look_for_selfloader_]      = $args{look_for_selfloader};
 441     $self->[_saw_autoloader_]           = 0;
 442     $self->[_saw_selfloader_]           = 0;
 443     $self->[_saw_hash_bang_]            = 0;
 444     $self->[_saw_end_]                  = 0;
 445     $self->[_saw_data_]                 = 0;
 446     $self->[_saw_negative_indentation_] = 0;
 447     $self->[_started_tokenizing_]       = 0;
 448     $self->[_line_buffer_object_]       = $line_buffer_object;
 449     $self->[_debugger_object_]          = $args{debugger_object};
 450     $self->[_diagnostics_object_]       = $args{diagnostics_object};
 451     $self->[_logger_object_]            = $args{logger_object};
 452     $self->[_unexpected_error_count_]   = 0;
 453     $self->[_started_looking_for_here_target_at_] = 0;
 454     $self->[_nearly_matched_here_target_at_]      = undef;
 455     $self->[_line_of_text_]                       = EMPTY_STRING;
 456     $self->[_rlower_case_labels_at_]              = undef;
 457     $self->[_extended_syntax_]                    = $args{extended_syntax};
 458     $self->[_maximum_level_]                      = 0;
 459     $self->[_true_brace_error_count_]             = 0;
 460     $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
 461     $self->[_rOpts_maximum_unexpected_errors_] =
 462       $rOpts->{'maximum-unexpected-errors'};
 463     $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
 464     $self->[_rOpts_]         = $rOpts;
 465
 466     # These vars are used for guessing indentation and must be positive
 467     $self->[_tabsize_]        = 8 if ( !$self->[_tabsize_] );
 468     $self->[_indent_columns_] = 4 if ( !$self->[_indent_columns_] );
 469
 470     bless $self, $class;
 471
 472     $tokenizer_self = $self;
 473
 474     prepare_for_a_new_file();
 475     $self->find_starting_indentation_level();
 476
 477     # This is not a full class yet, so die if an attempt is made to
 478     # create more than one object.
 479
 480     if ( _increment_count() > 1 ) {
 481         confess
 482 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
 483     }
 484
 485     return $self;
 486
 487 } ## end sub new
 488
 489 # interface to Perl::Tidy::Logger routines
 490 sub warning {
 491     my $msg           = shift;
 492     my $logger_object = $tokenizer_self->[_logger_object_];
 493     if ($logger_object) {
 494         $logger_object->warning($msg);
 495     }
 496     return;
 497 }
 498
 499 sub get_input_stream_name {
 500     my $input_stream_name = EMPTY_STRING;
 501     my $logger_object     = $tokenizer_self->[_logger_object_];
 502     if ($logger_object) {
 503         $input_stream_name = $logger_object->get_input_stream_name();
 504     }
 505     return $input_stream_name;
 506 }
 507
 508 sub complain {
 509     my $msg           = shift;
 510     my $logger_object = $tokenizer_self->[_logger_object_];
 511     if ($logger_object) {
 512         my $input_line_number = $tokenizer_self->[_last_line_number_] + 1;
 513         $msg = "Line $input_line_number: $msg";
 514         $logger_object->complain($msg);
 515     }
 516     return;
 517 } ## end sub complain
 518
 519 sub write_logfile_entry {
 520     my $msg           = shift;
 521     my $logger_object = $tokenizer_self->[_logger_object_];
 522     if ($logger_object) {
 523         $logger_object->write_logfile_entry($msg);
 524     }
 525     return;
 526 }
 527
 528 sub interrupt_logfile {
 529     my $logger_object = $tokenizer_self->[_logger_object_];
 530     if ($logger_object) {
 531         $logger_object->interrupt_logfile();
 532     }
 533     return;
 534 }
 535
 536 sub resume_logfile {
 537     my $logger_object = $tokenizer_self->[_logger_object_];
 538     if ($logger_object) {
 539         $logger_object->resume_logfile();
 540     }
 541     return;
 542 }
 543
 544 sub increment_brace_error {
 545     my $logger_object = $tokenizer_self->[_logger_object_];
 546     if ($logger_object) {
 547         $logger_object->increment_brace_error();
 548     }
 549     return;
 550 }
 551
 552 sub report_definite_bug {
 553     $tokenizer_self->[_hit_bug_] = 1;
 554     my $logger_object = $tokenizer_self->[_logger_object_];
 555     if ($logger_object) {
 556         $logger_object->report_definite_bug();
 557     }
 558     return;
 559 }
 560
 561 sub brace_warning {
 562     my $msg           = shift;
 563     my $logger_object = $tokenizer_self->[_logger_object_];
 564     if ($logger_object) {
 565         $logger_object->brace_warning($msg);
 566     }
 567     return;
 568 }
 569
 570 sub get_saw_brace_error {
 571     my $logger_object = $tokenizer_self->[_logger_object_];
 572     if ($logger_object) {
 573         return $logger_object->get_saw_brace_error();
 574     }
 575     else {
 576         return 0;
 577     }
 578 }
 579
 580 sub get_unexpected_error_count {
 581     my ($self) = @_;
 582     return $self->[_unexpected_error_count_];
 583 }
 584
 585 # interface to Perl::Tidy::Diagnostics routines
 586 sub write_diagnostics {
 587     my $msg = shift;
 588     if ( $tokenizer_self->[_diagnostics_object_] ) {
 589         $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
 590     }
 591     return;
 592 }
 593
 594 sub get_maximum_level {
 595     return $tokenizer_self->[_maximum_level_];
 596 }
 597
 598 sub report_tokenization_errors {
 599
 600     my ($self) = @_;
 601
 602     # Report any tokenization errors and return a flag '$severe_error'.
 603     # Set $severe_error = 1 if the tokenization errors are so severe that
 604     # the formatter should not attempt to format the file. Instead, it will
 605     # just output the file verbatim.
 606
 607     # set severe error flag if tokenizer has encountered file reading problems
 608     # (i.e. unexpected binary characters)
 609     my $severe_error = $self->[_in_error_];
 610
 611     my $maxle = $self->[_rOpts_maximum_level_errors_];
 612     my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
 613     $maxle = 1 unless defined($maxle);
 614     $maxue = 0 unless defined($maxue);
 615
 616     my $level = get_indentation_level();
 617     if ( $level != $tokenizer_self->[_starting_level_] ) {
 618         warning("final indentation level: $level\n");
 619         my $level_diff = $tokenizer_self->[_starting_level_] - $level;
 620         if ( $level_diff < 0 ) { $level_diff = -$level_diff }
 621
 622         # Set severe error flag if the level error is greater than 1.
 623         # The formatter can function for any level error but it is probably
 624         # best not to attempt formatting for a high level error.
 625         if ( $maxle >= 0 && $level_diff > $maxle ) {
 626             $severe_error = 1;
 627             warning(<<EOM);
 628 Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
 629 EOM
 630         }
 631     }
 632
 633     check_final_nesting_depths();
 634
 635     # Likewise, large numbers of brace errors usually indicate non-perl
 636     # scripts, so set the severe error flag at a low number.  This is similar
 637     # to the level check, but different because braces may balance but be
 638     # incorrectly interlaced.
 639     if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
 640         $severe_error = 1;
 641     }
 642
 643     if ( $tokenizer_self->[_look_for_hash_bang_]
 644         && !$tokenizer_self->[_saw_hash_bang_] )
 645     {
 646         warning(
 647             "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
 648     }
 649
 650     if ( $tokenizer_self->[_in_format_] ) {
 651         warning("hit EOF while in format description\n");
 652     }
 653
 654     if ( $tokenizer_self->[_in_skipped_] ) {
 655         write_logfile_entry(
 656             "hit EOF while in lines skipped with --code-skipping\n");
 657     }
 658
 659     if ( $tokenizer_self->[_in_pod_] ) {
 660
 661         # Just write log entry if this is after __END__ or __DATA__
 662         # because this happens to often, and it is not likely to be
 663         # a parsing error.
 664         if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
 665             write_logfile_entry(
 666 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
 667             );
 668         }
 669
 670         else {
 671             complain(
 672 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
 673             );
 674         }
 675
 676     }
 677
 678     if ( $tokenizer_self->[_in_here_doc_] ) {
 679         $severe_error = 1;
 680         my $here_doc_target = $tokenizer_self->[_here_doc_target_];
 681         my $started_looking_for_here_target_at =
 682           $tokenizer_self->[_started_looking_for_here_target_at_];
 683         if ($here_doc_target) {
 684             warning(
 685 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
 686             );
 687         }
 688         else {
 689             warning(<<EOM);
 690 Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
 691   (Perl will match to the end of file but this may not be intended).
 692 EOM
 693         }
 694         my $nearly_matched_here_target_at =
 695           $tokenizer_self->[_nearly_matched_here_target_at_];
 696         if ($nearly_matched_here_target_at) {
 697             warning(
 698 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
 699             );
 700         }
 701     }
 702
 703     # Something is seriously wrong if we ended inside a quote
 704     if ( $tokenizer_self->[_in_quote_] ) {
 705         $severe_error = 1;
 706         my $line_start_quote = $tokenizer_self->[_line_start_quote_];
 707         my $quote_target     = $tokenizer_self->[_quote_target_];
 708         my $what =
 709           ( $tokenizer_self->[_in_attribute_list_] )
 710           ? "attribute list"
 711           : "quote/pattern";
 712         warning(
 713 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
 714         );
 715     }
 716
 717     if ( $tokenizer_self->[_hit_bug_] ) {
 718         $severe_error = 1;
 719     }
 720
 721     # Multiple "unexpected" type tokenization errors usually indicate parsing
 722     # non-perl scripts, or that something is seriously wrong, so we should
 723     # avoid formatting them.  This can happen for example if we run perltidy on
 724     # a shell script or an html file.  But unfortunately this check can
 725     # interfere with some extended syntaxes, such as RPerl, so it has to be off
 726     # by default.
 727     my $ue_count = $tokenizer_self->[_unexpected_error_count_];
 728     if ( $maxue > 0 && $ue_count > $maxue ) {
 729         warning(<<EOM);
 730 Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
 731 EOM
 732         $severe_error = 1;
 733     }
 734
 735     unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
 736         if ( $] < 5.006 ) {
 737             write_logfile_entry("Suggest including '-w parameter'\n");
 738         }
 739         else {
 740             write_logfile_entry("Suggest including 'use warnings;'\n");
 741         }
 742     }
 743
 744     if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
 745         write_logfile_entry("Use of -P parameter for defines is discouraged\n");
 746     }
 747
 748     unless ( $tokenizer_self->[_saw_use_strict_] ) {
 749         write_logfile_entry("Suggest including 'use strict;'\n");
 750     }
 751
 752     # it is suggested that labels have at least one upper case character
 753     # for legibility and to avoid code breakage as new keywords are introduced
 754     if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
 755         my @lower_case_labels_at =
 756           @{ $tokenizer_self->[_rlower_case_labels_at_] };
 757         write_logfile_entry(
 758             "Suggest using upper case characters in label(s)\n");
 759         local $LIST_SEPARATOR = ')(';
 760         write_logfile_entry("  defined at line(s): (@lower_case_labels_at)\n");
 761     }
 762     return $severe_error;
 763 } ## end sub report_tokenization_errors
 764
 765 sub report_v_string {
 766
 767     # warn if this version can't handle v-strings
 768     my $tok = shift;
 769     unless ( $tokenizer_self->[_saw_v_string_] ) {
 770         $tokenizer_self->[_saw_v_string_] =
 771           $tokenizer_self->[_last_line_number_];
 772     }
 773     if ( $] < 5.006 ) {
 774         warning(
 775 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
 776         );
 777     }
 778     return;
 779 } ## end sub report_v_string
 780
 781 sub is_valid_token_type {
 782     my ($type) = @_;
 783     return $is_valid_token_type{$type};
 784 }
 785
 786 sub get_input_line_number {
 787     return $tokenizer_self->[_last_line_number_];
 788 }
 789
 790 sub log_numbered_msg {
 791     my ( $self, $msg ) = @_;
 792
 793     # write input line number + message to logfile
 794     my $input_line_number = $self->[_last_line_number_];
 795     write_logfile_entry("Line $input_line_number: $msg");
 796     return;
 797 }
 798
 799 # returns the next tokenized line
 800 sub get_line {
 801
 802     my $self = shift;
 803
 804     # USES GLOBAL VARIABLES:
 805     #   $brace_depth, $square_bracket_depth, $paren_depth
 806
 807     my $input_line = $self->[_line_buffer_object_]->get_line();
 808     $self->[_line_of_text_] = $input_line;
 809
 810     return unless ($input_line);
 811
 812     my $input_line_number = ++$self->[_last_line_number_];
 813
 814     # Find and remove what characters terminate this line, including any
 815     # control r
 816     my $input_line_separator = EMPTY_STRING;
 817     if ( chomp($input_line) ) {
 818         $input_line_separator = $INPUT_RECORD_SEPARATOR;
 819     }
 820
 821     # The first test here very significantly speeds things up, but be sure to
 822     # keep the regex and hash %other_line_endings the same.
 823     if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
 824         if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
 825             $input_line_separator = $2 . $input_line_separator;
 826         }
 827     }
 828
 829     # for backwards compatibility we keep the line text terminated with
 830     # a newline character
 831     $input_line .= "\n";
 832     $self->[_line_of_text_] = $input_line;
 833
 834     # create a data structure describing this line which will be
 835     # returned to the caller.
 836
 837     # _line_type codes are:
 838     #   SYSTEM         - system-specific code before hash-bang line
 839     #   CODE           - line of perl code (including comments)
 840     #   POD_START      - line starting pod, such as '=head'
 841     #   POD            - pod documentation text
 842     #   POD_END        - last line of pod section, '=cut'
 843     #   HERE           - text of here-document
 844     #   HERE_END       - last line of here-doc (target word)
 845     #   FORMAT         - format section
 846     #   FORMAT_END     - last line of format section, '.'
 847     #   SKIP           - code skipping section
 848     #   SKIP_END       - last line of code skipping section, '#>>V'
 849     #   DATA_START     - __DATA__ line
 850     #   DATA           - unidentified text following __DATA__
 851     #   END_START      - __END__ line
 852     #   END            - unidentified text following __END__
 853     #   ERROR          - we are in big trouble, probably not a perl script
 854
 855     # Other variables:
 856     #   _curly_brace_depth     - depth of curly braces at start of line
 857     #   _square_bracket_depth  - depth of square brackets at start of line
 858     #   _paren_depth           - depth of parens at start of line
 859     #   _starting_in_quote     - this line continues a multi-line quote
 860     #                            (so don't trim leading blanks!)
 861     #   _ending_in_quote       - this line ends in a multi-line quote
 862     #                            (so don't trim trailing blanks!)
 863     my $line_of_tokens = {
 864         _line_type                 => 'EOF',
 865         _line_text                 => $input_line,
 866         _line_number               => $input_line_number,
 867         _guessed_indentation_level => 0,
 868         _curly_brace_depth         => $brace_depth,
 869         _square_bracket_depth      => $square_bracket_depth,
 870         _paren_depth               => $paren_depth,
 871         _quote_character           => EMPTY_STRING,
 872 ## Skip these needless initializations for efficiency:
 873 ##      _rtoken_type               => undef,
 874 ##      _rtokens                   => undef,
 875 ##      _rlevels                   => undef,
 876 ##      _rblock_type               => undef,
 877 ##      _rtype_sequence            => undef,
 878 ##      _rci_levels                => undef,
 879 ##      _starting_in_quote         => 0,
 880 ##      _ending_in_quote           => 0,
 881     };
 882
 883     # must print line unchanged if we are in a here document
 884     if ( $self->[_in_here_doc_] ) {
 885
 886         $line_of_tokens->{_line_type} = 'HERE';
 887         my $here_doc_target      = $self->[_here_doc_target_];
 888         my $here_quote_character = $self->[_here_quote_character_];
 889         my $candidate_target     = $input_line;
 890         chomp $candidate_target;
 891
 892         # Handle <<~ targets, which are indicated here by a leading space on
 893         # the here quote character
 894         if ( $here_quote_character =~ /^\s/ ) {
 895             $candidate_target =~ s/^\s*//;
 896         }
 897         if ( $candidate_target eq $here_doc_target ) {
 898             $self->[_nearly_matched_here_target_at_] = undef;
 899             $line_of_tokens->{_line_type} = 'HERE_END';
 900             $self->log_numbered_msg("Exiting HERE document $here_doc_target\n");
 901
 902             my $rhere_target_list = $self->[_rhere_target_list_];
 903             if ( @{$rhere_target_list} ) {  # there can be multiple here targets
 904                 ( $here_doc_target, $here_quote_character ) =
 905                   @{ shift @{$rhere_target_list} };
 906                 $self->[_here_doc_target_]      = $here_doc_target;
 907                 $self->[_here_quote_character_] = $here_quote_character;
 908                 $self->log_numbered_msg(
 909                     "Entering HERE document $here_doc_target\n");
 910                 $self->[_nearly_matched_here_target_at_] = undef;
 911                 $self->[_started_looking_for_here_target_at_] =
 912                   $input_line_number;
 913             }
 914             else {
 915                 $self->[_in_here_doc_]          = 0;
 916                 $self->[_here_doc_target_]      = EMPTY_STRING;
 917                 $self->[_here_quote_character_] = EMPTY_STRING;
 918             }
 919         }
 920
 921         # check for error of extra whitespace
 922         # note for PERL6: leading whitespace is allowed
 923         else {
 924             $candidate_target =~ s/\s*$//;
 925             $candidate_target =~ s/^\s*//;
 926             if ( $candidate_target eq $here_doc_target ) {
 927                 $self->[_nearly_matched_here_target_at_] = $input_line_number;
 928             }
 929         }
 930         return $line_of_tokens;
 931     }
 932
 933     # Print line unchanged if we are in a format section
 934     elsif ( $self->[_in_format_] ) {
 935
 936         if ( $input_line =~ /^\.[\s#]*$/ ) {
 937
 938             # Decrement format depth count at a '.' after a 'format'
 939             $self->[_in_format_]--;
 940
 941             # This is the end when count reaches 0
 942             if ( !$self->[_in_format_] ) {
 943                 $self->log_numbered_msg("Exiting format section\n");
 944                 $line_of_tokens->{_line_type} = 'FORMAT_END';
 945             }
 946         }
 947         else {
 948             $line_of_tokens->{_line_type} = 'FORMAT';
 949             if ( $input_line =~ /^\s*format\s+\w+/ ) {
 950
 951                 # Increment format depth count at a 'format' within a 'format'
 952                 # This is a simple way to handle nested formats (issue c019).
 953                 $self->[_in_format_]++;
 954             }
 955         }
 956         return $line_of_tokens;
 957     }
 958
 959     # must print line unchanged if we are in pod documentation
 960     elsif ( $self->[_in_pod_] ) {
 961
 962         $line_of_tokens->{_line_type} = 'POD';
 963         if ( $input_line =~ /^=cut/ ) {
 964             $line_of_tokens->{_line_type} = 'POD_END';
 965             $self->log_numbered_msg("Exiting POD section\n");
 966             $self->[_in_pod_] = 0;
 967         }
 968         if ( $input_line =~ /^\#\!.*perl\b/ && !$self->[_in_end_] ) {
 969             warning(
 970                 "Hash-bang in pod can cause older versions of perl to fail! \n"
 971             );
 972         }
 973
 974         return $line_of_tokens;
 975     }
 976
 977     # print line unchanged if in skipped section
 978     elsif ( $self->[_in_skipped_] ) {
 979
 980         $line_of_tokens->{_line_type} = 'SKIP';
 981         if ( $input_line =~ /$code_skipping_pattern_end/ ) {
 982             $line_of_tokens->{_line_type} = 'SKIP_END';
 983             $self->log_numbered_msg("Exiting code-skipping section\n");
 984             $self->[_in_skipped_] = 0;
 985         }
 986         return $line_of_tokens;
 987     }
 988
 989     # must print line unchanged if we have seen a severe error (i.e., we
 990     # are seeing illegal tokens and cannot continue.  Syntax errors do
 991     # not pass this route).  Calling routine can decide what to do, but
 992     # the default can be to just pass all lines as if they were after __END__
 993     elsif ( $self->[_in_error_] ) {
 994         $line_of_tokens->{_line_type} = 'ERROR';
 995         return $line_of_tokens;
 996     }
 997
 998     # print line unchanged if we are __DATA__ section
 999     elsif ( $self->[_in_data_] ) {
1000
1001         # ...but look for POD
1002         # Note that the _in_data and _in_end flags remain set
1003         # so that we return to that state after seeing the
1004         # end of a pod section
1005         if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1006             $line_of_tokens->{_line_type} = 'POD_START';
1007             $self->log_numbered_msg("Entering POD section\n");
1008             $self->[_in_pod_] = 1;
1009             return $line_of_tokens;
1010         }
1011         else {
1012             $line_of_tokens->{_line_type} = 'DATA';
1013             return $line_of_tokens;
1014         }
1015     }
1016
1017     # print line unchanged if we are in __END__ section
1018     elsif ( $self->[_in_end_] ) {
1019
1020         # ...but look for POD
1021         # Note that the _in_data and _in_end flags remain set
1022         # so that we return to that state after seeing the
1023         # end of a pod section
1024         if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1025             $line_of_tokens->{_line_type} = 'POD_START';
1026             $self->log_numbered_msg("Entering POD section\n");
1027             $self->[_in_pod_] = 1;
1028             return $line_of_tokens;
1029         }
1030         else {
1031             $line_of_tokens->{_line_type} = 'END';
1032             return $line_of_tokens;
1033         }
1034     }
1035
1036     # check for a hash-bang line if we haven't seen one
1037     if ( !$self->[_saw_hash_bang_] ) {
1038         if ( $input_line =~ /^\#\!.*perl\b/ ) {
1039             $self->[_saw_hash_bang_] = $input_line_number;
1040
1041             # check for -w and -P flags
1042             if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
1043                 $self->[_saw_perl_dash_P_] = 1;
1044             }
1045
1046             if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
1047                 $self->[_saw_perl_dash_w_] = 1;
1048             }
1049
1050             if (
1051                 $input_line_number > 1
1052
1053                 # leave any hash bang in a BEGIN block alone
1054                 # i.e. see 'debugger-duck_type.t'
1055                 && !(
1056                        $last_nonblank_block_type
1057                     && $last_nonblank_block_type eq 'BEGIN'
1058                 )
1059                 && !$self->[_look_for_hash_bang_]
1060
1061                 # Try to avoid giving a false alarm at a simple comment.
1062                 # These look like valid hash-bang lines:
1063
1064                 #!/usr/bin/perl -w
1065                 #!   /usr/bin/perl -w
1066                 #!c:\perl\bin\perl.exe
1067
1068                 # These are comments:
1069                 #! I love perl
1070                 #!  sunos does not yet provide a /usr/bin/perl
1071
1072                 # Comments typically have multiple spaces, which suggests
1073                 # the filter
1074                 && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
1075               )
1076             {
1077
1078                 # this is helpful for VMS systems; we may have accidentally
1079                 # tokenized some DCL commands
1080                 if ( $self->[_started_tokenizing_] ) {
1081                     warning(
1082 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
1083                     );
1084                 }
1085                 else {
1086                     complain("Useless hash-bang after line 1\n");
1087                 }
1088             }
1089
1090             # Report the leading hash-bang as a system line
1091             # This will prevent -dac from deleting it
1092             else {
1093                 $line_of_tokens->{_line_type} = 'SYSTEM';
1094                 return $line_of_tokens;
1095             }
1096         }
1097     }
1098
1099     # wait for a hash-bang before parsing if the user invoked us with -x
1100     if ( $self->[_look_for_hash_bang_]
1101         && !$self->[_saw_hash_bang_] )
1102     {
1103         $line_of_tokens->{_line_type} = 'SYSTEM';
1104         return $line_of_tokens;
1105     }
1106
1107     # a first line of the form ': #' will be marked as SYSTEM
1108     # since lines of this form may be used by tcsh
1109     if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1110         $line_of_tokens->{_line_type} = 'SYSTEM';
1111         return $line_of_tokens;
1112     }
1113
1114     # now we know that it is ok to tokenize the line...
1115     # the line tokenizer will modify any of these private variables:
1116     #        _rhere_target_list_
1117     #        _in_data_
1118     #        _in_end_
1119     #        _in_format_
1120     #        _in_error_
1121     #        _in_skipped_
1122     #        _in_pod_
1123     #        _in_quote_
1124     my $ending_in_quote_last = $self->[_in_quote_];
1125     $self->tokenize_this_line($line_of_tokens);
1126
1127     # Now finish defining the return structure and return it
1128     $line_of_tokens->{_ending_in_quote} = $self->[_in_quote_];
1129
1130     # handle severe error (binary data in script)
1131     if ( $self->[_in_error_] ) {
1132         $self->[_in_quote_] = 0;    # to avoid any more messages
1133         warning("Giving up after error\n");
1134         $line_of_tokens->{_line_type} = 'ERROR';
1135         reset_indentation_level(0);    # avoid error messages
1136         return $line_of_tokens;
1137     }
1138
1139     # handle start of pod documentation
1140     if ( $self->[_in_pod_] ) {
1141
1142         # This gets tricky..above a __DATA__ or __END__ section, perl
1143         # accepts '=cut' as the start of pod section. But afterwards,
1144         # only pod utilities see it and they may ignore an =cut without
1145         # leading =head.  In any case, this isn't good.
1146         if ( $input_line =~ /^=cut\b/ ) {
1147             if ( $self->[_saw_data_] || $self->[_saw_end_] ) {
1148                 complain("=cut while not in pod ignored\n");
1149                 $self->[_in_pod_] = 0;
1150                 $line_of_tokens->{_line_type} = 'POD_END';
1151             }
1152             else {
1153                 $line_of_tokens->{_line_type} = 'POD_START';
1154                 warning(
1155 "=cut starts a pod section .. this can fool pod utilities.\n"
1156                 ) unless (DEVEL_MODE);
1157                 $self->log_numbered_msg("Entering POD section\n");
1158             }
1159         }
1160
1161         else {
1162             $line_of_tokens->{_line_type} = 'POD_START';
1163             $self->log_numbered_msg("Entering POD section\n");
1164         }
1165
1166         return $line_of_tokens;
1167     }
1168
1169     # handle start of skipped section
1170     if ( $self->[_in_skipped_] ) {
1171
1172         $line_of_tokens->{_line_type} = 'SKIP';
1173         $self->log_numbered_msg("Entering code-skipping section\n");
1174         return $line_of_tokens;
1175     }
1176
1177     # see if this line contains here doc targets
1178     my $rhere_target_list = $self->[_rhere_target_list_];
1179     if ( @{$rhere_target_list} ) {
1180
1181         my ( $here_doc_target, $here_quote_character ) =
1182           @{ shift @{$rhere_target_list} };
1183         $self->[_in_here_doc_]          = 1;
1184         $self->[_here_doc_target_]      = $here_doc_target;
1185         $self->[_here_quote_character_] = $here_quote_character;
1186         $self->log_numbered_msg("Entering HERE document $here_doc_target\n");
1187         $self->[_started_looking_for_here_target_at_] = $input_line_number;
1188     }
1189
1190     # NOTE: __END__ and __DATA__ statements are written unformatted
1191     # because they can theoretically contain additional characters
1192     # which are not tokenized (and cannot be read with <DATA> either!).
1193     if ( $self->[_in_data_] ) {
1194         $line_of_tokens->{_line_type} = 'DATA_START';
1195         $self->log_numbered_msg("Starting __DATA__ section\n");
1196         $self->[_saw_data_] = 1;
1197
1198         # keep parsing after __DATA__ if use SelfLoader was seen
1199         if ( $self->[_saw_selfloader_] ) {
1200             $self->[_in_data_] = 0;
1201             $self->log_numbered_msg(
1202                 "SelfLoader seen, continuing; -nlsl deactivates\n");
1203         }
1204
1205         return $line_of_tokens;
1206     }
1207
1208     elsif ( $self->[_in_end_] ) {
1209         $line_of_tokens->{_line_type} = 'END_START';
1210         $self->log_numbered_msg("Starting __END__ section\n");
1211         $self->[_saw_end_] = 1;
1212
1213         # keep parsing after __END__ if use AutoLoader was seen
1214         if ( $self->[_saw_autoloader_] ) {
1215             $self->[_in_end_] = 0;
1216             $self->log_numbered_msg(
1217                 "AutoLoader seen, continuing; -nlal deactivates\n");
1218         }
1219         return $line_of_tokens;
1220     }
1221
1222     # now, finally, we know that this line is type 'CODE'
1223     $line_of_tokens->{_line_type} = 'CODE';
1224
1225     # remember if we have seen any real code
1226     if (  !$self->[_started_tokenizing_]
1227         && $input_line !~ /^\s*$/
1228         && $input_line !~ /^\s*#/ )
1229     {
1230         $self->[_started_tokenizing_] = 1;
1231     }
1232
1233     if ( $self->[_debugger_object_] ) {
1234         $self->[_debugger_object_]->write_debug_entry($line_of_tokens);
1235     }
1236
1237     # Note: if keyword 'format' occurs in this line code, it is still CODE
1238     # (keyword 'format' need not start a line)
1239     if ( $self->[_in_format_] ) {
1240         $self->log_numbered_msg("Entering format section\n");
1241     }
1242
1243     if ( $self->[_in_quote_]
1244         and ( $self->[_line_start_quote_] < 0 ) )
1245     {
1246
1247         #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1248         if ( ( my $quote_target = $self->[_quote_target_] ) !~ /^\s*$/ ) {
1249             $self->[_line_start_quote_] = $input_line_number;
1250             $self->log_numbered_msg(
1251                 "Start multi-line quote or pattern ending in $quote_target\n");
1252         }
1253     }
1254     elsif ( ( $self->[_line_start_quote_] >= 0 )
1255         && !$self->[_in_quote_] )
1256     {
1257         $self->[_line_start_quote_] = -1;
1258         $self->log_numbered_msg("End of multi-line quote or pattern\n");
1259     }
1260
1261     # we are returning a line of CODE
1262     return $line_of_tokens;
1263 } ## end sub get_line
1264
1265 sub find_starting_indentation_level {
1266
1267     # We need to find the indentation level of the first line of the
1268     # script being formatted.  Often it will be zero for an entire file,
1269     # but if we are formatting a local block of code (within an editor for
1270     # example) it may not be zero.  The user may specify this with the
1271     # -sil=n parameter but normally doesn't so we have to guess.
1272     #
1273     my ($self) = @_;
1274     my $starting_level = 0;
1275
1276     # use value if given as parameter
1277     if ( $self->[_know_starting_level_] ) {
1278         $starting_level = $self->[_starting_level_];
1279     }
1280
1281     # if we know there is a hash_bang line, the level must be zero
1282     elsif ( $self->[_look_for_hash_bang_] ) {
1283         $self->[_know_starting_level_] = 1;
1284     }
1285
1286     # otherwise figure it out from the input file
1287     else {
1288         my $line;
1289         my $i = 0;
1290
1291         # keep looking at lines until we find a hash bang or piece of code
1292         my $msg = EMPTY_STRING;
1293         while ( $line = $self->[_line_buffer_object_]->peek_ahead( $i++ ) ) {
1294
1295             # if first line is #! then assume starting level is zero
1296             if ( $i == 1 && $line =~ /^\#\!/ ) {
1297                 $starting_level = 0;
1298                 last;
1299             }
1300             next if ( $line =~ /^\s*#/ );    # skip past comments
1301             next if ( $line =~ /^\s*$/ );    # skip past blank lines
1302             $starting_level = guess_old_indentation_level($line);
1303             last;
1304         }
1305         $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1306         write_logfile_entry("$msg");
1307     }
1308     $self->[_starting_level_] = $starting_level;
1309     reset_indentation_level($starting_level);
1310     return;
1311 } ## end sub find_starting_indentation_level
1312
1313 sub guess_old_indentation_level {
1314     my ($line) = @_;
1315
1316     # Guess the indentation level of an input line.
1317     #
1318     # For the first line of code this result will define the starting
1319     # indentation level.  It will mainly be non-zero when perltidy is applied
1320     # within an editor to a local block of code.
1321     #
1322     # This is an impossible task in general because we can't know what tabs
1323     # meant for the old script and how many spaces were used for one
1324     # indentation level in the given input script.  For example it may have
1325     # been previously formatted with -i=7 -et=3.  But we can at least try to
1326     # make sure that perltidy guesses correctly if it is applied repeatedly to
1327     # a block of code within an editor, so that the block stays at the same
1328     # level when perltidy is applied repeatedly.
1329     #
1330     # USES GLOBAL VARIABLES: $tokenizer_self
1331     my $level = 0;
1332
1333     # find leading tabs, spaces, and any statement label
1334     my $spaces = 0;
1335     if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1336
1337         # If there are leading tabs, we use the tab scheme for this run, if
1338         # any, so that the code will remain stable when editing.
1339         if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }
1340
1341         if ($2) { $spaces += length($2) }
1342
1343         # correct for outdented labels
1344         if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
1345             $spaces += $tokenizer_self->[_continuation_indentation_];
1346         }
1347     }
1348
1349     # compute indentation using the value of -i for this run.
1350     # If -i=0 is used for this run (which is possible) it doesn't matter
1351     # what we do here but we'll guess that the old run used 4 spaces per level.
1352     my $indent_columns = $tokenizer_self->[_indent_columns_];
1353     $indent_columns = 4 if ( !$indent_columns );
1354     $level          = int( $spaces / $indent_columns );
1355     return ($level);
1356 } ## end sub guess_old_indentation_level
1357
1358 # This is a currently unused debug routine
1359 sub dump_functions {
1360
1361     my $fh = *STDOUT;
1362     foreach my $pkg ( keys %is_user_function ) {
1363         $fh->print("\nnon-constant subs in package $pkg\n");
1364
1365         foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1366             my $msg = EMPTY_STRING;
1367             if ( $is_block_list_function{$pkg}{$sub} ) {
1368                 $msg = 'block_list';
1369             }
1370
1371             if ( $is_block_function{$pkg}{$sub} ) {
1372                 $msg = 'block';
1373             }
1374             $fh->print("$sub $msg\n");
1375         }
1376     }
1377
1378     foreach my $pkg ( keys %is_constant ) {
1379         $fh->print("\nconstants and constant subs in package $pkg\n");
1380
1381         foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1382             $fh->print("$sub\n");
1383         }
1384     }
1385     return;
1386 } ## end sub dump_functions
1387
1388 sub prepare_for_a_new_file {
1389
1390     # previous tokens needed to determine what to expect next
1391     $last_nonblank_token      = ';';    # the only possible starting state which
1392     $last_nonblank_type       = ';';    # will make a leading brace a code block
1393     $last_nonblank_block_type = EMPTY_STRING;
1394
1395     # scalars for remembering statement types across multiple lines
1396     $statement_type    = EMPTY_STRING;    # '' or 'use' or 'sub..' or 'case..'
1397     $in_attribute_list = 0;
1398
1399     # scalars for remembering where we are in the file
1400     $current_package = "main";
1401     $context         = UNKNOWN_CONTEXT;
1402
1403     # hashes used to remember function information
1404     %is_constant             = ();        # user-defined constants
1405     %is_user_function        = ();        # user-defined functions
1406     %user_function_prototype = ();        # their prototypes
1407     %is_block_function       = ();
1408     %is_block_list_function  = ();
1409     %saw_function_definition = ();
1410     %saw_use_module          = ();
1411
1412     # variables used to track depths of various containers
1413     # and report nesting errors
1414     $paren_depth             = 0;
1415     $brace_depth             = 0;
1416     $square_bracket_depth    = 0;
1417     @current_depth           = (0) x scalar @closing_brace_names;
1418     $total_depth             = 0;
1419     @total_depth             = ();
1420     @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
1421     @current_sequence_number = ();
1422     $next_sequence_number    = 2;    # The value 1 is reserved for SEQ_ROOT
1423
1424     @paren_type                     = ();
1425     @paren_semicolon_count          = ();
1426     @paren_structural_type          = ();
1427     @brace_type                     = ();
1428     @brace_structural_type          = ();
1429     @brace_context                  = ();
1430     @brace_package                  = ();
1431     @square_bracket_type            = ();
1432     @square_bracket_structural_type = ();
1433     @depth_array                    = ();
1434     @nested_ternary_flag            = ();
1435     @nested_statement_type          = ();
1436     @starting_line_of_current_depth = ();
1437
1438     $paren_type[$paren_depth]            = EMPTY_STRING;
1439     $paren_semicolon_count[$paren_depth] = 0;
1440     $paren_structural_type[$brace_depth] = EMPTY_STRING;
1441     $brace_type[$brace_depth] = ';';    # identify opening brace as code block
1442     $brace_structural_type[$brace_depth]                   = EMPTY_STRING;
1443     $brace_context[$brace_depth]                           = UNKNOWN_CONTEXT;
1444     $brace_package[$paren_depth]                           = $current_package;
1445     $square_bracket_type[$square_bracket_depth]            = EMPTY_STRING;
1446     $square_bracket_structural_type[$square_bracket_depth] = EMPTY_STRING;
1447
1448     initialize_tokenizer_state();
1449     return;
1450 } ## end sub prepare_for_a_new_file
1451
1452 {    ## closure for sub tokenize_this_line
1453
1454     use constant BRACE          => 0;
1455     use constant SQUARE_BRACKET => 1;
1456     use constant PAREN          => 2;
1457     use constant QUESTION_COLON => 3;
1458
1459     # TV1: scalars for processing one LINE.
1460     # Re-initialized on each entry to sub tokenize_this_line.
1461     my (
1462         $block_type,        $container_type,    $expecting,
1463         $i,                 $i_tok,             $input_line,
1464         $input_line_number, $last_nonblank_i,   $max_token_index,
1465         $next_tok,          $next_type,         $peeked_ahead,
1466         $prototype,         $rhere_target_list, $rtoken_map,
1467         $rtoken_type,       $rtokens,           $tok,
1468         $type,              $type_sequence,     $indent_flag,
1469     );
1470
1471     # TV2: refs to ARRAYS for processing one LINE
1472     # Re-initialized on each call.
1473     my $routput_token_list     = [];    # stack of output token indexes
1474     my $routput_token_type     = [];    # token types
1475     my $routput_block_type     = [];    # types of code block
1476     my $routput_container_type = [];    # paren types, such as if, elsif, ..
1477     my $routput_type_sequence  = [];    # nesting sequential number
1478     my $routput_indent_flag    = [];    #
1479
1480     # TV3: SCALARS for quote variables.  These are initialized with a
1481     # subroutine call and continually updated as lines are processed.
1482     my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1483         $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1484
1485     # TV4: SCALARS for multi-line identifiers and
1486     # statements. These are initialized with a subroutine call
1487     # and continually updated as lines are processed.
1488     my ( $id_scan_state, $identifier, $want_paren );
1489
1490     # TV5: SCALARS for tracking indentation level.
1491     # Initialized once and continually updated as lines are
1492     # processed.
1493     my (
1494         $nesting_token_string,      $nesting_type_string,
1495         $nesting_block_string,      $nesting_block_flag,
1496         $nesting_list_string,       $nesting_list_flag,
1497         $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
1498         $in_statement_continuation, $level_in_tokenizer,
1499         $slevel_in_tokenizer,       $rslevel_stack,
1500     );
1501
1502     # TV6: SCALARS for remembering several previous
1503     # tokens. Initialized once and continually updated as
1504     # lines are processed.
1505     my (
1506         $last_nonblank_container_type,     $last_nonblank_type_sequence,
1507         $last_last_nonblank_token,         $last_last_nonblank_type,
1508         $last_last_nonblank_block_type,    $last_last_nonblank_container_type,
1509         $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1510     );
1511
1512     # ----------------------------------------------------------------
1513     # beginning of tokenizer variable access and manipulation routines
1514     # ----------------------------------------------------------------
1515
1516     sub initialize_tokenizer_state {
1517
1518         # TV1: initialized on each call
1519         # TV2: initialized on each call
1520         # TV3:
1521         $in_quote                = 0;
1522         $quote_type              = 'Q';
1523         $quote_character         = EMPTY_STRING;
1524         $quote_pos               = 0;
1525         $quote_depth             = 0;
1526         $quoted_string_1         = EMPTY_STRING;
1527         $quoted_string_2         = EMPTY_STRING;
1528         $allowed_quote_modifiers = EMPTY_STRING;
1529
1530         # TV4:
1531         $id_scan_state = EMPTY_STRING;
1532         $identifier    = EMPTY_STRING;
1533         $want_paren    = EMPTY_STRING;
1534
1535         # TV5:
1536         $nesting_token_string   = EMPTY_STRING;
1537         $nesting_type_string    = EMPTY_STRING;
1538         $nesting_block_string   = '1';            # initially in a block
1539         $nesting_block_flag     = 1;
1540         $nesting_list_string    = '0';            # initially not in a list
1541         $nesting_list_flag      = 0;              # initially not in a list
1542         $ci_string_in_tokenizer = EMPTY_STRING;
1543         $continuation_string_in_tokenizer = "0";
1544         $in_statement_continuation        = 0;
1545         $level_in_tokenizer               = 0;
1546         $slevel_in_tokenizer              = 0;
1547         $rslevel_stack                    = [];
1548
1549         # TV6:
1550         $last_nonblank_container_type      = EMPTY_STRING;
1551         $last_nonblank_type_sequence       = EMPTY_STRING;
1552         $last_last_nonblank_token          = ';';
1553         $last_last_nonblank_type           = ';';
1554         $last_last_nonblank_block_type     = EMPTY_STRING;
1555         $last_last_nonblank_container_type = EMPTY_STRING;
1556         $last_last_nonblank_type_sequence  = EMPTY_STRING;
1557         $last_nonblank_prototype           = EMPTY_STRING;
1558         return;
1559     } ## end sub initialize_tokenizer_state
1560
1561     sub save_tokenizer_state {
1562
1563         my $rTV1 = [
1564             $block_type,        $container_type,    $expecting,
1565             $i,                 $i_tok,             $input_line,
1566             $input_line_number, $last_nonblank_i,   $max_token_index,
1567             $next_tok,          $next_type,         $peeked_ahead,
1568             $prototype,         $rhere_target_list, $rtoken_map,
1569             $rtoken_type,       $rtokens,           $tok,
1570             $type,              $type_sequence,     $indent_flag,
1571         ];
1572
1573         my $rTV2 = [
1574             $routput_token_list,    $routput_token_type,
1575             $routput_block_type,    $routput_container_type,
1576             $routput_type_sequence, $routput_indent_flag,
1577         ];
1578
1579         my $rTV3 = [
1580             $in_quote,        $quote_type,
1581             $quote_character, $quote_pos,
1582             $quote_depth,     $quoted_string_1,
1583             $quoted_string_2, $allowed_quote_modifiers,
1584         ];
1585
1586         my $rTV4 = [ $id_scan_state, $identifier, $want_paren ];
1587
1588         my $rTV5 = [
1589             $nesting_token_string,      $nesting_type_string,
1590             $nesting_block_string,      $nesting_block_flag,
1591             $nesting_list_string,       $nesting_list_flag,
1592             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
1593             $in_statement_continuation, $level_in_tokenizer,
1594             $slevel_in_tokenizer,       $rslevel_stack,
1595         ];
1596
1597         my $rTV6 = [
1598             $last_nonblank_container_type,
1599             $last_nonblank_type_sequence,
1600             $last_last_nonblank_token,
1601             $last_last_nonblank_type,
1602             $last_last_nonblank_block_type,
1603             $last_last_nonblank_container_type,
1604             $last_last_nonblank_type_sequence,
1605             $last_nonblank_prototype,
1606         ];
1607         return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1608     } ## end sub save_tokenizer_state
1609
1610     sub restore_tokenizer_state {
1611         my ($rstate) = @_;
1612         my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1613         (
1614             $block_type,        $container_type,    $expecting,
1615             $i,                 $i_tok,             $input_line,
1616             $input_line_number, $last_nonblank_i,   $max_token_index,
1617             $next_tok,          $next_type,         $peeked_ahead,
1618             $prototype,         $rhere_target_list, $rtoken_map,
1619             $rtoken_type,       $rtokens,           $tok,
1620             $type,              $type_sequence,     $indent_flag,
1621         ) = @{$rTV1};
1622
1623         (
1624             $routput_token_list,    $routput_token_type,
1625             $routput_block_type,    $routput_container_type,
1626             $routput_type_sequence, $routput_indent_flag,
1627         ) = @{$rTV2};
1628
1629         (
1630             $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1631             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1632         ) = @{$rTV3};
1633
1634         ( $id_scan_state, $identifier, $want_paren ) = @{$rTV4};
1635
1636         (
1637             $nesting_token_string,      $nesting_type_string,
1638             $nesting_block_string,      $nesting_block_flag,
1639             $nesting_list_string,       $nesting_list_flag,
1640             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
1641             $in_statement_continuation, $level_in_tokenizer,
1642             $slevel_in_tokenizer,       $rslevel_stack,
1643         ) = @{$rTV5};
1644
1645         (
1646             $last_nonblank_container_type,
1647             $last_nonblank_type_sequence,
1648             $last_last_nonblank_token,
1649             $last_last_nonblank_type,
1650             $last_last_nonblank_block_type,
1651             $last_last_nonblank_container_type,
1652             $last_last_nonblank_type_sequence,
1653             $last_nonblank_prototype,
1654         ) = @{$rTV6};
1655         return;
1656     } ## end sub restore_tokenizer_state
1657
1658     sub split_pretoken {
1659
1660         my ($numc) = @_;
1661
1662      # Split the leading $numc characters from the current token (at index=$i)
1663      # which is pre-type 'w' and insert the remainder back into the pretoken
1664      # stream with appropriate settings.  Since we are splitting a pre-type 'w',
1665      # there are three cases, depending on if the remainder starts with a digit:
1666      # Case 1: remainder is type 'd', all digits
1667      # Case 2: remainder is type 'd' and type 'w': digits and other characters
1668      # Case 3: remainder is type 'w'
1669
1670         # Examples, for $numc=1:
1671         #   $tok    => $tok_0 $tok_1 $tok_2
1672         #   'x10'   => 'x'    '10'                # case 1
1673         #   'x10if' => 'x'    '10'   'if'         # case 2
1674         #   '0ne    => 'O'            'ne'        # case 3
1675
1676         # where:
1677         #   $tok_1 is a possible string of digits (pre-type 'd')
1678         #   $tok_2 is a possible word (pre-type 'w')
1679
1680         # return 1 if successful
1681         # return undef if error (shouldn't happen)
1682
1683         # Calling routine should update '$type' and '$tok' if successful.
1684
1685         my $pretoken = $rtokens->[$i];
1686         if (   $pretoken
1687             && length($pretoken) > $numc
1688             && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
1689         {
1690
1691             # Split $tok into up to 3 tokens:
1692             my $tok_0 = substr( $pretoken, 0, $numc );
1693             my $tok_1 = defined($1) ? $1 : EMPTY_STRING;
1694             my $tok_2 = defined($2) ? $2 : EMPTY_STRING;
1695
1696             my $len_0 = length($tok_0);
1697             my $len_1 = length($tok_1);
1698             my $len_2 = length($tok_2);
1699
1700             my $pre_type_0 = 'w';
1701             my $pre_type_1 = 'd';
1702             my $pre_type_2 = 'w';
1703
1704             my $pos_0 = $rtoken_map->[$i];
1705             my $pos_1 = $pos_0 + $len_0;
1706             my $pos_2 = $pos_1 + $len_1;
1707
1708             my $isplice = $i + 1;
1709
1710             # Splice in any digits
1711             if ($len_1) {
1712                 splice @{$rtoken_map},  $isplice, 0, $pos_1;
1713                 splice @{$rtokens},     $isplice, 0, $tok_1;
1714                 splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
1715                 $max_token_index++;
1716                 $isplice++;
1717             }
1718
1719             # Splice in any trailing word
1720             if ($len_2) {
1721                 splice @{$rtoken_map},  $isplice, 0, $pos_2;
1722                 splice @{$rtokens},     $isplice, 0, $tok_2;
1723                 splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
1724                 $max_token_index++;
1725             }
1726
1727             $rtokens->[$i] = $tok_0;
1728             return 1;
1729         }
1730         else {
1731
1732             # Shouldn't get here
1733             if (DEVEL_MODE) {
1734                 Fault(<<EOM);
1735 While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
1736 EOM
1737             }
1738         }
1739         return;
1740     } ## end sub split_pretoken
1741
1742     sub get_indentation_level {
1743         return $level_in_tokenizer;
1744     }
1745
1746     sub reset_indentation_level {
1747         $level_in_tokenizer = $slevel_in_tokenizer = shift;
1748         push @{$rslevel_stack}, $slevel_in_tokenizer;
1749         return;
1750     }
1751
1752     sub peeked_ahead {
1753         my $flag = shift;
1754         $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1755         return $peeked_ahead;
1756     }
1757
1758     # ------------------------------------------------------------
1759     # end of tokenizer variable access and manipulation routines
1760     # ------------------------------------------------------------
1761
1762     #------------------------------
1763     # beginning of tokenizer hashes
1764     #------------------------------
1765
1766     my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
1767
1768     # These block types terminate statements and do not need a trailing
1769     # semicolon
1770     # patched for SWITCH/CASE/
1771     my %is_zero_continuation_block_type;
1772     my @q;
1773     @q = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
1774       if elsif else unless while until for foreach switch case given when);
1775     @is_zero_continuation_block_type{@q} = (1) x scalar(@q);
1776
1777     my %is_logical_container;
1778     @q = qw(if elsif unless while and or err not && !  || for foreach);
1779     @is_logical_container{@q} = (1) x scalar(@q);
1780
1781     my %is_binary_type;
1782     @q = qw(|| &&);
1783     @is_binary_type{@q} = (1) x scalar(@q);
1784
1785     my %is_binary_keyword;
1786     @q = qw(and or err eq ne cmp);
1787     @is_binary_keyword{@q} = (1) x scalar(@q);
1788
1789     # 'L' is token for opening { at hash key
1790     my %is_opening_type;
1791     @q = qw< L { ( [ >;
1792     @is_opening_type{@q} = (1) x scalar(@q);
1793
1794     # 'R' is token for closing } at hash key
1795     my %is_closing_type;
1796     @q = qw< R } ) ] >;
1797     @is_closing_type{@q} = (1) x scalar(@q);
1798
1799     my %is_redo_last_next_goto;
1800     @q = qw(redo last next goto);
1801     @is_redo_last_next_goto{@q} = (1) x scalar(@q);
1802
1803     my %is_use_require;
1804     @q = qw(use require);
1805     @is_use_require{@q} = (1) x scalar(@q);
1806
1807     # This hash holds the array index in $tokenizer_self for these keywords:
1808     # Fix for issue c035: removed 'format' from this hash
1809     my %is_END_DATA = (
1810         '__END__'  => _in_end_,
1811         '__DATA__' => _in_data_,
1812     );
1813
1814     my %is_list_end_type;
1815     @q = qw( ; { } );
1816     push @q, ',';
1817     @is_list_end_type{@q} = (1) x scalar(@q);
1818
1819     # original ref: camel 3 p 147,
1820     # but perl may accept undocumented flags
1821     # perl 5.10 adds 'p' (preserve)
1822     # Perl version 5.22 added 'n'
1823     # From http://perldoc.perl.org/perlop.html we have
1824     # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
1825     # s/PATTERN/REPLACEMENT/msixpodualngcer
1826     # y/SEARCHLIST/REPLACEMENTLIST/cdsr
1827     # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
1828     # qr/STRING/msixpodualn
1829     my %quote_modifiers = (
1830         's'  => '[msixpodualngcer]',
1831         'y'  => '[cdsr]',
1832         'tr' => '[cdsr]',
1833         'm'  => '[msixpodualngc]',
1834         'qr' => '[msixpodualn]',
1835         'q'  => EMPTY_STRING,
1836         'qq' => EMPTY_STRING,
1837         'qw' => EMPTY_STRING,
1838         'qx' => EMPTY_STRING,
1839     );
1840
1841     # table showing how many quoted things to look for after quote operator..
1842     # s, y, tr have 2 (pattern and replacement)
1843     # others have 1 (pattern only)
1844     my %quote_items = (
1845         's'  => 2,
1846         'y'  => 2,
1847         'tr' => 2,
1848         'm'  => 1,
1849         'qr' => 1,
1850         'q'  => 1,
1851         'qq' => 1,
1852         'qw' => 1,
1853         'qx' => 1,
1854     );
1855
1856     my %is_for_foreach;
1857     @q = qw(for foreach);
1858     @is_for_foreach{@q} = (1) x scalar(@q);
1859
1860     my %is_my_our_state;
1861     @q = qw(my our state);
1862     @is_my_our_state{@q} = (1) x scalar(@q);
1863
1864     # These keywords may introduce blocks after parenthesized expressions,
1865     # in the form:
1866     # keyword ( .... ) { BLOCK }
1867     # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
1868     my %is_blocktype_with_paren;
1869     @q =
1870       qw(if elsif unless while until for foreach switch case given when catch);
1871     @is_blocktype_with_paren{@q} = (1) x scalar(@q);
1872
1873     my %is_case_default;
1874     @q = qw(case default);
1875     @is_case_default{@q} = (1) x scalar(@q);
1876
1877     #------------------------
1878     # end of tokenizer hashes
1879     #------------------------
1880
1881     # ------------------------------------------------------------
1882     # beginning of various scanner interface routines
1883     # ------------------------------------------------------------
1884     sub scan_replacement_text {
1885
1886         # check for here-docs in replacement text invoked by
1887         # a substitution operator with executable modifier 'e'.
1888         #
1889         # given:
1890         #  $replacement_text
1891         # return:
1892         #  $rht = reference to any here-doc targets
1893         my ($replacement_text) = @_;
1894
1895         # quick check
1896         return unless ( $replacement_text =~ /<</ );
1897
1898         write_logfile_entry("scanning replacement text for here-doc targets\n");
1899
1900         # save the logger object for error messages
1901         my $logger_object = $tokenizer_self->[_logger_object_];
1902
1903         # localize all package variables
1904         local (
1905             $tokenizer_self,                 $last_nonblank_token,
1906             $last_nonblank_type,             $last_nonblank_block_type,
1907             $statement_type,                 $in_attribute_list,
1908             $current_package,                $context,
1909             %is_constant,                    %is_user_function,
1910             %user_function_prototype,        %is_block_function,
1911             %is_block_list_function,         %saw_function_definition,
1912             $brace_depth,                    $paren_depth,
1913             $square_bracket_depth,           @current_depth,
1914             @total_depth,                    $total_depth,
1915             @nesting_sequence_number,        @current_sequence_number,
1916             @paren_type,                     @paren_semicolon_count,
1917             @paren_structural_type,          @brace_type,
1918             @brace_structural_type,          @brace_context,
1919             @brace_package,                  @square_bracket_type,
1920             @square_bracket_structural_type, @depth_array,
1921             @starting_line_of_current_depth, @nested_ternary_flag,
1922             @nested_statement_type,          $next_sequence_number,
1923         );
1924
1925         # save all lexical variables
1926         my $rstate = save_tokenizer_state();
1927         _decrement_count();    # avoid error check for multiple tokenizers
1928
1929         # make a new tokenizer
1930         my $rOpts         = {};
1931         my $source_object = Perl::Tidy::LineSource->new(
1932             input_file => \$replacement_text,
1933             rOpts      => $rOpts,
1934         );
1935         my $tokenizer = Perl::Tidy::Tokenizer->new(
1936             source_object        => $source_object,
1937             logger_object        => $logger_object,
1938             starting_line_number => $input_line_number,
1939         );
1940
1941         # scan the replacement text
1942         1 while ( $tokenizer->get_line() );
1943
1944         # remove any here doc targets
1945         my $rht = undef;
1946         if ( $tokenizer_self->[_in_here_doc_] ) {
1947             $rht = [];
1948             push @{$rht},
1949               [
1950                 $tokenizer_self->[_here_doc_target_],
1951                 $tokenizer_self->[_here_quote_character_]
1952               ];
1953             if ( $tokenizer_self->[_rhere_target_list_] ) {
1954                 push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
1955                 $tokenizer_self->[_rhere_target_list_] = undef;
1956             }
1957             $tokenizer_self->[_in_here_doc_] = undef;
1958         }
1959
1960         # now its safe to report errors
1961         my $severe_error = $tokenizer->report_tokenization_errors();
1962
1963         # TODO: Could propagate a severe error up
1964
1965         # restore all tokenizer lexical variables
1966         restore_tokenizer_state($rstate);
1967
1968         # return the here doc targets
1969         return $rht;
1970     } ## end sub scan_replacement_text
1971
1972     sub scan_bare_identifier {
1973         ( $i, $tok, $type, $prototype ) =
1974           scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
1975             $rtoken_map, $max_token_index );
1976         return;
1977     }
1978
1979     sub scan_identifier {
1980         (
1981             $i, $tok, $type, $id_scan_state, $identifier,
1982             my $split_pretoken_flag
1983           )
1984           = scan_complex_identifier( $i, $id_scan_state, $identifier, $rtokens,
1985             $max_token_index, $expecting, $paren_type[$paren_depth] );
1986
1987         # Check for signal to fix a special variable adjacent to a keyword,
1988         # such as '$^One$0'.
1989         if ($split_pretoken_flag) {
1990
1991             # Try to fix it by splitting the pretoken
1992             if (   $i > 0
1993                 && $rtokens->[ $i - 1 ] eq '^'
1994                 && split_pretoken(1) )
1995             {
1996                 $identifier = substr( $identifier, 0, 3 );
1997                 $tok        = $identifier;
1998             }
1999             else {
2000
2001                 # This shouldn't happen ...
2002                 my $var    = substr( $tok, 0, 3 );
2003                 my $excess = substr( $tok, 3 );
2004                 interrupt_logfile();
2005                 warning(<<EOM);
2006 $input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
2007 A space may be needed after '$var'.
2008 EOM
2009                 resume_logfile();
2010             }
2011         }
2012         return;
2013     } ## end sub scan_identifier
2014
2015     use constant VERIFY_FASTSCAN => 0;
2016     my %fast_scan_context;
2017
2018     BEGIN {
2019         %fast_scan_context = (
2020             '$' => SCALAR_CONTEXT,
2021             '*' => SCALAR_CONTEXT,
2022             '@' => LIST_CONTEXT,
2023             '%' => LIST_CONTEXT,
2024             '&' => UNKNOWN_CONTEXT,
2025         );
2026     }
2027
2028     sub scan_simple_identifier {
2029
2030         # This is a wrapper for sub scan_identifier. It does a fast preliminary
2031         # scan for certain common identifiers:
2032         #   '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
2033         # If it does not find one of these, or this is a restart, it calls the
2034         # original scanner directly.
2035
2036         # This gives the same results as the full scanner in about 1/4 the
2037         # total runtime for a typical input stream.
2038
2039         # Notation:
2040         #     $var * 2
2041         #     ^^   ^
2042         #     ||  |
2043         #     ||  ---- $i_next [= next nonblank pretoken ]
2044         #     |----$i_plus_1 [= a bareword ]
2045         #     ---$i_begin [= a sigil]
2046
2047         my $i_begin   = $i;
2048         my $tok_begin = $tok;
2049         my $i_plus_1  = $i + 1;
2050         my $fast_scan_type;
2051
2052         #-------------------------------------------------------
2053         # Do full scan for anything following a pointer, such as
2054         #      $cref->&*;    # a postderef
2055         #-------------------------------------------------------
2056         if ( $last_nonblank_token eq '->' ) {
2057
2058         }
2059
2060         #------------------------------
2061         # quick scan with leading sigil
2062         #------------------------------
2063         elsif ( !$id_scan_state
2064             && $i_plus_1 <= $max_token_index
2065             && $fast_scan_context{$tok} )
2066         {
2067             $context = $fast_scan_context{$tok};
2068
2069             # look for $var, @var, ...
2070             if ( $rtoken_type->[$i_plus_1] eq 'w' ) {
2071                 my $pretype_next = EMPTY_STRING;
2072                 if ( $i_plus_1 < $max_token_index ) {
2073                     my $i_next = $i_plus_1 + 1;
2074                     if (   $rtoken_type->[$i_next] eq 'b'
2075                         && $i_next < $max_token_index )
2076                     {
2077                         $i_next += 1;
2078                     }
2079                     $pretype_next = $rtoken_type->[$i_next];
2080                 }
2081                 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
2082
2083                     # Found type 'i' like '$var', '@var', or '%var'
2084                     $identifier     = $tok . $rtokens->[$i_plus_1];
2085                     $tok            = $identifier;
2086                     $type           = 'i';
2087                     $i              = $i_plus_1;
2088                     $fast_scan_type = $type;
2089                 }
2090             }
2091
2092             # Look for @{ or %{  .
2093             # But we must let the full scanner handle things ${ because it may
2094             # keep going to get a complete identifier like '${#}'  .
2095             elsif (
2096                 $rtoken_type->[$i_plus_1] eq '{'
2097                 && (   $tok_begin eq '@'
2098                     || $tok_begin eq '%' )
2099               )
2100             {
2101
2102                 $identifier     = $tok;
2103                 $type           = 't';
2104                 $fast_scan_type = $type;
2105             }
2106         }
2107
2108         #---------------------------
2109         # Quick scan with leading ->
2110         # Look for ->[ and ->{
2111         #---------------------------
2112         elsif (
2113                $tok eq '->'
2114             && $i < $max_token_index
2115             && (   $rtokens->[$i_plus_1] eq '{'
2116                 || $rtokens->[$i_plus_1] eq '[' )
2117           )
2118         {
2119             $type           = $tok;
2120             $fast_scan_type = $type;
2121             $identifier     = $tok;
2122             $context        = UNKNOWN_CONTEXT;
2123         }
2124
2125         #--------------------------------------
2126         # Verify correctness during development
2127         #--------------------------------------
2128         if ( VERIFY_FASTSCAN && $fast_scan_type ) {
2129
2130             # We will call the full method
2131             my $identifier_simple = $identifier;
2132             my $tok_simple        = $tok;
2133             my $i_simple          = $i;
2134             my $context_simple    = $context;
2135
2136             $tok = $tok_begin;
2137             $i   = $i_begin;
2138             scan_identifier();
2139
2140             if (   $tok ne $tok_simple
2141                 || $type ne $fast_scan_type
2142                 || $i != $i_simple
2143                 || $identifier ne $identifier_simple
2144                 || $id_scan_state
2145                 || $context ne $context_simple )
2146             {
2147                 print STDERR <<EOM;
2148 scan_simple_identifier differs from scan_identifier:
2149 simple:  i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
2150 full:    i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
2151 EOM
2152             }
2153         }
2154
2155         #-------------------------------------------------
2156         # call full scanner if fast method did not succeed
2157         #-------------------------------------------------
2158         if ( !$fast_scan_type ) {
2159             scan_identifier();
2160         }
2161         return;
2162     } ## end sub scan_simple_identifier
2163
2164     sub scan_id {
2165         ( $i, $tok, $type, $id_scan_state ) =
2166           scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
2167             $id_scan_state, $max_token_index );
2168         return;
2169     }
2170
2171     sub scan_number {
2172         my $number;
2173         ( $i, $type, $number ) =
2174           scan_number_do( $input_line, $i, $rtoken_map, $type,
2175             $max_token_index );
2176         return $number;
2177     }
2178
2179     use constant VERIFY_FASTNUM => 0;
2180
2181     sub scan_number_fast {
2182
2183         # This is a wrapper for sub scan_number. It does a fast preliminary
2184         # scan for a simple integer.  It calls the original scan_number if it
2185         # does not find one.
2186
2187         my $i_begin   = $i;
2188         my $tok_begin = $tok;
2189         my $number;
2190
2191         #---------------------------------
2192         # Quick check for (signed) integer
2193         #---------------------------------
2194
2195         # This will be the string of digits:
2196         my $i_d   = $i;
2197         my $tok_d = $tok;
2198         my $typ_d = $rtoken_type->[$i_d];
2199
2200         # check for signed integer
2201         my $sign = EMPTY_STRING;
2202         if (   $typ_d ne 'd'
2203             && ( $typ_d eq '+' || $typ_d eq '-' )
2204             && $i_d < $max_token_index )
2205         {
2206             $sign = $tok_d;
2207             $i_d++;
2208             $tok_d = $rtokens->[$i_d];
2209             $typ_d = $rtoken_type->[$i_d];
2210         }
2211
2212         # Handle integers
2213         if (
2214             $typ_d eq 'd'
2215             && (
2216                 $i_d == $max_token_index
2217                 || (   $i_d < $max_token_index
2218                     && $rtoken_type->[ $i_d + 1 ] ne '.'
2219                     && $rtoken_type->[ $i_d + 1 ] ne 'w' )
2220             )
2221           )
2222         {
2223             # Let let full scanner handle multi-digit integers beginning with
2224             # '0' because there could be error messages.  For example, '009' is
2225             # not a valid number.
2226
2227             if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
2228                 $number = $sign . $tok_d;
2229                 $type   = 'n';
2230                 $i      = $i_d;
2231             }
2232         }
2233
2234         #--------------------------------------
2235         # Verify correctness during development
2236         #--------------------------------------
2237         if ( VERIFY_FASTNUM && defined($number) ) {
2238
2239             # We will call the full method
2240             my $type_simple   = $type;
2241             my $i_simple      = $i;
2242             my $number_simple = $number;
2243
2244             $tok    = $tok_begin;
2245             $i      = $i_begin;
2246             $number = scan_number();
2247
2248             if (   $type ne $type_simple
2249                 || ( $i != $i_simple && $i <= $max_token_index )
2250                 || $number ne $number_simple )
2251             {
2252                 print STDERR <<EOM;
2253 scan_number_fast differs from scan_number:
2254 simple:  i=$i_simple, type=$type_simple, number=$number_simple
2255 full:  i=$i, type=$type, number=$number
2256 EOM
2257             }
2258         }
2259
2260         #----------------------------------------
2261         # call full scanner if may not be integer
2262         #----------------------------------------
2263         if ( !defined($number) ) {
2264             $number = scan_number();
2265         }
2266         return $number;
2267     } ## end sub scan_number_fast
2268
2269     # a sub to warn if token found where term expected
2270     sub error_if_expecting_TERM {
2271         if ( $expecting == TERM ) {
2272             if ( $really_want_term{$last_nonblank_type} ) {
2273                 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
2274                     $rtoken_map, $rtoken_type, $input_line );
2275                 return 1;
2276             }
2277         }
2278         return;
2279     } ## end sub error_if_expecting_TERM
2280
2281     # a sub to warn if token found where operator expected
2282     sub error_if_expecting_OPERATOR {
2283         my $thing = shift;
2284         if ( $expecting == OPERATOR ) {
2285             if ( !defined($thing) ) { $thing = $tok }
2286             report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
2287                 $rtoken_map, $rtoken_type, $input_line );
2288             if ( $i_tok == 0 ) {
2289                 interrupt_logfile();
2290                 warning("Missing ';' or ',' above?\n");
2291                 resume_logfile();
2292             }
2293             return 1;
2294         }
2295         return;
2296     } ## end sub error_if_expecting_OPERATOR
2297
2298     # ------------------------------------------------------------
2299     # end scanner interfaces
2300     # ------------------------------------------------------------
2301
2302     #------------------
2303     # Tokenization subs
2304     #------------------
2305     sub do_GREATER_THAN_SIGN {
2306
2307         # '>'
2308         error_if_expecting_TERM()
2309           if ( $expecting == TERM );
2310         return;
2311     }
2312
2313     sub do_VERTICAL_LINE {
2314
2315         # '|'
2316         error_if_expecting_TERM()
2317           if ( $expecting == TERM );
2318         return;
2319     }
2320
2321     sub do_DOLLAR_SIGN {
2322
2323         # '$'
2324         # start looking for a scalar
2325         error_if_expecting_OPERATOR("Scalar")
2326           if ( $expecting == OPERATOR );
2327         scan_simple_identifier();
2328
2329         if ( $identifier eq '$^W' ) {
2330             $tokenizer_self->[_saw_perl_dash_w_] = 1;
2331         }
2332
2333         # Check for identifier in indirect object slot
2334         # (vorboard.pl, sort.t).  Something like:
2335         #   /^(print|printf|sort|exec|system)$/
2336         if (
2337                $is_indirect_object_taker{$last_nonblank_token}
2338             && $last_nonblank_type eq 'k'
2339             || ( ( $last_nonblank_token eq '(' )
2340                 && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
2341             || (   $last_nonblank_type eq 'w'
2342                 || $last_nonblank_type eq 'U' )    # possible object
2343           )
2344         {
2345
2346             # An identifier followed by '->' is not indirect object;
2347             # fixes b1175, b1176
2348             my ( $next_nonblank_type, $i_next ) =
2349               find_next_noncomment_type( $i, $rtokens, $max_token_index );
2350             $type = 'Z' if ( $next_nonblank_type ne '->' );
2351         }
2352         return;
2353     } ## end sub do_DOLLAR_SIGN
2354
2355     sub do_LEFT_PARENTHESIS {
2356
2357         # '('
2358         ++$paren_depth;
2359         $paren_semicolon_count[$paren_depth] = 0;
2360         if ($want_paren) {
2361             $container_type = $want_paren;
2362             $want_paren     = EMPTY_STRING;
2363         }
2364         elsif ( $statement_type =~ /^sub\b/ ) {
2365             $container_type = $statement_type;
2366         }
2367         else {
2368             $container_type = $last_nonblank_token;
2369
2370             # We can check for a syntax error here of unexpected '(',
2371             # but this is going to get messy...
2372             if (
2373                 $expecting == OPERATOR
2374
2375                 # Be sure this is not a method call of the form
2376                 # &method(...), $method->(..), &{method}(...),
2377                 # $ref[2](list) is ok & short for $ref[2]->(list)
2378                 # NOTE: at present, braces in something like &{ xxx }
2379                 # are not marked as a block, we might have a method call.
2380                 # Added ')' to fix case c017, something like ()()()
2381                 && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/
2382               )
2383             {
2384
2385                 # ref: camel 3 p 703.
2386                 if ( $last_last_nonblank_token eq 'do' ) {
2387                     complain(
2388 "do SUBROUTINE is deprecated; consider & or -> notation\n"
2389                     );
2390                 }
2391                 else {
2392
2393                     # if this is an empty list, (), then it is not an
2394                     # error; for example, we might have a constant pi and
2395                     # invoke it with pi() or just pi;
2396                     my ( $next_nonblank_token, $i_next ) =
2397                       find_next_nonblank_token( $i, $rtokens,
2398                         $max_token_index );
2399
2400                     # Patch for c029: give up error check if
2401                     # a side comment follows
2402                     if (   $next_nonblank_token ne ')'
2403                         && $next_nonblank_token ne '#' )
2404                     {
2405                         my $hint;
2406
2407                         error_if_expecting_OPERATOR('(');
2408
2409                         if ( $last_nonblank_type eq 'C' ) {
2410                             $hint =
2411                               "$last_nonblank_token has a void prototype\n";
2412                         }
2413                         elsif ( $last_nonblank_type eq 'i' ) {
2414                             if (   $i_tok > 0
2415                                 && $last_nonblank_token =~ /^\$/ )
2416                             {
2417                                 $hint =
2418                                   "Do you mean '$last_nonblank_token->(' ?\n";
2419                             }
2420                         }
2421                         if ($hint) {
2422                             interrupt_logfile();
2423                             warning($hint);
2424                             resume_logfile();
2425                         }
2426                     } ## end if ( $next_nonblank_token...
2427                 } ## end else [ if ( $last_last_nonblank_token...
2428             } ## end if ( $expecting == OPERATOR...
2429         }
2430
2431         # Do not update container type at ') ('; fix for git #105.  This will
2432         # propagate the container type onward so that any subsequent brace gets
2433         # correctly marked.  I have implemented this as a general rule, which
2434         # should be safe, but if necessary it could be restricted to certain
2435         # container statement types such as 'for'.
2436         $paren_type[$paren_depth] = $container_type
2437           if ( $last_nonblank_token ne ')' );
2438
2439         ( $type_sequence, $indent_flag ) =
2440           increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2441
2442         # propagate types down through nested parens
2443         # for example: the second paren in 'if ((' would be structural
2444         # since the first is.
2445
2446         if ( $last_nonblank_token eq '(' ) {
2447             $type = $last_nonblank_type;
2448         }
2449
2450         #     We exclude parens as structural after a ',' because it
2451         #     causes subtle problems with continuation indentation for
2452         #     something like this, where the first 'or' will not get
2453         #     indented.
2454         #
2455         #         assert(
2456         #             __LINE__,
2457         #             ( not defined $check )
2458         #               or ref $check
2459         #               or $check eq "new"
2460         #               or $check eq "old",
2461         #         );
2462         #
2463         #     Likewise, we exclude parens where a statement can start
2464         #     because of problems with continuation indentation, like
2465         #     these:
2466         #
2467         #         ($firstline =~ /^#\!.*perl/)
2468         #         and (print $File::Find::name, "\n")
2469         #           and (return 1);
2470         #
2471         #         (ref($usage_fref) =~ /CODE/)
2472         #         ? &$usage_fref
2473         #           : (&blast_usage, &blast_params, &blast_general_params);
2474
2475         else {
2476             $type = '{';
2477         }
2478
2479         if ( $last_nonblank_type eq ')' ) {
2480             warning(
2481                 "Syntax error? found token '$last_nonblank_type' then '('\n");
2482         }
2483         $paren_structural_type[$paren_depth] = $type;
2484         return;
2485
2486     } ## end sub do_LEFT_PARENTHESIS
2487
2488     sub do_RIGHT_PARENTHESIS {
2489
2490         # ')'
2491         ( $type_sequence, $indent_flag ) =
2492           decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2493
2494         if ( $paren_structural_type[$paren_depth] eq '{' ) {
2495             $type = '}';
2496         }
2497
2498         $container_type = $paren_type[$paren_depth];
2499
2500         # restore statement type as 'sub' at closing paren of a signature
2501         # so that a subsequent ':' is identified as an attribute
2502         if ( $container_type =~ /^sub\b/ ) {
2503             $statement_type = $container_type;
2504         }
2505
2506         #    /^(for|foreach)$/
2507         if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
2508             my $num_sc = $paren_semicolon_count[$paren_depth];
2509             if ( $num_sc > 0 && $num_sc != 2 ) {
2510                 warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2511             }
2512         }
2513
2514         if ( $paren_depth > 0 ) { $paren_depth-- }
2515         return;
2516     } ## end sub do_RIGHT_PARENTHESIS
2517
2518     sub do_COMMA {
2519
2520         # ','
2521         if ( $last_nonblank_type eq ',' ) {
2522             complain("Repeated ','s \n");
2523         }
2524
2525         # Note that we have to check both token and type here because a
2526         # comma following a qw list can have last token='(' but type = 'q'
2527         elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' ) {
2528             warning("Unexpected leading ',' after a '('\n");
2529         }
2530
2531         # patch for operator_expected: note if we are in the list (use.t)
2532         if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2533         return;
2534
2535     } ## end sub do_COMMA
2536
2537     sub do_SEMICOLON {
2538
2539         # ';'
2540         $context        = UNKNOWN_CONTEXT;
2541         $statement_type = EMPTY_STRING;
2542         $want_paren     = EMPTY_STRING;
2543
2544         #    /^(for|foreach)$/
2545         if ( $is_for_foreach{ $paren_type[$paren_depth] } )
2546         {    # mark ; in for loop
2547
2548             # Be careful: we do not want a semicolon such as the
2549             # following to be included:
2550             #
2551             #    for (sort {strcoll($a,$b);} keys %investments) {
2552
2553             if (   $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
2554                 && $square_bracket_depth ==
2555                 $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
2556             {
2557
2558                 $type = 'f';
2559                 $paren_semicolon_count[$paren_depth]++;
2560             }
2561         }
2562         return;
2563     } ## end sub do_SEMICOLON
2564
2565     sub do_QUOTATION_MARK {
2566
2567         # '"'
2568         error_if_expecting_OPERATOR("String")
2569           if ( $expecting == OPERATOR );
2570         $in_quote                = 1;
2571         $type                    = 'Q';
2572         $allowed_quote_modifiers = EMPTY_STRING;
2573         return;
2574     } ## end sub do_QUOTATION_MARK
2575
2576     sub do_APOSTROPHE {
2577
2578         # "'"
2579         error_if_expecting_OPERATOR("String")
2580           if ( $expecting == OPERATOR );
2581         $in_quote                = 1;
2582         $type                    = 'Q';
2583         $allowed_quote_modifiers = EMPTY_STRING;
2584         return;
2585     } ## end sub do_APOSTROPHE
2586
2587     sub do_BACKTICK {
2588
2589         # '`'
2590         error_if_expecting_OPERATOR("String")
2591           if ( $expecting == OPERATOR );
2592         $in_quote                = 1;
2593         $type                    = 'Q';
2594         $allowed_quote_modifiers = EMPTY_STRING;
2595         return;
2596     } ## end sub do_BACKTICK
2597
2598     sub do_SLASH {
2599
2600         # '/'
2601         my $is_pattern;
2602
2603         # a pattern cannot follow certain keywords which take optional
2604         # arguments, like 'shift' and 'pop'. See also '?'.
2605         if (
2606             $last_nonblank_type eq 'k'
2607             && $is_keyword_rejecting_slash_as_pattern_delimiter{
2608                 $last_nonblank_token}
2609           )
2610         {
2611             $is_pattern = 0;
2612         }
2613         elsif ( $expecting == UNKNOWN ) {    # indeterminate, must guess..
2614             my $msg;
2615             ( $is_pattern, $msg ) =
2616               guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
2617                 $max_token_index );
2618
2619             if ($msg) {
2620                 write_diagnostics("DIVIDE:$msg\n");
2621                 write_logfile_entry($msg);
2622             }
2623         }
2624         else { $is_pattern = ( $expecting == TERM ) }
2625
2626         if ($is_pattern) {
2627             $in_quote                = 1;
2628             $type                    = 'Q';
2629             $allowed_quote_modifiers = '[msixpodualngc]';
2630         }
2631         else {    # not a pattern; check for a /= token
2632
2633             if ( $rtokens->[ $i + 1 ] eq '=' ) {    # form token /=
2634                 $i++;
2635                 $tok  = '/=';
2636                 $type = $tok;
2637             }
2638
2639            #DEBUG - collecting info on what tokens follow a divide
2640            # for development of guessing algorithm
2641            #if ( is_possible_numerator( $i, $rtokens, $max_token_index ) < 0 ) {
2642            #    #write_diagnostics( "DIVIDE? $input_line\n" );
2643            #}
2644         }
2645         return;
2646     } ## end sub do_SLASH
2647
2648     sub do_LEFT_CURLY_BRACKET {
2649
2650         # '{'
2651         # if we just saw a ')', we will label this block with
2652         # its type.  We need to do this to allow sub
2653         # code_block_type to determine if this brace starts a
2654         # code block or anonymous hash.  (The type of a paren
2655         # pair is the preceding token, such as 'if', 'else',
2656         # etc).
2657         $container_type = EMPTY_STRING;
2658
2659         # ATTRS: for a '{' following an attribute list, reset
2660         # things to look like we just saw the sub name
2661         if ( $statement_type =~ /^sub\b/ ) {
2662             $last_nonblank_token = $statement_type;
2663             $last_nonblank_type  = 'i';
2664             $statement_type      = EMPTY_STRING;
2665         }
2666
2667         # patch for SWITCH/CASE: hide these keywords from an immediately
2668         # following opening brace
2669         elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
2670             && $statement_type eq $last_nonblank_token )
2671         {
2672             $last_nonblank_token = ";";
2673         }
2674
2675         elsif ( $last_nonblank_token eq ')' ) {
2676             $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
2677
2678             # defensive move in case of a nesting error (pbug.t)
2679             # in which this ')' had no previous '('
2680             # this nesting error will have been caught
2681             if ( !defined($last_nonblank_token) ) {
2682                 $last_nonblank_token = 'if';
2683             }
2684
2685             # check for syntax error here;
2686             unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
2687                 if ( $tokenizer_self->[_extended_syntax_] ) {
2688
2689                     # we append a trailing () to mark this as an unknown
2690                     # block type.  This allows perltidy to format some
2691                     # common extensions of perl syntax.
2692                     # This is used by sub code_block_type
2693                     $last_nonblank_token .= '()';
2694                 }
2695                 else {
2696                     my $list =
2697                       join( SPACE, sort keys %is_blocktype_with_paren );
2698                     warning(
2699 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
2700                     );
2701                 }
2702             }
2703         }
2704
2705         # patch for paren-less for/foreach glitch, part 2.
2706         # see note below under 'qw'
2707         elsif ($last_nonblank_token eq 'qw'
2708             && $is_for_foreach{$want_paren} )
2709         {
2710             $last_nonblank_token = $want_paren;
2711             if ( $last_last_nonblank_token eq $want_paren ) {
2712                 warning(
2713 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
2714                 );
2715
2716             }
2717             $want_paren = EMPTY_STRING;
2718         }
2719
2720         # now identify which of the three possible types of
2721         # curly braces we have: hash index container, anonymous
2722         # hash reference, or code block.
2723
2724         # non-structural (hash index) curly brace pair
2725         # get marked 'L' and 'R'
2726         if ( is_non_structural_brace() ) {
2727             $type = 'L';
2728
2729             # patch for SWITCH/CASE:
2730             # allow paren-less identifier after 'when'
2731             # if the brace is preceded by a space
2732             if (   $statement_type eq 'when'
2733                 && $last_nonblank_type eq 'i'
2734                 && $last_last_nonblank_type eq 'k'
2735                 && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
2736             {
2737                 $type       = '{';
2738                 $block_type = $statement_type;
2739             }
2740         }
2741
2742         # code and anonymous hash have the same type, '{', but are
2743         # distinguished by 'block_type',
2744         # which will be blank for an anonymous hash
2745         else {
2746
2747             $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
2748                 $max_token_index );
2749
2750             # patch to promote bareword type to function taking block
2751             if (   $block_type
2752                 && $last_nonblank_type eq 'w'
2753                 && $last_nonblank_i >= 0 )
2754             {
2755                 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
2756                     $routput_token_type->[$last_nonblank_i] =
2757                       $is_grep_alias{$block_type} ? 'k' : 'G';
2758                 }
2759             }
2760
2761             # patch for SWITCH/CASE: if we find a stray opening block brace
2762             # where we might accept a 'case' or 'when' block, then take it
2763             if (   $statement_type eq 'case'
2764                 || $statement_type eq 'when' )
2765             {
2766                 if ( !$block_type || $block_type eq '}' ) {
2767                     $block_type = $statement_type;
2768                 }
2769             }
2770         }
2771
2772         $brace_type[ ++$brace_depth ]        = $block_type;
2773         $brace_package[$brace_depth]         = $current_package;
2774         $brace_structural_type[$brace_depth] = $type;
2775         $brace_context[$brace_depth]         = $context;
2776         ( $type_sequence, $indent_flag ) =
2777           increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2778         return;
2779     } ## end sub do_LEFT_CURLY_BRACKET
2780
2781     sub do_RIGHT_CURLY_BRACKET {
2782
2783         # '}'
2784         $block_type = $brace_type[$brace_depth];
2785         if ($block_type) { $statement_type = EMPTY_STRING }
2786         if ( defined( $brace_package[$brace_depth] ) ) {
2787             $current_package = $brace_package[$brace_depth];
2788         }
2789
2790         # can happen on brace error (caught elsewhere)
2791         else {
2792         }
2793         ( $type_sequence, $indent_flag ) =
2794           decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2795
2796         if ( $brace_structural_type[$brace_depth] eq 'L' ) {
2797             $type = 'R';
2798         }
2799
2800         # propagate type information for 'do' and 'eval' blocks, and also
2801         # for smartmatch operator.  This is necessary to enable us to know
2802         # if an operator or term is expected next.
2803         if ( $is_block_operator{$block_type} ) {
2804             $tok = $block_type;
2805         }
2806
2807         $context = $brace_context[$brace_depth];
2808         if ( $brace_depth > 0 ) { $brace_depth--; }
2809         return;
2810     } ## end sub do_RIGHT_CURLY_BRACKET
2811
2812     sub do_AMPERSAND {
2813
2814         # '&' = maybe sub call? start looking
2815         # We have to check for sub call unless we are sure we
2816         # are expecting an operator.  This example from s2p
2817         # got mistaken as a q operator in an early version:
2818         #   print BODY &q(<<'EOT');
2819         if ( $expecting != OPERATOR ) {
2820
2821             # But only look for a sub call if we are expecting a term or
2822             # if there is no existing space after the &.
2823             # For example we probably don't want & as sub call here:
2824             #    Fcntl::S_IRUSR & $mode;
2825             if ( $expecting == TERM || $next_type ne 'b' ) {
2826                 scan_simple_identifier();
2827             }
2828         }
2829         else {
2830         }
2831         return;
2832     } ## end sub do_AMPERSAND
2833
2834     sub do_LESS_THAN_SIGN {
2835
2836         # '<' - angle operator or less than?
2837         if ( $expecting != OPERATOR ) {
2838             ( $i, $type ) =
2839               find_angle_operator_termination( $input_line, $i, $rtoken_map,
2840                 $expecting, $max_token_index );
2841
2842             ##  This message is not very helpful and quite confusing if the above
2843             ##  routine decided not to write a message with the line number.
2844             ##  if ( $type eq '<' && $expecting == TERM ) {
2845             ##      error_if_expecting_TERM();
2846             ##      interrupt_logfile();
2847             ##      warning("Unterminated <> operator?\n");
2848             ##      resume_logfile();
2849             ##  }
2850
2851         }
2852         else {
2853         }
2854         return;
2855     } ## end sub do_LESS_THAN_SIGN
2856
2857     sub do_QUESTION_MARK {
2858
2859         # '?' = conditional or starting pattern?
2860         my $is_pattern;
2861
2862         # Patch for rt #126965
2863         # a pattern cannot follow certain keywords which take optional
2864         # arguments, like 'shift' and 'pop'. See also '/'.
2865         if (
2866             $last_nonblank_type eq 'k'
2867             && $is_keyword_rejecting_question_as_pattern_delimiter{
2868                 $last_nonblank_token}
2869           )
2870         {
2871             $is_pattern = 0;
2872         }
2873
2874         # patch for RT#131288, user constant function without prototype
2875         # last type is 'U' followed by ?.
2876         elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
2877             $is_pattern = 0;
2878         }
2879         elsif ( $expecting == UNKNOWN ) {
2880
2881             # In older versions of Perl, a bare ? can be a pattern
2882             # delimiter.  In perl version 5.22 this was
2883             # dropped, but we have to support it in order to format
2884             # older programs. See:
2885             ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
2886             # For example, the following line worked
2887             # at one time:
2888             #      ?(.*)? && (print $1,"\n");
2889             # In current versions it would have to be written with slashes:
2890             #      /(.*)/ && (print $1,"\n");
2891             my $msg;
2892             ( $is_pattern, $msg ) =
2893               guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
2894                 $max_token_index );
2895
2896             if ($msg) { write_logfile_entry($msg) }
2897         }
2898         else { $is_pattern = ( $expecting == TERM ) }
2899
2900         if ($is_pattern) {
2901             $in_quote                = 1;
2902             $type                    = 'Q';
2903             $allowed_quote_modifiers = '[msixpodualngc]';
2904         }
2905         else {
2906             ( $type_sequence, $indent_flag ) =
2907               increase_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
2908         }
2909         return;
2910     } ## end sub do_QUESTION_MARK
2911
2912     sub do_STAR {
2913
2914         # '*' = typeglob, or multiply?
2915         if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
2916             if (   $next_type ne 'b'
2917                 && $next_type ne '('
2918                 && $next_type ne '#' )    # Fix c036
2919             {
2920                 $expecting = TERM;
2921             }
2922         }
2923         if ( $expecting == TERM ) {
2924             scan_simple_identifier();
2925         }
2926         else {
2927
2928             if ( $rtokens->[ $i + 1 ] eq '=' ) {
2929                 $tok  = '*=';
2930                 $type = $tok;
2931                 $i++;
2932             }
2933             elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
2934                 $tok  = '**';
2935                 $type = $tok;
2936                 $i++;
2937                 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2938                     $tok  = '**=';
2939                     $type = $tok;
2940                     $i++;
2941                 }
2942             }
2943         }
2944         return;
2945     } ## end sub do_STAR
2946
2947     sub do_DOT {
2948
2949         # '.' =  what kind of . ?
2950         if ( $expecting != OPERATOR ) {
2951             scan_number();
2952             if ( $type eq '.' ) {
2953                 error_if_expecting_TERM()
2954                   if ( $expecting == TERM );
2955             }
2956         }
2957         else {
2958         }
2959         return;
2960     } ## end sub do_DOT
2961
2962     sub do_COLON {
2963
2964         # ':' = label, ternary, attribute, ?
2965
2966         # if this is the first nonblank character, call it a label
2967         # since perl seems to just swallow it
2968         if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
2969             $type = 'J';
2970         }
2971
2972         # ATTRS: check for a ':' which introduces an attribute list
2973         # either after a 'sub' keyword or within a paren list
2974         elsif ( $statement_type =~ /^sub\b/ ) {
2975             $type              = 'A';
2976             $in_attribute_list = 1;
2977         }
2978
2979         # Within a signature, unless we are in a ternary.  For example,
2980         # from 't/filter_example.t':
2981         #    method foo4 ( $class: $bar ) { $class->bar($bar) }
2982         elsif ( $paren_type[$paren_depth] =~ /^sub\b/
2983             && !is_balanced_closing_container(QUESTION_COLON) )
2984         {
2985             $type              = 'A';
2986             $in_attribute_list = 1;
2987         }
2988
2989         # check for scalar attribute, such as
2990         # my $foo : shared = 1;
2991         elsif ($is_my_our_state{$statement_type}
2992             && $current_depth[QUESTION_COLON] == 0 )
2993         {
2994             $type              = 'A';
2995             $in_attribute_list = 1;
2996         }
2997
2998         # Look for Switch::Plain syntax if an error would otherwise occur
2999         # here. Note that we do not need to check if the extended syntax
3000         # flag is set because otherwise an error would occur, and we would
3001         # then have to output a message telling the user to set the
3002         # extended syntax flag to avoid the error.
3003         #  case 1: {
3004         #  default: {
3005         #  default:
3006         # Note that the line 'default:' will be parsed as a label elsewhere.
3007         elsif ( $is_case_default{$statement_type}
3008             && !is_balanced_closing_container(QUESTION_COLON) )
3009         {
3010             # mark it as a perltidy label type
3011             $type = 'J';
3012         }
3013
3014         # otherwise, it should be part of a ?/: operator
3015         else {
3016             ( $type_sequence, $indent_flag ) =
3017               decrease_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
3018             if ( $last_nonblank_token eq '?' ) {
3019                 warning("Syntax error near ? :\n");
3020             }
3021         }
3022         return;
3023     } ## end sub do_COLON
3024
3025     sub do_PLUS_SIGN {
3026
3027         # '+' = what kind of plus?
3028         if ( $expecting == TERM ) {
3029             my $number = scan_number_fast();
3030
3031             # unary plus is safest assumption if not a number
3032             if ( !defined($number) ) { $type = 'p'; }
3033         }
3034         elsif ( $expecting == OPERATOR ) {
3035         }
3036         else {
3037             if ( $next_type eq 'w' ) { $type = 'p' }
3038         }
3039         return;
3040     } ## end sub do_PLUS_SIGN
3041
3042     sub do_AT_SIGN {
3043
3044         # '@' = sigil for array?
3045         error_if_expecting_OPERATOR("Array")
3046           if ( $expecting == OPERATOR );
3047         scan_simple_identifier();
3048         return;
3049     }
3050
3051     sub do_PERCENT_SIGN {
3052
3053         # '%' = hash or modulo?
3054         # first guess is hash if no following blank or paren
3055         if ( $expecting == UNKNOWN ) {
3056             if ( $next_type ne 'b' && $next_type ne '(' ) {
3057                 $expecting = TERM;
3058             }
3059         }
3060         if ( $expecting == TERM ) {
3061             scan_simple_identifier();
3062         }
3063         return;
3064     } ## end sub do_PERCENT_SIGN
3065
3066     sub do_LEFT_SQUARE_BRACKET {
3067
3068         # '['
3069         $square_bracket_type[ ++$square_bracket_depth ] = $last_nonblank_token;
3070         ( $type_sequence, $indent_flag ) =
3071           increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3072
3073         # It may seem odd, but structural square brackets have
3074         # type '{' and '}'.  This simplifies the indentation logic.
3075         if ( !is_non_structural_brace() ) {
3076             $type = '{';
3077         }
3078         $square_bracket_structural_type[$square_bracket_depth] = $type;
3079         return;
3080     } ## end sub do_LEFT_SQUARE_BRACKET
3081
3082     sub do_RIGHT_SQUARE_BRACKET {
3083
3084         # ']'
3085         ( $type_sequence, $indent_flag ) =
3086           decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3087
3088         if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' ) {
3089             $type = '}';
3090         }
3091
3092         # propagate type information for smartmatch operator.  This is
3093         # necessary to enable us to know if an operator or term is expected
3094         # next.
3095         if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
3096             $tok = $square_bracket_type[$square_bracket_depth];
3097         }
3098
3099         if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
3100         return;
3101     } ## end sub do_RIGHT_SQUARE_BRACKET
3102
3103     sub do_MINUS_SIGN {
3104
3105         # '-' = what kind of minus?
3106         if ( ( $expecting != OPERATOR )
3107             && $is_file_test_operator{$next_tok} )
3108         {
3109             my ( $next_nonblank_token, $i_next ) =
3110               find_next_nonblank_token( $i + 1, $rtokens, $max_token_index );
3111
3112             # check for a quoted word like "-w=>xx";
3113             # it is sufficient to just check for a following '='
3114             if ( $next_nonblank_token eq '=' ) {
3115                 $type = 'm';
3116             }
3117             else {
3118                 $i++;
3119                 $tok .= $next_tok;
3120                 $type = 'F';
3121             }
3122         }
3123         elsif ( $expecting == TERM ) {
3124             my $number = scan_number_fast();
3125
3126             # maybe part of bareword token? unary is safest
3127             if ( !defined($number) ) { $type = 'm'; }
3128
3129         }
3130         elsif ( $expecting == OPERATOR ) {
3131         }
3132         else {
3133
3134             if ( $next_type eq 'w' ) {
3135                 $type = 'm';
3136             }
3137         }
3138         return;
3139     } ## end sub do_MINUS_SIGN
3140
3141     sub do_CARAT_SIGN {
3142
3143         # '^'
3144         # check for special variables like ${^WARNING_BITS}
3145         if ( $expecting == TERM ) {
3146
3147             if (   $last_nonblank_token eq '{'
3148                 && ( $next_tok !~ /^\d/ )
3149                 && ( $next_tok =~ /^\w/ ) )
3150             {
3151
3152                 if ( $next_tok eq 'W' ) {
3153                     $tokenizer_self->[_saw_perl_dash_w_] = 1;
3154                 }
3155                 $tok  = $tok . $next_tok;
3156                 $i    = $i + 1;
3157                 $type = 'w';
3158
3159                 # Optional coding to try to catch syntax errors. This can
3160                 # be removed if it ever causes incorrect warning messages.
3161                 # The '{^' should be preceded by either by a type or '$#'
3162                 # Examples:
3163                 #   $#{^CAPTURE}       ok
3164                 #   *${^LAST_FH}{NAME} ok
3165                 #   @{^HOWDY}          ok
3166                 #   $hash{^HOWDY}      error
3167
3168                 # Note that a type sigil '$' may be tokenized as 'Z'
3169                 # after something like 'print', so allow type 'Z'
3170                 if (   $last_last_nonblank_type ne 't'
3171                     && $last_last_nonblank_type ne 'Z'
3172                     && $last_last_nonblank_token ne '$#' )
3173                 {
3174                     warning("Possible syntax error near '{^'\n");
3175                 }
3176             }
3177
3178             else {
3179                 unless ( error_if_expecting_TERM() ) {
3180
3181                     # Something like this is valid but strange:
3182                     # undef ^I;
3183                     complain("The '^' seems unusual here\n");
3184                 }
3185             }
3186         }
3187         return;
3188     } ## end sub do_CARAT_SIGN
3189
3190     sub do_DOUBLE_COLON {
3191
3192         #  '::' = probably a sub call
3193         scan_bare_identifier();
3194         return;
3195     }
3196
3197     sub do_LEFT_SHIFT {
3198
3199         # '<<' = maybe a here-doc?
3200
3201 ##      This check removed because it could be a deprecated here-doc with
3202 ##      no specified target.  See example in log 16 Sep 2020.
3203 ##            return
3204 ##              unless ( $i < $max_token_index )
3205 ##              ;          # here-doc not possible if end of line
3206
3207         if ( $expecting != OPERATOR ) {
3208             my ( $found_target, $here_doc_target, $here_quote_character,
3209                 $saw_error );
3210             (
3211                 $found_target, $here_doc_target, $here_quote_character, $i,
3212                 $saw_error
3213               )
3214               = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3215                 $max_token_index );
3216
3217             if ($found_target) {
3218                 push @{$rhere_target_list},
3219                   [ $here_doc_target, $here_quote_character ];
3220                 $type = 'h';
3221                 if ( length($here_doc_target) > 80 ) {
3222                     my $truncated = substr( $here_doc_target, 0, 80 );
3223                     complain("Long here-target: '$truncated' ...\n");
3224                 }
3225                 elsif ( !$here_doc_target ) {
3226                     warning(
3227                         'Use of bare << to mean <<"" is deprecated' . "\n" )
3228                       unless ($here_quote_character);
3229                 }
3230                 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3231                     complain(
3232                         "Unconventional here-target: '$here_doc_target'\n");
3233                 }
3234             }
3235             elsif ( $expecting == TERM ) {
3236                 unless ($saw_error) {
3237
3238                     # shouldn't happen..arriving here implies an error in
3239                     # the logic in sub 'find_here_doc'
3240                     if (DEVEL_MODE) {
3241                         Fault(<<EOM);
3242 Program bug; didn't find here doc target
3243 EOM
3244                     }
3245                     warning(
3246                         "Possible program error: didn't find here doc target\n"
3247                     );
3248                     report_definite_bug();
3249                 }
3250             }
3251         }
3252         else {
3253         }
3254         return;
3255     } ## end sub do_LEFT_SHIFT
3256
3257     sub do_NEW_HERE_DOC {
3258
3259         # '<<~' = a here-doc, new type added in v26
3260         return
3261           unless ( $i < $max_token_index )
3262           ;    # here-doc not possible if end of line
3263         if ( $expecting != OPERATOR ) {
3264             my ( $found_target, $here_doc_target, $here_quote_character,
3265                 $saw_error );
3266             (
3267                 $found_target, $here_doc_target, $here_quote_character, $i,
3268                 $saw_error
3269               )
3270               = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3271                 $max_token_index );
3272
3273             if ($found_target) {
3274
3275                 if ( length($here_doc_target) > 80 ) {
3276                     my $truncated = substr( $here_doc_target, 0, 80 );
3277                     complain("Long here-target: '$truncated' ...\n");
3278                 }
3279                 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3280                     complain(
3281                         "Unconventional here-target: '$here_doc_target'\n");
3282                 }
3283
3284                 # Note that we put a leading space on the here quote
3285                 # character indicate that it may be preceded by spaces
3286                 $here_quote_character = SPACE . $here_quote_character;
3287                 push @{$rhere_target_list},
3288                   [ $here_doc_target, $here_quote_character ];
3289                 $type = 'h';
3290             }
3291             elsif ( $expecting == TERM ) {
3292                 unless ($saw_error) {
3293
3294                     # shouldn't happen..arriving here implies an error in
3295                     # the logic in sub 'find_here_doc'
3296                     if (DEVEL_MODE) {
3297                         Fault(<<EOM);
3298 Program bug; didn't find here doc target
3299 EOM
3300                     }
3301                     warning(
3302                         "Possible program error: didn't find here doc target\n"
3303                     );
3304                     report_definite_bug();
3305                 }
3306             }
3307         }
3308         else {
3309             error_if_expecting_OPERATOR();
3310         }
3311         return;
3312     } ## end sub do_NEW_HERE_DOC
3313
3314     sub do_POINTER {
3315
3316         #  '->'
3317         return;
3318     } ## end sub do_POINTER
3319
3320     sub do_PLUS_PLUS {
3321
3322         # '++'
3323         # type = 'pp' for pre-increment, '++' for post-increment
3324         if    ( $expecting == TERM ) { $type = 'pp' }
3325         elsif ( $expecting == UNKNOWN ) {
3326
3327             my ( $next_nonblank_token, $i_next ) =
3328               find_next_nonblank_token( $i, $rtokens, $max_token_index );
3329
3330             # Fix for c042: look past a side comment
3331             if ( $next_nonblank_token eq '#' ) {
3332                 ( $next_nonblank_token, $i_next ) =
3333                   find_next_nonblank_token( $max_token_index,
3334                     $rtokens, $max_token_index );
3335             }
3336
3337             if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
3338         }
3339         return;
3340     } ## end sub do_PLUS_PLUS
3341
3342     sub do_FAT_COMMA {
3343
3344         # '=>'
3345         if ( $last_nonblank_type eq $tok ) {
3346             complain("Repeated '=>'s \n");
3347         }
3348
3349         # patch for operator_expected: note if we are in the list (use.t)
3350         # TODO: make version numbers a new token type
3351         if ( $statement_type eq 'use' ) { $statement_type = '_use' }
3352         return;
3353     } ## end sub do_FAT_COMMA
3354
3355     sub do_MINUS_MINUS {
3356
3357         # '--'
3358         # type = 'mm' for pre-decrement, '--' for post-decrement
3359
3360         if    ( $expecting == TERM ) { $type = 'mm' }
3361         elsif ( $expecting == UNKNOWN ) {
3362             my ( $next_nonblank_token, $i_next ) =
3363               find_next_nonblank_token( $i, $rtokens, $max_token_index );
3364
3365             # Fix for c042: look past a side comment
3366             if ( $next_nonblank_token eq '#' ) {
3367                 ( $next_nonblank_token, $i_next ) =
3368                   find_next_nonblank_token( $max_token_index,
3369                     $rtokens, $max_token_index );
3370             }
3371
3372             if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
3373         }
3374         return;
3375     } ## end sub do_MINUS_MINUS
3376
3377     sub do_LOGICAL_AND {
3378
3379         # '&&'
3380         error_if_expecting_TERM()
3381           if ( $expecting == TERM && $last_nonblank_token ne ',' );    #c015
3382         return;
3383     }
3384
3385     sub do_LOGICAL_OR {
3386
3387         # '||'
3388         error_if_expecting_TERM()
3389           if ( $expecting == TERM && $last_nonblank_token ne ',' );    #c015
3390         return;
3391     }
3392
3393     sub do_SLASH_SLASH {
3394
3395         # '//'
3396         error_if_expecting_TERM()
3397           if ( $expecting == TERM );
3398         return;
3399     }
3400
3401     sub do_DIGITS {
3402
3403         # 'd' = string of digits
3404         error_if_expecting_OPERATOR("Number")
3405           if ( $expecting == OPERATOR );
3406
3407         my $number = scan_number_fast();
3408         if ( !defined($number) ) {
3409
3410             # shouldn't happen - we should always get a number
3411             if (DEVEL_MODE) {
3412                 Fault(<<EOM);
3413 non-number beginning with digit--program bug
3414 EOM
3415             }
3416             warning(
3417                 "Unexpected error condition: non-number beginning with digit\n"
3418             );
3419             report_definite_bug();
3420         }
3421         return;
3422     } ## end sub do_DIGITS
3423
3424     sub do_ATTRIBUTE_LIST {
3425
3426         my ($next_nonblank_token) = @_;
3427
3428         # Called at a bareword encountered while in an attribute list
3429         # returns 'is_attribute':
3430         #    true if attribute found
3431         #    false if an attribute (continue parsing bareword)
3432
3433         # treat bare word followed by open paren like qw(
3434         if ( $next_nonblank_token eq '(' ) {
3435
3436             # For something like:
3437             #     : prototype($$)
3438             # we should let do_scan_sub see it so that it can see
3439             # the prototype.  All other attributes get parsed as a
3440             # quoted string.
3441             if ( $tok eq 'prototype' ) {
3442                 $id_scan_state = 'prototype';
3443
3444                 # start just after the word 'prototype'
3445                 my $i_beg = $i + 1;
3446                 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
3447                     {
3448                         input_line      => $input_line,
3449                         i               => $i,
3450                         i_beg           => $i_beg,
3451                         tok             => $tok,
3452                         type            => $type,
3453                         rtokens         => $rtokens,
3454                         rtoken_map      => $rtoken_map,
3455                         id_scan_state   => $id_scan_state,
3456                         max_token_index => $max_token_index,
3457                     }
3458                 );
3459
3460                 # If successful, mark as type 'q' to be consistent
3461                 # with other attributes.  Type 'w' would also work.
3462                 if ( $i > $i_beg ) {
3463                     $type = 'q';
3464                     return 1;
3465                 }
3466
3467                 # If not successful, continue and parse as a quote.
3468             }
3469
3470             # All other attribute lists must be parsed as quotes
3471             # (see 'signatures.t' for good examples)
3472             $in_quote                = $quote_items{'q'};
3473             $allowed_quote_modifiers = $quote_modifiers{'q'};
3474             $type                    = 'q';
3475             $quote_type              = 'q';
3476             return 1;
3477         }
3478
3479         # handle bareword not followed by open paren
3480         else {
3481             $type = 'w';
3482             return 1;
3483         }
3484
3485         # attribute not found
3486         return;
3487     } ## end sub do_ATTRIBUTE_LIST
3488
3489     sub do_QUOTED_BAREWORD {
3490
3491         # find type of a bareword followed by a '=>'
3492         if ( $is_constant{$current_package}{$tok} ) {
3493             $type = 'C';
3494         }
3495         elsif ( $is_user_function{$current_package}{$tok} ) {
3496             $type      = 'U';
3497             $prototype = $user_function_prototype{$current_package}{$tok};
3498         }
3499         elsif ( $tok =~ /^v\d+$/ ) {
3500             $type = 'v';
3501             report_v_string($tok);
3502         }
3503         else {
3504
3505             # Bareword followed by a fat comma - see 'git18.in'
3506             # If tok is something like 'x17' then it could
3507             # actually be operator x followed by number 17.
3508             # For example, here:
3509             #     123x17 => [ 792, 1224 ],
3510             # (a key of 123 repeated 17 times, perhaps not
3511             # what was intended). We will mark x17 as type
3512             # 'n' and it will be split. If the previous token
3513             # was also a bareword then it is not very clear is
3514             # going on.  In this case we will not be sure that
3515             # an operator is expected, so we just mark it as a
3516             # bareword.  Perl is a little murky in what it does
3517             # with stuff like this, and its behavior can change
3518             # over time.  Something like
3519             #    a x18 => [792, 1224], will compile as
3520             # a key with 18 a's.  But something like
3521             #    push @array, a x18;
3522             # is a syntax error.
3523             if (
3524                    $expecting == OPERATOR
3525                 && substr( $tok, 0, 1 ) eq 'x'
3526                 && ( length($tok) == 1
3527                     || substr( $tok, 1, 1 ) =~ /^\d/ )
3528               )
3529             {
3530                 $type = 'n';
3531                 if ( split_pretoken(1) ) {
3532                     $type = 'x';
3533                     $tok  = 'x';
3534                 }
3535             }
3536             else {
3537
3538                 # git #18
3539                 $type = 'w';
3540                 error_if_expecting_OPERATOR();
3541             }
3542         }
3543         return;
3544     } ## end sub do_QUOTED_BAREWORD
3545
3546     sub do_X_OPERATOR {
3547
3548         if ( $tok eq 'x' ) {
3549             if ( $rtokens->[ $i + 1 ] eq '=' ) {    # x=
3550                 $tok  = 'x=';
3551                 $type = $tok;
3552                 $i++;
3553             }
3554             else {
3555                 $type = 'x';
3556             }
3557         }
3558         else {
3559
3560             # Split a pretoken like 'x10' into 'x' and '10'.
3561             # Note: In previous versions of perltidy it was marked
3562             # as a number, $type = 'n', and fixed downstream by the
3563             # Formatter.
3564             $type = 'n';
3565             if ( split_pretoken(1) ) {
3566                 $type = 'x';
3567                 $tok  = 'x';
3568             }
3569         }
3570         return;
3571     } ## end sub do_X_OPERATOR
3572
3573     sub do_USE_CONSTANT {
3574         scan_bare_identifier();
3575         my ( $next_nonblank_tok2, $i_next2 ) =
3576           find_next_nonblank_token( $i, $rtokens, $max_token_index );
3577
3578         if ($next_nonblank_tok2) {
3579
3580             if ( $is_keyword{$next_nonblank_tok2} ) {
3581
3582                 # Assume qw is used as a quote and okay, as in:
3583                 #  use constant qw{ DEBUG 0 };
3584                 # Not worth trying to parse for just a warning
3585
3586                 # NOTE: This warning is deactivated because recent
3587                 # versions of perl do not complain here, but
3588                 # the coding is retained for reference.
3589                 if ( 0 && $next_nonblank_tok2 ne 'qw' ) {
3590                     warning(
3591 "Attempting to define constant '$next_nonblank_tok2' which is a perl keyword\n"
3592                     );
3593                 }
3594             }
3595
3596             else {
3597                 $is_constant{$current_package}{$next_nonblank_tok2} = 1;
3598             }
3599         }
3600         return;
3601     } ## end sub do_USE_CONSTANT
3602
3603     sub do_KEYWORD {
3604
3605         # found a keyword - set any associated flags
3606         $type = 'k';
3607
3608         # Since for and foreach may not be followed immediately
3609         # by an opening paren, we have to remember which keyword
3610         # is associated with the next '('
3611         if ( $is_for_foreach{$tok} ) {
3612             if ( new_statement_ok() ) {
3613                 $want_paren = $tok;
3614             }
3615         }
3616
3617         # recognize 'use' statements, which are special
3618         elsif ( $is_use_require{$tok} ) {
3619             $statement_type = $tok;
3620             error_if_expecting_OPERATOR()
3621               if ( $expecting == OPERATOR );
3622         }
3623
3624         # remember my and our to check for trailing ": shared"
3625         elsif ( $is_my_our_state{$tok} ) {
3626             $statement_type = $tok;
3627         }
3628
3629         # Check for misplaced 'elsif' and 'else', but allow isolated
3630         # else or elsif blocks to be formatted.  This is indicated
3631         # by a last noblank token of ';'
3632         elsif ( $tok eq 'elsif' ) {
3633             if (
3634                 $last_nonblank_token ne ';'
3635
3636                 ## !~ /^(if|elsif|unless)$/
3637                 && !$is_if_elsif_unless{$last_nonblank_block_type}
3638               )
3639             {
3640                 warning(
3641                     "expecting '$tok' to follow one of 'if|elsif|unless'\n");
3642             }
3643         }
3644         elsif ( $tok eq 'else' ) {
3645
3646             # patched for SWITCH/CASE
3647             if (
3648                 $last_nonblank_token ne ';'
3649
3650                 ## !~ /^(if|elsif|unless|case|when)$/
3651                 && !$is_if_elsif_unless_case_when{$last_nonblank_block_type}
3652
3653                 # patch to avoid an unwanted error message for
3654                 # the case of a parenless 'case' (RT 105484):
3655                 # switch ( 1 ) { case x { 2 } else { } }
3656                 ## !~ /^(if|elsif|unless|case|when)$/
3657                 && !$is_if_elsif_unless_case_when{$statement_type}
3658               )
3659             {
3660                 warning(
3661 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
3662                 );
3663             }
3664         }
3665
3666         # patch for SWITCH/CASE if 'case' and 'when are
3667         # treated as keywords.  Also 'default' for Switch::Plain
3668         elsif ($tok eq 'when'
3669             || $tok eq 'case'
3670             || $tok eq 'default' )
3671         {
3672             $statement_type = $tok;    # next '{' is block
3673         }
3674
3675         # feature 'err' was removed in Perl 5.10.  So mark this as
3676         # a bareword unless an operator is expected (see c158).
3677         elsif ( $tok eq 'err' ) {
3678             if ( $expecting != OPERATOR ) { $type = 'w' }
3679         }
3680
3681         return;
3682     } ## end sub do_KEYWORD
3683
3684     sub do_QUOTE_OPERATOR {
3685
3686         if ( $expecting == OPERATOR ) {
3687
3688             # Be careful not to call an error for a qw quote
3689             # where a parenthesized list is allowed.  For example,
3690             # it could also be a for/foreach construct such as
3691             #
3692             #    foreach my $key qw\Uno Due Tres Quadro\ {
3693             #        print "Set $key\n";
3694             #    }
3695             #
3696
3697             # Or it could be a function call.
3698             # NOTE: Braces in something like &{ xxx } are not
3699             # marked as a block, we might have a method call.
3700             # &method(...), $method->(..), &{method}(...),
3701             # $ref[2](list) is ok & short for $ref[2]->(list)
3702             #
3703             # See notes in 'sub code_block_type' and
3704             # 'sub is_non_structural_brace'
3705
3706             unless (
3707                 $tok eq 'qw'
3708                 && (   $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
3709                     || $is_for_foreach{$want_paren} )
3710               )
3711             {
3712                 error_if_expecting_OPERATOR();
3713             }
3714         }
3715         $in_quote                = $quote_items{$tok};
3716         $allowed_quote_modifiers = $quote_modifiers{$tok};
3717
3718         # All quote types are 'Q' except possibly qw quotes.
3719         # qw quotes are special in that they may generally be trimmed
3720         # of leading and trailing whitespace.  So they are given a
3721         # separate type, 'q', unless requested otherwise.
3722         $type =
3723           ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
3724           ? 'q'
3725           : 'Q';
3726         $quote_type = $type;
3727         return;
3728     } ## end sub do_QUOTE_OPERATOR
3729
3730     sub do_UNKNOWN_BAREWORD {
3731
3732         my ($next_nonblank_token) = @_;
3733
3734         scan_bare_identifier();
3735
3736         if (   $statement_type eq 'use'
3737             && $last_nonblank_token eq 'use' )
3738         {
3739             $saw_use_module{$current_package}->{$tok} = 1;
3740         }
3741
3742         if ( $type eq 'w' ) {
3743
3744             if ( $expecting == OPERATOR ) {
3745
3746                 # Patch to avoid error message for RPerl overloaded
3747                 # operator functions: use overload
3748                 #    '+' => \&sse_add,
3749                 #    '-' => \&sse_sub,
3750                 #    '*' => \&sse_mul,
3751                 #    '/' => \&sse_div;
3752                 # TODO: this could eventually be generalized
3753                 if (   $saw_use_module{$current_package}->{'RPerl'}
3754                     && $tok =~ /^sse_(mul|div|add|sub)$/ )
3755                 {
3756
3757                 }
3758
3759                 # Fix part 1 for git #63 in which a comment falls
3760                 # between an -> and the following word.  An
3761                 # alternate fix would be to change operator_expected
3762                 # to return an UNKNOWN for this type.
3763                 elsif ( $last_nonblank_type eq '->' ) {
3764
3765                 }
3766
3767                 # don't complain about possible indirect object
3768                 # notation.
3769                 # For example:
3770                 #   package main;
3771                 #   sub new($) { ... }
3772                 #   $b = new A::;  # calls A::new
3773                 #   $c = new A;    # same thing but suspicious
3774                 # This will call A::new but we have a 'new' in
3775                 # main:: which looks like a constant.
3776                 #
3777                 elsif ( $last_nonblank_type eq 'C' ) {
3778                     if ( $tok !~ /::$/ ) {
3779                         complain(<<EOM);
3780 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
3781        Maybe indirectet object notation?
3782 EOM
3783                     }
3784                 }
3785                 else {
3786                     error_if_expecting_OPERATOR("bareword");
3787                 }
3788             }
3789
3790             # mark bare words immediately followed by a paren as
3791             # functions
3792             $next_tok = $rtokens->[ $i + 1 ];
3793             if ( $next_tok eq '(' ) {
3794
3795                 # Patch for issue c151, where we are processing a snippet and
3796                 # have not seen that SPACE is a constant.  In this case 'x' is
3797                 # probably an operator. The only disadvantage with an incorrect
3798                 # guess is that the space after it may be incorrect. For example
3799                 #   $str .= SPACE x ( 16 - length($str) ); See also b1410.
3800                 if ( $tok eq 'x' && $last_nonblank_type eq 'w' ) { $type = 'x' }
3801
3802                 # Fix part 2 for git #63.  Leave type as 'w' to keep
3803                 # the type the same as if the -> were not separated
3804                 elsif ( $last_nonblank_type ne '->' ) { $type = 'U' }
3805
3806             }
3807
3808             # underscore after file test operator is file handle
3809             if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
3810                 $type = 'Z';
3811             }
3812
3813             # patch for SWITCH/CASE if 'case' and 'when are
3814             # not treated as keywords:
3815             if (
3816                 ( $tok eq 'case' && $brace_type[$brace_depth] eq 'switch' )
3817                 || (   $tok eq 'when'
3818                     && $brace_type[$brace_depth] eq 'given' )
3819               )
3820             {
3821                 $statement_type = $tok;    # next '{' is block
3822                 $type           = 'k';     # for keyword syntax coloring
3823             }
3824             if ( $next_nonblank_token eq '(' ) {
3825
3826                 # patch for SWITCH/CASE if switch and given not keywords
3827                 # Switch is not a perl 5 keyword, but we will gamble
3828                 # and mark switch followed by paren as a keyword.  This
3829                 # is only necessary to get html syntax coloring nice,
3830                 # and does not commit this as being a switch/case.
3831                 if ( $tok eq 'switch' || $tok eq 'given' ) {
3832                     $type = 'k';    # for keyword syntax coloring
3833                 }
3834
3835                 # mark 'x' as operator for something like this (see b1410)
3836                 #  my $line = join( LD_X, map { LD_H x ( $_ + 2 ) } @$widths );
3837                 elsif ( $tok eq 'x' && $last_nonblank_type eq 'w' ) {
3838                     $type = 'x';
3839                 }
3840             }
3841         }
3842         return;
3843     } ## end sub do_UNKNOWN_BAREWORD
3844
3845     sub sub_attribute_ok_here {
3846
3847         my ( $tok_kw, $next_nonblank_token, $i_next ) = @_;
3848
3849         # Decide if 'sub :' can be the start of a sub attribute list.
3850         # We will decide based on if the colon is followed by a
3851         # bareword which is not a keyword.
3852         # Changed inext+1 to inext to fixed case b1190.
3853         my $sub_attribute_ok_here;
3854         if (   $is_sub{$tok_kw}
3855             && $expecting != OPERATOR
3856             && $next_nonblank_token eq ':' )
3857         {
3858             my ( $nn_nonblank_token, $i_nn ) =
3859               find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
3860             $sub_attribute_ok_here =
3861                  $nn_nonblank_token =~ /^\w/
3862               && $nn_nonblank_token !~ /^\d/
3863               && !$is_keyword{$nn_nonblank_token};
3864         }
3865         return $sub_attribute_ok_here;
3866     } ## end sub sub_attribute_ok_here
3867
3868     sub do_BAREWORD {
3869
3870         my ($is_END_or_DATA) = @_;
3871
3872         # handle a bareword token:
3873         # returns
3874         #    true if this token ends the current line
3875         #    false otherwise
3876
3877         my ( $next_nonblank_token, $i_next ) =
3878           find_next_nonblank_token( $i, $rtokens, $max_token_index );
3879
3880         # a bare word immediately followed by :: is not a keyword;
3881         # use $tok_kw when testing for keywords to avoid a mistake
3882         my $tok_kw = $tok;
3883         if (   $rtokens->[ $i + 1 ] eq ':'
3884             && $rtokens->[ $i + 2 ] eq ':' )
3885         {
3886             $tok_kw .= '::';
3887         }
3888
3889         if ($in_attribute_list) {
3890             my $is_attribute = do_ATTRIBUTE_LIST($next_nonblank_token);
3891             return if ($is_attribute);
3892         }
3893
3894         #----------------------------------------
3895         # Starting final if-elsif- chain of tests
3896         #----------------------------------------
3897
3898         # This is the return flag:
3899         #   true => this is the last token on the line
3900         #   false => keep tokenizing the line
3901         my $is_last;
3902
3903         # The following blocks of code must update these vars:
3904         # $type - the final token type, must always be set
3905
3906         # In addition, if additional pretokens are added:
3907         # $tok  - the final token
3908         # $i    - the index of the last pretoken
3909
3910         # They may also need to check and set various flags
3911
3912         # Scan a bare word following a -> as an identifier; it could
3913         # have a long package name.  Fixes c037, c041.
3914         if ( $last_nonblank_token eq '->' ) {
3915             scan_bare_identifier();
3916
3917             # a bareward after '->' gets type 'i'
3918             $type = 'i';
3919         }
3920
3921         # Quote a word followed by => operator
3922         # unless the word __END__ or __DATA__ and the only word on
3923         # the line.
3924         elsif ( !$is_END_or_DATA
3925             && $next_nonblank_token eq '='
3926             && $rtokens->[ $i_next + 1 ] eq '>' )
3927         {
3928             do_QUOTED_BAREWORD();
3929         }
3930
3931         # quote a bare word within braces..like xxx->{s}; note that we
3932         # must be sure this is not a structural brace, to avoid
3933         # mistaking {s} in the following for a quoted bare word:
3934         #     for(@[){s}bla}BLA}
3935         # Also treat q in something like var{-q} as a bare word, not
3936         # a quote operator
3937         elsif (
3938             $next_nonblank_token eq '}'
3939             && (
3940                 $last_nonblank_type eq 'L'
3941                 || (   $last_nonblank_type eq 'm'
3942                     && $last_last_nonblank_type eq 'L' )
3943             )
3944           )
3945         {
3946             $type = 'w';
3947         }
3948
3949         # handle operator x (now we know it isn't $x=)
3950         elsif (
3951                $expecting == OPERATOR
3952             && substr( $tok, 0, 1 ) eq 'x'
3953             && ( length($tok) == 1
3954                 || substr( $tok, 1, 1 ) =~ /^\d/ )
3955           )
3956         {
3957             do_X_OPERATOR();
3958         }
3959         elsif ( $tok_kw eq 'CORE::' ) {
3960             $type = $tok = $tok_kw;
3961             $i += 2;
3962         }
3963         elsif ( ( $tok eq 'strict' )
3964             and ( $last_nonblank_token eq 'use' ) )
3965         {
3966             $tokenizer_self->[_saw_use_strict_] = 1;
3967             scan_bare_identifier();
3968         }
3969
3970         elsif ( ( $tok eq 'warnings' )
3971             and ( $last_nonblank_token eq 'use' ) )
3972         {
3973             $tokenizer_self->[_saw_perl_dash_w_] = 1;
3974
3975             # scan as identifier, so that we pick up something like:
3976             # use warnings::register
3977             scan_bare_identifier();
3978         }
3979
3980         elsif (
3981                $tok eq 'AutoLoader'
3982             && $tokenizer_self->[_look_for_autoloader_]
3983             && (
3984                 $last_nonblank_token eq 'use'
3985
3986                 # these regexes are from AutoSplit.pm, which we want
3987                 # to mimic
3988                 || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
3989                 || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
3990             )
3991           )
3992         {
3993             write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
3994             $tokenizer_self->[_saw_autoloader_]      = 1;
3995             $tokenizer_self->[_look_for_autoloader_] = 0;
3996             scan_bare_identifier();
3997         }
3998
3999         elsif (
4000                $tok eq 'SelfLoader'
4001             && $tokenizer_self->[_look_for_selfloader_]
4002             && (   $last_nonblank_token eq 'use'
4003                 || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
4004                 || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
4005           )
4006         {
4007             write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
4008             $tokenizer_self->[_saw_selfloader_]      = 1;
4009             $tokenizer_self->[_look_for_selfloader_] = 0;
4010             scan_bare_identifier();
4011         }
4012
4013         elsif ( ( $tok eq 'constant' )
4014             and ( $last_nonblank_token eq 'use' ) )
4015         {
4016             do_USE_CONSTANT();
4017         }
4018
4019         # various quote operators
4020         elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
4021             do_QUOTE_OPERATOR();
4022         }
4023
4024         # check for a statement label
4025         elsif (
4026                ( $next_nonblank_token eq ':' )
4027             && ( $rtokens->[ $i_next + 1 ] ne ':' )
4028             && ( $i_next <= $max_token_index )    # colon on same line
4029
4030             # like 'sub : lvalue' ?
4031             && !sub_attribute_ok_here( $tok_kw, $next_nonblank_token, $i_next )
4032             && label_ok()
4033           )
4034         {
4035             if ( $tok !~ /[A-Z]/ ) {
4036                 push @{ $tokenizer_self->[_rlower_case_labels_at_] },
4037                   $input_line_number;
4038             }
4039             $type = 'J';
4040             $tok .= ':';
4041             $i = $i_next;
4042         }
4043
4044         #      'sub' or alias
4045         elsif ( $is_sub{$tok_kw} ) {
4046             error_if_expecting_OPERATOR()
4047               if ( $expecting == OPERATOR );
4048             initialize_subname();
4049             scan_id();
4050         }
4051
4052         #      'package'
4053         elsif ( $is_package{$tok_kw} ) {
4054             error_if_expecting_OPERATOR()
4055               if ( $expecting == OPERATOR );
4056             scan_id();
4057         }
4058
4059         # Fix for c035: split 'format' from 'is_format_END_DATA' to be
4060         # more restrictive. Require a new statement to be ok here.
4061         elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
4062             $type = ';';    # make tokenizer look for TERM next
4063             $tokenizer_self->[_in_format_] = 1;
4064             $is_last = 1;                          ## is last token on this line
4065         }
4066
4067         # Note on token types for format, __DATA__, __END__:
4068         # It simplifies things to give these type ';', so that when we
4069         # start rescanning we will be expecting a token of type TERM.
4070         # We will switch to type 'k' before outputting the tokens.
4071         elsif ( $is_END_DATA{$tok_kw} ) {
4072             $type = ';';    # make tokenizer look for TERM next
4073
4074             # Remember that we are in one of these three sections
4075             $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1;
4076             $is_last = 1;    ## is last token on this line
4077         }
4078
4079         elsif ( $is_keyword{$tok_kw} ) {
4080             do_KEYWORD();
4081         }
4082
4083         # check for inline label following
4084         #         /^(redo|last|next|goto)$/
4085         elsif (( $last_nonblank_type eq 'k' )
4086             && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4087         {
4088             $type = 'j';
4089         }
4090
4091         # something else --
4092         else {
4093             do_UNKNOWN_BAREWORD($next_nonblank_token);
4094         }
4095
4096         return $is_last;
4097
4098     } ## end sub do_BAREWORD
4099
4100     sub do_FOLLOW_QUOTE {
4101
4102         # Continue following a quote on a new line
4103         $type = $quote_type;
4104
4105         unless ( @{$routput_token_list} ) {    # initialize if continuation line
4106             push( @{$routput_token_list}, $i );
4107             $routput_token_type->[$i] = $type;
4108
4109         }
4110
4111         # scan for the end of the quote or pattern
4112         (
4113             $i,
4114             $in_quote,
4115             $quote_character,
4116             $quote_pos,
4117             $quote_depth,
4118             $quoted_string_1,
4119             $quoted_string_2,
4120
4121         ) = do_quote(
4122
4123             $i,
4124             $in_quote,
4125             $quote_character,
4126             $quote_pos,
4127             $quote_depth,
4128             $quoted_string_1,
4129             $quoted_string_2,
4130             $rtokens,
4131             $rtoken_map,
4132             $max_token_index,
4133
4134         );
4135
4136         # all done if we didn't find it
4137         if ($in_quote) { return }
4138
4139         # save pattern and replacement text for rescanning
4140         my $qs1 = $quoted_string_1;
4141
4142         # re-initialize for next search
4143         $quote_character = EMPTY_STRING;
4144         $quote_pos       = 0;
4145         $quote_type      = 'Q';
4146         $quoted_string_1 = EMPTY_STRING;
4147         $quoted_string_2 = EMPTY_STRING;
4148         if ( ++$i > $max_token_index ) { return }
4149
4150         # look for any modifiers
4151         if ($allowed_quote_modifiers) {
4152
4153             # check for exact quote modifiers
4154             if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
4155                 my $str = $rtokens->[$i];
4156                 my $saw_modifier_e;
4157                 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
4158                     my $pos  = pos($str);
4159                     my $char = substr( $str, $pos - 1, 1 );
4160                     $saw_modifier_e ||= ( $char eq 'e' );
4161                 }
4162
4163                 # For an 'e' quote modifier we must scan the replacement
4164                 # text for here-doc targets...
4165                 # but if the modifier starts a new line we can skip
4166                 # this because either the here doc will be fully
4167                 # contained in the replacement text (so we can
4168                 # ignore it) or Perl will not find it.
4169                 # See test 'here2.in'.
4170                 if ( $saw_modifier_e && $i_tok >= 0 ) {
4171
4172                     my $rht = scan_replacement_text($qs1);
4173
4174                     # Change type from 'Q' to 'h' for quotes with
4175                     # here-doc targets so that the formatter (see sub
4176                     # process_line_of_CODE) will not make any line
4177                     # breaks after this point.
4178                     if ($rht) {
4179                         push @{$rhere_target_list}, @{$rht};
4180                         $type = 'h';
4181                         if ( $i_tok < 0 ) {
4182                             my $ilast = $routput_token_list->[-1];
4183                             $routput_token_type->[$ilast] = $type;
4184                         }
4185                     }
4186                 }
4187
4188                 if ( defined( pos($str) ) ) {
4189
4190                     # matched
4191                     if ( pos($str) == length($str) ) {
4192                         if ( ++$i > $max_token_index ) { return }
4193                     }
4194
4195                     # Looks like a joined quote modifier
4196                     # and keyword, maybe something like
4197                     # s/xxx/yyy/gefor @k=...
4198                     # Example is "galgen.pl".  Would have to split
4199                     # the word and insert a new token in the
4200                     # pre-token list.  This is so rare that I haven't
4201                     # done it.  Will just issue a warning citation.
4202
4203                     # This error might also be triggered if my quote
4204                     # modifier characters are incomplete
4205                     else {
4206                         warning(<<EOM);
4207
4208 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
4209 Please put a space between quote modifiers and trailing keywords.
4210 EOM
4211
4212                         # print "token $rtokens->[$i]\n";
4213                         # my $num = length($str) - pos($str);
4214                         # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
4215                         # print "continuing with new token $rtokens->[$i]\n";
4216
4217                         # skipping past this token does least damage
4218                         if ( ++$i > $max_token_index ) { return }
4219                     }
4220                 }
4221                 else {
4222
4223                     # example file: rokicki4.pl
4224                     # This error might also be triggered if my quote
4225                     # modifier characters are incomplete
4226                     write_logfile_entry(
4227                         "Note: found word $str at quote modifier location\n");
4228                 }
4229             }
4230
4231             # re-initialize
4232             $allowed_quote_modifiers = EMPTY_STRING;
4233         }
4234         return;
4235     } ## end sub do_FOLLOW_QUOTE
4236
4237     # ------------------------------------------------------------
4238     # begin hash of code for handling most token types
4239     # ------------------------------------------------------------
4240     my $tokenization_code = {
4241
4242         '>'   => \&do_GREATER_THAN_SIGN,
4243         '|'   => \&do_VERTICAL_LINE,
4244         '$'   => \&do_DOLLAR_SIGN,
4245         '('   => \&do_LEFT_PARENTHESIS,
4246         ')'   => \&do_RIGHT_PARENTHESIS,
4247         ','   => \&do_COMMA,
4248         ';'   => \&do_SEMICOLON,
4249         '"'   => \&do_QUOTATION_MARK,
4250         "'"   => \&do_APOSTROPHE,
4251         '`'   => \&do_BACKTICK,
4252         '/'   => \&do_SLASH,
4253         '{'   => \&do_LEFT_CURLY_BRACKET,
4254         '}'   => \&do_RIGHT_CURLY_BRACKET,
4255         '&'   => \&do_AMPERSAND,
4256         '<'   => \&do_LESS_THAN_SIGN,
4257         '?'   => \&do_QUESTION_MARK,
4258         '*'   => \&do_STAR,
4259         '.'   => \&do_DOT,
4260         ':'   => \&do_COLON,
4261         '+'   => \&do_PLUS_SIGN,
4262         '@'   => \&do_AT_SIGN,
4263         '%'   => \&do_PERCENT_SIGN,
4264         '['   => \&do_LEFT_SQUARE_BRACKET,
4265         ']'   => \&do_RIGHT_SQUARE_BRACKET,
4266         '-'   => \&do_MINUS_SIGN,
4267         '^'   => \&do_CARAT_SIGN,
4268         '::'  => \&do_DOUBLE_COLON,
4269         '<<'  => \&do_LEFT_SHIFT,
4270         '<<~' => \&do_NEW_HERE_DOC,
4271         '->'  => \&do_POINTER,
4272         '++'  => \&do_PLUS_PLUS,
4273         '=>'  => \&do_FAT_COMMA,
4274         '--'  => \&do_MINUS_MINUS,
4275         '&&'  => \&do_LOGICAL_AND,
4276         '||'  => \&do_LOGICAL_OR,
4277         '//'  => \&do_SLASH_SLASH,
4278
4279         # No special code for these types yet, but syntax checks
4280         # could be added.
4281         ##  '!'   => undef,
4282         ##  '!='  => undef,
4283         ##  '!~'  => undef,
4284         ##  '%='  => undef,
4285         ##  '&&=' => undef,
4286         ##  '&='  => undef,
4287         ##  '+='  => undef,
4288         ##  '-='  => undef,
4289         ##  '..'  => undef,
4290         ##  '..'  => undef,
4291         ##  '...' => undef,
4292         ##  '.='  => undef,
4293         ##  '<<=' => undef,
4294         ##  '<='  => undef,
4295         ##  '<=>' => undef,
4296         ##  '<>'  => undef,
4297         ##  '='   => undef,
4298         ##  '=='  => undef,
4299         ##  '=~'  => undef,
4300         ##  '>='  => undef,
4301         ##  '>>'  => undef,
4302         ##  '>>=' => undef,
4303         ##  '\\'  => undef,
4304         ##  '^='  => undef,
4305         ##  '|='  => undef,
4306         ##  '||=' => undef,
4307         ##  '//=' => undef,
4308         ##  '~'   => undef,
4309         ##  '~~'  => undef,
4310         ##  '!~~' => undef,
4311
4312     };
4313
4314     # ------------------------------------------------------------
4315     # end hash of code for handling individual token types
4316     # ------------------------------------------------------------
4317
4318     use constant DEBUG_TOKENIZE => 0;
4319
4320     sub tokenize_this_line {
4321
4322   # This routine breaks a line of perl code into tokens which are of use in
4323   # indentation and reformatting.  One of my goals has been to define tokens
4324   # such that a newline may be inserted between any pair of tokens without
4325   # changing or invalidating the program. This version comes close to this,
4326   # although there are necessarily a few exceptions which must be caught by
4327   # the formatter.  Many of these involve the treatment of bare words.
4328   #
4329   # The tokens and their types are returned in arrays.  See previous
4330   # routine for their names.
4331   #
4332   # See also the array "valid_token_types" in the BEGIN section for an
4333   # up-to-date list.
4334   #
4335   # To simplify things, token types are either a single character, or they
4336   # are identical to the tokens themselves.
4337   #
4338   # As a debugging aid, the -D flag creates a file containing a side-by-side
4339   # comparison of the input string and its tokenization for each line of a file.
4340   # This is an invaluable debugging aid.
4341   #
4342   # In addition to tokens, and some associated quantities, the tokenizer
4343   # also returns flags indication any special line types.  These include
4344   # quotes, here_docs, formats.
4345   #
4346   # -----------------------------------------------------------------------
4347   #
4348   # How to add NEW_TOKENS:
4349   #
4350   # New token types will undoubtedly be needed in the future both to keep up
4351   # with changes in perl and to help adapt the tokenizer to other applications.
4352   #
4353   # Here are some notes on the minimal steps.  I wrote these notes while
4354   # adding the 'v' token type for v-strings, which are things like version
4355   # numbers 5.6.0, and ip addresses, and will use that as an example.  ( You
4356   # can use your editor to search for the string "NEW_TOKENS" to find the
4357   # appropriate sections to change):
4358   #
4359   # *. Try to talk somebody else into doing it!  If not, ..
4360   #
4361   # *. Make a backup of your current version in case things don't work out!
4362   #
4363   # *. Think of a new, unused character for the token type, and add to
4364   # the array @valid_token_types in the BEGIN section of this package.
4365   # For example, I used 'v' for v-strings.
4366   #
4367   # *. Implement coding to recognize the $type of the token in this routine.
4368   # This is the hardest part, and is best done by imitating or modifying
4369   # some of the existing coding.  For example, to recognize v-strings, I
4370   # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
4371   # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
4372   #
4373   # *. Update sub operator_expected.  This update is critically important but
4374   # the coding is trivial.  Look at the comments in that routine for help.
4375   # For v-strings, which should behave like numbers, I just added 'v' to the
4376   # regex used to handle numbers and strings (types 'n' and 'Q').
4377   #
4378   # *. Implement a 'bond strength' rule in sub set_bond_strengths in
4379   # Perl::Tidy::Formatter for breaking lines around this token type.  You can
4380   # skip this step and take the default at first, then adjust later to get
4381   # desired results.  For adding type 'v', I looked at sub bond_strength and
4382   # saw that number type 'n' was using default strengths, so I didn't do
4383   # anything.  I may tune it up someday if I don't like the way line
4384   # breaks with v-strings look.
4385   #
4386   # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
4387   # Perl::Tidy::Formatter.  For adding type 'v', I looked at this routine
4388   # and saw that type 'n' used spaces on both sides, so I just added 'v'
4389   # to the array @spaces_both_sides.
4390   #
4391   # *. Update HtmlWriter package so that users can colorize the token as
4392   # desired.  This is quite easy; see comments identified by 'NEW_TOKENS' in
4393   # that package.  For v-strings, I initially chose to use a default color
4394   # equal to the default for numbers, but it might be nice to change that
4395   # eventually.
4396   #
4397   # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
4398   #
4399   # *. Run lots and lots of debug tests.  Start with special files designed
4400   # to test the new token type.  Run with the -D flag to create a .DEBUG
4401   # file which shows the tokenization.  When these work ok, test as many old
4402   # scripts as possible.  Start with all of the '.t' files in the 'test'
4403   # directory of the distribution file.  Compare .tdy output with previous
4404   # version and updated version to see the differences.  Then include as
4405   # many more files as possible. My own technique has been to collect a huge
4406   # number of perl scripts (thousands!) into one directory and run perltidy
4407   # *, then run diff between the output of the previous version and the
4408   # current version.
4409   #
4410   # *. For another example, search for the smartmatch operator '~~'
4411   # with your editor to see where updates were made for it.
4412   #
4413   # -----------------------------------------------------------------------
4414
4415         my ( $self, $line_of_tokens ) = @_;
4416         my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
4417
4418         # Extract line number for use in error messages
4419         $input_line_number = $line_of_tokens->{_line_number};
4420
4421         # Check for pod documentation
4422         if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
4423             && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
4424         {
4425
4426             # Must not be in multi-line quote
4427             # and must not be in an equation
4428             if ( !$in_quote
4429                 && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
4430             {
4431                 $self->[_in_pod_] = 1;
4432                 return;
4433             }
4434         }
4435
4436         $input_line = $untrimmed_input_line;
4437
4438         chomp $input_line;
4439
4440         # Set a flag to indicate if we might be at an __END__ or __DATA__ line
4441         # This will be used below to avoid quoting a bare word followed by
4442         # a fat comma.
4443         my $is_END_or_DATA;
4444
4445         # Reinitialize the multi-line quote flag
4446         if ( $in_quote && $quote_type eq 'Q' ) {
4447             $line_of_tokens->{_starting_in_quote} = 1;
4448         }
4449         else {
4450             $line_of_tokens->{_starting_in_quote} = 0;
4451
4452             # Trim start of this line unless we are continuing a quoted line.
4453             # Do not trim end because we might end in a quote (test: deken4.pl)
4454             # Perl::Tidy::Formatter will delete needless trailing blanks
4455             $input_line =~ s/^(\s+)//;
4456
4457             # Calculate a guessed level for nonblank lines to avoid calls to
4458             #    sub guess_old_indentation_level()
4459             if ( length($input_line) && $1 ) {
4460                 my $leading_spaces = $1;
4461                 my $spaces         = length($leading_spaces);
4462
4463                 # handle leading tabs
4464                 if ( ord( substr( $leading_spaces, 0, 1 ) ) == ORD_TAB
4465                     && $leading_spaces =~ /^(\t+)/ )
4466                 {
4467                     my $tabsize = $self->[_tabsize_];
4468                     $spaces += length($1) * ( $tabsize - 1 );
4469                 }
4470
4471                 my $indent_columns = $self->[_indent_columns_];
4472                 $line_of_tokens->{_guessed_indentation_level} =
4473                   int( $spaces / $indent_columns );
4474             }
4475
4476             $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
4477               && $input_line =~ /^__(END|DATA)__\s*$/;
4478         }
4479
4480         # Optimize for a full-line comment.
4481         if ( !$in_quote ) {
4482             if ( substr( $input_line, 0, 1 ) eq '#' ) {
4483
4484                 # and check for skipped section
4485                 if (   $rOpts_code_skipping
4486                     && $input_line =~ /$code_skipping_pattern_begin/ )
4487                 {
4488                     $self->[_in_skipped_] = 1;
4489                     return;
4490                 }
4491
4492                 # Optional fast processing of a block comment
4493                 my $ci_string_sum =
4494                   ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4495                 my $ci_string_i = $ci_string_sum + $in_statement_continuation;
4496                 $line_of_tokens->{_line_type}        = 'CODE';
4497                 $line_of_tokens->{_rtokens}          = [$input_line];
4498                 $line_of_tokens->{_rtoken_type}      = ['#'];
4499                 $line_of_tokens->{_rlevels}          = [$level_in_tokenizer];
4500                 $line_of_tokens->{_rci_levels}       = [$ci_string_i];
4501                 $line_of_tokens->{_rblock_type}      = [EMPTY_STRING];
4502                 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
4503                 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
4504                 return;
4505             }
4506
4507             # Optimize handling of a blank line
4508             if ( !length($input_line) ) {
4509                 $line_of_tokens->{_line_type}        = 'CODE';
4510                 $line_of_tokens->{_rtokens}          = [];
4511                 $line_of_tokens->{_rtoken_type}      = [];
4512                 $line_of_tokens->{_rlevels}          = [];
4513                 $line_of_tokens->{_rci_levels}       = [];
4514                 $line_of_tokens->{_rblock_type}      = [];
4515                 $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
4516                 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
4517                 return;
4518             }
4519         }
4520
4521         # update the copy of the line for use in error messages
4522         # This must be exactly what we give the pre_tokenizer
4523         $self->[_line_of_text_] = $input_line;
4524
4525         # re-initialize for the main loop
4526         $routput_token_list     = [];    # stack of output token indexes
4527         $routput_token_type     = [];    # token types
4528         $routput_block_type     = [];    # types of code block
4529         $routput_container_type = [];    # paren types, such as if, elsif, ..
4530         $routput_type_sequence  = [];    # nesting sequential number
4531
4532         $rhere_target_list = [];
4533
4534         $tok             = $last_nonblank_token;
4535         $type            = $last_nonblank_type;
4536         $prototype       = $last_nonblank_prototype;
4537         $last_nonblank_i = -1;
4538         $block_type      = $last_nonblank_block_type;
4539         $container_type  = $last_nonblank_container_type;
4540         $type_sequence   = $last_nonblank_type_sequence;
4541         $indent_flag     = 0;
4542         $peeked_ahead    = 0;
4543
4544         $self->tokenizer_main_loop($is_END_or_DATA);
4545
4546         #-----------------------------------------------
4547         # all done tokenizing this line ...
4548         # now prepare the final list of tokens and types
4549         #-----------------------------------------------
4550
4551         $self->tokenizer_wrapup_line($line_of_tokens);
4552
4553         return;
4554     } ## end sub tokenize_this_line
4555
4556     sub tokenizer_main_loop {
4557
4558         my ( $self, $is_END_or_DATA ) = @_;
4559
4560         #---------------------------------
4561         # Break one input line into tokens
4562         #---------------------------------
4563
4564         # Input parameter:
4565         #   $is_END_or_DATA is true for a __END__ or __DATA__ line
4566
4567         # start by breaking the line into pre-tokens
4568         my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
4569         ( $rtokens, $rtoken_map, $rtoken_type ) =
4570           pre_tokenize( $input_line, $max_tokens_wanted );
4571
4572         $max_token_index = scalar( @{$rtokens} ) - 1;
4573         push( @{$rtokens}, SPACE, SPACE, SPACE )
4574           ;                        # extra whitespace simplifies logic
4575         push( @{$rtoken_map},  0,   0,   0 );     # shouldn't be referenced
4576         push( @{$rtoken_type}, 'b', 'b', 'b' );
4577
4578         # initialize for main loop
4579         if (0) { #<<< this is not necessary
4580         foreach my $ii ( 0 .. $max_token_index + 3 ) {
4581             $routput_token_type->[$ii]     = EMPTY_STRING;
4582             $routput_block_type->[$ii]     = EMPTY_STRING;
4583             $routput_container_type->[$ii] = EMPTY_STRING;
4584             $routput_type_sequence->[$ii]  = EMPTY_STRING;
4585             $routput_indent_flag->[$ii]    = 0;
4586         }
4587         }
4588
4589         $i     = -1;
4590         $i_tok = -1;
4591
4592         #-----------------------------
4593         # begin main tokenization loop
4594         #-----------------------------
4595
4596         # we are looking at each pre-token of one line and combining them
4597         # into tokens
4598         while ( ++$i <= $max_token_index ) {
4599
4600             # continue looking for the end of a quote
4601             if ($in_quote) {
4602                 do_FOLLOW_QUOTE();
4603                 last if ( $in_quote || $i > $max_token_index );
4604             }
4605
4606             if ( $type ne 'b' && $tok ne 'CORE::' ) {
4607
4608                 # try to catch some common errors
4609                 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
4610
4611                     if ( $last_nonblank_token eq 'eq' ) {
4612                         complain("Should 'eq' be '==' here ?\n");
4613                     }
4614                     elsif ( $last_nonblank_token eq 'ne' ) {
4615                         complain("Should 'ne' be '!=' here ?\n");
4616                     }
4617                 }
4618
4619                 # fix c090, only rotate vars if a new token will be stored
4620                 if ( $i_tok >= 0 ) {
4621                     $last_last_nonblank_token      = $last_nonblank_token;
4622                     $last_last_nonblank_type       = $last_nonblank_type;
4623                     $last_last_nonblank_block_type = $last_nonblank_block_type;
4624                     $last_last_nonblank_container_type =
4625                       $last_nonblank_container_type;
4626                     $last_last_nonblank_type_sequence =
4627                       $last_nonblank_type_sequence;
4628
4629                     # Fix part #3 for git82: propagate type 'Z' though L-R pair
4630                     unless ( $type eq 'R' && $last_nonblank_type eq 'Z' ) {
4631                         $last_nonblank_token = $tok;
4632                         $last_nonblank_type  = $type;
4633                     }
4634                     $last_nonblank_prototype      = $prototype;
4635                     $last_nonblank_block_type     = $block_type;
4636                     $last_nonblank_container_type = $container_type;
4637                     $last_nonblank_type_sequence  = $type_sequence;
4638                     $last_nonblank_i              = $i_tok;
4639                 }
4640
4641                 # Patch for c030: Fix things in case a '->' got separated from
4642                 # the subsequent identifier by a side comment.  We need the
4643                 # last_nonblank_token to have a leading -> to avoid triggering
4644                 # an operator expected error message at the next '('. See also
4645                 # fix for git #63.
4646                 if ( $last_last_nonblank_token eq '->' ) {
4647                     if (   $last_nonblank_type eq 'w'
4648                         || $last_nonblank_type eq 'i' )
4649                     {
4650                         $last_nonblank_token = '->' . $last_nonblank_token;
4651                         $last_nonblank_type  = 'i';
4652                     }
4653                 }
4654             }
4655
4656             # store previous token type
4657             if ( $i_tok >= 0 ) {
4658                 $routput_token_type->[$i_tok]     = $type;
4659                 $routput_block_type->[$i_tok]     = $block_type;
4660                 $routput_container_type->[$i_tok] = $container_type;
4661                 $routput_type_sequence->[$i_tok]  = $type_sequence;
4662                 $routput_indent_flag->[$i_tok]    = $indent_flag;
4663             }
4664
4665             # get the next pre-token and type
4666             # $tok and $type will be modified to make the output token
4667             my $pre_tok  = $tok  = $rtokens->[$i];      # get the next pre-token
4668             my $pre_type = $type = $rtoken_type->[$i];  # and type
4669
4670             # remember the starting index of this token; we will be updating $i
4671             $i_tok = $i;
4672
4673             # re-initialize various flags for the next output token
4674             $block_type     &&= EMPTY_STRING;
4675             $container_type &&= EMPTY_STRING;
4676             $type_sequence  &&= EMPTY_STRING;
4677             $indent_flag    &&= 0;
4678             $prototype      &&= EMPTY_STRING;
4679
4680             # this pre-token will start an output token
4681             push( @{$routput_token_list}, $i_tok );
4682
4683             #--------------------------
4684             # handle a whitespace token
4685             #--------------------------
4686             next if ( $pre_type eq 'b' );
4687
4688             #-----------------
4689             # handle a comment
4690             #-----------------
4691             last if ( $pre_type eq '#' );
4692
4693             # continue gathering identifier if necessary
4694             if ($id_scan_state) {
4695
4696                 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
4697                     scan_id();
4698                 }
4699                 else {
4700                     scan_identifier();
4701                 }
4702
4703                 if ($id_scan_state) {
4704
4705                     # Still scanning ...
4706                     # Check for side comment between sub and prototype (c061)
4707
4708                     # done if nothing left to scan on this line
4709                     last if ( $i > $max_token_index );
4710
4711                     my ( $next_nonblank_token, $i_next ) =
4712                       find_next_nonblank_token_on_this_line( $i, $rtokens,
4713                         $max_token_index );
4714
4715                     # done if it was just some trailing space
4716                     last if ( $i_next > $max_token_index );
4717
4718                     # something remains on the line ... must be a side comment
4719                     next;
4720                 }
4721
4722                 next if ( ( $i > 0 ) || $type );
4723
4724                 # didn't find any token; start over
4725                 $type = $pre_type;
4726                 $tok  = $pre_tok;
4727             }
4728
4729             my $prev_tok  = $i > 0 ? $rtokens->[ $i - 1 ]     : SPACE;
4730             my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
4731
4732             #-----------------------------------------------------------
4733             # Combine pre-tokens into digraphs and trigraphs if possible
4734             #-----------------------------------------------------------
4735
4736             # See if we can make a digraph...
4737             # The following tokens are excluded and handled specially:
4738             # '/=' is excluded because the / might start a pattern.
4739             # 'x=' is excluded since it might be $x=, with $ on previous line
4740             # '**' and *= might be typeglobs of punctuation variables
4741             # I have allowed tokens starting with <, such as <=,
4742             # because I don't think these could be valid angle operators.
4743             # test file: storrs4.pl
4744             if (   $can_start_digraph{$tok}
4745                 && $i < $max_token_index
4746                 && $is_digraph{ $tok . $rtokens->[ $i + 1 ] } )
4747             {
4748
4749                 my $combine_ok = 1;
4750                 my $test_tok   = $tok . $rtokens->[ $i + 1 ];
4751
4752                 # check for special cases which cannot be combined
4753
4754                 # '//' must be defined_or operator if an operator is expected.
4755                 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
4756                 # could be migrated here for clarity
4757
4758                 # Patch for RT#102371, misparsing a // in the following snippet:
4759                 #     state $b //= ccc();
4760                 # The solution is to always accept the digraph (or trigraph)
4761                 # after type 'Z' (possible file handle).  The reason is that
4762                 # sub operator_expected gives TERM expected here, which is
4763                 # wrong in this case.
4764                 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
4765
4766                     # note that here $tok = '/' and the next tok and type is '/'
4767                     $expecting = operator_expected( [ $prev_type, $tok, '/' ] );
4768
4769                     # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
4770                     $combine_ok = 0 if ( $expecting == TERM );
4771                 }
4772
4773                 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
4774                 # Accept the digraphs '**' only after type 'Z'
4775                 # Otherwise postpone the decision.
4776                 if ( $test_tok eq '**' ) {
4777                     if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
4778                 }
4779
4780                 if (
4781
4782                     # still ok to combine?
4783                     $combine_ok
4784
4785                     && ( $test_tok ne '/=' )    # might be pattern
4786                     && ( $test_tok ne 'x=' )    # might be $x
4787                     && ( $test_tok ne '*=' )    # typeglob?
4788
4789                     # Moved above as part of fix for
4790                     # RT #114359: Missparsing of "print $x ** 0.5;
4791                     # && ( $test_tok ne '**' )    # typeglob?
4792                   )
4793                 {
4794                     $tok = $test_tok;
4795                     $i++;
4796
4797                     # Now try to assemble trigraphs.  Note that all possible
4798                     # perl trigraphs can be constructed by appending a character
4799                     # to a digraph.
4800                     $test_tok = $tok . $rtokens->[ $i + 1 ];
4801
4802                     if ( $is_trigraph{$test_tok} ) {
4803                         $tok = $test_tok;
4804                         $i++;
4805                     }
4806
4807                     # The only current tetragraph is the double diamond operator
4808                     # and its first three characters are not a trigraph, so
4809                     # we do can do a special test for it
4810                     elsif ( $test_tok eq '<<>' ) {
4811                         $test_tok .= $rtokens->[ $i + 2 ];
4812                         if ( $is_tetragraph{$test_tok} ) {
4813                             $tok = $test_tok;
4814                             $i += 2;
4815                         }
4816                     }
4817                 }
4818             }
4819
4820             $type      = $tok;
4821             $next_tok  = $rtokens->[ $i + 1 ];
4822             $next_type = $rtoken_type->[ $i + 1 ];
4823
4824             DEBUG_TOKENIZE && do {
4825                 local $LIST_SEPARATOR = ')(';
4826                 my @debug_list = (
4827                     $last_nonblank_token,      $tok,
4828                     $next_tok,                 $brace_depth,
4829                     $brace_type[$brace_depth], $paren_depth,
4830                     $paren_type[$paren_depth],
4831                 );
4832                 print STDOUT "TOKENIZE:(@debug_list)\n";
4833             };
4834
4835             # Turn off attribute list on first non-blank, non-bareword.
4836             # Added '#' to fix c038 (later moved above).
4837             if ( $in_attribute_list && $pre_type ne 'w' ) {
4838                 $in_attribute_list = 0;
4839             }
4840
4841             #--------------------------------------------------------
4842             # We have the next token, $tok.
4843             # Now we have to examine this token and decide what it is
4844             # and define its $type
4845             #
4846             # section 1: bare words
4847             #--------------------------------------------------------
4848
4849             if ( $pre_type eq 'w' ) {
4850                 $expecting =
4851                   operator_expected( [ $prev_type, $tok, $next_type ] );
4852                 my $is_last = do_BAREWORD($is_END_or_DATA);
4853                 last if ($is_last);
4854             }
4855
4856             #-----------------------------
4857             # section 2: strings of digits
4858             #-----------------------------
4859             elsif ( $pre_type eq 'd' ) {
4860                 $expecting =
4861                   operator_expected( [ $prev_type, $tok, $next_type ] );
4862                 do_DIGITS();
4863             }
4864
4865             #----------------------------
4866             # section 3: all other tokens
4867             #----------------------------
4868             else {
4869                 my $code = $tokenization_code->{$tok};
4870                 if ($code) {
4871                     $expecting =
4872                       operator_expected( [ $prev_type, $tok, $next_type ] );
4873                     $code->();
4874                     redo if $in_quote;
4875                 }
4876             }
4877         }
4878
4879         # -----------------------------
4880         # end of main tokenization loop
4881         # -----------------------------
4882
4883         # Store the final token
4884         if ( $i_tok >= 0 ) {
4885             $routput_token_type->[$i_tok]     = $type;
4886             $routput_block_type->[$i_tok]     = $block_type;
4887             $routput_container_type->[$i_tok] = $container_type;
4888             $routput_type_sequence->[$i_tok]  = $type_sequence;
4889             $routput_indent_flag->[$i_tok]    = $indent_flag;
4890         }
4891
4892         # Remember last nonblank values
4893         if ( $type ne 'b' && $type ne '#' ) {
4894             $last_last_nonblank_token          = $last_nonblank_token;
4895             $last_last_nonblank_type           = $last_nonblank_type;
4896             $last_last_nonblank_block_type     = $last_nonblank_block_type;
4897             $last_last_nonblank_container_type = $last_nonblank_container_type;
4898             $last_last_nonblank_type_sequence  = $last_nonblank_type_sequence;
4899             $last_nonblank_token               = $tok;
4900             $last_nonblank_type                = $type;
4901             $last_nonblank_block_type          = $block_type;
4902             $last_nonblank_container_type      = $container_type;
4903             $last_nonblank_type_sequence       = $type_sequence;
4904             $last_nonblank_prototype           = $prototype;
4905         }
4906
4907         # reset indentation level if necessary at a sub or package
4908         # in an attempt to recover from a nesting error
4909         if ( $level_in_tokenizer < 0 ) {
4910             if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
4911                 reset_indentation_level(0);
4912                 brace_warning("resetting level to 0 at $1 $2\n");
4913             }
4914         }
4915
4916         $self->[_in_attribute_list_] = $in_attribute_list;
4917         $self->[_in_quote_]          = $in_quote;
4918         $self->[_quote_target_] =
4919           $in_quote ? matching_end_token($quote_character) : EMPTY_STRING;
4920         $self->[_rhere_target_list_] = $rhere_target_list;
4921
4922         return;
4923     } ## end sub tokenizer_main_loop
4924
4925     sub tokenizer_wrapup_line {
4926         my ( $self, $line_of_tokens ) = @_;
4927
4928         #---------------------------------------------------------
4929         # Package a line of tokens for shipping back to the caller
4930         #---------------------------------------------------------
4931
4932         # Most of the remaining work involves defining the two indentation
4933         # parameters that the formatter needs for each token:
4934         # - $level    = structural indentation level and
4935         # - $ci_level = continuation indentation level
4936
4937         # The method for setting the indentation level is straightforward.
4938         # But the method used to define the continuation indentation is
4939         # complicated because it has evolved over a long time by trial and
4940         # error. It could undoubtedly be simplified but it works okay as is.
4941
4942         # Here is a brief description of how indentation is computed.
4943         # Perl::Tidy computes indentation as the sum of 2 terms:
4944         #
4945         # (1) structural indentation, such as if/else/elsif blocks
4946         # (2) continuation indentation, such as long parameter call lists.
4947         #
4948         # These are occasionally called primary and secondary indentation.
4949         #
4950         # Structural indentation is introduced by tokens of type '{',
4951         # although the actual tokens might be '{', '(', or '['.  Structural
4952         # indentation is of two types: BLOCK and non-BLOCK.  Default
4953         # structural indentation is 4 characters if the standard indentation
4954         # scheme is used.
4955         #
4956         # Continuation indentation is introduced whenever a line at BLOCK
4957         # level is broken before its termination.  Default continuation
4958         # indentation is 2 characters in the standard indentation scheme.
4959         #
4960         # Both types of indentation may be nested arbitrarily deep and
4961         # interlaced.  The distinction between the two is somewhat arbitrary.
4962         #
4963         # For each token, we will define two variables which would apply if
4964         # the current statement were broken just before that token, so that
4965         # that token started a new line:
4966         #
4967         # $level = the structural indentation level,
4968         # $ci_level = the continuation indentation level
4969         #
4970         # The total indentation will be $level * (4 spaces) + $ci_level * (2
4971         # spaces), assuming defaults.  However, in some special cases it is
4972         # customary to modify $ci_level from this strict value.
4973         #
4974         # The total structural indentation is easy to compute by adding and
4975         # subtracting 1 from a saved value as types '{' and '}' are seen.
4976         # The running value of this variable is $level_in_tokenizer.
4977         #
4978         # The total continuation is much more difficult to compute, and
4979         # requires several variables.  These variables are:
4980         #
4981         # $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
4982         #   each indentation level, if there are intervening open secondary
4983         #   structures just prior to that level.
4984         # $continuation_string_in_tokenizer = a string of 1's and 0's
4985         #   indicating if the last token at that level is "continued", meaning
4986         #   that it is not the first token of an expression.
4987         # $nesting_block_string = a string of 1's and 0's indicating, for each
4988         #   indentation level, if the level is of type BLOCK or not.
4989         # $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
4990         # $nesting_list_string = a string of 1's and 0's indicating, for each
4991         #   indentation level, if it is appropriate for list formatting.
4992         #   If so, continuation indentation is used to indent long list items.
4993         # $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
4994         # @{$rslevel_stack} = a stack of total nesting depths at each
4995         #   structural indentation level, where "total nesting depth" means
4996         #   the nesting depth that would occur if every nesting token
4997         #   -- '{', '[', #   and '(' -- , regardless of context, is used to
4998         #   compute a nesting depth.
4999
5000         # Notes on the Continuation Indentation
5001         #
5002         # There is a sort of chicken-and-egg problem with continuation
5003         # indentation.  The formatter can't make decisions on line breaks
5004         # without knowing what 'ci' will be at arbitrary locations.
5005         #
5006         # But a problem with setting the continuation indentation (ci) here
5007         # in the tokenizer is that we do not know where line breaks will
5008         # actually be.  As a result, we don't know if we should propagate
5009         # continuation indentation to higher levels of structure.
5010         #
5011         # For nesting of only structural indentation, we never need to do
5012         # this.  For example, in a long if statement, like this
5013         #
5014         #   if ( !$output_block_type[$i]
5015         #     && ($in_statement_continuation) )
5016         #   {           <--outdented
5017         #       do_something();
5018         #   }
5019         #
5020         # the second line has ci but we do normally give the lines within
5021         # the BLOCK any ci.  This would be true if we had blocks nested
5022         # arbitrarily deeply.
5023         #
5024         # But consider something like this, where we have created a break
5025         # after an opening paren on line 1, and the paren is not (currently)
5026         # a structural indentation token:
5027         #
5028         # my $file = $menubar->Menubutton(
5029         #   qw/-text File -underline 0 -menuitems/ => [
5030         #       [
5031         #           Cascade    => '~View',
5032         #           -menuitems => [
5033         #           ...
5034         #
5035         # The second line has ci, so it would seem reasonable to propagate
5036         # it down, giving the third line 1 ci + 1 indentation.  This
5037         # suggests the following rule, which is currently used to
5038         # propagating ci down: if there are any non-structural opening
5039         # parens (or brackets, or braces), before an opening structural
5040         # brace, then ci is propagated down, and otherwise
5041         # not.  The variable $intervening_secondary_structure contains this
5042         # information for the current token, and the string
5043         # "$ci_string_in_tokenizer" is a stack of previous values of this
5044         # variable.
5045
5046         my @token_type    = ();    # stack of output token types
5047         my @block_type    = ();    # stack of output code block types
5048         my @type_sequence = ();    # stack of output type sequence numbers
5049         my @tokens        = ();    # output tokens
5050         my @levels        = ();    # structural brace levels of output tokens
5051         my @ci_string = ();  # string needed to compute continuation indentation
5052
5053         # Count the number of '1's in the string (previously sub ones_count)
5054         my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5055
5056         $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5057
5058         my ( $ci_string_i, $level_i );
5059
5060         #-----------------
5061         # Loop over tokens
5062         #-----------------
5063         my $rtoken_map_im;
5064         foreach my $i ( @{$routput_token_list} ) {
5065
5066             my $type_i = $routput_token_type->[$i];
5067             $level_i = $level_in_tokenizer;
5068
5069             # Quick handling of indentation levels for blanks and comments
5070             if ( $type_i eq 'b' || $type_i eq '#' ) {
5071                 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5072             }
5073
5074             # All other types
5075             else {
5076
5077                 # $tok_i is the PRE-token.  It only equals the token for symbols
5078                 my $tok_i = $rtokens->[$i];
5079
5080                 # Check for an invalid token type..
5081                 # This can happen by running perltidy on non-scripts although
5082                 # it could also be bug introduced by programming change.  Perl
5083                 # silently accepts a 032 (^Z) and takes it as the end
5084                 if ( !$is_valid_token_type{$type_i} ) {
5085                     my $val = ord($type_i);
5086                     warning(
5087 "unexpected character decimal $val ($type_i) in script\n"
5088                     );
5089                     $self->[_in_error_] = 1;
5090                 }
5091
5092                 # $ternary_indentation_flag indicates that we need a change
5093                 # in level at a nested ternary, as follows
5094                 #     1 => at a nested ternary ?
5095                 #    -1 => at a nested ternary :
5096                 #     0 => otherwise
5097                 my $ternary_indentation_flag = $routput_indent_flag->[$i];
5098
5099                 #-------------------------------------------
5100                 # Section 1: handle a level-increasing token
5101                 #-------------------------------------------
5102                 # set primary indentation levels based on structural braces
5103                 # Note: these are set so that the leading braces have a HIGHER
5104                 # level than their CONTENTS, which is convenient for indentation
5105                 # Also, define continuation indentation for each token.
5106                 if (   $type_i eq '{'
5107                     || $type_i eq 'L'
5108                     || $ternary_indentation_flag > 0 )
5109                 {
5110
5111                     # if the difference between total nesting levels is not 1,
5112                     # there are intervening non-structural nesting types between
5113                     # this '{' and the previous unclosed '{'
5114                     my $intervening_secondary_structure = 0;
5115                     if ( @{$rslevel_stack} ) {
5116                         $intervening_secondary_structure =
5117                           $slevel_in_tokenizer - $rslevel_stack->[-1];
5118                     }
5119
5120                     # save the current states
5121                     push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
5122                     $level_in_tokenizer++;
5123
5124                     if ( $level_in_tokenizer > $self->[_maximum_level_] ) {
5125                         $self->[_maximum_level_] = $level_in_tokenizer;
5126                     }
5127
5128                     if ($ternary_indentation_flag) {
5129
5130                         # break BEFORE '?' in a nested ternary
5131                         if ( $type_i eq '?' ) {
5132                             $level_i = $level_in_tokenizer;
5133                         }
5134
5135                         $nesting_block_string .= "$nesting_block_flag";
5136                     } ## end if ($ternary_indentation_flag)
5137                     else {
5138
5139                         if ( $routput_block_type->[$i] ) {
5140                             $nesting_block_flag = 1;
5141                             $nesting_block_string .= '1';
5142                         }
5143                         else {
5144                             $nesting_block_flag = 0;
5145                             $nesting_block_string .= '0';
5146                         }
5147                     }
5148
5149                     # we will use continuation indentation within containers
5150                     # which are not blocks and not logical expressions
5151                     my $bit = 0;
5152                     if ( !$routput_block_type->[$i] ) {
5153
5154                         # propagate flag down at nested open parens
5155                         if ( $routput_container_type->[$i] eq '(' ) {
5156                             $bit = 1 if $nesting_list_flag;
5157                         }
5158
5159                   # use list continuation if not a logical grouping
5160                   # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
5161                         else {
5162                             $bit = 1
5163                               unless
5164                               $is_logical_container{ $routput_container_type
5165                                   ->[$i] };
5166                         }
5167                     }
5168                     $nesting_list_string .= $bit;
5169                     $nesting_list_flag = $bit;
5170
5171                     $ci_string_in_tokenizer .=
5172                       ( $intervening_secondary_structure != 0 ) ? '1' : '0';
5173                     $ci_string_sum =
5174                       ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5175                     $continuation_string_in_tokenizer .=
5176                       ( $in_statement_continuation > 0 ) ? '1' : '0';
5177
5178                     #  Sometimes we want to give an opening brace
5179                     #  continuation indentation, and sometimes not.  For code
5180                     #  blocks, we don't do it, so that the leading '{' gets
5181                     #  outdented, like this:
5182                     #
5183                     #   if ( !$output_block_type[$i]
5184                     #     && ($in_statement_continuation) )
5185                     #   {           <--outdented
5186                     #
5187                     #  For other types, we will give them continuation
5188                     #  indentation.  For example, here is how a list looks
5189                     #  with the opening paren indented:
5190                     #
5191                     #  @LoL =
5192                     #    ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
5193                     #      [ "homer", "marge", "bart" ], );
5194                     #
5195                     #  This looks best when 'ci' is one-half of the
5196                     #  indentation  (i.e., 2 and 4)
5197
5198                     my $total_ci = $ci_string_sum;
5199                     if (
5200                         !$routput_block_type->[$i]    # patch: skip for BLOCK
5201                         && ($in_statement_continuation)
5202                         && !( $ternary_indentation_flag && $type_i eq ':' )
5203                       )
5204                     {
5205                         $total_ci += $in_statement_continuation
5206                           unless (
5207                             substr( $ci_string_in_tokenizer, -1 ) eq '1' );
5208                     }
5209
5210                     $ci_string_i               = $total_ci;
5211                     $in_statement_continuation = 0;
5212                 } ## end if ( $type_i eq '{' ||...})
5213
5214                 #-------------------------------------------
5215                 # Section 2: handle a level-decreasing token
5216                 #-------------------------------------------
5217                 elsif ($type_i eq '}'
5218                     || $type_i eq 'R'
5219                     || $ternary_indentation_flag < 0 )
5220                 {
5221
5222                     # only a nesting error in the script would prevent
5223                     # popping here
5224                     if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
5225
5226                     $level_i = --$level_in_tokenizer;
5227
5228                     if ( $level_in_tokenizer < 0 ) {
5229                         unless ( $self->[_saw_negative_indentation_] ) {
5230                             $self->[_saw_negative_indentation_] = 1;
5231                             warning("Starting negative indentation\n");
5232                         }
5233                     }
5234
5235                     # restore previous level values
5236                     if ( length($nesting_block_string) > 1 )
5237                     {    # true for valid script
5238                         chop $nesting_block_string;
5239                         $nesting_block_flag =
5240                           substr( $nesting_block_string, -1 ) eq '1';
5241                         chop $nesting_list_string;
5242                         $nesting_list_flag =
5243                           substr( $nesting_list_string, -1 ) eq '1';
5244
5245                         chop $ci_string_in_tokenizer;
5246                         $ci_string_sum =
5247                           ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5248
5249                         $in_statement_continuation =
5250                           chop $continuation_string_in_tokenizer;
5251
5252                         # zero continuation flag at terminal BLOCK '}' which
5253                         # ends a statement.
5254                         my $block_type_i = $routput_block_type->[$i];
5255                         if ($block_type_i) {
5256
5257                             # ...These include non-anonymous subs
5258                             # note: could be sub ::abc { or sub 'abc
5259                             if ( substr( $block_type_i, 0, 3 ) eq 'sub'
5260                                 && $block_type_i =~ m/^sub\s*/gc )
5261                             {
5262
5263                                 # note: older versions of perl require the /gc
5264                                 # modifier here or else the \G does not work.
5265                                 $in_statement_continuation = 0
5266                                   if ( $block_type_i =~ /\G('|::|\w)/gc );
5267                             }
5268
5269                             # ...and include all block types except user subs
5270                             # with block prototypes and these:
5271                             # (sort|grep|map|do|eval)
5272                             elsif (
5273                                 $is_zero_continuation_block_type{$block_type_i}
5274                               )
5275                             {
5276                                 $in_statement_continuation = 0;
5277                             }
5278
5279                             # ..but these are not terminal types:
5280                             #     /^(sort|grep|map|do|eval)$/ )
5281                             elsif ($is_sort_map_grep_eval_do{$block_type_i}
5282                                 || $is_grep_alias{$block_type_i} )
5283                             {
5284                             }
5285
5286                             # ..and a block introduced by a label
5287                             # /^\w+\s*:$/gc ) {
5288                             elsif ( $block_type_i =~ /:$/ ) {
5289                                 $in_statement_continuation = 0;
5290                             }
5291
5292                             # user function with block prototype
5293                             else {
5294                                 $in_statement_continuation = 0;
5295                             }
5296                         } ## end if ($block_type_i)
5297
5298                         # If we are in a list, then
5299                         # we must set continuation indentation at the closing
5300                         # paren of something like this (paren after $check):
5301                         #     assert(
5302                         #         __LINE__,
5303                         #         ( not defined $check )
5304                         #           or ref $check
5305                         #           or $check eq "new"
5306                         #           or $check eq "old",
5307                         #     );
5308                         elsif ( $tok_i eq ')' ) {
5309                             $in_statement_continuation = 1
5310                               if (
5311                                 $is_list_end_type{
5312                                     $routput_container_type->[$i]
5313                                 }
5314                               );
5315                             ##if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
5316                         }
5317                     } ## end if ( length($nesting_block_string...))
5318
5319                     $ci_string_i = $ci_string_sum + $in_statement_continuation;
5320                 } ## end elsif ( $type_i eq '}' ||...{)
5321
5322                 #-----------------------------------------
5323                 # Section 3: handle a constant level token
5324                 #-----------------------------------------
5325                 else {
5326
5327                     # zero the continuation indentation at certain tokens so
5328                     # that they will be at the same level as its container.  For
5329                     # commas, this simplifies the -lp indentation logic, which
5330                     # counts commas.  For ?: it makes them stand out.
5331                     if (
5332                         $nesting_list_flag
5333                         ##      $type_i =~ /^[,\?\:]$/
5334                         && $is_comma_question_colon{$type_i}
5335                       )
5336                     {
5337                         $in_statement_continuation = 0;
5338                     }
5339
5340                     # Be sure binary operators get continuation indentation.
5341                     # Note: the check on $nesting_block_flag is only needed
5342                     # to add ci to binary operators following a 'try' block,
5343                     # or similar extended syntax block operator (see c158).
5344                     if (
5345                            !$in_statement_continuation
5346                         && ( $nesting_block_flag || $nesting_list_flag )
5347                         && (   $type_i eq 'k' && $is_binary_keyword{$tok_i}
5348                             || $is_binary_type{$type_i} )
5349                       )
5350                     {
5351                         $in_statement_continuation = 1;
5352                     }
5353
5354                     # continuation indentation is sum of any open ci from
5355                     # previous levels plus the current level
5356                     $ci_string_i = $ci_string_sum + $in_statement_continuation;
5357
5358                     # update continuation flag ...
5359
5360                     # if we are in a BLOCK
5361                     if ($nesting_block_flag) {
5362
5363                         # the next token after a ';' and label starts a new stmt
5364                         if ( $type_i eq ';' || $type_i eq 'J' ) {
5365                             $in_statement_continuation = 0;
5366                         }
5367
5368                         # otherwise, we are continuing the current statement
5369                         else {
5370                             $in_statement_continuation = 1;
5371                         }
5372                     }
5373
5374                     # if we are not in a BLOCK..
5375                     else {
5376
5377                         # do not use continuation indentation if not list
5378                         # environment (could be within if/elsif clause)
5379                         if ( !$nesting_list_flag ) {
5380                             $in_statement_continuation = 0;
5381                         }
5382
5383                         # otherwise, the token after a ',' starts a new term
5384
5385                         # Patch FOR RT#99961; no continuation after a ';'
5386                         # This is needed because perltidy currently marks
5387                         # a block preceded by a type character like % or @
5388                         # as a non block, to simplify formatting. But these
5389                         # are actually blocks and can have semicolons.
5390                         # See code_block_type() and is_non_structural_brace().
5391                         elsif ( $type_i eq ',' || $type_i eq ';' ) {
5392                             $in_statement_continuation = 0;
5393                         }
5394
5395                         # otherwise, we are continuing the current term
5396                         else {
5397                             $in_statement_continuation = 1;
5398                         }
5399                     } ## end else [ if ($nesting_block_flag)]
5400
5401                 } ## end else [ if ( $type_i eq '{' ||...})]
5402
5403                 #-------------------------------------------
5404                 # Section 4: operations common to all levels
5405                 #-------------------------------------------
5406
5407                 # set secondary nesting levels based on all containment token
5408                 # types Note: these are set so that the nesting depth is the
5409                 # depth of the PREVIOUS TOKEN, which is convenient for setting
5410                 # the strength of token bonds
5411
5412                 #    /^[L\{\(\[]$/
5413                 if ( $is_opening_type{$type_i} ) {
5414                     $slevel_in_tokenizer++;
5415                     $nesting_token_string .= $tok_i;
5416                     $nesting_type_string  .= $type_i;
5417                 }
5418
5419                 #       /^[R\}\)\]]$/
5420                 elsif ( $is_closing_type{$type_i} ) {
5421                     $slevel_in_tokenizer--;
5422                     my $char = chop $nesting_token_string;
5423
5424                     if ( $char ne $matching_start_token{$tok_i} ) {
5425                         $nesting_token_string .= $char . $tok_i;
5426                         $nesting_type_string  .= $type_i;
5427                     }
5428                     else {
5429                         chop $nesting_type_string;
5430                     }
5431                 }
5432
5433                 # apply token type patch:
5434                 # - output anonymous 'sub' as keyword (type 'k')
5435                 # - output __END__, __DATA__, and format as type 'k' instead
5436                 #   of ';' to make html colors correct, etc.
5437                 # The following hash tests are equivalent to these older tests:
5438                 #   if ( $type_i eq 't' && $is_sub{$tok_i} ) { $fix_type = 'k' }
5439                 #   if ( $type_i eq ';' && $tok_i =~ /\w/ ) { $fix_type = 'k' }
5440                 if (   $is_END_DATA_format_sub{$tok_i}
5441                     && $is_semicolon_or_t{$type_i} )
5442                 {
5443                     $type_i = 'k';
5444                 }
5445             } ## end else [ if ( $type_i eq 'b' ||...)]
5446
5447             #--------------------------------
5448             # Store the values for this token
5449             #--------------------------------
5450             push( @ci_string,     $ci_string_i );
5451             push( @levels,        $level_i );
5452             push( @block_type,    $routput_block_type->[$i] );
5453             push( @type_sequence, $routput_type_sequence->[$i] );
5454             push( @token_type,    $type_i );
5455
5456             # Form and store the PREVIOUS token
5457             if ( defined($rtoken_map_im) ) {
5458                 my $numc =
5459                   $rtoken_map->[$i] - $rtoken_map_im;    # how many characters
5460
5461                 if ( $numc > 0 ) {
5462                     push( @tokens,
5463                         substr( $input_line, $rtoken_map_im, $numc ) );
5464                 }
5465                 else {
5466
5467                     # Should not happen unless @{$rtoken_map} is corrupted
5468                     DEVEL_MODE
5469                       && Fault(
5470                         "number of characters is '$numc' but should be >0\n");
5471                 }
5472             }
5473
5474             # or grab some values for the leading token (needed for log output)
5475             else {
5476                 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5477             }
5478
5479             $rtoken_map_im = $rtoken_map->[$i];
5480         } ## end foreach my $i ( @{$routput_token_list...})
5481
5482         #------------------------
5483         # End loop to over tokens
5484         #------------------------
5485
5486         # Form and store the final token of this line
5487         if ( defined($rtoken_map_im) ) {
5488             my $numc = length($input_line) - $rtoken_map_im;
5489             if ( $numc > 0 ) {
5490                 push( @tokens, substr( $input_line, $rtoken_map_im, $numc ) );
5491             }
5492             else {
5493
5494                 # Should not happen unless @{$rtoken_map} is corrupted
5495                 DEVEL_MODE
5496                   && Fault(
5497                     "Number of Characters is '$numc' but should be >0\n");
5498             }
5499         }
5500
5501         #----------------------------------------------------------
5502         # Wrap up this line of tokens for shipping to the Formatter
5503         #----------------------------------------------------------
5504         $line_of_tokens->{_rtoken_type}    = \@token_type;
5505         $line_of_tokens->{_rtokens}        = \@tokens;
5506         $line_of_tokens->{_rblock_type}    = \@block_type;
5507         $line_of_tokens->{_rtype_sequence} = \@type_sequence;
5508         $line_of_tokens->{_rlevels}        = \@levels;
5509         $line_of_tokens->{_rci_levels}     = \@ci_string;
5510
5511         return;
5512     } ## end sub tokenizer_wrapup_line
5513 } ## end tokenize_this_line
5514
5515 #######################################################################
5516 # Tokenizer routines which assist in identifying token types
5517 #######################################################################
5518
5519 # hash lookup table of operator expected values
5520 my %op_expected_table;
5521
5522 # exceptions to perl's weird parsing rules after type 'Z'
5523 my %is_weird_parsing_rule_exception;
5524
5525 my %is_paren_dollar;
5526
5527 my %is_n_v;
5528
5529 BEGIN {
5530
5531     # Always expecting TERM following these types:
5532     # note: this is identical to '@value_requestor_type' defined later.
5533     my @q = qw(
5534       ; ! + x & ?  F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
5535       || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
5536       &= // >> ~. &. |. ^.
5537       ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
5538     );
5539     push @q, ',';
5540     push @q, '(';     # for completeness, not currently a token type
5541     push @q, '->';    # was previously in UNKNOWN
5542     @{op_expected_table}{@q} = (TERM) x scalar(@q);
5543
5544     # Always UNKNOWN following these types;
5545     # previously had '->' in this list for c030
5546     @q = qw( w );
5547     @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
5548
5549     # Always expecting OPERATOR ...
5550     # 'n' and 'v' are currently excluded because they might be VERSION numbers
5551     # 'i' is currently excluded because it might be a package
5552     # 'q' is currently excluded because it might be a prototype
5553     # Fix for c030: removed '->' from this list:
5554     @q = qw( -- C h R ++ ] Q <> );    ## n v q i );
5555     push @q, ')';
5556     @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
5557
5558     # Fix for git #62: added '*' and '%'
5559     @q = qw( < ? * % );
5560     @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
5561
5562     @q = qw<) $>;
5563     @{is_paren_dollar}{@q} = (1) x scalar(@q);
5564
5565     @q = qw( n v );
5566     @{is_n_v}{@q} = (1) x scalar(@q);
5567
5568 }
5569
5570 use constant DEBUG_OPERATOR_EXPECTED => 0;
5571
5572 sub operator_expected {
5573
5574     # Returns a parameter indicating what types of tokens can occur next
5575
5576     # Call format:
5577     #    $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
5578     # where
5579     #    $prev_type is the type of the previous token (blank or not)
5580     #    $tok is the current token
5581     #    $next_type is the type of the next token (blank or not)
5582
5583     # Many perl symbols have two or more meanings.  For example, '<<'
5584     # can be a shift operator or a here-doc operator.  The
5585     # interpretation of these symbols depends on the current state of
5586     # the tokenizer, which may either be expecting a term or an
5587     # operator.  For this example, a << would be a shift if an OPERATOR
5588     # is expected, and a here-doc if a TERM is expected.  This routine
5589     # is called to make this decision for any current token.  It returns
5590     # one of three possible values:
5591     #
5592     #     OPERATOR - operator expected (or at least, not a term)
5593     #     UNKNOWN  - can't tell
5594     #     TERM     - a term is expected (or at least, not an operator)
5595     #
5596     # The decision is based on what has been seen so far.  This
5597     # information is stored in the "$last_nonblank_type" and
5598     # "$last_nonblank_token" variables.  For example, if the
5599     # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
5600     # if $last_nonblank_type is 'n' (numeric), we are expecting an
5601     # OPERATOR.
5602     #
5603     # If a UNKNOWN is returned, the calling routine must guess. A major
5604     # goal of this tokenizer is to minimize the possibility of returning
5605     # UNKNOWN, because a wrong guess can spoil the formatting of a
5606     # script.
5607     #
5608     # Adding NEW_TOKENS: it is critically important that this routine be
5609     # updated to allow it to determine if an operator or term is to be
5610     # expected after the new token.  Doing this simply involves adding
5611     # the new token character to one of the regexes in this routine or
5612     # to one of the hash lists
5613     # that it uses, which are initialized in the BEGIN section.
5614     # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
5615     # $statement_type
5616
5617     # When possible, token types should be selected such that we can determine
5618     # the 'operator_expected' value by a simple hash lookup.  If there are
5619     # exceptions, that is an indication that a new type is needed.
5620
5621     my ($rarg) = @_;
5622
5623     #-------------
5624     # Table lookup
5625     #-------------
5626
5627     # Many types are can be obtained by a table lookup given the previous type.
5628     # This typically handles half or more of the calls.
5629     my $op_expected = $op_expected_table{$last_nonblank_type};
5630     if ( defined($op_expected) ) {
5631         DEBUG_OPERATOR_EXPECTED
5632           && print STDOUT
5633 "OPERATOR_EXPECTED: Table Lookup; returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5634         return $op_expected;
5635     }
5636
5637     #---------------------
5638     # Handle special cases
5639     #---------------------
5640
5641     $op_expected = UNKNOWN;
5642     my ( $prev_type, $tok, $next_type ) = @{$rarg};
5643
5644     # Types 'k', '}' and 'Z' depend on context
5645     # Types 'i', 'n', 'v', 'q' currently also temporarily depend on context.
5646
5647     # identifier...
5648     if ( $last_nonblank_type eq 'i' ) {
5649         $op_expected = OPERATOR;
5650
5651         # TODO: it would be cleaner to make this a special type
5652         # expecting VERSION or {} after package NAMESPACE;
5653         # maybe mark these words as type 'Y'?
5654         if (   substr( $last_nonblank_token, 0, 7 ) eq 'package'
5655             && $statement_type      =~ /^package\b/
5656             && $last_nonblank_token =~ /^package\b/ )
5657         {
5658             $op_expected = TERM;
5659         }
5660     }
5661
5662     # keyword...
5663     elsif ( $last_nonblank_type eq 'k' ) {
5664         $op_expected = TERM;
5665         if ( $expecting_operator_token{$last_nonblank_token} ) {
5666             $op_expected = OPERATOR;
5667         }
5668         elsif ( $expecting_term_token{$last_nonblank_token} ) {
5669
5670             # Exceptions from TERM:
5671
5672             # // may follow perl functions which may be unary operators
5673             # see test file dor.t (defined or);
5674             if (
5675                    $tok eq '/'
5676                 && $next_type eq '/'
5677                 && $is_keyword_rejecting_slash_as_pattern_delimiter{
5678                     $last_nonblank_token}
5679               )
5680             {
5681                 $op_expected = OPERATOR;
5682             }
5683
5684             # Patch to allow a ? following 'split' to be a deprecated pattern
5685             # delimiter.  This patch is coordinated with the omission of split
5686             # from the list
5687             # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
5688             # will force perltidy to guess.
5689             elsif ($tok eq '?'
5690                 && $last_nonblank_token eq 'split' )
5691             {
5692                 $op_expected = UNKNOWN;
5693             }
5694         }
5695     } ## end type 'k'
5696
5697     # closing container token...
5698
5699     # Note that the actual token for type '}' may also be a ')'.
5700
5701     # Also note that $last_nonblank_token is not the token corresponding to
5702     # $last_nonblank_type when the type is a closing container.  In that
5703     # case it is the token before the corresponding opening container token.
5704     # So for example, for this snippet
5705     #       $a = do { BLOCK } / 2;
5706     # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
5707
5708     elsif ( $last_nonblank_type eq '}' ) {
5709         $op_expected = UNKNOWN;
5710
5711         # handle something after 'do' and 'eval'
5712         if ( $is_block_operator{$last_nonblank_token} ) {
5713
5714             # something like $a = do { BLOCK } / 2;
5715             $op_expected = OPERATOR;    # block mode following }
5716         }
5717
5718         #       $last_nonblank_token =~ /^(\)|\$|\-\>)/
5719         elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
5720             || substr( $last_nonblank_token, 0, 2 ) eq '->' )
5721         {
5722             $op_expected = OPERATOR;
5723             if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
5724         }
5725
5726         # Check for smartmatch operator before preceding brace or square
5727         # bracket.  For example, at the ? after the ] in the following
5728         # expressions we are expecting an operator:
5729         #
5730         # qr/3/ ~~ ['1234'] ? 1 : 0;
5731         # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
5732         elsif ( $last_nonblank_token eq '~~' ) {
5733             $op_expected = OPERATOR;
5734         }
5735
5736         # A right brace here indicates the end of a simple block.  All
5737         # non-structural right braces have type 'R' all braces associated with
5738         # block operator keywords have been given those keywords as
5739         # "last_nonblank_token" and caught above.  (This statement is order
5740         # dependent, and must come after checking $last_nonblank_token).
5741         else {
5742
5743             # patch for dor.t (defined or).
5744             if (   $tok eq '/'
5745                 && $next_type eq '/'
5746                 && $last_nonblank_token eq ']' )
5747             {
5748                 $op_expected = OPERATOR;
5749             }
5750
5751             # Patch for RT #116344: misparse a ternary operator after an
5752             # anonymous hash, like this:
5753             #   return ref {} ? 1 : 0;
5754             # The right brace should really be marked type 'R' in this case,
5755             # and it is safest to return an UNKNOWN here. Expecting a TERM will
5756             # cause the '?' to always be interpreted as a pattern delimiter
5757             # rather than introducing a ternary operator.
5758             elsif ( $tok eq '?' ) {
5759                 $op_expected = UNKNOWN;
5760             }
5761             else {
5762                 $op_expected = TERM;
5763             }
5764         }
5765     } ## end type '}'
5766
5767     # number or v-string...
5768     # An exception is for VERSION numbers a 'use' statement. It has the format
5769     #     use Module VERSION LIST
5770     # We could avoid this exception by writing a special sub to parse 'use'
5771     # statements and perhaps mark these numbers with a new type V (for VERSION)
5772     ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
5773     elsif ( $is_n_v{$last_nonblank_type} ) {
5774         $op_expected = OPERATOR;
5775         if ( $statement_type eq 'use' ) {
5776             $op_expected = UNKNOWN;
5777         }
5778     }
5779
5780     # quote...
5781     # TODO: labeled prototype words would better be given type 'A' or maybe
5782     # 'J'; not 'q'; or maybe mark as type 'Y'?
5783     elsif ( $last_nonblank_type eq 'q' ) {
5784         $op_expected = OPERATOR;
5785         if ( $last_nonblank_token eq 'prototype' ) {
5786             $op_expected = TERM;
5787         }
5788     }
5789
5790     # file handle or similar
5791     elsif ( $last_nonblank_type eq 'Z' ) {
5792
5793         $op_expected = UNKNOWN;
5794
5795         # angle.t
5796         if ( $last_nonblank_token =~ /^\w/ ) {
5797             $op_expected = UNKNOWN;
5798         }
5799
5800         # Exception to weird parsing rules for 'x(' ... see case b1205:
5801         # In something like 'print $vv x(...' the x is an operator;
5802         # Likewise in 'print $vv x$ww' the x is an operator (case b1207)
5803         # otherwise x follows the weird parsing rules.
5804         elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
5805             $op_expected = OPERATOR;
5806         }
5807
5808         # The 'weird parsing rules' of next section do not work for '<' and '?'
5809         # It is best to mark them as unknown.  Test case:
5810         #  print $fh <DATA>;
5811         elsif ( $is_weird_parsing_rule_exception{$tok} ) {
5812             $op_expected = UNKNOWN;
5813         }
5814
5815         # For possible file handle like "$a", Perl uses weird parsing rules.
5816         # For example:
5817         # print $a/2,"/hi";   - division
5818         # print $a / 2,"/hi"; - division
5819         # print $a/ 2,"/hi";  - division
5820         # print $a /2,"/hi";  - pattern (and error)!
5821         # Some examples where this logic works okay, for '&','*','+':
5822         #    print $fh &xsi_protos(@mods);
5823         #    my $x = new $CompressClass *FH;
5824         #    print $OUT +( $count % 15 ? ", " : "\n\t" );
5825         elsif ($prev_type eq 'b'
5826             && $next_type ne 'b' )
5827         {
5828             $op_expected = TERM;
5829         }
5830
5831         # Note that '?' and '<' have been moved above
5832         # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
5833         elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
5834
5835             # Do not complain in 'use' statements, which have special syntax.
5836             # For example, from RT#130344:
5837             #   use lib $FindBin::Bin . '/lib';
5838             if ( $statement_type ne 'use' ) {
5839                 complain(
5840 "operator in possible indirect object location not recommended\n"
5841                 );
5842             }
5843             $op_expected = OPERATOR;
5844         }
5845     }
5846
5847     # anything else...
5848     else {
5849         $op_expected = UNKNOWN;
5850     }
5851
5852     DEBUG_OPERATOR_EXPECTED
5853       && print STDOUT
5854 "OPERATOR_EXPECTED: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5855
5856     return $op_expected;
5857
5858 } ## end sub operator_expected
5859
5860 sub new_statement_ok {
5861
5862     # return true if the current token can start a new statement
5863     # USES GLOBAL VARIABLES: $last_nonblank_type
5864
5865     return label_ok()    # a label would be ok here
5866
5867       || $last_nonblank_type eq 'J';    # or we follow a label
5868
5869 } ## end sub new_statement_ok
5870
5871 sub label_ok {
5872
5873     # Decide if a bare word followed by a colon here is a label
5874     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5875     # $brace_depth, @brace_type
5876
5877     # if it follows an opening or closing code block curly brace..
5878     if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
5879         && $last_nonblank_type eq $last_nonblank_token )
5880     {
5881
5882         # it is a label if and only if the curly encloses a code block
5883         return $brace_type[$brace_depth];
5884     }
5885
5886     # otherwise, it is a label if and only if it follows a ';' (real or fake)
5887     # or another label
5888     else {
5889         return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
5890     }
5891 } ## end sub label_ok
5892
5893 sub code_block_type {
5894
5895     # Decide if this is a block of code, and its type.
5896     # Must be called only when $type = $token = '{'
5897     # The problem is to distinguish between the start of a block of code
5898     # and the start of an anonymous hash reference
5899     # Returns "" if not code block, otherwise returns 'last_nonblank_token'
5900     # to indicate the type of code block.  (For example, 'last_nonblank_token'
5901     # might be 'if' for an if block, 'else' for an else block, etc).
5902     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5903     # $last_nonblank_block_type, $brace_depth, @brace_type
5904
5905     # handle case of multiple '{'s
5906
5907 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
5908
5909     my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
5910     if (   $last_nonblank_token eq '{'
5911         && $last_nonblank_type eq $last_nonblank_token )
5912     {
5913
5914         # opening brace where a statement may appear is probably
5915         # a code block but might be and anonymous hash reference
5916         if ( $brace_type[$brace_depth] ) {
5917             return decide_if_code_block( $i, $rtokens, $rtoken_type,
5918                 $max_token_index );
5919         }
5920
5921         # cannot start a code block within an anonymous hash
5922         else {
5923             return EMPTY_STRING;
5924         }
5925     }
5926
5927     elsif ( $last_nonblank_token eq ';' ) {
5928
5929         # an opening brace where a statement may appear is probably
5930         # a code block but might be and anonymous hash reference
5931         return decide_if_code_block( $i, $rtokens, $rtoken_type,
5932             $max_token_index );
5933     }
5934
5935     # handle case of '}{'
5936     elsif ($last_nonblank_token eq '}'
5937         && $last_nonblank_type eq $last_nonblank_token )
5938     {
5939
5940         # a } { situation ...
5941         # could be hash reference after code block..(blktype1.t)
5942         if ($last_nonblank_block_type) {
5943             return decide_if_code_block( $i, $rtokens, $rtoken_type,
5944                 $max_token_index );
5945         }
5946
5947         # must be a block if it follows a closing hash reference
5948         else {
5949             return $last_nonblank_token;
5950         }
5951     }
5952
5953     #--------------------------------------------------------------
5954     # NOTE: braces after type characters start code blocks, but for
5955     # simplicity these are not identified as such.  See also
5956     # sub is_non_structural_brace.
5957     #--------------------------------------------------------------
5958
5959 ##    elsif ( $last_nonblank_type eq 't' ) {
5960 ##       return $last_nonblank_token;
5961 ##    }
5962
5963     # brace after label:
5964     elsif ( $last_nonblank_type eq 'J' ) {
5965         return $last_nonblank_token;
5966     }
5967
5968 # otherwise, look at previous token.  This must be a code block if
5969 # it follows any of these:
5970 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
5971     elsif ($is_code_block_token{$last_nonblank_token}
5972         || $is_grep_alias{$last_nonblank_token} )
5973     {
5974
5975         # Bug Patch: Note that the opening brace after the 'if' in the following
5976         # snippet is an anonymous hash ref and not a code block!
5977         #   print 'hi' if { x => 1, }->{x};
5978         # We can identify this situation because the last nonblank type
5979         # will be a keyword (instead of a closing paren)
5980         if (
5981             $last_nonblank_type eq 'k'
5982             && (   $last_nonblank_token eq 'if'
5983                 || $last_nonblank_token eq 'unless' )
5984           )
5985         {
5986             return EMPTY_STRING;
5987         }
5988         else {
5989             return $last_nonblank_token;
5990         }
5991     }
5992
5993     # or a sub or package BLOCK
5994     elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
5995         && $last_nonblank_token =~ /^(sub|package)\b/ )
5996     {
5997         return $last_nonblank_token;
5998     }
5999
6000     # or a sub alias
6001     elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6002         && ( $is_sub{$last_nonblank_token} ) )
6003     {
6004         return 'sub';
6005     }
6006
6007     elsif ( $statement_type =~ /^(sub|package)\b/ ) {
6008         return $statement_type;
6009     }
6010
6011     # user-defined subs with block parameters (like grep/map/eval)
6012     elsif ( $last_nonblank_type eq 'G' ) {
6013         return $last_nonblank_token;
6014     }
6015
6016     # check bareword
6017     elsif ( $last_nonblank_type eq 'w' ) {
6018
6019         # check for syntax 'use MODULE LIST'
6020         # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
6021         return EMPTY_STRING if ( $statement_type eq 'use' );
6022
6023         return decide_if_code_block( $i, $rtokens, $rtoken_type,
6024             $max_token_index );
6025     }
6026
6027     # Patch for bug # RT #94338 reported by Daniel Trizen
6028     # for-loop in a parenthesized block-map triggering an error message:
6029     #    map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
6030     # Check for a code block within a parenthesized function call
6031     elsif ( $last_nonblank_token eq '(' ) {
6032         my $paren_type = $paren_type[$paren_depth];
6033
6034         #                   /^(map|grep|sort)$/
6035         if ( $paren_type && $is_sort_map_grep{$paren_type} ) {
6036
6037             # We will mark this as a code block but use type 't' instead
6038             # of the name of the containing function.  This will allow for
6039             # correct parsing but will usually produce better formatting.
6040             # Braces with block type 't' are not broken open automatically
6041             # in the formatter as are other code block types, and this usually
6042             # works best.
6043             return 't';    # (Not $paren_type)
6044         }
6045         else {
6046             return EMPTY_STRING;
6047         }
6048     }
6049
6050     # handle unknown syntax ') {'
6051     # we previously appended a '()' to mark this case
6052     elsif ( $last_nonblank_token =~ /\(\)$/ ) {
6053         return $last_nonblank_token;
6054     }
6055
6056     # anything else must be anonymous hash reference
6057     else {
6058         return EMPTY_STRING;
6059     }
6060 } ## end sub code_block_type
6061
6062 sub decide_if_code_block {
6063
6064     # USES GLOBAL VARIABLES: $last_nonblank_token
6065     my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6066
6067     my ( $next_nonblank_token, $i_next ) =
6068       find_next_nonblank_token( $i, $rtokens, $max_token_index );
6069
6070     # we are at a '{' where a statement may appear.
6071     # We must decide if this brace starts an anonymous hash or a code
6072     # block.
6073     # return "" if anonymous hash, and $last_nonblank_token otherwise
6074
6075     # initialize to be code BLOCK
6076     my $code_block_type = $last_nonblank_token;
6077
6078     # Check for the common case of an empty anonymous hash reference:
6079     # Maybe something like sub { { } }
6080     if ( $next_nonblank_token eq '}' ) {
6081         $code_block_type = EMPTY_STRING;
6082     }
6083
6084     else {
6085
6086         # To guess if this '{' is an anonymous hash reference, look ahead
6087         # and test as follows:
6088         #
6089         # it is a hash reference if next come:
6090         #   - a string or digit followed by a comma or =>
6091         #   - bareword followed by =>
6092         # otherwise it is a code block
6093         #
6094         # Examples of anonymous hash ref:
6095         # {'aa',};
6096         # {1,2}
6097         #
6098         # Examples of code blocks:
6099         # {1; print "hello\n", 1;}
6100         # {$a,1};
6101
6102         # We are only going to look ahead one more (nonblank/comment) line.
6103         # Strange formatting could cause a bad guess, but that's unlikely.
6104         my @pre_types;
6105         my @pre_tokens;
6106
6107         # Ignore the rest of this line if it is a side comment
6108         if ( $next_nonblank_token ne '#' ) {
6109             @pre_types  = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
6110             @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
6111         }
6112         my ( $rpre_tokens, $rpre_types ) =
6113           peek_ahead_for_n_nonblank_pre_tokens(20);    # 20 is arbitrary but
6114                                                        # generous, and prevents
6115                                                        # wasting lots of
6116                                                        # time in mangled files
6117         if ( defined($rpre_types) && @{$rpre_types} ) {
6118             push @pre_types,  @{$rpre_types};
6119             push @pre_tokens, @{$rpre_tokens};
6120         }
6121
6122         # put a sentinel token to simplify stopping the search
6123         push @pre_types, '}';
6124         push @pre_types, '}';
6125
6126         my $jbeg = 0;
6127         $jbeg = 1 if $pre_types[0] eq 'b';
6128
6129         # first look for one of these
6130         #  - bareword
6131         #  - bareword with leading -
6132         #  - digit
6133         #  - quoted string
6134         my $j = $jbeg;
6135         if ( $pre_types[$j] =~ /^[\'\"]/ ) {
6136
6137             # find the closing quote; don't worry about escapes
6138             my $quote_mark = $pre_types[$j];
6139             foreach my $k ( $j + 1 .. @pre_types - 2 ) {
6140                 if ( $pre_types[$k] eq $quote_mark ) {
6141                     $j = $k + 1;
6142                     my $next = $pre_types[$j];
6143                     last;
6144                 }
6145             }
6146         }
6147         elsif ( $pre_types[$j] eq 'd' ) {
6148             $j++;
6149         }
6150         elsif ( $pre_types[$j] eq 'w' ) {
6151             $j++;
6152         }
6153         elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
6154             $j++;
6155         }
6156         if ( $j > $jbeg ) {
6157
6158             $j++ if $pre_types[$j] eq 'b';
6159
6160             # Patched for RT #95708
6161             if (
6162
6163                 # it is a comma which is not a pattern delimiter except for qw
6164                 (
6165                     $pre_types[$j] eq ','
6166                     ## !~ /^(s|m|y|tr|qr|q|qq|qx)$/
6167                     && !$is_q_qq_qx_qr_s_y_tr_m{ $pre_tokens[$jbeg] }
6168                 )
6169
6170                 # or a =>
6171                 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
6172               )
6173             {
6174                 $code_block_type = EMPTY_STRING;
6175             }
6176         }
6177
6178         if ($code_block_type) {
6179
6180             # Patch for cases b1085 b1128: It is uncertain if this is a block.
6181             # If this brace follows a bareword, then append a space as a signal
6182             # to the formatter that this may not be a block brace.  To find the
6183             # corresponding code in Formatter.pm search for 'b1085'.
6184             $code_block_type .= SPACE if ( $code_block_type =~ /^\w/ );
6185         }
6186     }
6187
6188     return $code_block_type;
6189 } ## end sub decide_if_code_block
6190
6191 sub report_unexpected {
6192
6193     # report unexpected token type and show where it is
6194     # USES GLOBAL VARIABLES: $tokenizer_self
6195     my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
6196         $rpretoken_type, $input_line )
6197       = @_;
6198
6199     if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
6200         my $msg = "found $found where $expecting expected";
6201         my $pos = $rpretoken_map->[$i_tok];
6202         interrupt_logfile();
6203         my $input_line_number = $tokenizer_self->[_last_line_number_];
6204         my ( $offset, $numbered_line, $underline ) =
6205           make_numbered_line( $input_line_number, $input_line, $pos );
6206         $underline = write_on_underline( $underline, $pos - $offset, '^' );
6207
6208         my $trailer = EMPTY_STRING;
6209         if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
6210             my $pos_prev = $rpretoken_map->[$last_nonblank_i];
6211             my $num;
6212             if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
6213                 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
6214             }
6215             else {
6216                 $num = $pos - $pos_prev;
6217             }
6218             if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
6219
6220             $underline =
6221               write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
6222             $trailer = " (previous token underlined)";
6223         }
6224         $underline =~ s/\s+$//;
6225         warning( $numbered_line . "\n" );
6226         warning( $underline . "\n" );
6227         warning( $msg . $trailer . "\n" );
6228         resume_logfile();
6229     }
6230     return;
6231 } ## end sub report_unexpected
6232
6233 my %is_sigil_or_paren;
6234 my %is_R_closing_sb;
6235
6236 BEGIN {
6237
6238     my @q = qw< $ & % * @ ) >;
6239     @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
6240
6241     @q = qw(R ]);
6242     @{is_R_closing_sb}{@q} = (1) x scalar(@q);
6243 }
6244
6245 sub is_non_structural_brace {
6246
6247     # Decide if a brace or bracket is structural or non-structural
6248     # by looking at the previous token and type
6249     # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
6250
6251     # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
6252     # Tentatively deactivated because it caused the wrong operator expectation
6253     # for this code:
6254     #      $user = @vars[1] / 100;
6255     # Must update sub operator_expected before re-implementing.
6256     # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
6257     #    return 0;
6258     # }
6259
6260     #--------------------------------------------------------------
6261     # NOTE: braces after type characters start code blocks, but for
6262     # simplicity these are not identified as such.  See also
6263     # sub code_block_type
6264     #--------------------------------------------------------------
6265
6266     ##if ($last_nonblank_type eq 't') {return 0}
6267
6268     # otherwise, it is non-structural if it is decorated
6269     # by type information.
6270     # For example, the '{' here is non-structural:   ${xxx}
6271     # Removed '::' to fix c074
6272     ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
6273     return (
6274         ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
6275         $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
6276           || substr( $last_nonblank_token, 0, 2 ) eq '->'
6277
6278           # or if we follow a hash or array closing curly brace or bracket
6279           # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
6280           # because the first '}' would have been given type 'R'
6281           ##|| $last_nonblank_type =~ /^([R\]])$/
6282           || $is_R_closing_sb{$last_nonblank_type}
6283     );
6284 } ## end sub is_non_structural_brace
6285
6286 #######################################################################
6287 # Tokenizer routines for tracking container nesting depths
6288 #######################################################################
6289
6290 # The following routines keep track of nesting depths of the nesting
6291 # types, ( [ { and ?.  This is necessary for determining the indentation
6292 # level, and also for debugging programs.  Not only do they keep track of
6293 # nesting depths of the individual brace types, but they check that each
6294 # of the other brace types is balanced within matching pairs.  For
6295 # example, if the program sees this sequence:
6296 #
6297 #         {  ( ( ) }
6298 #
6299 # then it can determine that there is an extra left paren somewhere
6300 # between the { and the }.  And so on with every other possible
6301 # combination of outer and inner brace types.  For another
6302 # example:
6303 #
6304 #         ( [ ..... ]  ] )
6305 #
6306 # which has an extra ] within the parens.
6307 #
6308 # The brace types have indexes 0 .. 3 which are indexes into
6309 # the matrices.
6310 #
6311 # The pair ? : are treated as just another nesting type, with ? acting
6312 # as the opening brace and : acting as the closing brace.
6313 #
6314 # The matrix
6315 #
6316 #         $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
6317 #
6318 # saves the nesting depth of brace type $b (where $b is either of the other
6319 # nesting types) when brace type $a enters a new depth.  When this depth
6320 # decreases, a check is made that the current depth of brace types $b is
6321 # unchanged, or otherwise there must have been an error.  This can
6322 # be very useful for localizing errors, particularly when perl runs to
6323 # the end of a large file (such as this one) and announces that there
6324 # is a problem somewhere.
6325 #
6326 # A numerical sequence number is maintained for every nesting type,
6327 # so that each matching pair can be uniquely identified in a simple
6328 # way.
6329
6330 sub increase_nesting_depth {
6331     my ( $aa, $pos ) = @_;
6332
6333     # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6334     # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
6335     # $statement_type
6336     $current_depth[$aa]++;
6337     $total_depth++;
6338     $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
6339     my $input_line_number = $tokenizer_self->[_last_line_number_];
6340     my $input_line        = $tokenizer_self->[_line_of_text_];
6341
6342     # Sequence numbers increment by number of items.  This keeps
6343     # a unique set of numbers but still allows the relative location
6344     # of any type to be determined.
6345
6346     # make a new unique sequence number
6347     my $seqno = $next_sequence_number++;
6348
6349     $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
6350
6351     $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
6352       [ $input_line_number, $input_line, $pos ];
6353
6354     for my $bb ( 0 .. @closing_brace_names - 1 ) {
6355         next if ( $bb == $aa );
6356         $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
6357     }
6358
6359     # set a flag for indenting a nested ternary statement
6360     my $indent = 0;
6361     if ( $aa == QUESTION_COLON ) {
6362         $nested_ternary_flag[ $current_depth[$aa] ] = 0;
6363         if ( $current_depth[$aa] > 1 ) {
6364             if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
6365                 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
6366                 if ( $pdepth == $total_depth - 1 ) {
6367                     $indent = 1;
6368                     $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
6369                 }
6370             }
6371         }
6372     }
6373
6374     # Fix part #1 for git82: save last token type for propagation of type 'Z'
6375     $nested_statement_type[$aa][ $current_depth[$aa] ] =
6376       [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
6377     $statement_type = EMPTY_STRING;
6378     return ( $seqno, $indent );
6379 } ## end sub increase_nesting_depth
6380
6381 sub is_balanced_closing_container {
6382
6383     # Return true if a closing container can go here without error
6384     # Return false if not
6385     my ($aa) = @_;
6386
6387     # cannot close if there was no opening
6388     return unless ( $current_depth[$aa] > 0 );
6389
6390     # check that any other brace types $bb contained within would be balanced
6391     for my $bb ( 0 .. @closing_brace_names - 1 ) {
6392         next if ( $bb == $aa );
6393         return
6394           unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6395             $current_depth[$bb] );
6396     }
6397
6398     # OK, everything will be balanced
6399     return 1;
6400 } ## end sub is_balanced_closing_container
6401
6402 sub decrease_nesting_depth {
6403
6404     my ( $aa, $pos ) = @_;
6405
6406     # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6407     # @current_sequence_number, @depth_array, @starting_line_of_current_depth
6408     # $statement_type
6409     my $seqno             = 0;
6410     my $input_line_number = $tokenizer_self->[_last_line_number_];
6411     my $input_line        = $tokenizer_self->[_line_of_text_];
6412
6413     my $outdent = 0;
6414     $total_depth--;
6415     if ( $current_depth[$aa] > 0 ) {
6416
6417         # set a flag for un-indenting after seeing a nested ternary statement
6418         $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
6419         if ( $aa == QUESTION_COLON ) {
6420             $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
6421         }
6422
6423         # Fix part #2 for git82: use saved type for propagation of type 'Z'
6424         # through type L-R braces.  Perl seems to allow ${bareword}
6425         # as an indirect object, but nothing much more complex than that.
6426         ( $statement_type, my $saved_type, my $saved_token ) =
6427           @{ $nested_statement_type[$aa][ $current_depth[$aa] ] };
6428         if (   $aa == BRACE
6429             && $saved_type eq 'Z'
6430             && $last_nonblank_type eq 'w'
6431             && $brace_structural_type[$brace_depth] eq 'L' )
6432         {
6433             $last_nonblank_type = $saved_type;
6434         }
6435
6436         # check that any brace types $bb contained within are balanced
6437         for my $bb ( 0 .. @closing_brace_names - 1 ) {
6438             next if ( $bb == $aa );
6439
6440             unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6441                 $current_depth[$bb] )
6442             {
6443                 my $diff =
6444                   $current_depth[$bb] -
6445                   $depth_array[$aa][$bb][ $current_depth[$aa] ];
6446
6447                 # don't whine too many times
6448                 my $saw_brace_error = get_saw_brace_error();
6449                 if (
6450                     $saw_brace_error <= MAX_NAG_MESSAGES
6451
6452                     # if too many closing types have occurred, we probably
6453                     # already caught this error
6454                     && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
6455                   )
6456                 {
6457                     interrupt_logfile();
6458                     my $rsl =
6459                       $starting_line_of_current_depth[$aa]
6460                       [ $current_depth[$aa] ];
6461                     my $sl  = $rsl->[0];
6462                     my $rel = [ $input_line_number, $input_line, $pos ];
6463                     my $el  = $rel->[0];
6464                     my ($ess);
6465
6466                     if ( $diff == 1 || $diff == -1 ) {
6467                         $ess = EMPTY_STRING;
6468                     }
6469                     else {
6470                         $ess = 's';
6471                     }
6472                     my $bname =
6473                       ( $diff > 0 )
6474                       ? $opening_brace_names[$bb]
6475                       : $closing_brace_names[$bb];
6476                     write_error_indicator_pair( @{$rsl}, '^' );
6477                     my $msg = <<"EOM";
6478 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
6479 EOM
6480
6481                     if ( $diff > 0 ) {
6482                         my $rml =
6483                           $starting_line_of_current_depth[$bb]
6484                           [ $current_depth[$bb] ];
6485                         my $ml = $rml->[0];
6486                         $msg .=
6487 "    The most recent un-matched $bname is on line $ml\n";
6488                         write_error_indicator_pair( @{$rml}, '^' );
6489                     }
6490                     write_error_indicator_pair( @{$rel}, '^' );
6491                     warning($msg);
6492                     resume_logfile();
6493                 }
6494                 increment_brace_error();
6495             }
6496         }
6497         $current_depth[$aa]--;
6498     }
6499     else {
6500
6501         my $saw_brace_error = get_saw_brace_error();
6502         if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
6503             my $msg = <<"EOM";
6504 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
6505 EOM
6506             indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
6507         }
6508         increment_brace_error();
6509
6510         # keep track of errors in braces alone (ignoring ternary nesting errors)
6511         $tokenizer_self->[_true_brace_error_count_]++
6512           if ( $closing_brace_names[$aa] ne "':'" );
6513     }
6514     return ( $seqno, $outdent );
6515 } ## end sub decrease_nesting_depth
6516
6517 sub check_final_nesting_depths {
6518
6519     # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
6520
6521     for my $aa ( 0 .. @closing_brace_names - 1 ) {
6522
6523         if ( $current_depth[$aa] ) {
6524             my $rsl =
6525               $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
6526             my $sl  = $rsl->[0];
6527             my $msg = <<"EOM";
6528 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
6529 The most recent un-matched $opening_brace_names[$aa] is on line $sl
6530 EOM
6531             indicate_error( $msg, @{$rsl}, '^' );
6532             increment_brace_error();
6533         }
6534     }
6535     return;
6536 } ## end sub check_final_nesting_depths
6537
6538 #######################################################################
6539 # Tokenizer routines for looking ahead in input stream
6540 #######################################################################
6541
6542 sub peek_ahead_for_n_nonblank_pre_tokens {
6543
6544     # returns next n pretokens if they exist
6545     # returns undef's if hits eof without seeing any pretokens
6546     # USES GLOBAL VARIABLES: $tokenizer_self
6547     my $max_pretokens = shift;
6548     my $line;
6549     my $i = 0;
6550     my ( $rpre_tokens, $rmap, $rpre_types );
6551
6552     while ( $line =
6553         $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6554     {
6555         $line =~ s/^\s*//;                 # trim leading blanks
6556         next if ( length($line) <= 0 );    # skip blank
6557         next if ( $line =~ /^#/ );         # skip comment
6558         ( $rpre_tokens, $rmap, $rpre_types ) =
6559           pre_tokenize( $line, $max_pretokens );
6560         last;
6561     }
6562     return ( $rpre_tokens, $rpre_types );
6563 } ## end sub peek_ahead_for_n_nonblank_pre_tokens
6564
6565 # look ahead for next non-blank, non-comment line of code
6566 sub peek_ahead_for_nonblank_token {
6567
6568     # USES GLOBAL VARIABLES: $tokenizer_self
6569     my ( $rtokens, $max_token_index ) = @_;
6570     my $line;
6571     my $i = 0;
6572
6573     while ( $line =
6574         $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6575     {
6576         $line =~ s/^\s*//;                 # trim leading blanks
6577         next if ( length($line) <= 0 );    # skip blank
6578         next if ( $line =~ /^#/ );         # skip comment
6579
6580         # Updated from 2 to 3 to get trigraphs, added for case b1175
6581         my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
6582         my $j = $max_token_index + 1;
6583
6584         foreach my $tok ( @{$rtok} ) {
6585             last if ( $tok =~ "\n" );
6586             $rtokens->[ ++$j ] = $tok;
6587         }
6588         last;
6589     }
6590     return;
6591 } ## end sub peek_ahead_for_nonblank_token
6592
6593 #######################################################################
6594 # Tokenizer guessing routines for ambiguous situations
6595 #######################################################################
6596
6597 sub guess_if_pattern_or_conditional {
6598
6599     # this routine is called when we have encountered a ? following an
6600     # unknown bareword, and we must decide if it starts a pattern or not
6601     # input parameters:
6602     #   $i - token index of the ? starting possible pattern
6603     # output parameters:
6604     #   $is_pattern = 0 if probably not pattern,  =1 if probably a pattern
6605     #   msg = a warning or diagnostic message
6606     # USES GLOBAL VARIABLES: $last_nonblank_token
6607
6608     my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6609     my $is_pattern = 0;
6610     my $msg        = "guessing that ? after $last_nonblank_token starts a ";
6611
6612     if ( $i >= $max_token_index ) {
6613         $msg .= "conditional (no end to pattern found on the line)\n";
6614     }
6615     else {
6616         my $ibeg = $i;
6617         $i = $ibeg + 1;
6618         my $next_token = $rtokens->[$i];    # first token after ?
6619
6620         # look for a possible ending ? on this line..
6621         my $in_quote        = 1;
6622         my $quote_depth     = 0;
6623         my $quote_character = EMPTY_STRING;
6624         my $quote_pos       = 0;
6625         my $quoted_string;
6626         (
6627
6628             $i,
6629             $in_quote,
6630             $quote_character,
6631             $quote_pos,
6632             $quote_depth,
6633             $quoted_string,
6634
6635         ) = follow_quoted_string(
6636
6637             $ibeg,
6638             $in_quote,
6639             $rtokens,
6640             $quote_character,
6641             $quote_pos,
6642             $quote_depth,
6643             $max_token_index,
6644
6645         );
6646
6647         if ($in_quote) {
6648
6649             # we didn't find an ending ? on this line,
6650             # so we bias towards conditional
6651             $is_pattern = 0;
6652             $msg .= "conditional (no ending ? on this line)\n";
6653
6654             # we found an ending ?, so we bias towards a pattern
6655         }
6656         else {
6657
6658             # Watch out for an ending ? in quotes, like this
6659             #    my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
6660             my $s_quote = 0;
6661             my $d_quote = 0;
6662             my $colons  = 0;
6663             foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
6664                 my $tok = $rtokens->[$ii];
6665                 if ( $tok eq ":" ) { $colons++ }
6666                 if ( $tok eq "'" ) { $s_quote++ }
6667                 if ( $tok eq '"' ) { $d_quote++ }
6668             }
6669             if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
6670                 $is_pattern = 0;
6671                 $msg .= "found ending ? but unbalanced quote chars\n";
6672             }
6673             elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
6674                 $is_pattern = 1;
6675                 $msg .= "pattern (found ending ? and pattern expected)\n";
6676             }
6677             else {
6678                 $msg .= "pattern (uncertain, but found ending ?)\n";
6679             }
6680         }
6681     }
6682     return ( $is_pattern, $msg );
6683 } ## end sub guess_if_pattern_or_conditional
6684
6685 my %is_known_constant;
6686 my %is_known_function;
6687
6688 BEGIN {
6689
6690     # Constants like 'pi' in Trig.pm are common
6691     my @q = qw(pi pi2 pi4 pip2 pip4);
6692     @{is_known_constant}{@q} = (1) x scalar(@q);
6693
6694     # parenless calls of 'ok' are common
6695     @q = qw( ok );
6696     @{is_known_function}{@q} = (1) x scalar(@q);
6697 }
6698
6699 sub guess_if_pattern_or_division {
6700
6701     # this routine is called when we have encountered a / following an
6702     # unknown bareword, and we must decide if it starts a pattern or is a
6703     # division
6704     # input parameters:
6705     #   $i - token index of the / starting possible pattern
6706     # output parameters:
6707     #   $is_pattern = 0 if probably division,  =1 if probably a pattern
6708     #   msg = a warning or diagnostic message
6709     # USES GLOBAL VARIABLES: $last_nonblank_token
6710     my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6711     my $is_pattern = 0;
6712     my $msg        = "guessing that / after $last_nonblank_token starts a ";
6713
6714     if ( $i >= $max_token_index ) {
6715         $msg .= "division (no end to pattern found on the line)\n";
6716     }
6717     else {
6718         my $ibeg = $i;
6719         my $divide_possible =
6720           is_possible_numerator( $i, $rtokens, $max_token_index );
6721
6722         if ( $divide_possible < 0 ) {
6723             $msg        = "pattern (division not possible here)\n";
6724             $is_pattern = 1;
6725             return ( $is_pattern, $msg );
6726         }
6727
6728         $i = $ibeg + 1;
6729         my $next_token = $rtokens->[$i];    # first token after slash
6730
6731         # One of the things we can look at is the spacing around the slash.
6732         # There # are four possible spacings around the first slash:
6733         #
6734         #     return pi/two;#/;     -/-
6735         #     return pi/ two;#/;    -/+
6736         #     return pi / two;#/;   +/+
6737         #     return pi /two;#/;    +/-   <-- possible pattern
6738         #
6739         # Spacing rule: a space before the slash but not after the slash
6740         # usually indicates a pattern.  We can use this to break ties.
6741
6742         my $is_pattern_by_spacing =
6743           ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
6744
6745         # look for a possible ending / on this line..
6746         my $in_quote        = 1;
6747         my $quote_depth     = 0;
6748         my $quote_character = EMPTY_STRING;
6749         my $quote_pos       = 0;
6750         my $quoted_string;
6751         (
6752             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6753             $quoted_string
6754           )
6755           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6756             $quote_pos, $quote_depth, $max_token_index );
6757
6758         if ($in_quote) {
6759
6760             # we didn't find an ending / on this line, so we bias towards
6761             # division
6762             if ( $divide_possible >= 0 ) {
6763                 $is_pattern = 0;
6764                 $msg .= "division (no ending / on this line)\n";
6765             }
6766             else {
6767
6768                 # assuming a multi-line pattern ... this is risky, but division
6769                 # does not seem possible.  If this fails, it would either be due
6770                 # to a syntax error in the code, or the division_expected logic
6771                 # needs to be fixed.
6772                 $msg        = "multi-line pattern (division not possible)\n";
6773                 $is_pattern = 1;
6774             }
6775         }
6776
6777         # we found an ending /, so we bias slightly towards a pattern
6778         else {
6779
6780             my $pattern_expected =
6781               pattern_expected( $i, $rtokens, $max_token_index );
6782
6783             if ( $pattern_expected >= 0 ) {
6784
6785                 # pattern looks possible...
6786                 if ( $divide_possible >= 0 ) {
6787
6788                     # Both pattern and divide can work here...
6789
6790                     # Increase weight of divide if a pure number follows
6791                     $divide_possible += $next_token =~ /^\d+$/;
6792
6793                     # Check for known constants in the numerator, like 'pi'
6794                     if ( $is_known_constant{$last_nonblank_token} ) {
6795                         $msg .=
6796 "division (pattern works too but saw known constant '$last_nonblank_token')\n";
6797                         $is_pattern = 0;
6798                     }
6799
6800                     # A very common bare word in pattern expressions is 'ok'
6801                     elsif ( $is_known_function{$last_nonblank_token} ) {
6802                         $msg .=
6803 "pattern (division works too but saw '$last_nonblank_token')\n";
6804                         $is_pattern = 1;
6805                     }
6806
6807                     # If one rule is more definite, use it
6808                     elsif ( $divide_possible > $pattern_expected ) {
6809                         $msg .=
6810                           "division (more likely based on following tokens)\n";
6811                         $is_pattern = 0;
6812                     }
6813
6814                     # otherwise, use the spacing rule
6815                     elsif ($is_pattern_by_spacing) {
6816                         $msg .=
6817 "pattern (guess on spacing, but division possible too)\n";
6818                         $is_pattern = 1;
6819                     }
6820                     else {
6821                         $msg .=
6822 "division (guess on spacing, but pattern is possible too)\n";
6823                         $is_pattern = 0;
6824                     }
6825                 }
6826
6827                 # divide_possible < 0 means divide can not work here
6828                 else {
6829                     $is_pattern = 1;
6830                     $msg .= "pattern (division not possible)\n";
6831                 }
6832             }
6833
6834             # pattern does not look possible...
6835             else {
6836
6837                 if ( $divide_possible >= 0 ) {
6838                     $is_pattern = 0;
6839                     $msg .= "division (pattern not possible)\n";
6840                 }
6841
6842                 # Neither pattern nor divide look possible...go by spacing
6843                 else {
6844                     if ($is_pattern_by_spacing) {
6845                         $msg .= "pattern (guess on spacing)\n";
6846                         $is_pattern = 1;
6847                     }
6848                     else {
6849                         $msg .= "division (guess on spacing)\n";
6850                         $is_pattern = 0;
6851                     }
6852                 }
6853             }
6854         }
6855     }
6856     return ( $is_pattern, $msg );
6857 } ## end sub guess_if_pattern_or_division
6858
6859 # try to resolve here-doc vs. shift by looking ahead for
6860 # non-code or the end token (currently only looks for end token)
6861 # returns 1 if it is probably a here doc, 0 if not
6862 sub guess_if_here_doc {
6863
6864     # This is how many lines we will search for a target as part of the
6865     # guessing strategy.  It is a constant because there is probably
6866     # little reason to change it.
6867     # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
6868     # %is_constant,
6869     my $HERE_DOC_WINDOW = 40;
6870
6871     my $next_token        = shift;
6872     my $here_doc_expected = 0;
6873     my $line;
6874     my $k   = 0;
6875     my $msg = "checking <<";
6876
6877     while ( $line =
6878         $tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) )
6879     {
6880         chomp $line;
6881
6882         if ( $line =~ /^$next_token$/ ) {
6883             $msg .= " -- found target $next_token ahead $k lines\n";
6884             $here_doc_expected = 1;    # got it
6885             last;
6886         }
6887         last if ( $k >= $HERE_DOC_WINDOW );
6888     }
6889
6890     unless ($here_doc_expected) {
6891
6892         if ( !defined($line) ) {
6893             $here_doc_expected = -1;    # hit eof without seeing target
6894             $msg .= " -- must be shift; target $next_token not in file\n";
6895
6896         }
6897         else {                          # still unsure..taking a wild guess
6898
6899             if ( !$is_constant{$current_package}{$next_token} ) {
6900                 $here_doc_expected = 1;
6901                 $msg .=
6902                   " -- guessing it's a here-doc ($next_token not a constant)\n";
6903             }
6904             else {
6905                 $msg .=
6906                   " -- guessing it's a shift ($next_token is a constant)\n";
6907             }
6908         }
6909     }
6910     write_logfile_entry($msg);
6911     return $here_doc_expected;
6912 } ## end sub guess_if_here_doc
6913
6914 #######################################################################
6915 # Tokenizer Routines for scanning identifiers and related items
6916 #######################################################################
6917
6918 sub scan_bare_identifier_do {
6919
6920     # this routine is called to scan a token starting with an alphanumeric
6921     # variable or package separator, :: or '.
6922     # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
6923     # $last_nonblank_type,@paren_type, $paren_depth
6924
6925     my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
6926         $max_token_index )
6927       = @_;
6928     my $i_begin = $i;
6929     my $package = undef;
6930
6931     my $i_beg = $i;
6932
6933     # we have to back up one pretoken at a :: since each : is one pretoken
6934     if ( $tok eq '::' ) { $i_beg-- }
6935     if ( $tok eq '->' ) { $i_beg-- }
6936     my $pos_beg = $rtoken_map->[$i_beg];
6937     pos($input_line) = $pos_beg;
6938
6939     #  Examples:
6940     #   A::B::C
6941     #   A::
6942     #   ::A
6943     #   A'B
6944     if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
6945
6946         my $pos  = pos($input_line);
6947         my $numc = $pos - $pos_beg;
6948         $tok = substr( $input_line, $pos_beg, $numc );
6949
6950         # type 'w' includes anything without leading type info
6951         # ($,%,@,*) including something like abc::def::ghi
6952         $type = 'w';
6953
6954         my $sub_name = EMPTY_STRING;
6955         if ( defined($2) ) { $sub_name = $2; }
6956         if ( defined($1) ) {
6957             $package = $1;
6958
6959             # patch: don't allow isolated package name which just ends
6960             # in the old style package separator (single quote).  Example:
6961             #   use CGI':all';
6962             if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
6963                 $pos--;
6964             }
6965
6966             $package =~ s/\'/::/g;
6967             if ( $package =~ /^\:/ ) { $package = 'main' . $package }
6968             $package =~ s/::$//;
6969         }
6970         else {
6971             $package = $current_package;
6972
6973             # patched for c043, part 1: keyword does not follow '->'
6974             if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
6975                 $type = 'k';
6976             }
6977         }
6978
6979         # if it is a bareword..  patched for c043, part 2: not following '->'
6980         if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
6981
6982             # check for v-string with leading 'v' type character
6983             # (This seems to have precedence over filehandle, type 'Y')
6984             if ( $tok =~ /^v\d[_\d]*$/ ) {
6985
6986                 # we only have the first part - something like 'v101' -
6987                 # look for more
6988                 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
6989                     $pos  = pos($input_line);
6990                     $numc = $pos - $pos_beg;
6991                     $tok  = substr( $input_line, $pos_beg, $numc );
6992                 }
6993                 $type = 'v';
6994
6995                 # warn if this version can't handle v-strings
6996                 report_v_string($tok);
6997             }
6998
6999             elsif ( $is_constant{$package}{$sub_name} ) {
7000                 $type = 'C';
7001             }
7002
7003             # bareword after sort has implied empty prototype; for example:
7004             # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
7005             # This has priority over whatever the user has specified.
7006             elsif ($last_nonblank_token eq 'sort'
7007                 && $last_nonblank_type eq 'k' )
7008             {
7009                 $type = 'Z';
7010             }
7011
7012             # Note: strangely, perl does not seem to really let you create
7013             # functions which act like eval and do, in the sense that eval
7014             # and do may have operators following the final }, but any operators
7015             # that you create with prototype (&) apparently do not allow
7016             # trailing operators, only terms.  This seems strange.
7017             # If this ever changes, here is the update
7018             # to make perltidy behave accordingly:
7019
7020             # elsif ( $is_block_function{$package}{$tok} ) {
7021             #    $tok='eval'; # patch to do braces like eval  - doesn't work
7022             #    $type = 'k';
7023             #}
7024             # TODO: This could become a separate type to allow for different
7025             # future behavior:
7026             elsif ( $is_block_function{$package}{$sub_name} ) {
7027                 $type = 'G';
7028             }
7029             elsif ( $is_block_list_function{$package}{$sub_name} ) {
7030                 $type = 'G';
7031             }
7032             elsif ( $is_user_function{$package}{$sub_name} ) {
7033                 $type      = 'U';
7034                 $prototype = $user_function_prototype{$package}{$sub_name};
7035             }
7036
7037             # check for indirect object
7038             elsif (
7039
7040                 # added 2001-03-27: must not be followed immediately by '('
7041                 # see fhandle.t
7042                 ( $input_line !~ m/\G\(/gc )
7043
7044                 # and
7045                 && (
7046
7047                     # preceded by keyword like 'print', 'printf' and friends
7048                     $is_indirect_object_taker{$last_nonblank_token}
7049
7050                     # or preceded by something like 'print(' or 'printf('
7051                     || (
7052                         ( $last_nonblank_token eq '(' )
7053                         && $is_indirect_object_taker{ $paren_type[$paren_depth]
7054                         }
7055
7056                     )
7057                 )
7058               )
7059             {
7060
7061                 # may not be indirect object unless followed by a space;
7062                 # updated 2021-01-16 to consider newline to be a space.
7063                 # updated for case b990 to look for either ';' or space
7064                 if ( pos($input_line) == length($input_line)
7065                     || $input_line =~ m/\G[;\s]/gc )
7066                 {
7067                     $type = 'Y';
7068
7069                     # Abandon Hope ...
7070                     # Perl's indirect object notation is a very bad
7071                     # thing and can cause subtle bugs, especially for
7072                     # beginning programmers.  And I haven't even been
7073                     # able to figure out a sane warning scheme which
7074                     # doesn't get in the way of good scripts.
7075
7076                     # Complain if a filehandle has any lower case
7077                     # letters.  This is suggested good practice.
7078                     # Use 'sub_name' because something like
7079                     # main::MYHANDLE is ok for filehandle
7080                     if ( $sub_name =~ /[a-z]/ ) {
7081
7082                         # could be bug caused by older perltidy if
7083                         # followed by '('
7084                         if ( $input_line =~ m/\G\s*\(/gc ) {
7085                             complain(
7086 "Caution: unknown word '$tok' in indirect object slot\n"
7087                             );
7088                         }
7089                     }
7090                 }
7091
7092                 # bareword not followed by a space -- may not be filehandle
7093                 # (may be function call defined in a 'use' statement)
7094                 else {
7095                     $type = 'Z';
7096                 }
7097             }
7098         }
7099
7100         # Now we must convert back from character position
7101         # to pre_token index.
7102         # I don't think an error flag can occur here ..but who knows
7103         my $error;
7104         ( $i, $error ) =
7105           inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7106         if ($error) {
7107             warning("scan_bare_identifier: Possibly invalid tokenization\n");
7108         }
7109     }
7110
7111     # no match but line not blank - could be syntax error
7112     # perl will take '::' alone without complaint
7113     else {
7114         $type = 'w';
7115
7116         # change this warning to log message if it becomes annoying
7117         warning("didn't find identifier after leading ::\n");
7118     }
7119     return ( $i, $tok, $type, $prototype );
7120 } ## end sub scan_bare_identifier_do
7121
7122 sub scan_id_do {
7123
7124 # This is the new scanner and will eventually replace scan_identifier.
7125 # Only type 'sub' and 'package' are implemented.
7126 # Token types $ * % @ & -> are not yet implemented.
7127 #
7128 # Scan identifier following a type token.
7129 # The type of call depends on $id_scan_state: $id_scan_state = ''
7130 # for starting call, in which case $tok must be the token defining
7131 # the type.
7132 #
7133 # If the type token is the last nonblank token on the line, a value
7134 # of $id_scan_state = $tok is returned, indicating that further
7135 # calls must be made to get the identifier.  If the type token is
7136 # not the last nonblank token on the line, the identifier is
7137 # scanned and handled and a value of '' is returned.
7138 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
7139 # $statement_type, $tokenizer_self
7140
7141     my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
7142         $max_token_index )
7143       = @_;
7144     use constant DEBUG_NSCAN => 0;
7145     my $type = EMPTY_STRING;
7146     my ( $i_beg, $pos_beg );
7147
7148     #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7149     #my ($a,$b,$c) = caller;
7150     #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
7151
7152     # on re-entry, start scanning at first token on the line
7153     if ($id_scan_state) {
7154         $i_beg = $i;
7155         $type  = EMPTY_STRING;
7156     }
7157
7158     # on initial entry, start scanning just after type token
7159     else {
7160         $i_beg         = $i + 1;
7161         $id_scan_state = $tok;
7162         $type          = 't';
7163     }
7164
7165     # find $i_beg = index of next nonblank token,
7166     # and handle empty lines
7167     my $blank_line          = 0;
7168     my $next_nonblank_token = $rtokens->[$i_beg];
7169     if ( $i_beg > $max_token_index ) {
7170         $blank_line = 1;
7171     }
7172     else {
7173
7174         # only a '#' immediately after a '$' is not a comment
7175         if ( $next_nonblank_token eq '#' ) {
7176             unless ( $tok eq '$' ) {
7177                 $blank_line = 1;
7178             }
7179         }
7180
7181         if ( $next_nonblank_token =~ /^\s/ ) {
7182             ( $next_nonblank_token, $i_beg ) =
7183               find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
7184                 $max_token_index );
7185             if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
7186                 $blank_line = 1;
7187             }
7188         }
7189     }
7190
7191     # handle non-blank line; identifier, if any, must follow
7192     unless ($blank_line) {
7193
7194         if ( $is_sub{$id_scan_state} ) {
7195             ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
7196                 {
7197                     input_line      => $input_line,
7198                     i               => $i,
7199                     i_beg           => $i_beg,
7200                     tok             => $tok,
7201                     type            => $type,
7202                     rtokens         => $rtokens,
7203                     rtoken_map      => $rtoken_map,
7204                     id_scan_state   => $id_scan_state,
7205                     max_token_index => $max_token_index,
7206                 }
7207             );
7208         }
7209
7210         elsif ( $is_package{$id_scan_state} ) {
7211             ( $i, $tok, $type ) =
7212               do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
7213                 $rtoken_map, $max_token_index );
7214             $id_scan_state = EMPTY_STRING;
7215         }
7216
7217         else {
7218             warning("invalid token in scan_id: $tok\n");
7219             $id_scan_state = EMPTY_STRING;
7220         }
7221     }
7222
7223     if ( $id_scan_state && ( !defined($type) || !$type ) ) {
7224
7225         # shouldn't happen:
7226         if (DEVEL_MODE) {
7227             Fault(<<EOM);
7228 Program bug in scan_id: undefined type but scan_state=$id_scan_state
7229 EOM
7230         }
7231         warning(
7232 "Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
7233         );
7234         report_definite_bug();
7235     }
7236
7237     DEBUG_NSCAN && do {
7238         print STDOUT
7239           "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7240     };
7241     return ( $i, $tok, $type, $id_scan_state );
7242 } ## end sub scan_id_do
7243
7244 sub check_prototype {
7245     my ( $proto, $package, $subname ) = @_;
7246     return unless ( defined($package) && defined($subname) );
7247     if ( defined($proto) ) {
7248         $proto =~ s/^\s*\(\s*//;
7249         $proto =~ s/\s*\)$//;
7250         if ($proto) {
7251             $is_user_function{$package}{$subname}        = 1;
7252             $user_function_prototype{$package}{$subname} = "($proto)";
7253
7254             # prototypes containing '&' must be treated specially..
7255             if ( $proto =~ /\&/ ) {
7256
7257                 # right curly braces of prototypes ending in
7258                 # '&' may be followed by an operator
7259                 if ( $proto =~ /\&$/ ) {
7260                     $is_block_function{$package}{$subname} = 1;
7261                 }
7262
7263                 # right curly braces of prototypes NOT ending in
7264                 # '&' may NOT be followed by an operator
7265                 elsif ( $proto !~ /\&$/ ) {
7266                     $is_block_list_function{$package}{$subname} = 1;
7267                 }
7268             }
7269         }
7270         else {
7271             $is_constant{$package}{$subname} = 1;
7272         }
7273     }
7274     else {
7275         $is_user_function{$package}{$subname} = 1;
7276     }
7277     return;
7278 } ## end sub check_prototype
7279
7280 sub do_scan_package {
7281
7282     # do_scan_package parses a package name
7283     # it is called with $i_beg equal to the index of the first nonblank
7284     # token following a 'package' token.
7285     # USES GLOBAL VARIABLES: $current_package,
7286
7287     # package NAMESPACE
7288     # package NAMESPACE VERSION
7289     # package NAMESPACE BLOCK
7290     # package NAMESPACE VERSION BLOCK
7291     #
7292     # If VERSION is provided, package sets the $VERSION variable in the given
7293     # namespace to a version object with the VERSION provided. VERSION must be
7294     # a "strict" style version number as defined by the version module: a
7295     # positive decimal number (integer or decimal-fraction) without
7296     # exponentiation or else a dotted-decimal v-string with a leading 'v'
7297     # character and at least three components.
7298     # reference http://perldoc.perl.org/functions/package.html
7299
7300     my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
7301         $max_token_index )
7302       = @_;
7303     my $package = undef;
7304     my $pos_beg = $rtoken_map->[$i_beg];
7305     pos($input_line) = $pos_beg;
7306
7307     # handle non-blank line; package name, if any, must follow
7308     if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
7309         $package = $1;
7310         $package = ( defined($1) && $1 ) ? $1 : 'main';
7311         $package =~ s/\'/::/g;
7312         if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7313         $package =~ s/::$//;
7314         my $pos  = pos($input_line);
7315         my $numc = $pos - $pos_beg;
7316         $tok  = 'package ' . substr( $input_line, $pos_beg, $numc );
7317         $type = 'i';
7318
7319         # Now we must convert back from character position
7320         # to pre_token index.
7321         # I don't think an error flag can occur here ..but ?
7322         my $error;
7323         ( $i, $error ) =
7324           inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7325         if ($error) { warning("Possibly invalid package\n") }
7326         $current_package = $package;
7327
7328         # we should now have package NAMESPACE
7329         # now expecting VERSION, BLOCK, or ; to follow ...
7330         # package NAMESPACE VERSION
7331         # package NAMESPACE BLOCK
7332         # package NAMESPACE VERSION BLOCK
7333         my ( $next_nonblank_token, $i_next ) =
7334           find_next_nonblank_token( $i, $rtokens, $max_token_index );
7335
7336         # check that something recognizable follows, but do not parse.
7337         # A VERSION number will be parsed later as a number or v-string in the
7338         # normal way.  What is important is to set the statement type if
7339         # everything looks okay so that the operator_expected() routine
7340         # knows that the number is in a package statement.
7341         # Examples of valid primitive tokens that might follow are:
7342         #  1235  . ; { } v3  v
7343         # FIX: added a '#' since a side comment may also follow
7344         if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#])|v\d|\d+$/ ) {
7345             $statement_type = $tok;
7346         }
7347         else {
7348             warning(
7349                 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
7350             );
7351         }
7352     }
7353
7354     # no match but line not blank --
7355     # could be a label with name package, like package:  , for example.
7356     else {
7357         $type = 'k';
7358     }
7359
7360     return ( $i, $tok, $type );
7361 } ## end sub do_scan_package
7362
7363 my %is_special_variable_char;
7364
7365 BEGIN {
7366
7367     # These are the only characters which can (currently) form special
7368     # variables, like $^W: (issue c066).
7369     my @q =
7370       qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
7371     @{is_special_variable_char}{@q} = (1) x scalar(@q);
7372 }
7373
7374 {    ## begin closure for sub scan_complex_identifier
7375
7376     use constant DEBUG_SCAN_ID => 0;
7377
7378     # These are the possible states for this scanner:
7379     my $scan_state_SIGIL     = '$';
7380     my $scan_state_ALPHA     = 'A';
7381     my $scan_state_COLON     = ':';
7382     my $scan_state_LPAREN    = '(';
7383     my $scan_state_RPAREN    = ')';
7384     my $scan_state_AMPERSAND = '&';
7385     my $scan_state_SPLIT     = '^';
7386
7387     # Only these non-blank states may be returned to caller:
7388     my %is_returnable_scan_state = (
7389         $scan_state_SIGIL     => 1,
7390         $scan_state_AMPERSAND => 1,
7391     );
7392
7393     # USES GLOBAL VARIABLES:
7394     #    $context, $last_nonblank_token, $last_nonblank_type
7395
7396     #-----------
7397     # call args:
7398     #-----------
7399     my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
7400         $expecting, $container_type );
7401
7402     #-------------------------------------------
7403     # my variables, re-initialized on each call:
7404     #-------------------------------------------
7405     my $i_begin;                # starting index $i
7406     my $type;                   # returned identifier type
7407     my $tok_begin;              # starting token
7408     my $tok;                    # returned token
7409     my $id_scan_state_begin;    # starting scan state
7410     my $identifier_begin;       # starting identifier
7411     my $i_save;                 # a last good index, in case of error
7412     my $message;                # hold error message for log file
7413     my $tok_is_blank;
7414     my $last_tok_is_blank;
7415     my $in_prototype_or_signature;
7416     my $saw_alpha;
7417     my $saw_type;
7418     my $allow_tick;
7419
7420     sub initialize_my_scan_id_vars {
7421
7422         # Initialize all 'my' vars on entry
7423         $i_begin   = $i;
7424         $type      = EMPTY_STRING;
7425         $tok_begin = $rtokens->[$i_begin];
7426         $tok       = $tok_begin;
7427         if ( $tok_begin eq ':' ) { $tok_begin = '::' }
7428         $id_scan_state_begin = $id_scan_state;
7429         $identifier_begin    = $identifier;
7430         $i_save              = undef;
7431
7432         $message           = EMPTY_STRING;
7433         $tok_is_blank      = undef;          # a flag to speed things up
7434         $last_tok_is_blank = undef;
7435
7436         $in_prototype_or_signature =
7437           $container_type && $container_type =~ /^sub\b/;
7438
7439         # these flags will be used to help figure out the type:
7440         $saw_alpha = undef;
7441         $saw_type  = undef;
7442
7443         # allow old package separator (') except in 'use' statement
7444         $allow_tick = ( $last_nonblank_token ne 'use' );
7445         return;
7446     } ## end sub initialize_my_scan_id_vars
7447
7448     #----------------------------------
7449     # Routines for handling scan states
7450     #----------------------------------
7451     sub do_id_scan_state_dollar {
7452
7453         # We saw a sigil, now looking to start a variable name
7454         if ( $tok eq '$' ) {
7455
7456             $identifier .= $tok;
7457
7458             # we've got a punctuation variable if end of line (punct.t)
7459             if ( $i == $max_token_index ) {
7460                 $type          = 'i';
7461                 $id_scan_state = EMPTY_STRING;
7462             }
7463         }
7464         elsif ( $tok =~ /^\w/ ) {    # alphanumeric ..
7465             $saw_alpha     = 1;
7466             $id_scan_state = $scan_state_COLON;    # now need ::
7467             $identifier .= $tok;
7468         }
7469         elsif ( $tok eq '::' ) {
7470             $id_scan_state = $scan_state_ALPHA;
7471             $identifier .= $tok;
7472         }
7473
7474         # POSTDEFREF ->@ ->% ->& ->*
7475         elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
7476             $identifier .= $tok;
7477         }
7478         elsif ( $tok eq "'" && $allow_tick ) {    # alphanumeric ..
7479             $saw_alpha     = 1;
7480             $id_scan_state = $scan_state_COLON;    # now need ::
7481             $identifier .= $tok;
7482
7483             # Perl will accept leading digits in identifiers,
7484             # although they may not always produce useful results.
7485             # Something like $main::0 is ok.  But this also works:
7486             #
7487             #  sub howdy::123::bubba{ print "bubba $54321!\n" }
7488             #  howdy::123::bubba();
7489             #
7490         }
7491         elsif ( $tok eq '#' ) {
7492
7493             my $is_punct_var = $identifier eq '$$';
7494
7495             # side comment or identifier?
7496             if (
7497
7498                 # A '#' starts a comment if it follows a space. For example,
7499                 # the following is equivalent to $ans=40.
7500                 #   my $ #
7501                 #     ans = 40;
7502                 !$last_tok_is_blank
7503
7504                 # a # inside a prototype or signature can only start a
7505                 # comment
7506                 && !$in_prototype_or_signature
7507
7508                 # these are valid punctuation vars: *# %# @# $#
7509                 # May also be '$#array' or POSTDEFREF ->$#
7510                 && (   $identifier =~ /^[\%\@\$\*]$/
7511                     || $identifier =~ /\$$/ )
7512
7513                 # but a '#' after '$$' is a side comment; see c147
7514                 && !$is_punct_var
7515
7516               )
7517             {
7518                 $identifier .= $tok;    # keep same state, a $ could follow
7519             }
7520             else {
7521
7522                 # otherwise it is a side comment
7523                 if    ( $identifier eq '->' )                 { }
7524                 elsif ($is_punct_var)                         { $type = 'i' }
7525                 elsif ( $id_scan_state eq $scan_state_SIGIL ) { $type = 't' }
7526                 else                                          { $type = 'i' }
7527                 $i             = $i_save;
7528                 $id_scan_state = EMPTY_STRING;
7529             }
7530         }
7531
7532         elsif ( $tok eq '{' ) {
7533
7534             # check for something like ${#} or ${?}, where ? is a special char
7535             if (
7536                 (
7537                        $identifier eq '$'
7538                     || $identifier eq '@'
7539                     || $identifier eq '$#'
7540                 )
7541                 && $i + 2 <= $max_token_index
7542                 && $rtokens->[ $i + 2 ] eq '}'
7543                 && $rtokens->[ $i + 1 ] !~ /[\s\w]/
7544               )
7545             {
7546                 my $next2 = $rtokens->[ $i + 2 ];
7547                 my $next1 = $rtokens->[ $i + 1 ];
7548                 $identifier .= $tok . $next1 . $next2;
7549                 $i += 2;
7550                 $id_scan_state = EMPTY_STRING;
7551             }
7552             else {
7553
7554                 # skip something like ${xxx} or ->{
7555                 $id_scan_state = EMPTY_STRING;
7556
7557                 # if this is the first token of a line, any tokens for this
7558                 # identifier have already been accumulated
7559                 if ( $identifier eq '$' || $i == 0 ) {
7560                     $identifier = EMPTY_STRING;
7561                 }
7562                 $i = $i_save;
7563             }
7564         }
7565
7566         # space ok after leading $ % * & @
7567         elsif ( $tok =~ /^\s*$/ ) {
7568
7569             $tok_is_blank = 1;
7570
7571             # note: an id with a leading '&' does not actually come this way
7572             if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
7573
7574                 if ( length($identifier) > 1 ) {
7575                     $id_scan_state = EMPTY_STRING;
7576                     $i             = $i_save;
7577                     $type          = 'i';    # probably punctuation variable
7578                 }
7579                 else {
7580
7581                     # fix c139: trim line-ending type 't'
7582                     if ( $i == $max_token_index ) {
7583                         $i    = $i_save;
7584                         $type = 't';
7585                     }
7586
7587                     # spaces after $'s are common, and space after @
7588                     # is harmless, so only complain about space
7589                     # after other type characters. Space after $ and
7590                     # @ will be removed in formatting.  Report space
7591                     # after % and * because they might indicate a
7592                     # parsing error.  In other words '% ' might be a
7593                     # modulo operator.  Delete this warning if it
7594                     # gets annoying.
7595                     elsif ( $identifier !~ /^[\@\$]$/ ) {
7596                         $message =
7597                           "Space in identifier, following $identifier\n";
7598                     }
7599                     else {
7600                         ## ok: silently accept space after '$' and '@' sigils
7601                     }
7602                 }
7603             }
7604
7605             elsif ( $identifier eq '->' ) {
7606
7607                 # space after '->' is ok except at line end ..
7608                 # so trim line-ending in type '->' (fixes c139)
7609                 if ( $i == $max_token_index ) {
7610                     $i    = $i_save;
7611                     $type = '->';
7612                 }
7613             }
7614
7615             # stop at space after something other than -> or sigil
7616             # Example of what can arrive here:
7617             #   eval { $MyClass->$$ };
7618             else {
7619                 $id_scan_state = EMPTY_STRING;
7620                 $i             = $i_save;
7621                 $type          = 'i';
7622             }
7623         }
7624         elsif ( $tok eq '^' ) {
7625
7626             # check for some special variables like $^ $^W
7627             if ( $identifier =~ /^[\$\*\@\%]$/ ) {
7628                 $identifier .= $tok;
7629                 $type = 'i';
7630
7631                 # There may be one more character, not a space, after the ^
7632                 my $next1 = $rtokens->[ $i + 1 ];
7633                 my $chr   = substr( $next1, 0, 1 );
7634                 if ( $is_special_variable_char{$chr} ) {
7635
7636                     # It is something like $^W
7637                     # Test case (c066) : $^Oeq'linux'
7638                     $i++;
7639                     $identifier .= $next1;
7640
7641                     # If pretoken $next1 is more than one character long,
7642                     # set a flag indicating that it needs to be split.
7643                     $id_scan_state =
7644                       ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
7645                 }
7646                 else {
7647
7648                     # it is just $^
7649                     # Simple test case (c065): '$aa=$^if($bb)';
7650                     $id_scan_state = EMPTY_STRING;
7651                 }
7652             }
7653             else {
7654                 $id_scan_state = EMPTY_STRING;
7655                 $i             = $i_save;
7656             }
7657         }
7658         else {    # something else
7659
7660             if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
7661
7662                 # We might be in an extrusion of
7663                 #     sub foo2 ( $first, $, $third ) {
7664                 # looking at a line starting with a comma, like
7665                 #   $
7666                 #   ,
7667                 # in this case the comma ends the signature variable
7668                 # '$' which will have been previously marked type 't'
7669                 # rather than 'i'.
7670                 if ( $i == $i_begin ) {
7671                     $identifier = EMPTY_STRING;
7672                     $type       = EMPTY_STRING;
7673                 }
7674
7675                 # at a # we have to mark as type 't' because more may
7676                 # follow, otherwise, in a signature we can let '$' be an
7677                 # identifier here for better formatting.
7678                 # See 'mangle4.in' for a test case.
7679                 else {
7680                     $type = 'i';
7681                     if ( $id_scan_state eq $scan_state_SIGIL && $tok eq '#' ) {
7682                         $type = 't';
7683                     }
7684                     $i = $i_save;
7685                 }
7686                 $id_scan_state = EMPTY_STRING;
7687             }
7688
7689             # check for various punctuation variables
7690             elsif ( $identifier =~ /^[\$\*\@\%]$/ ) {
7691                 $identifier .= $tok;
7692             }
7693
7694             # POSTDEFREF: Postfix reference ->$* ->%*  ->@* ->** ->&* ->$#*
7695             elsif ($tok eq '*'
7696                 && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
7697             {
7698                 $identifier .= $tok;
7699             }
7700
7701             elsif ( $identifier eq '$#' ) {
7702
7703                 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
7704
7705                 # perl seems to allow just these: $#: $#- $#+
7706                 elsif ( $tok =~ /^[\:\-\+]$/ ) {
7707                     $type = 'i';
7708                     $identifier .= $tok;
7709                 }
7710                 else {
7711                     $i = $i_save;
7712                     write_logfile_entry( 'Use of $# is deprecated' . "\n" );
7713                 }
7714             }
7715             elsif ( $identifier eq '$$' ) {
7716
7717                 # perl does not allow references to punctuation
7718                 # variables without braces.  For example, this
7719                 # won't work:
7720                 #  $:=\4;
7721                 #  $a = $$:;
7722                 # You would have to use
7723                 #  $a = ${$:};
7724
7725                 # '$$' alone is punctuation variable for PID
7726                 $i = $i_save;
7727                 if   ( $tok eq '{' ) { $type = 't' }
7728                 else                 { $type = 'i' }
7729             }
7730             elsif ( $identifier eq '->' ) {
7731                 $i = $i_save;
7732             }
7733             else {
7734                 $i = $i_save;
7735                 if ( length($identifier) == 1 ) {
7736                     $identifier = EMPTY_STRING;
7737                 }
7738             }
7739             $id_scan_state = EMPTY_STRING;
7740         }
7741         return;
7742     } ## end sub do_id_scan_state_dollar
7743
7744     sub do_id_scan_state_alpha {
7745
7746         # looking for alphanumeric after ::
7747         $tok_is_blank = $tok =~ /^\s*$/;
7748
7749         if ( $tok =~ /^\w/ ) {    # found it
7750             $identifier .= $tok;
7751             $id_scan_state = $scan_state_COLON;    # now need ::
7752             $saw_alpha     = 1;
7753         }
7754         elsif ( $tok eq "'" && $allow_tick ) {
7755             $identifier .= $tok;
7756             $id_scan_state = $scan_state_COLON;    # now need ::
7757             $saw_alpha     = 1;
7758         }
7759         elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7760             $id_scan_state = $scan_state_LPAREN;
7761             $identifier .= $tok;
7762         }
7763         elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7764             $id_scan_state = $scan_state_RPAREN;
7765             $identifier .= $tok;
7766         }
7767         else {
7768             $id_scan_state = EMPTY_STRING;
7769             $i             = $i_save;
7770         }
7771         return;
7772     } ## end sub do_id_scan_state_alpha
7773
7774     sub do_id_scan_state_colon {
7775
7776         # looking for possible :: after alphanumeric
7777
7778         $tok_is_blank = $tok =~ /^\s*$/;
7779
7780         if ( $tok eq '::' ) {    # got it
7781             $identifier .= $tok;
7782             $id_scan_state = $scan_state_ALPHA;    # now require alpha
7783         }
7784         elsif ( $tok =~ /^\w/ ) {    # more alphanumeric is ok here
7785             $identifier .= $tok;
7786             $id_scan_state = $scan_state_COLON;    # now need ::
7787             $saw_alpha     = 1;
7788         }
7789         elsif ( $tok eq "'" && $allow_tick ) {     # tick
7790
7791             if ( $is_keyword{$identifier} ) {
7792                 $id_scan_state = EMPTY_STRING;     # that's all
7793                 $i             = $i_save;
7794             }
7795             else {
7796                 $identifier .= $tok;
7797             }
7798         }
7799         elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7800             $id_scan_state = $scan_state_LPAREN;
7801             $identifier .= $tok;
7802         }
7803         elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7804             $id_scan_state = $scan_state_RPAREN;
7805             $identifier .= $tok;
7806         }
7807         else {
7808             $id_scan_state = EMPTY_STRING;    # that's all
7809             $i             = $i_save;
7810         }
7811         return;
7812     } ## end sub do_id_scan_state_colon
7813
7814     sub do_id_scan_state_left_paren {
7815
7816         # looking for possible '(' of a prototype
7817
7818         if ( $tok eq '(' ) {    # got it
7819             $identifier .= $tok;
7820             $id_scan_state = $scan_state_RPAREN;    # now find the end of it
7821         }
7822         elsif ( $tok =~ /^\s*$/ ) {                 # blank - keep going
7823             $identifier .= $tok;
7824             $tok_is_blank = 1;
7825         }
7826         else {
7827             $id_scan_state = EMPTY_STRING;          # that's all - no prototype
7828             $i             = $i_save;
7829         }
7830         return;
7831     } ## end sub do_id_scan_state_left_paren
7832
7833     sub do_id_scan_state_right_paren {
7834
7835         # looking for a ')' of prototype to close a '('
7836
7837         $tok_is_blank = $tok =~ /^\s*$/;
7838
7839         if ( $tok eq ')' ) {    # got it
7840             $identifier .= $tok;
7841             $id_scan_state = EMPTY_STRING;    # all done
7842         }
7843         elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
7844             $identifier .= $tok;
7845         }
7846         else {    # probable error in script, but keep going
7847             warning("Unexpected '$tok' while seeking end of prototype\n");
7848             $identifier .= $tok;
7849         }
7850         return;
7851     } ## end sub do_id_scan_state_right_paren
7852
7853     sub do_id_scan_state_ampersand {
7854
7855         # Starting sub call after seeing an '&'
7856
7857         if ( $tok =~ /^[\$\w]/ ) {    # alphanumeric ..
7858             $id_scan_state = $scan_state_COLON;    # now need ::
7859             $saw_alpha     = 1;
7860             $identifier .= $tok;
7861         }
7862         elsif ( $tok eq "'" && $allow_tick ) {     # alphanumeric ..
7863             $id_scan_state = $scan_state_COLON;    # now need ::
7864             $saw_alpha     = 1;
7865             $identifier .= $tok;
7866         }
7867         elsif ( $tok =~ /^\s*$/ ) {                # allow space
7868             $tok_is_blank = 1;
7869
7870             # fix c139: trim line-ending type 't'
7871             if ( length($identifier) == 1 && $i == $max_token_index ) {
7872                 $i    = $i_save;
7873                 $type = 't';
7874             }
7875         }
7876         elsif ( $tok eq '::' ) {                   # leading ::
7877             $id_scan_state = $scan_state_ALPHA;    # accept alpha next
7878             $identifier .= $tok;
7879         }
7880         elsif ( $tok eq '{' ) {
7881             if ( $identifier eq '&' || $i == 0 ) {
7882                 $identifier = EMPTY_STRING;
7883             }
7884             $i             = $i_save;
7885             $id_scan_state = EMPTY_STRING;
7886         }
7887         elsif ( $tok eq '^' ) {
7888             if ( $identifier eq '&' ) {
7889
7890                 # Special variable (c066)
7891                 $identifier .= $tok;
7892                 $type = '&';
7893
7894                 # There may be one more character, not a space, after the ^
7895                 my $next1 = $rtokens->[ $i + 1 ];
7896                 my $chr   = substr( $next1, 0, 1 );
7897                 if ( $is_special_variable_char{$chr} ) {
7898
7899                     # It is something like &^O
7900                     $i++;
7901                     $identifier .= $next1;
7902
7903                     # If pretoken $next1 is more than one character long,
7904                     # set a flag indicating that it needs to be split.
7905                     $id_scan_state =
7906                       ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
7907                 }
7908                 else {
7909
7910                     # it is &^
7911                     $id_scan_state = EMPTY_STRING;
7912                 }
7913             }
7914             else {
7915                 $identifier = EMPTY_STRING;
7916                 $i          = $i_save;
7917             }
7918         }
7919         else {
7920
7921             # punctuation variable?
7922             # testfile: cunningham4.pl
7923             #
7924             # We have to be careful here.  If we are in an unknown state,
7925             # we will reject the punctuation variable.  In the following
7926             # example the '&' is a binary operator but we are in an unknown
7927             # state because there is no sigil on 'Prima', so we don't
7928             # know what it is.  But it is a bad guess that
7929             # '&~' is a function variable.
7930             # $self->{text}->{colorMap}->[
7931             #   Prima::PodView::COLOR_CODE_FOREGROUND
7932             #   & ~tb::COLOR_INDEX ] =
7933             #   $sec->{ColorCode}
7934
7935             # Fix for case c033: a '#' here starts a side comment
7936             if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
7937                 $identifier .= $tok;
7938             }
7939             else {
7940                 $identifier = EMPTY_STRING;
7941                 $i          = $i_save;
7942                 $type       = '&';
7943             }
7944             $id_scan_state = EMPTY_STRING;
7945         }
7946         return;
7947     } ## end sub do_id_scan_state_ampersand
7948
7949     #-------------------
7950     # hash of scanner subs
7951     #-------------------
7952     my $scan_identifier_code = {
7953         $scan_state_SIGIL     => \&do_id_scan_state_dollar,
7954         $scan_state_ALPHA     => \&do_id_scan_state_alpha,
7955         $scan_state_COLON     => \&do_id_scan_state_colon,
7956         $scan_state_LPAREN    => \&do_id_scan_state_left_paren,
7957         $scan_state_RPAREN    => \&do_id_scan_state_right_paren,
7958         $scan_state_AMPERSAND => \&do_id_scan_state_ampersand,
7959     };
7960
7961     sub scan_complex_identifier {
7962
7963         # This routine assembles tokens into identifiers.  It maintains a
7964         # scan state, id_scan_state.  It updates id_scan_state based upon
7965         # current id_scan_state and token, and returns an updated
7966         # id_scan_state and the next index after the identifier.
7967
7968         # This routine now serves a a backup for sub scan_simple_identifier
7969         # which handles most identifiers.
7970
7971         (
7972             $i,         $id_scan_state, $identifier, $rtokens, $max_token_index,
7973             $expecting, $container_type
7974         ) = @_;
7975
7976         # return flag telling caller to split the pretoken
7977         my $split_pretoken_flag;
7978
7979         #-------------------
7980         # Initialize my vars
7981         #-------------------
7982
7983         initialize_my_scan_id_vars();
7984
7985         #--------------------------------------------------------
7986         # get started by defining a type and a state if necessary
7987         #--------------------------------------------------------
7988
7989         if ( !$id_scan_state ) {
7990             $context = UNKNOWN_CONTEXT;
7991
7992             # fixup for digraph
7993             if ( $tok eq '>' ) {
7994                 $tok       = '->';
7995                 $tok_begin = $tok;
7996             }
7997             $identifier = $tok;
7998
7999             if ( $last_nonblank_token eq '->' ) {
8000                 $identifier    = '->' . $identifier;
8001                 $id_scan_state = $scan_state_SIGIL;
8002             }
8003             elsif ( $tok eq '$' || $tok eq '*' ) {
8004                 $id_scan_state = $scan_state_SIGIL;
8005                 $context       = SCALAR_CONTEXT;
8006             }
8007             elsif ( $tok eq '%' || $tok eq '@' ) {
8008                 $id_scan_state = $scan_state_SIGIL;
8009                 $context       = LIST_CONTEXT;
8010             }
8011             elsif ( $tok eq '&' ) {
8012                 $id_scan_state = $scan_state_AMPERSAND;
8013             }
8014             elsif ( $tok eq 'sub' or $tok eq 'package' ) {
8015                 $saw_alpha     = 0;    # 'sub' is considered type info here
8016                 $id_scan_state = $scan_state_SIGIL;
8017                 $identifier .=
8018                   SPACE;    # need a space to separate sub from sub name
8019             }
8020             elsif ( $tok eq '::' ) {
8021                 $id_scan_state = $scan_state_ALPHA;
8022             }
8023             elsif ( $tok =~ /^\w/ ) {
8024                 $id_scan_state = $scan_state_COLON;
8025                 $saw_alpha     = 1;
8026             }
8027             elsif ( $tok eq '->' ) {
8028                 $id_scan_state = $scan_state_SIGIL;
8029             }
8030             else {
8031
8032                 # shouldn't happen: bad call parameter
8033                 my $msg =
8034 "Program bug detected: scan_identifier received bad starting token = '$tok'\n";
8035                 if (DEVEL_MODE) { Fault($msg) }
8036                 if ( !$tokenizer_self->[_in_error_] ) {
8037                     warning($msg);
8038                     $tokenizer_self->[_in_error_] = 1;
8039                 }
8040                 $id_scan_state = EMPTY_STRING;
8041
8042                 # emergency return
8043                 goto RETURN;
8044             }
8045             $saw_type = !$saw_alpha;
8046         }
8047         else {
8048             $i--;
8049             $saw_alpha = ( $tok =~ /^\w/ );
8050             $saw_type  = ( $tok =~ /([\$\%\@\*\&])/ );
8051
8052             # check for a valid starting state
8053             if ( DEVEL_MODE && !$is_returnable_scan_state{$id_scan_state} ) {
8054                 Fault(<<EOM);
8055 Unexpected starting scan state in sub scan_complex_identifier: '$id_scan_state'
8056 EOM
8057             }
8058         }
8059
8060         #------------------------------
8061         # loop to gather the identifier
8062         #------------------------------
8063
8064         $i_save = $i;
8065
8066         while ( $i < $max_token_index && $id_scan_state ) {
8067
8068             # Be sure we have code to handle this state before we proceed
8069             my $code = $scan_identifier_code->{$id_scan_state};
8070             if ( !$code ) {
8071
8072                 if ( $id_scan_state eq $scan_state_SPLIT ) {
8073                     ## OK: this is the signal to exit and split the pretoken
8074                 }
8075
8076                 # unknown state - should not happen
8077                 else {
8078                     if (DEVEL_MODE) {
8079                         Fault(<<EOM);
8080 Unknown scan state in sub scan_complex_identifier: '$id_scan_state'
8081 Scan state at sub entry was '$id_scan_state_begin'
8082 EOM
8083                     }
8084                     $id_scan_state = EMPTY_STRING;
8085                     $i             = $i_save;
8086                 }
8087                 last;
8088             }
8089
8090             # Remember the starting index for progress check below
8091             my $i_start_loop = $i;
8092
8093             $last_tok_is_blank = $tok_is_blank;
8094             if   ($tok_is_blank) { $tok_is_blank = undef }
8095             else                 { $i_save       = $i }
8096
8097             $tok = $rtokens->[ ++$i ];
8098
8099             # patch to make digraph :: if necessary
8100             if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
8101                 $tok = '::';
8102                 $i++;
8103             }
8104
8105             $code->();
8106
8107             # check for forward progress: a decrease in the index $i
8108             # implies that scanning has finished
8109             last if ( $i <= $i_start_loop );
8110
8111         } ## end of main loop
8112
8113         #-------------
8114         # Check result
8115         #-------------
8116
8117         # Be sure a valid state is returned
8118         if ($id_scan_state) {
8119
8120             if ( !$is_returnable_scan_state{$id_scan_state} ) {
8121
8122                 if ( $id_scan_state eq $scan_state_SPLIT ) {
8123                     $split_pretoken_flag = 1;
8124                 }
8125
8126                 if ( $id_scan_state eq $scan_state_RPAREN ) {
8127                     warning(
8128                         "Hit end of line while seeking ) to end prototype\n");
8129                 }
8130
8131                 $id_scan_state = EMPTY_STRING;
8132             }
8133
8134             # Patch: the deprecated variable $# does not combine with anything
8135             # on the next line.
8136             if ( $identifier eq '$#' ) { $id_scan_state = EMPTY_STRING }
8137         }
8138
8139         # Be sure the token index is valid
8140         if ( $i < 0 ) { $i = 0 }
8141
8142         # Be sure a token type is defined
8143         if ( !$type ) {
8144
8145             if ($saw_type) {
8146
8147                 if ($saw_alpha) {
8148
8149                   # The type without the -> should be the same as with the -> so
8150                   # that if they get separated we get the same bond strengths,
8151                   # etc.  See b1234
8152                     if (   $identifier =~ /^->/
8153                         && $last_nonblank_type eq 'w'
8154                         && substr( $identifier, 2, 1 ) =~ /^\w/ )
8155                     {
8156                         $type = 'w';
8157                     }
8158                     else { $type = 'i' }
8159                 }
8160                 elsif ( $identifier eq '->' ) {
8161                     $type = '->';
8162                 }
8163                 elsif (
8164                     ( length($identifier) > 1 )
8165
8166                     # In something like '@$=' we have an identifier '@$'
8167                     # In something like '$${' we have type '$$' (and only
8168                     # part of an identifier)
8169                     && !( $identifier =~ /\$$/ && $tok eq '{' )
8170
8171                     ## && ( $identifier !~ /^(sub |package )$/ )
8172                     && $identifier ne 'sub '
8173                     && $identifier ne 'package '
8174                   )
8175                 {
8176                     $type = 'i';
8177                 }
8178                 else { $type = 't' }
8179             }
8180             elsif ($saw_alpha) {
8181
8182                 # type 'w' includes anything without leading type info
8183                 # ($,%,@,*) including something like abc::def::ghi
8184                 $type = 'w';
8185
8186                 # Fix for b1337, if restarting scan after line break between
8187                 # '->' or sigil and identifier name, use type 'i'
8188                 if (   $id_scan_state_begin
8189                     && $identifier =~ /^([\$\%\@\*\&]|->)/ )
8190                 {
8191                     $type = 'i';
8192                 }
8193             }
8194             else {
8195                 $type = EMPTY_STRING;
8196             }    # this can happen on a restart
8197         }
8198
8199         # See if we formed an identifier...
8200         if ($identifier) {
8201             $tok = $identifier;
8202             if ($message) { write_logfile_entry($message) }
8203         }
8204
8205         # did not find an identifier, back  up
8206         else {
8207             $tok = $tok_begin;
8208             $i   = $i_begin;
8209         }
8210
8211       RETURN:
8212
8213         DEBUG_SCAN_ID && do {
8214             my ( $a, $b, $c ) = caller;
8215             print STDOUT
8216 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
8217             print STDOUT
8218 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
8219         };
8220         return ( $i, $tok, $type, $id_scan_state, $identifier,
8221             $split_pretoken_flag );
8222     } ## end sub scan_complex_identifier
8223 } ## end closure for sub scan_complex_identifier
8224
8225 {    ## closure for sub do_scan_sub
8226
8227     my %warn_if_lexical;
8228
8229     BEGIN {
8230
8231         # lexical subs with these names can cause parsing errors in this version
8232         my @q = qw( m q qq qr qw qx s tr y );
8233         @{warn_if_lexical}{@q} = (1) x scalar(@q);
8234     }
8235
8236     # saved package and subnames in case prototype is on separate line
8237     my ( $package_saved, $subname_saved );
8238
8239     # initialize subname each time a new 'sub' keyword is encountered
8240     sub initialize_subname {
8241         $package_saved = EMPTY_STRING;
8242         $subname_saved = EMPTY_STRING;
8243         return;
8244     }
8245
8246     use constant {
8247         SUB_CALL       => 1,
8248         PAREN_CALL     => 2,
8249         PROTOTYPE_CALL => 3,
8250     };
8251
8252     sub do_scan_sub {
8253
8254         # do_scan_sub parses a sub name and prototype.
8255
8256         # At present there are three basic CALL TYPES which are
8257         # distinguished by the starting value of '$tok':
8258         # 1. $tok='sub', id_scan_state='sub'
8259         #    it is called with $i_beg equal to the index of the first nonblank
8260         #    token following a 'sub' token.
8261         # 2. $tok='(', id_scan_state='sub',
8262         #    it is called with $i_beg equal to the index of a '(' which may
8263         #    start a prototype.
8264         # 3. $tok='prototype', id_scan_state='prototype'
8265         #    it is called with $i_beg equal to the index of a '(' which is
8266         #    preceded by ': prototype' and has $id_scan_state eq 'prototype'
8267
8268         # Examples:
8269
8270         # A single type 1 call will get both the sub and prototype
8271         #   sub foo1 ( $$ ) { }
8272         #   ^
8273
8274         # The subname will be obtained with a 'sub' call
8275         # The prototype on line 2 will be obtained with a '(' call
8276         #   sub foo1
8277         #   ^                    <---call type 1
8278         #     ( $$ ) { }
8279         #     ^                  <---call type 2
8280
8281         # The subname will be obtained with a 'sub' call
8282         # The prototype will be obtained with a 'prototype' call
8283         #   sub foo1 ( $x, $y ) : prototype ( $$ ) { }
8284         #   ^ <---type 1                    ^ <---type 3
8285
8286         # TODO: add future error checks to be sure we have a valid
8287         # sub name.  For example, 'sub &doit' is wrong.  Also, be sure
8288         # a name is given if and only if a non-anonymous sub is
8289         # appropriate.
8290         # USES GLOBAL VARS: $current_package, $last_nonblank_token,
8291         # $in_attribute_list, %saw_function_definition,
8292         # $statement_type
8293
8294         my ($rinput_hash) = @_;
8295
8296         my $input_line      = $rinput_hash->{input_line};
8297         my $i               = $rinput_hash->{i};
8298         my $i_beg           = $rinput_hash->{i_beg};
8299         my $tok             = $rinput_hash->{tok};
8300         my $type            = $rinput_hash->{type};
8301         my $rtokens         = $rinput_hash->{rtokens};
8302         my $rtoken_map      = $rinput_hash->{rtoken_map};
8303         my $id_scan_state   = $rinput_hash->{id_scan_state};
8304         my $max_token_index = $rinput_hash->{max_token_index};
8305
8306         my $i_entry = $i;
8307
8308         # Determine the CALL TYPE
8309         # 1=sub
8310         # 2=(
8311         # 3=prototype
8312         my $call_type =
8313             $tok eq 'prototype' ? PROTOTYPE_CALL
8314           : $tok eq '('         ? PAREN_CALL
8315           :                       SUB_CALL;
8316
8317         $id_scan_state = EMPTY_STRING;  # normally we get everything in one call
8318         my $subname = $subname_saved;
8319         my $package = $package_saved;
8320         my $proto   = undef;
8321         my $attrs   = undef;
8322         my $match;
8323
8324         my $pos_beg = $rtoken_map->[$i_beg];
8325         pos($input_line) = $pos_beg;
8326
8327         # Look for the sub NAME if this is a SUB call
8328         if (
8329                $call_type == SUB_CALL
8330             && $input_line =~ m/\G\s*
8331         ((?:\w*(?:'|::))*)  # package - something that ends in :: or '
8332         (\w+)               # NAME    - required
8333         /gcx
8334           )
8335         {
8336             $match   = 1;
8337             $subname = $2;
8338
8339             my $is_lexical_sub =
8340               $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
8341             if ( $is_lexical_sub && $1 ) {
8342                 warning("'my' sub $subname cannot be in package '$1'\n");
8343                 $is_lexical_sub = 0;
8344             }
8345
8346             if ($is_lexical_sub) {
8347
8348                 # lexical subs use the block sequence number as a package name
8349                 my $seqno =
8350                   $current_sequence_number[BRACE][ $current_depth[BRACE] ];
8351                 $seqno   = 1 unless ( defined($seqno) );
8352                 $package = $seqno;
8353                 if ( $warn_if_lexical{$subname} ) {
8354                     warning(
8355 "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
8356                     );
8357                 }
8358             }
8359             else {
8360                 $package = ( defined($1) && $1 ) ? $1 : $current_package;
8361                 $package =~ s/\'/::/g;
8362                 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
8363                 $package =~ s/::$//;
8364             }
8365
8366             my $pos  = pos($input_line);
8367             my $numc = $pos - $pos_beg;
8368             $tok  = 'sub ' . substr( $input_line, $pos_beg, $numc );
8369             $type = 'i';
8370
8371             # remember the sub name in case another call is needed to
8372             # get the prototype
8373             $package_saved = $package;
8374             $subname_saved = $subname;
8375         }
8376
8377         # Now look for PROTO ATTRS for all call types
8378         # Look for prototype/attributes which are usually on the same
8379         # line as the sub name but which might be on a separate line.
8380         # For example, we might have an anonymous sub with attributes,
8381         # or a prototype on a separate line from its sub name
8382
8383         # NOTE: We only want to parse PROTOTYPES here. If we see anything that
8384         # does not look like a prototype, we assume it is a SIGNATURE and we
8385         # will stop and let the the standard tokenizer handle it.  In
8386         # particular, we stop if we see any nested parens, braces, or commas.
8387         # Also note, a valid prototype cannot contain any alphabetic character
8388         #  -- see https://perldoc.perl.org/perlsub
8389         # But it appears that an underscore is valid in a prototype, so the
8390         # regex below uses [A-Za-z] rather than \w
8391         # This is the old regex which has been replaced:
8392         # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))?  # PROTO
8393         my $saw_opening_paren = $input_line =~ /\G\s*\(/;
8394         if (
8395             $input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))?  # PROTO
8396             (\s*:)?                              # ATTRS leading ':'
8397             /gcx
8398             && ( $1 || $2 )
8399           )
8400         {
8401             $proto = $1;
8402             $attrs = $2;
8403
8404             # Append the prototype to the starting token if it is 'sub' or
8405             # 'prototype'.  This is not necessary but for compatibility with
8406             # previous versions when the -csc flag is used:
8407             if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
8408                 $tok .= $proto;
8409             }
8410
8411             # If we just entered the sub at an opening paren on this call, not
8412             # a following :prototype, label it with the previous token.  This is
8413             # necessary to propagate the sub name to its opening block.
8414             elsif ( $call_type == PAREN_CALL ) {
8415                 $tok = $last_nonblank_token;
8416             }
8417
8418             $match ||= 1;
8419
8420             # Patch part #1 to fixes cases b994 and b1053:
8421             # Mark an anonymous sub keyword without prototype as type 'k', i.e.
8422             #    'sub : lvalue { ...'
8423             $type = 'i';
8424             if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
8425         }
8426
8427         if ($match) {
8428
8429             # ATTRS: if there are attributes, back up and let the ':' be
8430             # found later by the scanner.
8431             my $pos = pos($input_line);
8432             if ($attrs) {
8433                 $pos -= length($attrs);
8434             }
8435
8436             my $next_nonblank_token = $tok;
8437
8438             # catch case of line with leading ATTR ':' after anonymous sub
8439             if ( $pos == $pos_beg && $tok eq ':' ) {
8440                 $type              = 'A';
8441                 $in_attribute_list = 1;
8442             }
8443
8444             # Otherwise, if we found a match we must convert back from
8445             # string position to the pre_token index for continued parsing.
8446             else {
8447
8448                 # I don't think an error flag can occur here ..but ?
8449                 my $error;
8450                 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
8451                     $max_token_index );
8452                 if ($error) { warning("Possibly invalid sub\n") }
8453
8454                 # Patch part #2 to fixes cases b994 and b1053:
8455                 # Do not let spaces be part of the token of an anonymous sub
8456                 # keyword which we marked as type 'k' above...i.e. for
8457                 # something like:
8458                 #    'sub : lvalue { ...'
8459                 # Back up and let it be parsed as a blank
8460                 if (   $type eq 'k'
8461                     && $attrs
8462                     && $i > $i_entry
8463                     && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
8464                 {
8465                     $i--;
8466                 }
8467
8468                 # check for multiple definitions of a sub
8469                 ( $next_nonblank_token, my $i_next ) =
8470                   find_next_nonblank_token_on_this_line( $i, $rtokens,
8471                     $max_token_index );
8472             }
8473
8474             if ( $next_nonblank_token =~ /^(\s*|#)$/ )
8475             {    # skip blank or side comment
8476                 my ( $rpre_tokens, $rpre_types ) =
8477                   peek_ahead_for_n_nonblank_pre_tokens(1);
8478                 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
8479                     $next_nonblank_token = $rpre_tokens->[0];
8480                 }
8481                 else {
8482                     $next_nonblank_token = '}';
8483                 }
8484             }
8485
8486             # See what's next...
8487             if ( $next_nonblank_token eq '{' ) {
8488                 if ($subname) {
8489
8490                     # Check for multiple definitions of a sub, but
8491                     # it is ok to have multiple sub BEGIN, etc,
8492                     # so we do not complain if name is all caps
8493                     if (   $saw_function_definition{$subname}{$package}
8494                         && $subname !~ /^[A-Z]+$/ )
8495                     {
8496                         my $lno = $saw_function_definition{$subname}{$package};
8497                         if ( $package =~ /^\d/ ) {
8498                             warning(
8499 "already saw definition of lexical 'sub $subname' at line $lno\n"
8500                             );
8501
8502                         }
8503                         else {
8504                             warning(
8505 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
8506                             ) unless (DEVEL_MODE);
8507                         }
8508                     }
8509                     $saw_function_definition{$subname}{$package} =
8510                       $tokenizer_self->[_last_line_number_];
8511                 }
8512             }
8513             elsif ( $next_nonblank_token eq ';' ) {
8514             }
8515             elsif ( $next_nonblank_token eq '}' ) {
8516             }
8517
8518             # ATTRS - if an attribute list follows, remember the name
8519             # of the sub so the next opening brace can be labeled.
8520             # Setting 'statement_type' causes any ':'s to introduce
8521             # attributes.
8522             elsif ( $next_nonblank_token eq ':' ) {
8523                 if ( $call_type == SUB_CALL ) {
8524                     $statement_type =
8525                       substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8526                 }
8527             }
8528
8529             # if we stopped before an open paren ...
8530             elsif ( $next_nonblank_token eq '(' ) {
8531
8532                 # If we DID NOT see this paren above then it must be on the
8533                 # next line so we will set a flag to come back here and see if
8534                 # it is a PROTOTYPE
8535
8536                 # Otherwise, we assume it is a SIGNATURE rather than a
8537                 # PROTOTYPE and let the normal tokenizer handle it as a list
8538                 if ( !$saw_opening_paren ) {
8539                     $id_scan_state = 'sub';    # we must come back to get proto
8540                 }
8541                 if ( $call_type == SUB_CALL ) {
8542                     $statement_type =
8543                       substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8544                 }
8545             }
8546             elsif ($next_nonblank_token) {    # EOF technically ok
8547                 $subname = EMPTY_STRING unless defined($subname);
8548                 warning(
8549 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
8550                 );
8551             }
8552             check_prototype( $proto, $package, $subname );
8553         }
8554
8555         # no match to either sub name or prototype, but line not blank
8556         else {
8557
8558         }
8559         return ( $i, $tok, $type, $id_scan_state );
8560     } ## end sub do_scan_sub
8561 }
8562
8563 #########################################################################
8564 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
8565 #########################################################################
8566
8567 sub find_next_nonblank_token {
8568     my ( $i, $rtokens, $max_token_index ) = @_;
8569
8570     # Returns the next nonblank token after the token at index $i
8571     # To skip past a side comment, and any subsequent block comments
8572     # and blank lines, call with i=$max_token_index
8573
8574     if ( $i >= $max_token_index ) {
8575         if ( !peeked_ahead() ) {
8576             peeked_ahead(1);
8577             peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
8578         }
8579     }
8580
8581     my $next_nonblank_token = $rtokens->[ ++$i ];
8582     return ( SPACE, $i )
8583       unless ( defined($next_nonblank_token) && length($next_nonblank_token) );
8584
8585     # Quick test for nonblank ascii char. Note that we just have to
8586     # examine the first character here.
8587     my $ord = ord( substr( $next_nonblank_token, 0, 1 ) );
8588     if (   $ord >= ORD_PRINTABLE_MIN
8589         && $ord <= ORD_PRINTABLE_MAX )
8590     {
8591         return ( $next_nonblank_token, $i );
8592     }
8593
8594     # Quick test to skip over an ascii space or tab
8595     elsif ( $ord == ORD_SPACE || $ord == ORD_TAB ) {
8596         $next_nonblank_token = $rtokens->[ ++$i ];
8597         return ( SPACE, $i ) unless defined($next_nonblank_token);
8598     }
8599
8600     # Slow test to skip over something else identified as whitespace
8601     elsif ( $next_nonblank_token =~ /^\s*$/ ) {
8602         $next_nonblank_token = $rtokens->[ ++$i ];
8603         return ( SPACE, $i ) unless defined($next_nonblank_token);
8604     }
8605
8606     # We should be at a nonblank now
8607     return ( $next_nonblank_token, $i );
8608 } ## end sub find_next_nonblank_token
8609
8610 sub find_next_noncomment_type {
8611     my ( $i, $rtokens, $max_token_index ) = @_;
8612
8613     # Given the current character position, look ahead past any comments
8614     # and blank lines and return the next token, including digraphs and
8615     # trigraphs.
8616
8617     my ( $next_nonblank_token, $i_next ) =
8618       find_next_nonblank_token( $i, $rtokens, $max_token_index );
8619
8620     # skip past any side comment
8621     if ( $next_nonblank_token eq '#' ) {
8622         ( $next_nonblank_token, $i_next ) =
8623           find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
8624     }
8625
8626     # check for a digraph
8627     if (   $next_nonblank_token
8628         && $next_nonblank_token ne SPACE
8629         && defined( $rtokens->[ $i_next + 1 ] ) )
8630     {
8631         my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8632         if ( $is_digraph{$test2} ) {
8633             $next_nonblank_token = $test2;
8634             $i_next              = $i_next + 1;
8635
8636             # check for a trigraph
8637             if ( defined( $rtokens->[ $i_next + 1 ] ) ) {
8638                 my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8639                 if ( $is_trigraph{$test3} ) {
8640                     $next_nonblank_token = $test3;
8641                     $i_next              = $i_next + 1;
8642                 }
8643             }
8644         }
8645     }
8646
8647     return ( $next_nonblank_token, $i_next );
8648 } ## end sub find_next_noncomment_type
8649
8650 sub is_possible_numerator {
8651
8652     # Look at the next non-comment character and decide if it could be a
8653     # numerator.  Return
8654     #   1 - yes
8655     #   0 - can't tell
8656     #  -1 - no
8657
8658     my ( $i, $rtokens, $max_token_index ) = @_;
8659     my $is_possible_numerator = 0;
8660
8661     my $next_token = $rtokens->[ $i + 1 ];
8662     if ( $next_token eq '=' ) { $i++; }    # handle /=
8663     my ( $next_nonblank_token, $i_next ) =
8664       find_next_nonblank_token( $i, $rtokens, $max_token_index );
8665
8666     if ( $next_nonblank_token eq '#' ) {
8667         ( $next_nonblank_token, $i_next ) =
8668           find_next_nonblank_token( $max_token_index, $rtokens,
8669             $max_token_index );
8670     }
8671
8672     if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
8673         $is_possible_numerator = 1;
8674     }
8675     elsif ( $next_nonblank_token =~ /^\s*$/ ) {
8676         $is_possible_numerator = 0;
8677     }
8678     else {
8679         $is_possible_numerator = -1;
8680     }
8681
8682     return $is_possible_numerator;
8683 } ## end sub is_possible_numerator
8684
8685 {    ## closure for sub pattern_expected
8686     my %pattern_test;
8687
8688     BEGIN {
8689
8690         # List of tokens which may follow a pattern.  Note that we will not
8691         # have formed digraphs at this point, so we will see '&' instead of
8692         # '&&' and '|' instead of '||'
8693
8694         # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
8695         my @q = qw( & && | || ? : + - * and or while if unless);
8696         push @q, ')', '}', ']', '>', ',', ';';
8697         @{pattern_test}{@q} = (1) x scalar(@q);
8698     }
8699
8700     sub pattern_expected {
8701
8702         # This a filter for a possible pattern.
8703         # It looks at the token after a possible pattern and tries to
8704         # determine if that token could end a pattern.
8705         # returns -
8706         #   1 - yes
8707         #   0 - can't tell
8708         #  -1 - no
8709         my ( $i, $rtokens, $max_token_index ) = @_;
8710         my $is_pattern = 0;
8711
8712         my $next_token = $rtokens->[ $i + 1 ];
8713         if ( $next_token =~ /^[msixpodualgc]/ ) {
8714             $i++;
8715         }    # skip possible modifier
8716         my ( $next_nonblank_token, $i_next ) =
8717           find_next_nonblank_token( $i, $rtokens, $max_token_index );
8718
8719         if ( $pattern_test{$next_nonblank_token} ) {
8720             $is_pattern = 1;
8721         }
8722         else {
8723
8724             # Added '#' to fix issue c044
8725             if (   $next_nonblank_token =~ /^\s*$/
8726                 || $next_nonblank_token eq '#' )
8727             {
8728                 $is_pattern = 0;
8729             }
8730             else {
8731                 $is_pattern = -1;
8732             }
8733         }
8734         return $is_pattern;
8735     } ## end sub pattern_expected
8736 }
8737
8738 sub find_next_nonblank_token_on_this_line {
8739     my ( $i, $rtokens, $max_token_index ) = @_;
8740     my $next_nonblank_token;
8741
8742     if ( $i < $max_token_index ) {
8743         $next_nonblank_token = $rtokens->[ ++$i ];
8744
8745         if ( $next_nonblank_token =~ /^\s*$/ ) {
8746
8747             if ( $i < $max_token_index ) {
8748                 $next_nonblank_token = $rtokens->[ ++$i ];
8749             }
8750         }
8751     }
8752     else {
8753         $next_nonblank_token = EMPTY_STRING;
8754     }
8755     return ( $next_nonblank_token, $i );
8756 } ## end sub find_next_nonblank_token_on_this_line
8757
8758 sub find_angle_operator_termination {
8759
8760     # We are looking at a '<' and want to know if it is an angle operator.
8761     # We are to return:
8762     #   $i = pretoken index of ending '>' if found, current $i otherwise
8763     #   $type = 'Q' if found, '>' otherwise
8764     my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
8765     my $i    = $i_beg;
8766     my $type = '<';
8767     pos($input_line) = 1 + $rtoken_map->[$i];
8768
8769     my $filter;
8770
8771     # we just have to find the next '>' if a term is expected
8772     if ( $expecting == TERM ) { $filter = '[\>]' }
8773
8774     # we have to guess if we don't know what is expected
8775     elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
8776
8777     # shouldn't happen - we shouldn't be here if operator is expected
8778     else {
8779         if (DEVEL_MODE) {
8780             Fault(<<EOM);
8781 Bad call to find_angle_operator_termination
8782 EOM
8783         }
8784         return ( $i, $type );
8785     }
8786
8787     # To illustrate what we might be looking at, in case we are
8788     # guessing, here are some examples of valid angle operators
8789     # (or file globs):
8790     #  <tmp_imp/*>
8791     #  <FH>
8792     #  <$fh>
8793     #  <*.c *.h>
8794     #  <_>
8795     #  <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
8796     #  <${PREFIX}*img*.$IMAGE_TYPE>
8797     #  <img*.$IMAGE_TYPE>
8798     #  <Timg*.$IMAGE_TYPE>
8799     #  <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
8800     #
8801     # Here are some examples of lines which do not have angle operators:
8802     #  return unless $self->[2]++ < $#{$self->[1]};
8803     #  < 2  || @$t >
8804     #
8805     # the following line from dlister.pl caused trouble:
8806     #  print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
8807     #
8808     # If the '<' starts an angle operator, it must end on this line and
8809     # it must not have certain characters like ';' and '=' in it.  I use
8810     # this to limit the testing.  This filter should be improved if
8811     # possible.
8812
8813     if ( $input_line =~ /($filter)/g ) {
8814
8815         if ( $1 eq '>' ) {
8816
8817             # We MAY have found an angle operator termination if we get
8818             # here, but we need to do more to be sure we haven't been
8819             # fooled.
8820             my $pos = pos($input_line);
8821
8822             my $pos_beg = $rtoken_map->[$i];
8823             my $str     = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
8824
8825             # Test for '<' after possible filehandle, issue c103
8826             # print $fh <>;          # syntax error
8827             # print $fh <DATA>;      # ok
8828             # print $fh < DATA>;     # syntax error at '>'
8829             # print STDERR < DATA>;  # ok, prints word 'DATA'
8830             # print BLABLA <DATA>;   # ok; does nothing unless BLABLA is defined
8831             if ( $last_nonblank_type eq 'Z' ) {
8832
8833                 # $str includes brackets; something like '<DATA>'
8834                 if (   substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
8835                     && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
8836                 {
8837                     return ( $i, $type );
8838                 }
8839             }
8840
8841             # Reject if the closing '>' follows a '-' as in:
8842             # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
8843             if ( $expecting eq UNKNOWN ) {
8844                 my $check = substr( $input_line, $pos - 2, 1 );
8845                 if ( $check eq '-' ) {
8846                     return ( $i, $type );
8847                 }
8848             }
8849
8850             ######################################debug#####
8851             #write_diagnostics( "ANGLE? :$str\n");
8852             #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
8853             ######################################debug#####
8854             $type = 'Q';
8855             my $error;
8856             ( $i, $error ) =
8857               inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
8858
8859             # It may be possible that a quote ends midway in a pretoken.
8860             # If this happens, it may be necessary to split the pretoken.
8861             if ($error) {
8862                 if (DEVEL_MODE) {
8863                     Fault(<<EOM);
8864 unexpected error condition returned by inverse_pretoken_map
8865 EOM
8866                 }
8867                 warning(
8868                     "Possible tokinization error..please check this line\n");
8869             }
8870
8871             # count blanks on inside of brackets
8872             my $blank_count = 0;
8873             $blank_count++ if ( $str =~ /<\s+/ );
8874             $blank_count++ if ( $str =~ /\s+>/ );
8875
8876             # Now let's see where we stand....
8877             # OK if math op not possible
8878             if ( $expecting == TERM ) {
8879             }
8880
8881             # OK if there are no more than 2 non-blank pre-tokens inside
8882             # (not possible to write 2 token math between < and >)
8883             # This catches most common cases
8884             elsif ( $i <= $i_beg + 3 + $blank_count ) {
8885
8886                 # No longer any need to document this common case
8887                 ## write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
8888             }
8889
8890             # OK if there is some kind of identifier inside
8891             #   print $fh <tvg::INPUT>;
8892             elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
8893                 write_diagnostics("ANGLE (contains identifier): $str\n");
8894             }
8895
8896             # Not sure..
8897             else {
8898
8899                 # Let's try a Brace Test: any braces inside must balance
8900                 my $br = 0;
8901                 while ( $str =~ /\{/g ) { $br++ }
8902                 while ( $str =~ /\}/g ) { $br-- }
8903                 my $sb = 0;
8904                 while ( $str =~ /\[/g ) { $sb++ }
8905                 while ( $str =~ /\]/g ) { $sb-- }
8906                 my $pr = 0;
8907                 while ( $str =~ /\(/g ) { $pr++ }
8908                 while ( $str =~ /\)/g ) { $pr-- }
8909
8910                 # if braces do not balance - not angle operator
8911                 if ( $br || $sb || $pr ) {
8912                     $i    = $i_beg;
8913                     $type = '<';
8914                     write_diagnostics(
8915                         "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
8916                 }
8917
8918                 # we should keep doing more checks here...to be continued
8919                 # Tentatively accepting this as a valid angle operator.
8920                 # There are lots more things that can be checked.
8921                 else {
8922                     write_diagnostics(
8923                         "ANGLE-Guessing yes: $str expecting=$expecting\n");
8924                     write_logfile_entry("Guessing angle operator here: $str\n");
8925                 }
8926             }
8927         }
8928
8929         # didn't find ending >
8930         else {
8931             if ( $expecting == TERM ) {
8932                 warning("No ending > for angle operator\n");
8933             }
8934         }
8935     }
8936     return ( $i, $type );
8937 } ## end sub find_angle_operator_termination
8938
8939 sub scan_number_do {
8940
8941     #  scan a number in any of the formats that Perl accepts
8942     #  Underbars (_) are allowed in decimal numbers.
8943     #  input parameters -
8944     #      $input_line  - the string to scan
8945     #      $i           - pre_token index to start scanning
8946     #    $rtoken_map    - reference to the pre_token map giving starting
8947     #                    character position in $input_line of token $i
8948     #  output parameters -
8949     #    $i            - last pre_token index of the number just scanned
8950     #    number        - the number (characters); or undef if not a number
8951
8952     my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
8953     my $pos_beg = $rtoken_map->[$i];
8954     my $pos;
8955     my $i_begin = $i;
8956     my $number  = undef;
8957     my $type    = $input_type;
8958
8959     my $first_char = substr( $input_line, $pos_beg, 1 );
8960
8961     # Look for bad starting characters; Shouldn't happen..
8962     if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
8963         if (DEVEL_MODE) {
8964             Fault(<<EOM);
8965 Program bug - scan_number given bad first character = '$first_char'
8966 EOM
8967         }
8968         return ( $i, $type, $number );
8969     }
8970
8971     # handle v-string without leading 'v' character ('Two Dot' rule)
8972     # (vstring.t)
8973     # Here is the format prior to including underscores:
8974     ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
8975     pos($input_line) = $pos_beg;
8976     if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
8977         $pos = pos($input_line);
8978         my $numc = $pos - $pos_beg;
8979         $number = substr( $input_line, $pos_beg, $numc );
8980         $type   = 'v';
8981         report_v_string($number);
8982     }
8983
8984     # handle octal, hex, binary
8985     if ( !defined($number) ) {
8986         pos($input_line) = $pos_beg;
8987
8988         # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
8989         # For reference, the format prior to hex floating point is:
8990         #   /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
8991         #             (hex)               (octal)   (binary)
8992         if (
8993             $input_line =~
8994
8995             /\G[+-]?0(                   # leading [signed] 0
8996
8997            # a hex float, i.e. '0x0.b17217f7d1cf78p0'
8998            ([xX][0-9a-fA-F_]*            # X and optional leading digits
8999            (\.([0-9a-fA-F][0-9a-fA-F_]*)?)?   # optional decimal and fraction
9000            [Pp][+-]?[0-9a-fA-F]          # REQUIRED exponent with digit
9001            [0-9a-fA-F_]*)                # optional Additional exponent digits
9002
9003            # or hex integer
9004            |([xX][0-9a-fA-F_]+)
9005
9006            # or octal fraction
9007            |([oO]?[0-7_]+          # string of octal digits
9008            (\.([0-7][0-7_]*)?)?    # optional decimal and fraction
9009            [Pp][+-]?[0-7]          # REQUIRED exponent, no underscore
9010            [0-7_]*)                # Additional exponent digits with underscores
9011
9012            # or octal integer
9013            |([oO]?[0-7_]+)         # string of octal digits
9014
9015            # or a binary float
9016            |([bB][01_]*            # 'b' with string of binary digits
9017            (\.([01][01_]*)?)?      # optional decimal and fraction
9018            [Pp][+-]?[01]           # Required exponent indicator, no underscore
9019            [01_]*)                 # additional exponent bits
9020
9021            # or binary integer
9022            |([bB][01_]+)           # 'b' with string of binary digits
9023
9024            )/gx
9025           )
9026         {
9027             $pos = pos($input_line);
9028             my $numc = $pos - $pos_beg;
9029             $number = substr( $input_line, $pos_beg, $numc );
9030             $type   = 'n';
9031         }
9032     }
9033
9034     # handle decimal
9035     if ( !defined($number) ) {
9036         pos($input_line) = $pos_beg;
9037
9038         if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
9039             $pos = pos($input_line);
9040
9041             # watch out for things like 0..40 which would give 0. by this;
9042             if (   ( substr( $input_line, $pos - 1, 1 ) eq '.' )
9043                 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
9044             {
9045                 $pos--;
9046             }
9047             my $numc = $pos - $pos_beg;
9048             $number = substr( $input_line, $pos_beg, $numc );
9049             $type   = 'n';
9050         }
9051     }
9052
9053     # filter out non-numbers like e + - . e2  .e3 +e6
9054     # the rule: at least one digit, and any 'e' must be preceded by a digit
9055     if (
9056         $number !~ /\d/    # no digits
9057         || (   $number =~ /^(.*)[eE]/
9058             && $1 !~ /\d/ )    # or no digits before the 'e'
9059       )
9060     {
9061         $number = undef;
9062         $type   = $input_type;
9063         return ( $i, $type, $number );
9064     }
9065
9066     # Found a number; now we must convert back from character position
9067     # to pre_token index. An error here implies user syntax error.
9068     # An example would be an invalid octal number like '009'.
9069     my $error;
9070     ( $i, $error ) =
9071       inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9072     if ($error) { warning("Possibly invalid number\n") }
9073
9074     return ( $i, $type, $number );
9075 } ## end sub scan_number_do
9076
9077 sub inverse_pretoken_map {
9078
9079     # Starting with the current pre_token index $i, scan forward until
9080     # finding the index of the next pre_token whose position is $pos.
9081     my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
9082     my $error = 0;
9083
9084     while ( ++$i <= $max_token_index ) {
9085
9086         if ( $pos <= $rtoken_map->[$i] ) {
9087
9088             # Let the calling routine handle errors in which we do not
9089             # land on a pre-token boundary.  It can happen by running
9090             # perltidy on some non-perl scripts, for example.
9091             if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
9092             $i--;
9093             last;
9094         }
9095     }
9096     return ( $i, $error );
9097 } ## end sub inverse_pretoken_map
9098
9099 sub find_here_doc {
9100
9101     # find the target of a here document, if any
9102     # input parameters:
9103     #   $i - token index of the second < of <<
9104     #   ($i must be less than the last token index if this is called)
9105     # output parameters:
9106     #   $found_target = 0 didn't find target; =1 found target
9107     #   HERE_TARGET - the target string (may be empty string)
9108     #   $i - unchanged if not here doc,
9109     #    or index of the last token of the here target
9110     #   $saw_error - flag noting unbalanced quote on here target
9111     my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
9112     my $ibeg                 = $i;
9113     my $found_target         = 0;
9114     my $here_doc_target      = EMPTY_STRING;
9115     my $here_quote_character = EMPTY_STRING;
9116     my $saw_error            = 0;
9117     my ( $next_nonblank_token, $i_next_nonblank, $next_token );
9118     $next_token = $rtokens->[ $i + 1 ];
9119
9120     # perl allows a backslash before the target string (heredoc.t)
9121     my $backslash = 0;
9122     if ( $next_token eq '\\' ) {
9123         $backslash  = 1;
9124         $next_token = $rtokens->[ $i + 2 ];
9125     }
9126
9127     ( $next_nonblank_token, $i_next_nonblank ) =
9128       find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
9129
9130     if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
9131
9132         my $in_quote    = 1;
9133         my $quote_depth = 0;
9134         my $quote_pos   = 0;
9135         my $quoted_string;
9136
9137         (
9138             $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
9139             $quoted_string
9140           )
9141           = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
9142             $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
9143
9144         if ($in_quote) {    # didn't find end of quote, so no target found
9145             $i = $ibeg;
9146             if ( $expecting == TERM ) {
9147                 warning(
9148 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
9149                 );
9150                 $saw_error = 1;
9151             }
9152         }
9153         else {              # found ending quote
9154             $found_target = 1;
9155
9156             my $tokj;
9157             foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
9158                 $tokj = $rtokens->[$j];
9159
9160                 # we have to remove any backslash before the quote character
9161                 # so that the here-doc-target exactly matches this string
9162                 next
9163                   if ( $tokj eq "\\"
9164                     && $j < $i - 1
9165                     && $rtokens->[ $j + 1 ] eq $here_quote_character );
9166                 $here_doc_target .= $tokj;
9167             }
9168         }
9169     }
9170
9171     elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
9172         $found_target = 1;
9173         write_logfile_entry(
9174             "found blank here-target after <<; suggest using \"\"\n");
9175         $i = $ibeg;
9176     }
9177     elsif ( $next_token =~ /^\w/ ) {    # simple bareword or integer after <<
9178
9179         my $here_doc_expected;
9180         if ( $expecting == UNKNOWN ) {
9181             $here_doc_expected = guess_if_here_doc($next_token);
9182         }
9183         else {
9184             $here_doc_expected = 1;
9185         }
9186
9187         if ($here_doc_expected) {
9188             $found_target    = 1;
9189             $here_doc_target = $next_token;
9190             $i               = $ibeg + 1;
9191         }
9192
9193     }
9194     else {
9195
9196         if ( $expecting == TERM ) {
9197             $found_target = 1;
9198             write_logfile_entry("Note: bare here-doc operator <<\n");
9199         }
9200         else {
9201             $i = $ibeg;
9202         }
9203     }
9204
9205     # patch to neglect any prepended backslash
9206     if ( $found_target && $backslash ) { $i++ }
9207
9208     return ( $found_target, $here_doc_target, $here_quote_character, $i,
9209         $saw_error );
9210 } ## end sub find_here_doc
9211
9212 sub do_quote {
9213
9214     # follow (or continue following) quoted string(s)
9215     # $in_quote return code:
9216     #   0 - ok, found end
9217     #   1 - still must find end of quote whose target is $quote_character
9218     #   2 - still looking for end of first of two quotes
9219     #
9220     # Returns updated strings:
9221     #  $quoted_string_1 = quoted string seen while in_quote=1
9222     #  $quoted_string_2 = quoted string seen while in_quote=2
9223     my (
9224
9225         $i,
9226         $in_quote,
9227         $quote_character,
9228         $quote_pos,
9229         $quote_depth,
9230         $quoted_string_1,
9231         $quoted_string_2,
9232         $rtokens,
9233         $rtoken_map,
9234         $max_token_index,
9235
9236     ) = @_;
9237
9238     my $in_quote_starting = $in_quote;
9239
9240     my $quoted_string;
9241     if ( $in_quote == 2 ) {    # two quotes/quoted_string_1s to follow
9242         my $ibeg = $i;
9243         (
9244             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9245             $quoted_string
9246           )
9247           = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
9248             $quote_pos, $quote_depth, $max_token_index );
9249         $quoted_string_2 .= $quoted_string;
9250         if ( $in_quote == 1 ) {
9251             if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
9252             $quote_character = EMPTY_STRING;
9253         }
9254         else {
9255             $quoted_string_2 .= "\n";
9256         }
9257     }
9258
9259     if ( $in_quote == 1 ) {    # one (more) quote to follow
9260         my $ibeg = $i;
9261         (
9262             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9263             $quoted_string
9264           )
9265           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
9266             $quote_pos, $quote_depth, $max_token_index );
9267         $quoted_string_1 .= $quoted_string;
9268         if ( $in_quote == 1 ) {
9269             $quoted_string_1 .= "\n";
9270         }
9271     }
9272     return (
9273
9274         $i,
9275         $in_quote,
9276         $quote_character,
9277         $quote_pos,
9278         $quote_depth,
9279         $quoted_string_1,
9280         $quoted_string_2,
9281
9282     );
9283 } ## end sub do_quote
9284
9285 sub follow_quoted_string {
9286
9287     # scan for a specific token, skipping escaped characters
9288     # if the quote character is blank, use the first non-blank character
9289     # input parameters:
9290     #   $rtokens = reference to the array of tokens
9291     #   $i = the token index of the first character to search
9292     #   $in_quote = number of quoted strings being followed
9293     #   $beginning_tok = the starting quote character
9294     #   $quote_pos = index to check next for alphanumeric delimiter
9295     # output parameters:
9296     #   $i = the token index of the ending quote character
9297     #   $in_quote = decremented if found end, unchanged if not
9298     #   $beginning_tok = the starting quote character
9299     #   $quote_pos = index to check next for alphanumeric delimiter
9300     #   $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
9301     #   $quoted_string = the text of the quote (without quotation tokens)
9302     my (
9303
9304         $i_beg,
9305         $in_quote,
9306         $rtokens,
9307         $beginning_tok,
9308         $quote_pos,
9309         $quote_depth,
9310         $max_token_index,
9311
9312     ) = @_;
9313
9314     my ( $tok, $end_tok );
9315     my $i             = $i_beg - 1;
9316     my $quoted_string = EMPTY_STRING;
9317
9318     0 && do {
9319         print STDOUT
9320 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
9321     };
9322
9323     # get the corresponding end token
9324     if ( $beginning_tok !~ /^\s*$/ ) {
9325         $end_tok = matching_end_token($beginning_tok);
9326     }
9327
9328     # a blank token means we must find and use the first non-blank one
9329     else {
9330         my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
9331
9332         while ( $i < $max_token_index ) {
9333             $tok = $rtokens->[ ++$i ];
9334
9335             if ( $tok !~ /^\s*$/ ) {
9336
9337                 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
9338                     $i = $max_token_index;
9339                 }
9340                 else {
9341
9342                     if ( length($tok) > 1 ) {
9343                         if ( $quote_pos <= 0 ) { $quote_pos = 1 }
9344                         $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
9345                     }
9346                     else {
9347                         $beginning_tok = $tok;
9348                         $quote_pos     = 0;
9349                     }
9350                     $end_tok     = matching_end_token($beginning_tok);
9351                     $quote_depth = 1;
9352                     last;
9353                 }
9354             }
9355             else {
9356                 $allow_quote_comments = 1;
9357             }
9358         }
9359     }
9360
9361     # There are two different loops which search for the ending quote
9362     # character.  In the rare case of an alphanumeric quote delimiter, we
9363     # have to look through alphanumeric tokens character-by-character, since
9364     # the pre-tokenization process combines multiple alphanumeric
9365     # characters, whereas for a non-alphanumeric delimiter, only tokens of
9366     # length 1 can match.
9367
9368     #----------------------------------------------------------------
9369     # Case 1 (rare): loop for case of alphanumeric quote delimiter..
9370     # "quote_pos" is the position the current word to begin searching
9371     #----------------------------------------------------------------
9372     if ( $beginning_tok =~ /\w/ ) {
9373
9374         # Note this because it is not recommended practice except
9375         # for obfuscated perl contests
9376         if ( $in_quote == 1 ) {
9377             write_logfile_entry(
9378                 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
9379         }
9380
9381         # Note: changed < to <= here to fix c109. Relying on extra end blanks.
9382         while ( $i <= $max_token_index ) {
9383
9384             if ( $quote_pos == 0 || ( $i < 0 ) ) {
9385                 $tok = $rtokens->[ ++$i ];
9386
9387                 if ( $tok eq '\\' ) {
9388
9389                     # retain backslash unless it hides the end token
9390                     $quoted_string .= $tok
9391                       unless $rtokens->[ $i + 1 ] eq $end_tok;
9392                     $quote_pos++;
9393                     last if ( $i >= $max_token_index );
9394                     $tok = $rtokens->[ ++$i ];
9395                 }
9396             }
9397             my $old_pos = $quote_pos;
9398
9399             unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
9400             {
9401
9402             }
9403             $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
9404
9405             if ( $quote_pos > 0 ) {
9406
9407                 $quoted_string .=
9408                   substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
9409
9410                 # NOTE: any quote modifiers will be at the end of '$tok'. If we
9411                 # wanted to check them, this is the place to get them.  But
9412                 # this quote form is rarely used in practice, so it isn't
9413                 # worthwhile.
9414
9415                 $quote_depth--;
9416
9417                 if ( $quote_depth == 0 ) {
9418                     $in_quote--;
9419                     last;
9420                 }
9421             }
9422             else {
9423                 if ( $old_pos <= length($tok) ) {
9424                     $quoted_string .= substr( $tok, $old_pos );
9425                 }
9426             }
9427         }
9428     }
9429
9430     #-----------------------------------------------------------------------
9431     # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
9432     #-----------------------------------------------------------------------
9433     else {
9434
9435         while ( $i < $max_token_index ) {
9436             $tok = $rtokens->[ ++$i ];
9437
9438             if ( $tok eq $end_tok ) {
9439                 $quote_depth--;
9440
9441                 if ( $quote_depth == 0 ) {
9442                     $in_quote--;
9443                     last;
9444                 }
9445             }
9446             elsif ( $tok eq $beginning_tok ) {
9447                 $quote_depth++;
9448             }
9449             elsif ( $tok eq '\\' ) {
9450
9451                 # retain backslash unless it hides the beginning or end token
9452                 $tok = $rtokens->[ ++$i ];
9453                 $quoted_string .= '\\'
9454                   unless ( $tok eq $end_tok || $tok eq $beginning_tok );
9455             }
9456             $quoted_string .= $tok;
9457         }
9458     }
9459     if ( $i > $max_token_index ) { $i = $max_token_index }
9460     return (
9461
9462         $i,
9463         $in_quote,
9464         $beginning_tok,
9465         $quote_pos,
9466         $quote_depth,
9467         $quoted_string,
9468
9469     );
9470 } ## end sub follow_quoted_string
9471
9472 sub indicate_error {
9473     my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
9474     interrupt_logfile();
9475     warning($msg);
9476     write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
9477     resume_logfile();
9478     return;
9479 }
9480
9481 sub write_error_indicator_pair {
9482     my ( $line_number, $input_line, $pos, $carrat ) = @_;
9483     my ( $offset, $numbered_line, $underline ) =
9484       make_numbered_line( $line_number, $input_line, $pos );
9485     $underline = write_on_underline( $underline, $pos - $offset, $carrat );
9486     warning( $numbered_line . "\n" );
9487     $underline =~ s/\s*$//;
9488     warning( $underline . "\n" );
9489     return;
9490 } ## end sub write_error_indicator_pair
9491
9492 sub make_numbered_line {
9493
9494     #  Given an input line, its line number, and a character position of
9495     #  interest, create a string not longer than 80 characters of the form
9496     #     $lineno: sub_string
9497     #  such that the sub_string of $str contains the position of interest
9498     #
9499     #  Here is an example of what we want, in this case we add trailing
9500     #  '...' because the line is long.
9501     #
9502     # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9503     #
9504     #  Here is another example, this time in which we used leading '...'
9505     #  because of excessive length:
9506     #
9507     # 2: ... er of the World Wide Web Consortium's
9508     #
9509     #  input parameters are:
9510     #   $lineno = line number
9511     #   $str = the text of the line
9512     #   $pos = position of interest (the error) : 0 = first character
9513     #
9514     #   We return :
9515     #     - $offset = an offset which corrects the position in case we only
9516     #       display part of a line, such that $pos-$offset is the effective
9517     #       position from the start of the displayed line.
9518     #     - $numbered_line = the numbered line as above,
9519     #     - $underline = a blank 'underline' which is all spaces with the same
9520     #       number of characters as the numbered line.
9521
9522     my ( $lineno, $str, $pos ) = @_;
9523     my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
9524     my $excess = length($str) - $offset - 68;
9525     my $numc   = ( $excess > 0 ) ? 68 : undef;
9526
9527     if ( defined($numc) ) {
9528         if ( $offset == 0 ) {
9529             $str = substr( $str, $offset, $numc - 4 ) . " ...";
9530         }
9531         else {
9532             $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
9533         }
9534     }
9535     else {
9536
9537         if ( $offset == 0 ) {
9538         }
9539         else {
9540             $str = "... " . substr( $str, $offset + 4 );
9541         }
9542     }
9543
9544     my $numbered_line = sprintf( "%d: ", $lineno );
9545     $offset -= length($numbered_line);
9546     $numbered_line .= $str;
9547     my $underline = SPACE x length($numbered_line);
9548     return ( $offset, $numbered_line, $underline );
9549 } ## end sub make_numbered_line
9550
9551 sub write_on_underline {
9552
9553     # The "underline" is a string that shows where an error is; it starts
9554     # out as a string of blanks with the same length as the numbered line of
9555     # code above it, and we have to add marking to show where an error is.
9556     # In the example below, we want to write the string '--^' just below
9557     # the line of bad code:
9558     #
9559     # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9560     #                 ---^
9561     # We are given the current underline string, plus a position and a
9562     # string to write on it.
9563     #
9564     # In the above example, there will be 2 calls to do this:
9565     # First call:  $pos=19, pos_chr=^
9566     # Second call: $pos=16, pos_chr=---
9567     #
9568     # This is a trivial thing to do with substr, but there is some
9569     # checking to do.
9570
9571     my ( $underline, $pos, $pos_chr ) = @_;
9572
9573     # check for error..shouldn't happen
9574     unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
9575         return $underline;
9576     }
9577     my $excess = length($pos_chr) + $pos - length($underline);
9578     if ( $excess > 0 ) {
9579         $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
9580     }
9581     substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
9582     return ($underline);
9583 } ## end sub write_on_underline
9584
9585 sub pre_tokenize {
9586
9587     my ( $str, $max_tokens_wanted ) = @_;
9588
9589     # Input parameter:
9590     #  $max_tokens_wanted > 0  to stop on reaching this many tokens.
9591     #                     = 0 means get all tokens
9592
9593     # Break a string, $str, into a sequence of preliminary tokens.  We
9594     # are interested in these types of tokens:
9595     #   words       (type='w'),            example: 'max_tokens_wanted'
9596     #   digits      (type = 'd'),          example: '0755'
9597     #   whitespace  (type = 'b'),          example: '   '
9598     #   any other single character (i.e. punct; type = the character itself).
9599     # We cannot do better than this yet because we might be in a quoted
9600     # string or pattern.  Caller sets $max_tokens_wanted to 0 to get all
9601     # tokens.
9602
9603     # An advantage of doing this pre-tokenization step is that it keeps almost
9604     # all of the regex work highly localized.  A disadvantage is that in some
9605     # very rare instances we will have to go back and split a pre-token.
9606
9607     # Return parameters:
9608     my @tokens    = ();     # array of the tokens themselves
9609     my @token_map = (0);    # string position of start of each token
9610     my @type      = ();     # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
9611
9612     do {
9613
9614         # whitespace
9615         if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
9616
9617         # numbers
9618         # note that this must come before words!
9619         elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
9620
9621         # words
9622         elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
9623
9624         # single-character punctuation
9625         elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
9626
9627         # that's all..
9628         else {
9629             return ( \@tokens, \@token_map, \@type );
9630         }
9631
9632         push @tokens,    $1;
9633         push @token_map, pos($str);
9634
9635     } while ( --$max_tokens_wanted != 0 );
9636
9637     return ( \@tokens, \@token_map, \@type );
9638 } ## end sub pre_tokenize
9639
9640 sub show_tokens {
9641
9642     # this is an old debug routine
9643     # not called, but saved for reference
9644     my ( $rtokens, $rtoken_map ) = @_;
9645     my $num = scalar( @{$rtokens} );
9646
9647     foreach my $i ( 0 .. $num - 1 ) {
9648         my $len = length( $rtokens->[$i] );
9649         print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
9650     }
9651     return;
9652 } ## end sub show_tokens
9653
9654 {    ## closure for sub matching end token
9655     my %matching_end_token;
9656
9657     BEGIN {
9658         %matching_end_token = (
9659             '{' => '}',
9660             '(' => ')',
9661             '[' => ']',
9662             '<' => '>',
9663         );
9664     }
9665
9666     sub matching_end_token {
9667
9668         # return closing character for a pattern
9669         my $beginning_token = shift;
9670         if ( $matching_end_token{$beginning_token} ) {
9671             return $matching_end_token{$beginning_token};
9672         }
9673         return ($beginning_token);
9674     }
9675 }
9676
9677 sub dump_token_types {
9678     my ( $class, $fh ) = @_;
9679
9680     # This should be the latest list of token types in use
9681     # adding NEW_TOKENS: add a comment here
9682     $fh->print(<<'END_OF_LIST');
9683
9684 Here is a list of the token types currently used for lines of type 'CODE'.
9685 For the following tokens, the "type" of a token is just the token itself.
9686
9687 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
9688 ( ) <= >= == =~ !~ != ++ -- /= x=
9689 ... **= <<= >>= &&= ||= //= <=>
9690 , + - / * | % ! x ~ = \ ? : . < > ^ &
9691
9692 The following additional token types are defined:
9693
9694  type    meaning
9695     b    blank (white space)
9696     {    indent: opening structural curly brace or square bracket or paren
9697          (code block, anonymous hash reference, or anonymous array reference)
9698     }    outdent: right structural curly brace or square bracket or paren
9699     [    left non-structural square bracket (enclosing an array index)
9700     ]    right non-structural square bracket
9701     (    left non-structural paren (all but a list right of an =)
9702     )    right non-structural paren
9703     L    left non-structural curly brace (enclosing a key)
9704     R    right non-structural curly brace
9705     ;    terminal semicolon
9706     f    indicates a semicolon in a "for" statement
9707     h    here_doc operator <<
9708     #    a comment
9709     Q    indicates a quote or pattern
9710     q    indicates a qw quote block
9711     k    a perl keyword
9712     C    user-defined constant or constant function (with void prototype = ())
9713     U    user-defined function taking parameters
9714     G    user-defined function taking block parameter (like grep/map/eval)
9715     M    (unused, but reserved for subroutine definition name)
9716     P    (unused, but -html uses it to label pod text)
9717     t    type indicater such as %,$,@,*,&,sub
9718     w    bare word (perhaps a subroutine call)
9719     i    identifier of some type (with leading %, $, @, *, &, sub, -> )
9720     n    a number
9721     v    a v-string
9722     F    a file test operator (like -e)
9723     Y    File handle
9724     Z    identifier in indirect object slot: may be file handle, object
9725     J    LABEL:  code block label
9726     j    LABEL after next, last, redo, goto
9727     p    unary +
9728     m    unary -
9729     pp   pre-increment operator ++
9730     mm   pre-decrement operator --
9731     A    : used as attribute separator
9732
9733     Here are the '_line_type' codes used internally:
9734     SYSTEM         - system-specific code before hash-bang line
9735     CODE           - line of perl code (including comments)
9736     POD_START      - line starting pod, such as '=head'
9737     POD            - pod documentation text
9738     POD_END        - last line of pod section, '=cut'
9739     HERE           - text of here-document
9740     HERE_END       - last line of here-doc (target word)
9741     FORMAT         - format section
9742     FORMAT_END     - last line of format section, '.'
9743     SKIP           - code skipping section
9744     SKIP_END       - last line of code skipping section, '#>>V'
9745     DATA_START     - __DATA__ line
9746     DATA           - unidentified text following __DATA__
9747     END_START      - __END__ line
9748     END            - unidentified text following __END__
9749     ERROR          - we are in big trouble, probably not a perl script
9750 END_OF_LIST
9751
9752     return;
9753 } ## end sub dump_token_types
9754
9755 BEGIN {
9756
9757     # These names are used in error messages
9758     @opening_brace_names = qw# '{' '[' '(' '?' #;
9759     @closing_brace_names = qw# '}' ']' ')' ':' #;
9760
9761     my @q;
9762
9763     my @digraphs = qw(
9764       .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
9765       <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
9766     );
9767     @is_digraph{@digraphs} = (1) x scalar(@digraphs);
9768
9769     @q = qw(
9770       . : < > * & | / - = + -  %  ^ !  x ~
9771     );
9772     @can_start_digraph{@q} = (1) x scalar(@q);
9773
9774     my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
9775     @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
9776
9777     my @tetragraphs = qw( <<>> );
9778     @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
9779
9780     # make a hash of all valid token types for self-checking the tokenizer
9781     # (adding NEW_TOKENS : select a new character and add to this list)
9782     my @valid_token_types = qw#
9783       A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
9784       { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
9785       #;
9786     push( @valid_token_types, @digraphs );
9787     push( @valid_token_types, @trigraphs );
9788     push( @valid_token_types, @tetragraphs );
9789     push( @valid_token_types, ( '#', ',', 'CORE::' ) );
9790     @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
9791
9792     # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
9793     my @file_test_operators =
9794       qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
9795     @is_file_test_operator{@file_test_operators} =
9796       (1) x scalar(@file_test_operators);
9797
9798     # these functions have prototypes of the form (&), so when they are
9799     # followed by a block, that block MAY BE followed by an operator.
9800     # Smartmatch operator ~~ may be followed by anonymous hash or array ref
9801     @q = qw( do eval );
9802     @is_block_operator{@q} = (1) x scalar(@q);
9803
9804     # these functions allow an identifier in the indirect object slot
9805     @q = qw( print printf sort exec system say);
9806     @is_indirect_object_taker{@q} = (1) x scalar(@q);
9807
9808     # These tokens may precede a code block
9809     # patched for SWITCH/CASE/CATCH.  Actually these could be removed
9810     # now and we could let the extended-syntax coding handle them.
9811     # Added 'default' for Switch::Plain.
9812     @q =
9813       qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
9814       unless do while until eval for foreach map grep sort
9815       switch case given when default catch try finally);
9816     @is_code_block_token{@q} = (1) x scalar(@q);
9817
9818     # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
9819     # to contrast it with the block types in '%is_zero_continuation_block_type'
9820     @q = qw( sort map grep eval do );
9821     @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
9822
9823     @q = qw( sort map grep );
9824     @is_sort_map_grep{@q} = (1) x scalar(@q);
9825
9826     %is_grep_alias = ();
9827
9828     # I'll build the list of keywords incrementally
9829     my @Keywords = ();
9830
9831     # keywords and tokens after which a value or pattern is expected,
9832     # but not an operator.  In other words, these should consume terms
9833     # to their right, or at least they are not expected to be followed
9834     # immediately by operators.
9835     my @value_requestor = qw(
9836       AUTOLOAD
9837       BEGIN
9838       CHECK
9839       DESTROY
9840       END
9841       EQ
9842       GE
9843       GT
9844       INIT
9845       LE
9846       LT
9847       NE
9848       UNITCHECK
9849       abs
9850       accept
9851       alarm
9852       and
9853       atan2
9854       bind
9855       binmode
9856       bless
9857       break
9858       caller
9859       chdir
9860       chmod
9861       chomp
9862       chop
9863       chown
9864       chr
9865       chroot
9866       close
9867       closedir
9868       cmp
9869       connect
9870       continue
9871       cos
9872       crypt
9873       dbmclose
9874       dbmopen
9875       defined
9876       delete
9877       die
9878       dump
9879       each
9880       else
9881       elsif
9882       eof
9883       eq
9884       evalbytes
9885       exec
9886       exists
9887       exit
9888       exp
9889       fc
9890       fcntl
9891       fileno
9892       flock
9893       for
9894       foreach
9895       formline
9896       ge
9897       getc
9898       getgrgid
9899       getgrnam
9900       gethostbyaddr
9901       gethostbyname
9902       getnetbyaddr
9903       getnetbyname
9904       getpeername
9905       getpgrp
9906       getpriority
9907       getprotobyname
9908       getprotobynumber
9909       getpwnam
9910       getpwuid
9911       getservbyname
9912       getservbyport
9913       getsockname
9914       getsockopt
9915       glob
9916       gmtime
9917       goto
9918       grep
9919       gt
9920       hex
9921       if
9922       index
9923       int
9924       ioctl
9925       join
9926       keys
9927       kill
9928       last
9929       lc
9930       lcfirst
9931       le
9932       length
9933       link
9934       listen
9935       local
9936       localtime
9937       lock
9938       log
9939       lstat
9940       lt
9941       map
9942       mkdir
9943       msgctl
9944       msgget
9945       msgrcv
9946       msgsnd
9947       my
9948       ne
9949       next
9950       no
9951       not
9952       oct
9953       open
9954       opendir
9955       or
9956       ord
9957       our
9958       pack
9959       pipe
9960       pop
9961       pos
9962       print
9963       printf
9964       prototype
9965       push
9966       quotemeta
9967       rand
9968       read
9969       readdir
9970       readlink
9971       readline
9972       readpipe
9973       recv
9974       redo
9975       ref
9976       rename
9977       require
9978       reset
9979       return
9980       reverse
9981       rewinddir
9982       rindex
9983       rmdir
9984       scalar
9985       seek
9986       seekdir
9987       select
9988       semctl
9989       semget
9990       semop
9991       send
9992       sethostent
9993       setnetent
9994       setpgrp
9995       setpriority
9996       setprotoent
9997       setservent
9998       setsockopt
9999       shift
10000       shmctl
10001       shmget
10002       shmread
10003       shmwrite
10004       shutdown
10005       sin
10006       sleep
10007       socket
10008       socketpair
10009       sort
10010       splice
10011       split
10012       sprintf
10013       sqrt
10014       srand
10015       stat
10016       state
10017       study
10018       substr
10019       symlink
10020       syscall
10021       sysopen
10022       sysread
10023       sysseek
10024       system
10025       syswrite
10026       tell
10027       telldir
10028       tie
10029       tied
10030       truncate
10031       uc
10032       ucfirst
10033       umask
10034       undef
10035       unless
10036       unlink
10037       unpack
10038       unshift
10039       untie
10040       until
10041       use
10042       utime
10043       values
10044       vec
10045       waitpid
10046       warn
10047       while
10048       write
10049       xor
10050
10051       switch
10052       case
10053       default
10054       given
10055       when
10056       err
10057       say
10058       isa
10059
10060       catch
10061     );
10062
10063     # patched above for SWITCH/CASE given/when err say
10064     # 'err' is a fairly safe addition.
10065     # Added 'default' for Switch::Plain. Note that we could also have
10066     # a separate set of keywords to include if we see 'use Switch::Plain'
10067     push( @Keywords, @value_requestor );
10068
10069     # These are treated the same but are not keywords:
10070     my @extra_vr = qw(
10071       constant
10072       vars
10073     );
10074     push( @value_requestor, @extra_vr );
10075
10076     @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
10077
10078     # this list contains keywords which do not look for arguments,
10079     # so that they might be followed by an operator, or at least
10080     # not a term.
10081     my @operator_requestor = qw(
10082       endgrent
10083       endhostent
10084       endnetent
10085       endprotoent
10086       endpwent
10087       endservent
10088       fork
10089       getgrent
10090       gethostent
10091       getlogin
10092       getnetent
10093       getppid
10094       getprotoent
10095       getpwent
10096       getservent
10097       setgrent
10098       setpwent
10099       time
10100       times
10101       wait
10102       wantarray
10103     );
10104
10105     push( @Keywords, @operator_requestor );
10106
10107     # These are treated the same but are not considered keywords:
10108     my @extra_or = qw(
10109       STDERR
10110       STDIN
10111       STDOUT
10112     );
10113
10114     push( @operator_requestor, @extra_or );
10115
10116     @expecting_operator_token{@operator_requestor} =
10117       (1) x scalar(@operator_requestor);
10118
10119     # these token TYPES expect trailing operator but not a term
10120     # note: ++ and -- are post-increment and decrement, 'C' = constant
10121     my @operator_requestor_types = qw( ++ -- C <> q );
10122     @expecting_operator_types{@operator_requestor_types} =
10123       (1) x scalar(@operator_requestor_types);
10124
10125     # these token TYPES consume values (terms)
10126     # note: pp and mm are pre-increment and decrement
10127     # f=semicolon in for,  F=file test operator
10128     my @value_requestor_type = qw#
10129       L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
10130       **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
10131       <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
10132       f F pp mm Y p m U J G j >> << ^ t
10133       ~. ^. |. &. ^.= |.= &.=
10134       #;
10135     push( @value_requestor_type, ',' )
10136       ;    # (perl doesn't like a ',' in a qw block)
10137     @expecting_term_types{@value_requestor_type} =
10138       (1) x scalar(@value_requestor_type);
10139
10140     # Note: the following valid token types are not assigned here to
10141     # hashes requesting to be followed by values or terms, but are
10142     # instead currently hard-coded into sub operator_expected:
10143     # ) -> :: Q R Z ] b h i k n v w } #
10144
10145     # For simple syntax checking, it is nice to have a list of operators which
10146     # will really be unhappy if not followed by a term.  This includes most
10147     # of the above...
10148     %really_want_term = %expecting_term_types;
10149
10150     # with these exceptions...
10151     delete $really_want_term{'U'}; # user sub, depends on prototype
10152     delete $really_want_term{'F'}; # file test works on $_ if no following term
10153     delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
10154                                    # let perl do it
10155     @q = qw(q qq qx qr s y tr m);
10156     @is_q_qq_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10157
10158     # Note added 'qw' here
10159     @q = qw(q qq qw qx qr s y tr m);
10160     @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10161
10162     @q = qw(package);
10163     @is_package{@q} = (1) x scalar(@q);
10164
10165     @q = qw( ? : );
10166     push @q, ',';
10167     @is_comma_question_colon{@q} = (1) x scalar(@q);
10168
10169     @q = qw( if elsif unless );
10170     @is_if_elsif_unless{@q} = (1) x scalar(@q);
10171
10172     @q = qw( ; t );
10173     @is_semicolon_or_t{@q} = (1) x scalar(@q);
10174
10175     @q = qw( if elsif unless case when );
10176     @is_if_elsif_unless_case_when{@q} = (1) x scalar(@q);
10177
10178     # Hash of other possible line endings which may occur.
10179     # Keep these coordinated with the regex where this is used.
10180     # Note: chr(13) = chr(015)="\r".
10181     @q = ( chr(13), chr(29), chr(26) );
10182     @other_line_endings{@q} = (1) x scalar(@q);
10183
10184     # These keywords are handled specially in the tokenizer code:
10185     my @special_keywords = qw(
10186       do
10187       eval
10188       format
10189       m
10190       package
10191       q
10192       qq
10193       qr
10194       qw
10195       qx
10196       s
10197       sub
10198       tr
10199       y
10200     );
10201     push( @Keywords, @special_keywords );
10202
10203     # Keywords after which list formatting may be used
10204     # WARNING: do not include |map|grep|eval or perl may die on
10205     # syntax errors (map1.t).
10206     my @keyword_taking_list = qw(
10207       and
10208       chmod
10209       chomp
10210       chop
10211       chown
10212       dbmopen
10213       die
10214       elsif
10215       exec
10216       fcntl
10217       for
10218       foreach
10219       formline
10220       getsockopt
10221       if
10222       index
10223       ioctl
10224       join
10225       kill
10226       local
10227       msgctl
10228       msgrcv
10229       msgsnd
10230       my
10231       open
10232       or
10233       our
10234       pack
10235       print
10236       printf
10237       push
10238       read
10239       readpipe
10240       recv
10241       return
10242       reverse
10243       rindex
10244       seek
10245       select
10246       semctl
10247       semget
10248       send
10249       setpriority
10250       setsockopt
10251       shmctl
10252       shmget
10253       shmread
10254       shmwrite
10255       socket
10256       socketpair
10257       sort
10258       splice
10259       split
10260       sprintf
10261       state
10262       substr
10263       syscall
10264       sysopen
10265       sysread
10266       sysseek
10267       system
10268       syswrite
10269       tie
10270       unless
10271       unlink
10272       unpack
10273       unshift
10274       until
10275       vec
10276       warn
10277       while
10278       given
10279       when
10280     );
10281     @is_keyword_taking_list{@keyword_taking_list} =
10282       (1) x scalar(@keyword_taking_list);
10283
10284     # perl functions which may be unary operators.
10285
10286     # This list is used to decide if a pattern delimited by slashes, /pattern/,
10287     # can follow one of these keywords.
10288     @q = qw(
10289       chomp eof eval fc lc pop shift uc undef
10290     );
10291
10292     @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
10293       (1) x scalar(@q);
10294
10295     # These are keywords for which an arg may optionally be omitted.  They are
10296     # currently only used to disambiguate a ? used as a ternary from one used
10297     # as a (deprecated) pattern delimiter.  In the future, they might be used
10298     # to give a warning about ambiguous syntax before a /.
10299     # Note: split has been omitted (see not below).
10300     my @keywords_taking_optional_arg = qw(
10301       abs
10302       alarm
10303       caller
10304       chdir
10305       chomp
10306       chop
10307       chr
10308       chroot
10309       close
10310       cos
10311       defined
10312       die
10313       eof
10314       eval
10315       evalbytes
10316       exit
10317       exp
10318       fc
10319       getc
10320       glob
10321       gmtime
10322       hex
10323       int
10324       last
10325       lc
10326       lcfirst
10327       length
10328       localtime
10329       log
10330       lstat
10331       mkdir
10332       next
10333       oct
10334       ord
10335       pop
10336       pos
10337       print
10338       printf
10339       prototype
10340       quotemeta
10341       rand
10342       readline
10343       readlink
10344       readpipe
10345       redo
10346       ref
10347       require
10348       reset
10349       reverse
10350       rmdir
10351       say
10352       select
10353       shift
10354       sin
10355       sleep
10356       sqrt
10357       srand
10358       stat
10359       study
10360       tell
10361       uc
10362       ucfirst
10363       umask
10364       undef
10365       unlink
10366       warn
10367       write
10368     );
10369     @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
10370       (1) x scalar(@keywords_taking_optional_arg);
10371
10372     # This list is used to decide if a pattern delimited by question marks,
10373     # ?pattern?, can follow one of these keywords.  Note that from perl 5.22
10374     # on, a ?pattern? is not recognized, so we can be much more strict than
10375     # with a /pattern/. Note that 'split' is not in this list. In current
10376     # versions of perl a question following split must be a ternary, but
10377     # in older versions it could be a pattern.  The guessing algorithm will
10378     # decide.  We are combining two lists here to simplify the test.
10379     @q = ( @keywords_taking_optional_arg, @operator_requestor );
10380     @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
10381       (1) x scalar(@q);
10382
10383     # These are not used in any way yet
10384     #    my @unused_keywords = qw(
10385     #     __FILE__
10386     #     __LINE__
10387     #     __PACKAGE__
10388     #     );
10389
10390     #  The list of keywords was originally extracted from function 'keyword' in
10391     #  perl file toke.c version 5.005.03, using this utility, plus a
10392     #  little editing: (file getkwd.pl):
10393     #  while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
10394     #  Add 'get' prefix where necessary, then split into the above lists.
10395     #  This list should be updated as necessary.
10396     #  The list should not contain these special variables:
10397     #  ARGV DATA ENV SIG STDERR STDIN STDOUT
10398     #  __DATA__ __END__
10399
10400     @is_keyword{@Keywords} = (1) x scalar(@Keywords);
10401 }
10402 1;