lib/Perl/Tidy/Tokenizer.pm

   1 #####################################################################
   2 #
   3 # The Perl::Tidy::Tokenizer package is essentially a filter which
   4 # reads lines of perl source code from a source object and provides
   5 # corresponding tokenized lines through its get_line() method.  Lines
   6 # flow from the source_object to the caller like this:
   7 #
   8 # source_object --> LineBuffer_object --> Tokenizer -->  calling routine
   9 #   get_line()         get_line()           get_line()     line_of_tokens
  10 #
  11 # The source object can be any object with a get_line() method which
  12 # supplies one line (a character string) perl call.
  13 # The LineBuffer object is created by the Tokenizer.
  14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
  15 # containing one tokenized line for each call to its get_line() method.
  16 #
  17 # WARNING: This is not a real class.  Only one tokenizer my be used.
  18 #
  19 ########################################################################
  20
  21 package Perl::Tidy::Tokenizer;
  22 use strict;
  23 use warnings;
  24 use English qw( -no_match_vars );
  25
  26 our $VERSION = '20220613';
  27
  28 use constant DEVEL_MODE   => 0;
  29 use constant EMPTY_STRING => q{};
  30 use constant SPACE        => q{ };
  31
  32 use Perl::Tidy::LineBuffer;
  33 use Carp;
  34
  35 # PACKAGE VARIABLES for processing an entire FILE.
  36 # These must be package variables because most may get localized during
  37 # processing.  Most are initialized in sub prepare_for_a_new_file.
  38 use vars qw{
  39   $tokenizer_self
  40
  41   $last_nonblank_token
  42   $last_nonblank_type
  43   $last_nonblank_block_type
  44   $statement_type
  45   $in_attribute_list
  46   $current_package
  47   $context
  48
  49   %is_constant
  50   %is_user_function
  51   %user_function_prototype
  52   %is_block_function
  53   %is_block_list_function
  54   %saw_function_definition
  55   %saw_use_module
  56
  57   $brace_depth
  58   $paren_depth
  59   $square_bracket_depth
  60
  61   @current_depth
  62   @total_depth
  63   $total_depth
  64   $next_sequence_number
  65   @nesting_sequence_number
  66   @current_sequence_number
  67   @paren_type
  68   @paren_semicolon_count
  69   @paren_structural_type
  70   @brace_type
  71   @brace_structural_type
  72   @brace_context
  73   @brace_package
  74   @square_bracket_type
  75   @square_bracket_structural_type
  76   @depth_array
  77   @nested_ternary_flag
  78   @nested_statement_type
  79   @starting_line_of_current_depth
  80 };
  81
  82 # GLOBAL CONSTANTS for routines in this package,
  83 # Initialized in a BEGIN block.
  84 use vars qw{
  85   %is_indirect_object_taker
  86   %is_block_operator
  87   %expecting_operator_token
  88   %expecting_operator_types
  89   %expecting_term_types
  90   %expecting_term_token
  91   %is_digraph
  92   %can_start_digraph
  93   %is_file_test_operator
  94   %is_trigraph
  95   %is_tetragraph
  96   %is_valid_token_type
  97   %is_keyword
  98   %is_code_block_token
  99   %is_sort_map_grep_eval_do
 100   %is_sort_map_grep
 101   %is_grep_alias
 102   %really_want_term
 103   @opening_brace_names
 104   @closing_brace_names
 105   %is_keyword_taking_list
 106   %is_keyword_taking_optional_arg
 107   %is_keyword_rejecting_slash_as_pattern_delimiter
 108   %is_keyword_rejecting_question_as_pattern_delimiter
 109   %is_q_qq_qx_qr_s_y_tr_m
 110   %is_q_qq_qw_qx_qr_s_y_tr_m
 111   %is_sub
 112   %is_package
 113   %is_comma_question_colon
 114   %is_if_elsif_unless
 115   %is_if_elsif_unless_case_when
 116   %other_line_endings
 117   %is_END_DATA_format_sub
 118   %is_semicolon_or_t
 119   $code_skipping_pattern_begin
 120   $code_skipping_pattern_end
 121 };
 122
 123 # GLOBAL VARIABLES which are constant after being configured by user-supplied
 124 # parameters.  They remain constant as a file is being processed.
 125 my (
 126
 127     $rOpts_code_skipping,
 128     $code_skipping_pattern_begin,
 129     $code_skipping_pattern_end,
 130 );
 131
 132 # possible values of operator_expected()
 133 use constant TERM     => -1;
 134 use constant UNKNOWN  => 0;
 135 use constant OPERATOR => 1;
 136
 137 # possible values of context
 138 use constant SCALAR_CONTEXT  => -1;
 139 use constant UNKNOWN_CONTEXT => 0;
 140 use constant LIST_CONTEXT    => 1;
 141
 142 # Maximum number of little messages; probably need not be changed.
 143 use constant MAX_NAG_MESSAGES => 6;
 144
 145 BEGIN {
 146
 147     # Array index names for $self.
 148     # Do not combine with other BEGIN blocks (c101).
 149     my $i = 0;
 150     use constant {
 151         _rhere_target_list_                  => $i++,
 152         _in_here_doc_                        => $i++,
 153         _here_doc_target_                    => $i++,
 154         _here_quote_character_               => $i++,
 155         _in_data_                            => $i++,
 156         _in_end_                             => $i++,
 157         _in_format_                          => $i++,
 158         _in_error_                           => $i++,
 159         _in_pod_                             => $i++,
 160         _in_skipped_                         => $i++,
 161         _in_attribute_list_                  => $i++,
 162         _in_quote_                           => $i++,
 163         _quote_target_                       => $i++,
 164         _line_start_quote_                   => $i++,
 165         _starting_level_                     => $i++,
 166         _know_starting_level_                => $i++,
 167         _tabsize_                            => $i++,
 168         _indent_columns_                     => $i++,
 169         _look_for_hash_bang_                 => $i++,
 170         _trim_qw_                            => $i++,
 171         _continuation_indentation_           => $i++,
 172         _outdent_labels_                     => $i++,
 173         _last_line_number_                   => $i++,
 174         _saw_perl_dash_P_                    => $i++,
 175         _saw_perl_dash_w_                    => $i++,
 176         _saw_use_strict_                     => $i++,
 177         _saw_v_string_                       => $i++,
 178         _hit_bug_                            => $i++,
 179         _look_for_autoloader_                => $i++,
 180         _look_for_selfloader_                => $i++,
 181         _saw_autoloader_                     => $i++,
 182         _saw_selfloader_                     => $i++,
 183         _saw_hash_bang_                      => $i++,
 184         _saw_end_                            => $i++,
 185         _saw_data_                           => $i++,
 186         _saw_negative_indentation_           => $i++,
 187         _started_tokenizing_                 => $i++,
 188         _line_buffer_object_                 => $i++,
 189         _debugger_object_                    => $i++,
 190         _diagnostics_object_                 => $i++,
 191         _logger_object_                      => $i++,
 192         _unexpected_error_count_             => $i++,
 193         _started_looking_for_here_target_at_ => $i++,
 194         _nearly_matched_here_target_at_      => $i++,
 195         _line_of_text_                       => $i++,
 196         _rlower_case_labels_at_              => $i++,
 197         _extended_syntax_                    => $i++,
 198         _maximum_level_                      => $i++,
 199         _true_brace_error_count_             => $i++,
 200         _rOpts_maximum_level_errors_         => $i++,
 201         _rOpts_maximum_unexpected_errors_    => $i++,
 202         _rOpts_logfile_                      => $i++,
 203         _rOpts_                              => $i++,
 204     };
 205 }
 206
 207 {    ## closure for subs to count instances
 208
 209     # methods to count instances
 210     my $_count = 0;
 211     sub get_count        { return $_count; }
 212     sub _increment_count { return ++$_count }
 213     sub _decrement_count { return --$_count }
 214 }
 215
 216 sub DESTROY {
 217     my $self = shift;
 218     $self->_decrement_count();
 219     return;
 220 }
 221
 222 sub AUTOLOAD {
 223
 224     # Catch any undefined sub calls so that we are sure to get
 225     # some diagnostic information.  This sub should never be called
 226     # except for a programming error.
 227     our $AUTOLOAD;
 228     return if ( $AUTOLOAD =~ /\bDESTROY$/ );
 229     my ( $pkg, $fname, $lno ) = caller();
 230     my $my_package = __PACKAGE__;
 231     print STDERR <<EOM;
 232 ======================================================================
 233 Error detected in package '$my_package', version $VERSION
 234 Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
 235 Called from package: '$pkg'
 236 Called from File '$fname'  at line '$lno'
 237 This error is probably due to a recent programming change
 238 ======================================================================
 239 EOM
 240     exit 1;
 241 } ## end sub AUTOLOAD
 242
 243 sub Die {
 244     my ($msg) = @_;
 245     Perl::Tidy::Die($msg);
 246     croak "unexpected return from Perl::Tidy::Die";
 247 }
 248
 249 sub Fault {
 250     my ($msg) = @_;
 251
 252     # This routine is called for errors that really should not occur
 253     # except if there has been a bug introduced by a recent program change.
 254     # Please add comments at calls to Fault to explain why the call
 255     # should not occur, and where to look to fix it.
 256     my ( $package0, $filename0, $line0, $subroutine0 ) = caller(0);
 257     my ( $package1, $filename1, $line1, $subroutine1 ) = caller(1);
 258     my ( $package2, $filename2, $line2, $subroutine2 ) = caller(2);
 259     my $input_stream_name = get_input_stream_name();
 260
 261     Die(<<EOM);
 262 ==============================================================================
 263 While operating on input stream with name: '$input_stream_name'
 264 A fault was detected at line $line0 of sub '$subroutine1'
 265 in file '$filename1'
 266 which was called from line $line1 of sub '$subroutine2'
 267 Message: '$msg'
 268 This is probably an error introduced by a recent programming change.
 269 Perl::Tidy::Tokenizer.pm reports VERSION='$VERSION'.
 270 ==============================================================================
 271 EOM
 272
 273     # We shouldn't get here, but this return is to keep Perl-Critic from
 274     # complaining.
 275     return;
 276 } ## end sub Fault
 277
 278 sub bad_pattern {
 279
 280     # See if a pattern will compile. We have to use a string eval here,
 281     # but it should be safe because the pattern has been constructed
 282     # by this program.
 283     my ($pattern) = @_;
 284     eval "'##'=~/$pattern/";
 285     return $EVAL_ERROR;
 286 }
 287
 288 sub make_code_skipping_pattern {
 289     my ( $rOpts, $opt_name, $default ) = @_;
 290     my $param = $rOpts->{$opt_name};
 291     unless ($param) { $param = $default }
 292     $param =~ s/^\s*//;    # allow leading spaces to be like format-skipping
 293     if ( $param !~ /^#/ ) {
 294         Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
 295     }
 296     my $pattern = '^\s*' . $param . '\b';
 297     if ( bad_pattern($pattern) ) {
 298         Die(
 299 "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
 300         );
 301     }
 302     return $pattern;
 303 } ## end sub make_code_skipping_pattern
 304
 305 sub check_options {
 306
 307     # Check Tokenizer parameters
 308     my $rOpts = shift;
 309
 310     %is_sub = ();
 311     $is_sub{'sub'} = 1;
 312
 313     %is_END_DATA_format_sub = (
 314         '__END__'  => 1,
 315         '__DATA__' => 1,
 316         'format'   => 1,
 317         'sub'      => 1,
 318     );
 319
 320     # Install any aliases to 'sub'
 321     if ( $rOpts->{'sub-alias-list'} ) {
 322
 323         # Note that any 'sub-alias-list' has been preprocessed to
 324         # be a trimmed, space-separated list which includes 'sub'
 325         # for example, it might be 'sub method fun'
 326         my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
 327         foreach my $word (@sub_alias_list) {
 328             $is_sub{$word}                 = 1;
 329             $is_END_DATA_format_sub{$word} = 1;
 330         }
 331     }
 332
 333     %is_grep_alias = ();
 334     if ( $rOpts->{'grep-alias-list'} ) {
 335
 336         # Note that 'grep-alias-list' has been preprocessed to be a trimmed,
 337         # space-separated list
 338         my @q = split /\s+/, $rOpts->{'grep-alias-list'};
 339         @{is_grep_alias}{@q} = (1) x scalar(@q);
 340     }
 341
 342     $rOpts_code_skipping = $rOpts->{'code-skipping'};
 343     $code_skipping_pattern_begin =
 344       make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
 345     $code_skipping_pattern_end =
 346       make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
 347     return;
 348 } ## end sub check_options
 349
 350 sub new {
 351
 352     my ( $class, @args ) = @_;
 353
 354     # Note: 'tabs' and 'indent_columns' are temporary and should be
 355     # removed asap
 356     my %defaults = (
 357         source_object        => undef,
 358         debugger_object      => undef,
 359         diagnostics_object   => undef,
 360         logger_object        => undef,
 361         starting_level       => undef,
 362         indent_columns       => 4,
 363         tabsize              => 8,
 364         look_for_hash_bang   => 0,
 365         trim_qw              => 1,
 366         look_for_autoloader  => 1,
 367         look_for_selfloader  => 1,
 368         starting_line_number => 1,
 369         extended_syntax      => 0,
 370         rOpts                => {},
 371     );
 372     my %args = ( %defaults, @args );
 373
 374     # we are given an object with a get_line() method to supply source lines
 375     my $source_object = $args{source_object};
 376     my $rOpts         = $args{rOpts};
 377
 378     # we create another object with a get_line() and peek_ahead() method
 379     my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
 380
 381     # Tokenizer state data is as follows:
 382     # _rhere_target_list_    reference to list of here-doc targets
 383     # _here_doc_target_      the target string for a here document
 384     # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
 385     #                        to determine if interpolation is done
 386     # _quote_target_         character we seek if chasing a quote
 387     # _line_start_quote_     line where we started looking for a long quote
 388     # _in_here_doc_          flag indicating if we are in a here-doc
 389     # _in_pod_               flag set if we are in pod documentation
 390     # _in_skipped_           flag set if we are in a skipped section
 391     # _in_error_             flag set if we saw severe error (binary in script)
 392     # _in_data_              flag set if we are in __DATA__ section
 393     # _in_end_               flag set if we are in __END__ section
 394     # _in_format_            flag set if we are in a format description
 395     # _in_attribute_list_    flag telling if we are looking for attributes
 396     # _in_quote_             flag telling if we are chasing a quote
 397     # _starting_level_       indentation level of first line
 398     # _line_buffer_object_   object with get_line() method to supply source code
 399     # _diagnostics_object_   place to write debugging information
 400     # _unexpected_error_count_ error count used to limit output
 401     # _lower_case_labels_at_ line numbers where lower case labels seen
 402     # _hit_bug_              program bug detected
 403
 404     my $self = [];
 405     $self->[_rhere_target_list_]        = [];
 406     $self->[_in_here_doc_]              = 0;
 407     $self->[_here_doc_target_]          = EMPTY_STRING;
 408     $self->[_here_quote_character_]     = EMPTY_STRING;
 409     $self->[_in_data_]                  = 0;
 410     $self->[_in_end_]                   = 0;
 411     $self->[_in_format_]                = 0;
 412     $self->[_in_error_]                 = 0;
 413     $self->[_in_pod_]                   = 0;
 414     $self->[_in_skipped_]               = 0;
 415     $self->[_in_attribute_list_]        = 0;
 416     $self->[_in_quote_]                 = 0;
 417     $self->[_quote_target_]             = EMPTY_STRING;
 418     $self->[_line_start_quote_]         = -1;
 419     $self->[_starting_level_]           = $args{starting_level};
 420     $self->[_know_starting_level_]      = defined( $args{starting_level} );
 421     $self->[_tabsize_]                  = $args{tabsize};
 422     $self->[_indent_columns_]           = $args{indent_columns};
 423     $self->[_look_for_hash_bang_]       = $args{look_for_hash_bang};
 424     $self->[_trim_qw_]                  = $args{trim_qw};
 425     $self->[_continuation_indentation_] = $args{continuation_indentation};
 426     $self->[_outdent_labels_]           = $args{outdent_labels};
 427     $self->[_last_line_number_]         = $args{starting_line_number} - 1;
 428     $self->[_saw_perl_dash_P_]          = 0;
 429     $self->[_saw_perl_dash_w_]          = 0;
 430     $self->[_saw_use_strict_]           = 0;
 431     $self->[_saw_v_string_]             = 0;
 432     $self->[_hit_bug_]                  = 0;
 433     $self->[_look_for_autoloader_]      = $args{look_for_autoloader};
 434     $self->[_look_for_selfloader_]      = $args{look_for_selfloader};
 435     $self->[_saw_autoloader_]           = 0;
 436     $self->[_saw_selfloader_]           = 0;
 437     $self->[_saw_hash_bang_]            = 0;
 438     $self->[_saw_end_]                  = 0;
 439     $self->[_saw_data_]                 = 0;
 440     $self->[_saw_negative_indentation_] = 0;
 441     $self->[_started_tokenizing_]       = 0;
 442     $self->[_line_buffer_object_]       = $line_buffer_object;
 443     $self->[_debugger_object_]          = $args{debugger_object};
 444     $self->[_diagnostics_object_]       = $args{diagnostics_object};
 445     $self->[_logger_object_]            = $args{logger_object};
 446     $self->[_unexpected_error_count_]   = 0;
 447     $self->[_started_looking_for_here_target_at_] = 0;
 448     $self->[_nearly_matched_here_target_at_]      = undef;
 449     $self->[_line_of_text_]                       = EMPTY_STRING;
 450     $self->[_rlower_case_labels_at_]              = undef;
 451     $self->[_extended_syntax_]                    = $args{extended_syntax};
 452     $self->[_maximum_level_]                      = 0;
 453     $self->[_true_brace_error_count_]             = 0;
 454     $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
 455     $self->[_rOpts_maximum_unexpected_errors_] =
 456       $rOpts->{'maximum-unexpected-errors'};
 457     $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
 458     $self->[_rOpts_]         = $rOpts;
 459
 460     # These vars are used for guessing indentation and must be positive
 461     $self->[_tabsize_]        = 8 if ( !$self->[_tabsize_] );
 462     $self->[_indent_columns_] = 4 if ( !$self->[_indent_columns_] );
 463
 464     bless $self, $class;
 465
 466     $tokenizer_self = $self;
 467
 468     prepare_for_a_new_file();
 469     find_starting_indentation_level();
 470
 471     # This is not a full class yet, so die if an attempt is made to
 472     # create more than one object.
 473
 474     if ( _increment_count() > 1 ) {
 475         confess
 476 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
 477     }
 478
 479     return $self;
 480
 481 } ## end sub new
 482
 483 # interface to Perl::Tidy::Logger routines
 484 sub warning {
 485     my $msg           = shift;
 486     my $logger_object = $tokenizer_self->[_logger_object_];
 487     if ($logger_object) {
 488         $logger_object->warning($msg);
 489     }
 490     return;
 491 }
 492
 493 sub get_input_stream_name {
 494     my $input_stream_name = EMPTY_STRING;
 495     my $logger_object     = $tokenizer_self->[_logger_object_];
 496     if ($logger_object) {
 497         $input_stream_name = $logger_object->get_input_stream_name();
 498     }
 499     return $input_stream_name;
 500 }
 501
 502 sub complain {
 503     my $msg           = shift;
 504     my $logger_object = $tokenizer_self->[_logger_object_];
 505     if ($logger_object) {
 506         my $input_line_number = $tokenizer_self->[_last_line_number_] + 1;
 507         $msg = "Line $input_line_number: $msg";
 508         $logger_object->complain($msg);
 509     }
 510     return;
 511 } ## end sub complain
 512
 513 sub write_logfile_entry {
 514     my $msg           = shift;
 515     my $logger_object = $tokenizer_self->[_logger_object_];
 516     if ($logger_object) {
 517         $logger_object->write_logfile_entry($msg);
 518     }
 519     return;
 520 }
 521
 522 sub interrupt_logfile {
 523     my $logger_object = $tokenizer_self->[_logger_object_];
 524     if ($logger_object) {
 525         $logger_object->interrupt_logfile();
 526     }
 527     return;
 528 }
 529
 530 sub resume_logfile {
 531     my $logger_object = $tokenizer_self->[_logger_object_];
 532     if ($logger_object) {
 533         $logger_object->resume_logfile();
 534     }
 535     return;
 536 }
 537
 538 sub increment_brace_error {
 539     my $logger_object = $tokenizer_self->[_logger_object_];
 540     if ($logger_object) {
 541         $logger_object->increment_brace_error();
 542     }
 543     return;
 544 }
 545
 546 sub report_definite_bug {
 547     $tokenizer_self->[_hit_bug_] = 1;
 548     my $logger_object = $tokenizer_self->[_logger_object_];
 549     if ($logger_object) {
 550         $logger_object->report_definite_bug();
 551     }
 552     return;
 553 }
 554
 555 sub brace_warning {
 556     my $msg           = shift;
 557     my $logger_object = $tokenizer_self->[_logger_object_];
 558     if ($logger_object) {
 559         $logger_object->brace_warning($msg);
 560     }
 561     return;
 562 }
 563
 564 sub get_saw_brace_error {
 565     my $logger_object = $tokenizer_self->[_logger_object_];
 566     if ($logger_object) {
 567         return $logger_object->get_saw_brace_error();
 568     }
 569     else {
 570         return 0;
 571     }
 572 }
 573
 574 sub get_unexpected_error_count {
 575     my ($self) = @_;
 576     return $self->[_unexpected_error_count_];
 577 }
 578
 579 # interface to Perl::Tidy::Diagnostics routines
 580 sub write_diagnostics {
 581     my $msg = shift;
 582     if ( $tokenizer_self->[_diagnostics_object_] ) {
 583         $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
 584     }
 585     return;
 586 }
 587
 588 sub get_maximum_level {
 589     return $tokenizer_self->[_maximum_level_];
 590 }
 591
 592 sub report_tokenization_errors {
 593
 594     my ($self) = @_;
 595
 596     # Report any tokenization errors and return a flag '$severe_error'.
 597     # Set $severe_error = 1 if the tokenization errors are so severe that
 598     # the formatter should not attempt to format the file. Instead, it will
 599     # just output the file verbatim.
 600
 601     # set severe error flag if tokenizer has encountered file reading problems
 602     # (i.e. unexpected binary characters)
 603     my $severe_error = $self->[_in_error_];
 604
 605     my $maxle = $self->[_rOpts_maximum_level_errors_];
 606     my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
 607     $maxle = 1 unless defined($maxle);
 608     $maxue = 0 unless defined($maxue);
 609
 610     my $level = get_indentation_level();
 611     if ( $level != $tokenizer_self->[_starting_level_] ) {
 612         warning("final indentation level: $level\n");
 613         my $level_diff = $tokenizer_self->[_starting_level_] - $level;
 614         if ( $level_diff < 0 ) { $level_diff = -$level_diff }
 615
 616         # Set severe error flag if the level error is greater than 1.
 617         # The formatter can function for any level error but it is probably
 618         # best not to attempt formatting for a high level error.
 619         if ( $maxle >= 0 && $level_diff > $maxle ) {
 620             $severe_error = 1;
 621             warning(<<EOM);
 622 Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
 623 EOM
 624         }
 625     }
 626
 627     check_final_nesting_depths();
 628
 629     # Likewise, large numbers of brace errors usually indicate non-perl
 630     # scripts, so set the severe error flag at a low number.  This is similar
 631     # to the level check, but different because braces may balance but be
 632     # incorrectly interlaced.
 633     if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
 634         $severe_error = 1;
 635     }
 636
 637     if ( $tokenizer_self->[_look_for_hash_bang_]
 638         && !$tokenizer_self->[_saw_hash_bang_] )
 639     {
 640         warning(
 641             "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
 642     }
 643
 644     if ( $tokenizer_self->[_in_format_] ) {
 645         warning("hit EOF while in format description\n");
 646     }
 647
 648     if ( $tokenizer_self->[_in_skipped_] ) {
 649         write_logfile_entry(
 650             "hit EOF while in lines skipped with --code-skipping\n");
 651     }
 652
 653     if ( $tokenizer_self->[_in_pod_] ) {
 654
 655         # Just write log entry if this is after __END__ or __DATA__
 656         # because this happens to often, and it is not likely to be
 657         # a parsing error.
 658         if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
 659             write_logfile_entry(
 660 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
 661             );
 662         }
 663
 664         else {
 665             complain(
 666 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
 667             );
 668         }
 669
 670     }
 671
 672     if ( $tokenizer_self->[_in_here_doc_] ) {
 673         $severe_error = 1;
 674         my $here_doc_target = $tokenizer_self->[_here_doc_target_];
 675         my $started_looking_for_here_target_at =
 676           $tokenizer_self->[_started_looking_for_here_target_at_];
 677         if ($here_doc_target) {
 678             warning(
 679 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
 680             );
 681         }
 682         else {
 683             warning(<<EOM);
 684 Hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string.
 685   (Perl will match to the end of file but this may not be intended).
 686 EOM
 687         }
 688         my $nearly_matched_here_target_at =
 689           $tokenizer_self->[_nearly_matched_here_target_at_];
 690         if ($nearly_matched_here_target_at) {
 691             warning(
 692 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
 693             );
 694         }
 695     }
 696
 697     # Something is seriously wrong if we ended inside a quote
 698     if ( $tokenizer_self->[_in_quote_] ) {
 699         $severe_error = 1;
 700         my $line_start_quote = $tokenizer_self->[_line_start_quote_];
 701         my $quote_target     = $tokenizer_self->[_quote_target_];
 702         my $what =
 703           ( $tokenizer_self->[_in_attribute_list_] )
 704           ? "attribute list"
 705           : "quote/pattern";
 706         warning(
 707 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
 708         );
 709     }
 710
 711     if ( $tokenizer_self->[_hit_bug_] ) {
 712         $severe_error = 1;
 713     }
 714
 715     # Multiple "unexpected" type tokenization errors usually indicate parsing
 716     # non-perl scripts, or that something is seriously wrong, so we should
 717     # avoid formatting them.  This can happen for example if we run perltidy on
 718     # a shell script or an html file.  But unfortunately this check can
 719     # interfere with some extended syntaxes, such as RPerl, so it has to be off
 720     # by default.
 721     my $ue_count = $tokenizer_self->[_unexpected_error_count_];
 722     if ( $maxue > 0 && $ue_count > $maxue ) {
 723         warning(<<EOM);
 724 Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
 725 EOM
 726         $severe_error = 1;
 727     }
 728
 729     unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
 730         if ( $] < 5.006 ) {
 731             write_logfile_entry("Suggest including '-w parameter'\n");
 732         }
 733         else {
 734             write_logfile_entry("Suggest including 'use warnings;'\n");
 735         }
 736     }
 737
 738     if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
 739         write_logfile_entry("Use of -P parameter for defines is discouraged\n");
 740     }
 741
 742     unless ( $tokenizer_self->[_saw_use_strict_] ) {
 743         write_logfile_entry("Suggest including 'use strict;'\n");
 744     }
 745
 746     # it is suggested that labels have at least one upper case character
 747     # for legibility and to avoid code breakage as new keywords are introduced
 748     if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
 749         my @lower_case_labels_at =
 750           @{ $tokenizer_self->[_rlower_case_labels_at_] };
 751         write_logfile_entry(
 752             "Suggest using upper case characters in label(s)\n");
 753         local $LIST_SEPARATOR = ')(';
 754         write_logfile_entry("  defined at line(s): (@lower_case_labels_at)\n");
 755     }
 756     return $severe_error;
 757 } ## end sub report_tokenization_errors
 758
 759 sub report_v_string {
 760
 761     # warn if this version can't handle v-strings
 762     my $tok = shift;
 763     unless ( $tokenizer_self->[_saw_v_string_] ) {
 764         $tokenizer_self->[_saw_v_string_] =
 765           $tokenizer_self->[_last_line_number_];
 766     }
 767     if ( $] < 5.006 ) {
 768         warning(
 769 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
 770         );
 771     }
 772     return;
 773 } ## end sub report_v_string
 774
 775 sub is_valid_token_type {
 776     my ($type) = @_;
 777     return $is_valid_token_type{$type};
 778 }
 779
 780 sub get_input_line_number {
 781     return $tokenizer_self->[_last_line_number_];
 782 }
 783
 784 # returns the next tokenized line
 785 sub get_line {
 786
 787     my $self = shift;
 788
 789     # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
 790     # $square_bracket_depth, $paren_depth
 791
 792     my $input_line = $tokenizer_self->[_line_buffer_object_]->get_line();
 793     $tokenizer_self->[_line_of_text_] = $input_line;
 794
 795     return unless ($input_line);
 796
 797     my $input_line_number = ++$tokenizer_self->[_last_line_number_];
 798
 799     my $write_logfile_entry = sub {
 800         my ($msg) = @_;
 801         write_logfile_entry("Line $input_line_number: $msg");
 802         return;
 803     };
 804
 805     # Find and remove what characters terminate this line, including any
 806     # control r
 807     my $input_line_separator = EMPTY_STRING;
 808     if ( chomp($input_line) ) {
 809         $input_line_separator = $INPUT_RECORD_SEPARATOR;
 810     }
 811
 812     # The first test here very significantly speeds things up, but be sure to
 813     # keep the regex and hash %other_line_endings the same.
 814     if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
 815         if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
 816             $input_line_separator = $2 . $input_line_separator;
 817         }
 818     }
 819
 820     # for backwards compatibility we keep the line text terminated with
 821     # a newline character
 822     $input_line .= "\n";
 823     $tokenizer_self->[_line_of_text_] = $input_line;    # update
 824
 825     # create a data structure describing this line which will be
 826     # returned to the caller.
 827
 828     # _line_type codes are:
 829     #   SYSTEM         - system-specific code before hash-bang line
 830     #   CODE           - line of perl code (including comments)
 831     #   POD_START      - line starting pod, such as '=head'
 832     #   POD            - pod documentation text
 833     #   POD_END        - last line of pod section, '=cut'
 834     #   HERE           - text of here-document
 835     #   HERE_END       - last line of here-doc (target word)
 836     #   FORMAT         - format section
 837     #   FORMAT_END     - last line of format section, '.'
 838     #   SKIP           - code skipping section
 839     #   SKIP_END       - last line of code skipping section, '#>>V'
 840     #   DATA_START     - __DATA__ line
 841     #   DATA           - unidentified text following __DATA__
 842     #   END_START      - __END__ line
 843     #   END            - unidentified text following __END__
 844     #   ERROR          - we are in big trouble, probably not a perl script
 845
 846     # Other variables:
 847     #   _curly_brace_depth     - depth of curly braces at start of line
 848     #   _square_bracket_depth  - depth of square brackets at start of line
 849     #   _paren_depth           - depth of parens at start of line
 850     #   _starting_in_quote     - this line continues a multi-line quote
 851     #                            (so don't trim leading blanks!)
 852     #   _ending_in_quote       - this line ends in a multi-line quote
 853     #                            (so don't trim trailing blanks!)
 854     my $line_of_tokens = {
 855         _line_type                 => 'EOF',
 856         _line_text                 => $input_line,
 857         _line_number               => $input_line_number,
 858         _guessed_indentation_level => 0,
 859         _curly_brace_depth         => $brace_depth,
 860         _square_bracket_depth      => $square_bracket_depth,
 861         _paren_depth               => $paren_depth,
 862         _quote_character           => EMPTY_STRING,
 863 ##        _rtoken_type               => undef,
 864 ##        _rtokens                   => undef,
 865 ##        _rlevels                   => undef,
 866 ##        _rblock_type               => undef,
 867 ##        _rcontainer_type           => undef,
 868 ##        _rcontainer_environment    => undef,
 869 ##        _rtype_sequence            => undef,
 870 ##        _rnesting_tokens           => undef,
 871 ##        _rci_levels                => undef,
 872 ##        _rnesting_blocks           => undef,
 873 ##        _starting_in_quote         => 0,
 874 ##        _ending_in_quote           => 0,
 875     };
 876
 877     # must print line unchanged if we are in a here document
 878     if ( $tokenizer_self->[_in_here_doc_] ) {
 879
 880         $line_of_tokens->{_line_type} = 'HERE';
 881         my $here_doc_target      = $tokenizer_self->[_here_doc_target_];
 882         my $here_quote_character = $tokenizer_self->[_here_quote_character_];
 883         my $candidate_target     = $input_line;
 884         chomp $candidate_target;
 885
 886         # Handle <<~ targets, which are indicated here by a leading space on
 887         # the here quote character
 888         if ( $here_quote_character =~ /^\s/ ) {
 889             $candidate_target =~ s/^\s*//;
 890         }
 891         if ( $candidate_target eq $here_doc_target ) {
 892             $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
 893             $line_of_tokens->{_line_type} = 'HERE_END';
 894             $write_logfile_entry->("Exiting HERE document $here_doc_target\n");
 895
 896             my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
 897             if ( @{$rhere_target_list} ) {  # there can be multiple here targets
 898                 ( $here_doc_target, $here_quote_character ) =
 899                   @{ shift @{$rhere_target_list} };
 900                 $tokenizer_self->[_here_doc_target_] = $here_doc_target;
 901                 $tokenizer_self->[_here_quote_character_] =
 902                   $here_quote_character;
 903                 $write_logfile_entry->(
 904                     "Entering HERE document $here_doc_target\n");
 905                 $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
 906                 $tokenizer_self->[_started_looking_for_here_target_at_] =
 907                   $input_line_number;
 908             }
 909             else {
 910                 $tokenizer_self->[_in_here_doc_]          = 0;
 911                 $tokenizer_self->[_here_doc_target_]      = EMPTY_STRING;
 912                 $tokenizer_self->[_here_quote_character_] = EMPTY_STRING;
 913             }
 914         }
 915
 916         # check for error of extra whitespace
 917         # note for PERL6: leading whitespace is allowed
 918         else {
 919             $candidate_target =~ s/\s*$//;
 920             $candidate_target =~ s/^\s*//;
 921             if ( $candidate_target eq $here_doc_target ) {
 922                 $tokenizer_self->[_nearly_matched_here_target_at_] =
 923                   $input_line_number;
 924             }
 925         }
 926         return $line_of_tokens;
 927     }
 928
 929     # Print line unchanged if we are in a format section
 930     elsif ( $tokenizer_self->[_in_format_] ) {
 931
 932         if ( $input_line =~ /^\.[\s#]*$/ ) {
 933
 934             # Decrement format depth count at a '.' after a 'format'
 935             $tokenizer_self->[_in_format_]--;
 936
 937             # This is the end when count reaches 0
 938             if ( !$tokenizer_self->[_in_format_] ) {
 939                 $write_logfile_entry->("Exiting format section\n");
 940                 $line_of_tokens->{_line_type} = 'FORMAT_END';
 941             }
 942         }
 943         else {
 944             $line_of_tokens->{_line_type} = 'FORMAT';
 945             if ( $input_line =~ /^\s*format\s+\w+/ ) {
 946
 947                 # Increment format depth count at a 'format' within a 'format'
 948                 # This is a simple way to handle nested formats (issue c019).
 949                 $tokenizer_self->[_in_format_]++;
 950             }
 951         }
 952         return $line_of_tokens;
 953     }
 954
 955     # must print line unchanged if we are in pod documentation
 956     elsif ( $tokenizer_self->[_in_pod_] ) {
 957
 958         $line_of_tokens->{_line_type} = 'POD';
 959         if ( $input_line =~ /^=cut/ ) {
 960             $line_of_tokens->{_line_type} = 'POD_END';
 961             $write_logfile_entry->("Exiting POD section\n");
 962             $tokenizer_self->[_in_pod_] = 0;
 963         }
 964         if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) {
 965             warning(
 966                 "Hash-bang in pod can cause older versions of perl to fail! \n"
 967             );
 968         }
 969
 970         return $line_of_tokens;
 971     }
 972
 973     # print line unchanged if in skipped section
 974     elsif ( $tokenizer_self->[_in_skipped_] ) {
 975
 976         $line_of_tokens->{_line_type} = 'SKIP';
 977         if ( $input_line =~ /$code_skipping_pattern_end/ ) {
 978             $line_of_tokens->{_line_type} = 'SKIP_END';
 979             $write_logfile_entry->("Exiting code-skipping section\n");
 980             $tokenizer_self->[_in_skipped_] = 0;
 981         }
 982         return $line_of_tokens;
 983     }
 984
 985     # must print line unchanged if we have seen a severe error (i.e., we
 986     # are seeing illegal tokens and cannot continue.  Syntax errors do
 987     # not pass this route).  Calling routine can decide what to do, but
 988     # the default can be to just pass all lines as if they were after __END__
 989     elsif ( $tokenizer_self->[_in_error_] ) {
 990         $line_of_tokens->{_line_type} = 'ERROR';
 991         return $line_of_tokens;
 992     }
 993
 994     # print line unchanged if we are __DATA__ section
 995     elsif ( $tokenizer_self->[_in_data_] ) {
 996
 997         # ...but look for POD
 998         # Note that the _in_data and _in_end flags remain set
 999         # so that we return to that state after seeing the
1000         # end of a pod section
1001         if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1002             $line_of_tokens->{_line_type} = 'POD_START';
1003             $write_logfile_entry->("Entering POD section\n");
1004             $tokenizer_self->[_in_pod_] = 1;
1005             return $line_of_tokens;
1006         }
1007         else {
1008             $line_of_tokens->{_line_type} = 'DATA';
1009             return $line_of_tokens;
1010         }
1011     }
1012
1013     # print line unchanged if we are in __END__ section
1014     elsif ( $tokenizer_self->[_in_end_] ) {
1015
1016         # ...but look for POD
1017         # Note that the _in_data and _in_end flags remain set
1018         # so that we return to that state after seeing the
1019         # end of a pod section
1020         if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
1021             $line_of_tokens->{_line_type} = 'POD_START';
1022             $write_logfile_entry->("Entering POD section\n");
1023             $tokenizer_self->[_in_pod_] = 1;
1024             return $line_of_tokens;
1025         }
1026         else {
1027             $line_of_tokens->{_line_type} = 'END';
1028             return $line_of_tokens;
1029         }
1030     }
1031
1032     # check for a hash-bang line if we haven't seen one
1033     if ( !$tokenizer_self->[_saw_hash_bang_] ) {
1034         if ( $input_line =~ /^\#\!.*perl\b/ ) {
1035             $tokenizer_self->[_saw_hash_bang_] = $input_line_number;
1036
1037             # check for -w and -P flags
1038             if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
1039                 $tokenizer_self->[_saw_perl_dash_P_] = 1;
1040             }
1041
1042             if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
1043                 $tokenizer_self->[_saw_perl_dash_w_] = 1;
1044             }
1045
1046             if (
1047                 $input_line_number > 1
1048
1049                 # leave any hash bang in a BEGIN block alone
1050                 # i.e. see 'debugger-duck_type.t'
1051                 && !(
1052                        $last_nonblank_block_type
1053                     && $last_nonblank_block_type eq 'BEGIN'
1054                 )
1055                 && !$tokenizer_self->[_look_for_hash_bang_]
1056
1057                 # Try to avoid giving a false alarm at a simple comment.
1058                 # These look like valid hash-bang lines:
1059
1060                 #!/usr/bin/perl -w
1061                 #!   /usr/bin/perl -w
1062                 #!c:\perl\bin\perl.exe
1063
1064                 # These are comments:
1065                 #! I love perl
1066                 #!  sunos does not yet provide a /usr/bin/perl
1067
1068                 # Comments typically have multiple spaces, which suggests
1069                 # the filter
1070                 && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
1071               )
1072             {
1073
1074                 # this is helpful for VMS systems; we may have accidentally
1075                 # tokenized some DCL commands
1076                 if ( $tokenizer_self->[_started_tokenizing_] ) {
1077                     warning(
1078 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
1079                     );
1080                 }
1081                 else {
1082                     complain("Useless hash-bang after line 1\n");
1083                 }
1084             }
1085
1086             # Report the leading hash-bang as a system line
1087             # This will prevent -dac from deleting it
1088             else {
1089                 $line_of_tokens->{_line_type} = 'SYSTEM';
1090                 return $line_of_tokens;
1091             }
1092         }
1093     }
1094
1095     # wait for a hash-bang before parsing if the user invoked us with -x
1096     if ( $tokenizer_self->[_look_for_hash_bang_]
1097         && !$tokenizer_self->[_saw_hash_bang_] )
1098     {
1099         $line_of_tokens->{_line_type} = 'SYSTEM';
1100         return $line_of_tokens;
1101     }
1102
1103     # a first line of the form ': #' will be marked as SYSTEM
1104     # since lines of this form may be used by tcsh
1105     if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
1106         $line_of_tokens->{_line_type} = 'SYSTEM';
1107         return $line_of_tokens;
1108     }
1109
1110     # now we know that it is ok to tokenize the line...
1111     # the line tokenizer will modify any of these private variables:
1112     #        _rhere_target_list_
1113     #        _in_data_
1114     #        _in_end_
1115     #        _in_format_
1116     #        _in_error_
1117     #        _in_skipped_
1118     #        _in_pod_
1119     #        _in_quote_
1120     my $ending_in_quote_last = $tokenizer_self->[_in_quote_];
1121     tokenize_this_line($line_of_tokens);
1122
1123     # Now finish defining the return structure and return it
1124     $line_of_tokens->{_ending_in_quote} = $tokenizer_self->[_in_quote_];
1125
1126     # handle severe error (binary data in script)
1127     if ( $tokenizer_self->[_in_error_] ) {
1128         $tokenizer_self->[_in_quote_] = 0;    # to avoid any more messages
1129         warning("Giving up after error\n");
1130         $line_of_tokens->{_line_type} = 'ERROR';
1131         reset_indentation_level(0);           # avoid error messages
1132         return $line_of_tokens;
1133     }
1134
1135     # handle start of pod documentation
1136     if ( $tokenizer_self->[_in_pod_] ) {
1137
1138         # This gets tricky..above a __DATA__ or __END__ section, perl
1139         # accepts '=cut' as the start of pod section. But afterwards,
1140         # only pod utilities see it and they may ignore an =cut without
1141         # leading =head.  In any case, this isn't good.
1142         if ( $input_line =~ /^=cut\b/ ) {
1143             if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] )
1144             {
1145                 complain("=cut while not in pod ignored\n");
1146                 $tokenizer_self->[_in_pod_] = 0;
1147                 $line_of_tokens->{_line_type} = 'POD_END';
1148             }
1149             else {
1150                 $line_of_tokens->{_line_type} = 'POD_START';
1151                 warning(
1152 "=cut starts a pod section .. this can fool pod utilities.\n"
1153                 ) unless (DEVEL_MODE);
1154                 $write_logfile_entry->("Entering POD section\n");
1155             }
1156         }
1157
1158         else {
1159             $line_of_tokens->{_line_type} = 'POD_START';
1160             $write_logfile_entry->("Entering POD section\n");
1161         }
1162
1163         return $line_of_tokens;
1164     }
1165
1166     # handle start of skipped section
1167     if ( $tokenizer_self->[_in_skipped_] ) {
1168
1169         $line_of_tokens->{_line_type} = 'SKIP';
1170         $write_logfile_entry->("Entering code-skipping section\n");
1171         return $line_of_tokens;
1172     }
1173
1174     # see if this line contains here doc targets
1175     my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
1176     if ( @{$rhere_target_list} ) {
1177
1178         my ( $here_doc_target, $here_quote_character ) =
1179           @{ shift @{$rhere_target_list} };
1180         $tokenizer_self->[_in_here_doc_]          = 1;
1181         $tokenizer_self->[_here_doc_target_]      = $here_doc_target;
1182         $tokenizer_self->[_here_quote_character_] = $here_quote_character;
1183         $write_logfile_entry->("Entering HERE document $here_doc_target\n");
1184         $tokenizer_self->[_started_looking_for_here_target_at_] =
1185           $input_line_number;
1186     }
1187
1188     # NOTE: __END__ and __DATA__ statements are written unformatted
1189     # because they can theoretically contain additional characters
1190     # which are not tokenized (and cannot be read with <DATA> either!).
1191     if ( $tokenizer_self->[_in_data_] ) {
1192         $line_of_tokens->{_line_type} = 'DATA_START';
1193         $write_logfile_entry->("Starting __DATA__ section\n");
1194         $tokenizer_self->[_saw_data_] = 1;
1195
1196         # keep parsing after __DATA__ if use SelfLoader was seen
1197         if ( $tokenizer_self->[_saw_selfloader_] ) {
1198             $tokenizer_self->[_in_data_] = 0;
1199             $write_logfile_entry->(
1200                 "SelfLoader seen, continuing; -nlsl deactivates\n");
1201         }
1202
1203         return $line_of_tokens;
1204     }
1205
1206     elsif ( $tokenizer_self->[_in_end_] ) {
1207         $line_of_tokens->{_line_type} = 'END_START';
1208         $write_logfile_entry->("Starting __END__ section\n");
1209         $tokenizer_self->[_saw_end_] = 1;
1210
1211         # keep parsing after __END__ if use AutoLoader was seen
1212         if ( $tokenizer_self->[_saw_autoloader_] ) {
1213             $tokenizer_self->[_in_end_] = 0;
1214             $write_logfile_entry->(
1215                 "AutoLoader seen, continuing; -nlal deactivates\n");
1216         }
1217         return $line_of_tokens;
1218     }
1219
1220     # now, finally, we know that this line is type 'CODE'
1221     $line_of_tokens->{_line_type} = 'CODE';
1222
1223     # remember if we have seen any real code
1224     if (  !$tokenizer_self->[_started_tokenizing_]
1225         && $input_line !~ /^\s*$/
1226         && $input_line !~ /^\s*#/ )
1227     {
1228         $tokenizer_self->[_started_tokenizing_] = 1;
1229     }
1230
1231     if ( $tokenizer_self->[_debugger_object_] ) {
1232         $tokenizer_self->[_debugger_object_]
1233           ->write_debug_entry($line_of_tokens);
1234     }
1235
1236     # Note: if keyword 'format' occurs in this line code, it is still CODE
1237     # (keyword 'format' need not start a line)
1238     if ( $tokenizer_self->[_in_format_] ) {
1239         $write_logfile_entry->("Entering format section\n");
1240     }
1241
1242     if ( $tokenizer_self->[_in_quote_]
1243         and ( $tokenizer_self->[_line_start_quote_] < 0 ) )
1244     {
1245
1246         #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
1247         if ( ( my $quote_target = $tokenizer_self->[_quote_target_] ) !~
1248             /^\s*$/ )
1249         {
1250             $tokenizer_self->[_line_start_quote_] = $input_line_number;
1251             $write_logfile_entry->(
1252                 "Start multi-line quote or pattern ending in $quote_target\n");
1253         }
1254     }
1255     elsif ( ( $tokenizer_self->[_line_start_quote_] >= 0 )
1256         && !$tokenizer_self->[_in_quote_] )
1257     {
1258         $tokenizer_self->[_line_start_quote_] = -1;
1259         $write_logfile_entry->("End of multi-line quote or pattern\n");
1260     }
1261
1262     # we are returning a line of CODE
1263     return $line_of_tokens;
1264 } ## end sub get_line
1265
1266 sub find_starting_indentation_level {
1267
1268     # We need to find the indentation level of the first line of the
1269     # script being formatted.  Often it will be zero for an entire file,
1270     # but if we are formatting a local block of code (within an editor for
1271     # example) it may not be zero.  The user may specify this with the
1272     # -sil=n parameter but normally doesn't so we have to guess.
1273     #
1274     # USES GLOBAL VARIABLES: $tokenizer_self
1275     my $starting_level = 0;
1276
1277     # use value if given as parameter
1278     if ( $tokenizer_self->[_know_starting_level_] ) {
1279         $starting_level = $tokenizer_self->[_starting_level_];
1280     }
1281
1282     # if we know there is a hash_bang line, the level must be zero
1283     elsif ( $tokenizer_self->[_look_for_hash_bang_] ) {
1284         $tokenizer_self->[_know_starting_level_] = 1;
1285     }
1286
1287     # otherwise figure it out from the input file
1288     else {
1289         my $line;
1290         my $i = 0;
1291
1292         # keep looking at lines until we find a hash bang or piece of code
1293         my $msg = EMPTY_STRING;
1294         while ( $line =
1295             $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
1296         {
1297
1298             # if first line is #! then assume starting level is zero
1299             if ( $i == 1 && $line =~ /^\#\!/ ) {
1300                 $starting_level = 0;
1301                 last;
1302             }
1303             next if ( $line =~ /^\s*#/ );    # skip past comments
1304             next if ( $line =~ /^\s*$/ );    # skip past blank lines
1305             $starting_level = guess_old_indentation_level($line);
1306             last;
1307         }
1308         $msg = "Line $i implies starting-indentation-level = $starting_level\n";
1309         write_logfile_entry("$msg");
1310     }
1311     $tokenizer_self->[_starting_level_] = $starting_level;
1312     reset_indentation_level($starting_level);
1313     return;
1314 } ## end sub find_starting_indentation_level
1315
1316 sub guess_old_indentation_level {
1317     my ($line) = @_;
1318
1319     # Guess the indentation level of an input line.
1320     #
1321     # For the first line of code this result will define the starting
1322     # indentation level.  It will mainly be non-zero when perltidy is applied
1323     # within an editor to a local block of code.
1324     #
1325     # This is an impossible task in general because we can't know what tabs
1326     # meant for the old script and how many spaces were used for one
1327     # indentation level in the given input script.  For example it may have
1328     # been previously formatted with -i=7 -et=3.  But we can at least try to
1329     # make sure that perltidy guesses correctly if it is applied repeatedly to
1330     # a block of code within an editor, so that the block stays at the same
1331     # level when perltidy is applied repeatedly.
1332     #
1333     # USES GLOBAL VARIABLES: $tokenizer_self
1334     my $level = 0;
1335
1336     # find leading tabs, spaces, and any statement label
1337     my $spaces = 0;
1338     if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1339
1340         # If there are leading tabs, we use the tab scheme for this run, if
1341         # any, so that the code will remain stable when editing.
1342         if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }
1343
1344         if ($2) { $spaces += length($2) }
1345
1346         # correct for outdented labels
1347         if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
1348             $spaces += $tokenizer_self->[_continuation_indentation_];
1349         }
1350     }
1351
1352     # compute indentation using the value of -i for this run.
1353     # If -i=0 is used for this run (which is possible) it doesn't matter
1354     # what we do here but we'll guess that the old run used 4 spaces per level.
1355     my $indent_columns = $tokenizer_self->[_indent_columns_];
1356     $indent_columns = 4 if ( !$indent_columns );
1357     $level          = int( $spaces / $indent_columns );
1358     return ($level);
1359 } ## end sub guess_old_indentation_level
1360
1361 # This is a currently unused debug routine
1362 sub dump_functions {
1363
1364     my $fh = *STDOUT;
1365     foreach my $pkg ( keys %is_user_function ) {
1366         $fh->print("\nnon-constant subs in package $pkg\n");
1367
1368         foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1369             my $msg = EMPTY_STRING;
1370             if ( $is_block_list_function{$pkg}{$sub} ) {
1371                 $msg = 'block_list';
1372             }
1373
1374             if ( $is_block_function{$pkg}{$sub} ) {
1375                 $msg = 'block';
1376             }
1377             $fh->print("$sub $msg\n");
1378         }
1379     }
1380
1381     foreach my $pkg ( keys %is_constant ) {
1382         $fh->print("\nconstants and constant subs in package $pkg\n");
1383
1384         foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1385             $fh->print("$sub\n");
1386         }
1387     }
1388     return;
1389 } ## end sub dump_functions
1390
1391 sub prepare_for_a_new_file {
1392
1393     # previous tokens needed to determine what to expect next
1394     $last_nonblank_token      = ';';    # the only possible starting state which
1395     $last_nonblank_type       = ';';    # will make a leading brace a code block
1396     $last_nonblank_block_type = EMPTY_STRING;
1397
1398     # scalars for remembering statement types across multiple lines
1399     $statement_type    = EMPTY_STRING;    # '' or 'use' or 'sub..' or 'case..'
1400     $in_attribute_list = 0;
1401
1402     # scalars for remembering where we are in the file
1403     $current_package = "main";
1404     $context         = UNKNOWN_CONTEXT;
1405
1406     # hashes used to remember function information
1407     %is_constant             = ();        # user-defined constants
1408     %is_user_function        = ();        # user-defined functions
1409     %user_function_prototype = ();        # their prototypes
1410     %is_block_function       = ();
1411     %is_block_list_function  = ();
1412     %saw_function_definition = ();
1413     %saw_use_module          = ();
1414
1415     # variables used to track depths of various containers
1416     # and report nesting errors
1417     $paren_depth             = 0;
1418     $brace_depth             = 0;
1419     $square_bracket_depth    = 0;
1420     @current_depth           = (0) x scalar @closing_brace_names;
1421     $total_depth             = 0;
1422     @total_depth             = ();
1423     @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
1424     @current_sequence_number = ();
1425     $next_sequence_number    = 2;    # The value 1 is reserved for SEQ_ROOT
1426
1427     @paren_type                     = ();
1428     @paren_semicolon_count          = ();
1429     @paren_structural_type          = ();
1430     @brace_type                     = ();
1431     @brace_structural_type          = ();
1432     @brace_context                  = ();
1433     @brace_package                  = ();
1434     @square_bracket_type            = ();
1435     @square_bracket_structural_type = ();
1436     @depth_array                    = ();
1437     @nested_ternary_flag            = ();
1438     @nested_statement_type          = ();
1439     @starting_line_of_current_depth = ();
1440
1441     $paren_type[$paren_depth]            = EMPTY_STRING;
1442     $paren_semicolon_count[$paren_depth] = 0;
1443     $paren_structural_type[$brace_depth] = EMPTY_STRING;
1444     $brace_type[$brace_depth] = ';';    # identify opening brace as code block
1445     $brace_structural_type[$brace_depth]                   = EMPTY_STRING;
1446     $brace_context[$brace_depth]                           = UNKNOWN_CONTEXT;
1447     $brace_package[$paren_depth]                           = $current_package;
1448     $square_bracket_type[$square_bracket_depth]            = EMPTY_STRING;
1449     $square_bracket_structural_type[$square_bracket_depth] = EMPTY_STRING;
1450
1451     initialize_tokenizer_state();
1452     return;
1453 } ## end sub prepare_for_a_new_file
1454
1455 {    ## closure for sub tokenize_this_line
1456
1457     use constant BRACE          => 0;
1458     use constant SQUARE_BRACKET => 1;
1459     use constant PAREN          => 2;
1460     use constant QUESTION_COLON => 3;
1461
1462     # TV1: scalars for processing one LINE.
1463     # Re-initialized on each entry to sub tokenize_this_line.
1464     my (
1465         $block_type,        $container_type,    $expecting,
1466         $i,                 $i_tok,             $input_line,
1467         $input_line_number, $last_nonblank_i,   $max_token_index,
1468         $next_tok,          $next_type,         $peeked_ahead,
1469         $prototype,         $rhere_target_list, $rtoken_map,
1470         $rtoken_type,       $rtokens,           $tok,
1471         $type,              $type_sequence,     $indent_flag,
1472     );
1473
1474     # TV2: refs to ARRAYS for processing one LINE
1475     # Re-initialized on each call.
1476     my $routput_token_list     = [];    # stack of output token indexes
1477     my $routput_token_type     = [];    # token types
1478     my $routput_block_type     = [];    # types of code block
1479     my $routput_container_type = [];    # paren types, such as if, elsif, ..
1480     my $routput_type_sequence  = [];    # nesting sequential number
1481     my $routput_indent_flag    = [];    #
1482
1483     # TV3: SCALARS for quote variables.  These are initialized with a
1484     # subroutine call and continually updated as lines are processed.
1485     my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1486         $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1487
1488     # TV4: SCALARS for multi-line identifiers and
1489     # statements. These are initialized with a subroutine call
1490     # and continually updated as lines are processed.
1491     my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
1492
1493     # TV5: SCALARS for tracking indentation level.
1494     # Initialized once and continually updated as lines are
1495     # processed.
1496     my (
1497         $nesting_token_string,      $nesting_type_string,
1498         $nesting_block_string,      $nesting_block_flag,
1499         $nesting_list_string,       $nesting_list_flag,
1500         $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
1501         $in_statement_continuation, $level_in_tokenizer,
1502         $slevel_in_tokenizer,       $rslevel_stack,
1503     );
1504
1505     # TV6: SCALARS for remembering several previous
1506     # tokens. Initialized once and continually updated as
1507     # lines are processed.
1508     my (
1509         $last_nonblank_container_type,     $last_nonblank_type_sequence,
1510         $last_last_nonblank_token,         $last_last_nonblank_type,
1511         $last_last_nonblank_block_type,    $last_last_nonblank_container_type,
1512         $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1513     );
1514
1515     # ----------------------------------------------------------------
1516     # beginning of tokenizer variable access and manipulation routines
1517     # ----------------------------------------------------------------
1518
1519     sub initialize_tokenizer_state {
1520
1521         # TV1: initialized on each call
1522         # TV2: initialized on each call
1523         # TV3:
1524         $in_quote                = 0;
1525         $quote_type              = 'Q';
1526         $quote_character         = EMPTY_STRING;
1527         $quote_pos               = 0;
1528         $quote_depth             = 0;
1529         $quoted_string_1         = EMPTY_STRING;
1530         $quoted_string_2         = EMPTY_STRING;
1531         $allowed_quote_modifiers = EMPTY_STRING;
1532
1533         # TV4:
1534         $id_scan_state     = EMPTY_STRING;
1535         $identifier        = EMPTY_STRING;
1536         $want_paren        = EMPTY_STRING;
1537         $indented_if_level = 0;
1538
1539         # TV5:
1540         $nesting_token_string   = EMPTY_STRING;
1541         $nesting_type_string    = EMPTY_STRING;
1542         $nesting_block_string   = '1';            # initially in a block
1543         $nesting_block_flag     = 1;
1544         $nesting_list_string    = '0';            # initially not in a list
1545         $nesting_list_flag      = 0;              # initially not in a list
1546         $ci_string_in_tokenizer = EMPTY_STRING;
1547         $continuation_string_in_tokenizer = "0";
1548         $in_statement_continuation        = 0;
1549         $level_in_tokenizer               = 0;
1550         $slevel_in_tokenizer              = 0;
1551         $rslevel_stack                    = [];
1552
1553         # TV6:
1554         $last_nonblank_container_type      = EMPTY_STRING;
1555         $last_nonblank_type_sequence       = EMPTY_STRING;
1556         $last_last_nonblank_token          = ';';
1557         $last_last_nonblank_type           = ';';
1558         $last_last_nonblank_block_type     = EMPTY_STRING;
1559         $last_last_nonblank_container_type = EMPTY_STRING;
1560         $last_last_nonblank_type_sequence  = EMPTY_STRING;
1561         $last_nonblank_prototype           = EMPTY_STRING;
1562         return;
1563     } ## end sub initialize_tokenizer_state
1564
1565     sub save_tokenizer_state {
1566
1567         my $rTV1 = [
1568             $block_type,        $container_type,    $expecting,
1569             $i,                 $i_tok,             $input_line,
1570             $input_line_number, $last_nonblank_i,   $max_token_index,
1571             $next_tok,          $next_type,         $peeked_ahead,
1572             $prototype,         $rhere_target_list, $rtoken_map,
1573             $rtoken_type,       $rtokens,           $tok,
1574             $type,              $type_sequence,     $indent_flag,
1575         ];
1576
1577         my $rTV2 = [
1578             $routput_token_list,    $routput_token_type,
1579             $routput_block_type,    $routput_container_type,
1580             $routput_type_sequence, $routput_indent_flag,
1581         ];
1582
1583         my $rTV3 = [
1584             $in_quote,        $quote_type,
1585             $quote_character, $quote_pos,
1586             $quote_depth,     $quoted_string_1,
1587             $quoted_string_2, $allowed_quote_modifiers,
1588         ];
1589
1590         my $rTV4 =
1591           [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
1592
1593         my $rTV5 = [
1594             $nesting_token_string,      $nesting_type_string,
1595             $nesting_block_string,      $nesting_block_flag,
1596             $nesting_list_string,       $nesting_list_flag,
1597             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
1598             $in_statement_continuation, $level_in_tokenizer,
1599             $slevel_in_tokenizer,       $rslevel_stack,
1600         ];
1601
1602         my $rTV6 = [
1603             $last_nonblank_container_type,
1604             $last_nonblank_type_sequence,
1605             $last_last_nonblank_token,
1606             $last_last_nonblank_type,
1607             $last_last_nonblank_block_type,
1608             $last_last_nonblank_container_type,
1609             $last_last_nonblank_type_sequence,
1610             $last_nonblank_prototype,
1611         ];
1612         return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1613     } ## end sub save_tokenizer_state
1614
1615     sub restore_tokenizer_state {
1616         my ($rstate) = @_;
1617         my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1618         (
1619             $block_type,        $container_type,    $expecting,
1620             $i,                 $i_tok,             $input_line,
1621             $input_line_number, $last_nonblank_i,   $max_token_index,
1622             $next_tok,          $next_type,         $peeked_ahead,
1623             $prototype,         $rhere_target_list, $rtoken_map,
1624             $rtoken_type,       $rtokens,           $tok,
1625             $type,              $type_sequence,     $indent_flag,
1626         ) = @{$rTV1};
1627
1628         (
1629             $routput_token_list,    $routput_token_type,
1630             $routput_block_type,    $routput_container_type,
1631             $routput_type_sequence, $routput_indent_flag,
1632         ) = @{$rTV2};
1633
1634         (
1635             $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1636             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1637         ) = @{$rTV3};
1638
1639         ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
1640           @{$rTV4};
1641
1642         (
1643             $nesting_token_string,      $nesting_type_string,
1644             $nesting_block_string,      $nesting_block_flag,
1645             $nesting_list_string,       $nesting_list_flag,
1646             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
1647             $in_statement_continuation, $level_in_tokenizer,
1648             $slevel_in_tokenizer,       $rslevel_stack,
1649         ) = @{$rTV5};
1650
1651         (
1652             $last_nonblank_container_type,
1653             $last_nonblank_type_sequence,
1654             $last_last_nonblank_token,
1655             $last_last_nonblank_type,
1656             $last_last_nonblank_block_type,
1657             $last_last_nonblank_container_type,
1658             $last_last_nonblank_type_sequence,
1659             $last_nonblank_prototype,
1660         ) = @{$rTV6};
1661         return;
1662     } ## end sub restore_tokenizer_state
1663
1664     sub split_pretoken {
1665
1666         my ($numc) = @_;
1667
1668      # Split the leading $numc characters from the current token (at index=$i)
1669      # which is pre-type 'w' and insert the remainder back into the pretoken
1670      # stream with appropriate settings.  Since we are splitting a pre-type 'w',
1671      # there are three cases, depending on if the remainder starts with a digit:
1672      # Case 1: remainder is type 'd', all digits
1673      # Case 2: remainder is type 'd' and type 'w': digits and other characters
1674      # Case 3: remainder is type 'w'
1675
1676         # Examples, for $numc=1:
1677         #   $tok    => $tok_0 $tok_1 $tok_2
1678         #   'x10'   => 'x'    '10'                # case 1
1679         #   'x10if' => 'x'    '10'   'if'         # case 2
1680         #   '0ne    => 'O'            'ne'        # case 3
1681
1682         # where:
1683         #   $tok_1 is a possible string of digits (pre-type 'd')
1684         #   $tok_2 is a possible word (pre-type 'w')
1685
1686         # return 1 if successful
1687         # return undef if error (shouldn't happen)
1688
1689         # Calling routine should update '$type' and '$tok' if successful.
1690
1691         my $pretoken = $rtokens->[$i];
1692         if (   $pretoken
1693             && length($pretoken) > $numc
1694             && substr( $pretoken, $numc ) =~ /^(\d*)(.*)$/ )
1695         {
1696
1697             # Split $tok into up to 3 tokens:
1698             my $tok_0 = substr( $pretoken, 0, $numc );
1699             my $tok_1 = defined($1) ? $1 : EMPTY_STRING;
1700             my $tok_2 = defined($2) ? $2 : EMPTY_STRING;
1701
1702             my $len_0 = length($tok_0);
1703             my $len_1 = length($tok_1);
1704             my $len_2 = length($tok_2);
1705
1706             my $pre_type_0 = 'w';
1707             my $pre_type_1 = 'd';
1708             my $pre_type_2 = 'w';
1709
1710             my $pos_0 = $rtoken_map->[$i];
1711             my $pos_1 = $pos_0 + $len_0;
1712             my $pos_2 = $pos_1 + $len_1;
1713
1714             my $isplice = $i + 1;
1715
1716             # Splice in any digits
1717             if ($len_1) {
1718                 splice @{$rtoken_map},  $isplice, 0, $pos_1;
1719                 splice @{$rtokens},     $isplice, 0, $tok_1;
1720                 splice @{$rtoken_type}, $isplice, 0, $pre_type_1;
1721                 $max_token_index++;
1722                 $isplice++;
1723             }
1724
1725             # Splice in any trailing word
1726             if ($len_2) {
1727                 splice @{$rtoken_map},  $isplice, 0, $pos_2;
1728                 splice @{$rtokens},     $isplice, 0, $tok_2;
1729                 splice @{$rtoken_type}, $isplice, 0, $pre_type_2;
1730                 $max_token_index++;
1731             }
1732
1733             $rtokens->[$i] = $tok_0;
1734             return 1;
1735         }
1736         else {
1737
1738             # Shouldn't get here
1739             if (DEVEL_MODE) {
1740                 Fault(<<EOM);
1741 While working near line number $input_line_number, bad arg '$tok' passed to sub split_pretoken()
1742 EOM
1743             }
1744         }
1745         return;
1746     } ## end sub split_pretoken
1747
1748     sub get_indentation_level {
1749
1750         # patch to avoid reporting error if indented if is not terminated
1751         if ($indented_if_level) { return $level_in_tokenizer - 1 }
1752         return $level_in_tokenizer;
1753     }
1754
1755     sub reset_indentation_level {
1756         $level_in_tokenizer = $slevel_in_tokenizer = shift;
1757         push @{$rslevel_stack}, $slevel_in_tokenizer;
1758         return;
1759     }
1760
1761     sub peeked_ahead {
1762         my $flag = shift;
1763         $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1764         return $peeked_ahead;
1765     }
1766
1767     # ------------------------------------------------------------
1768     # end of tokenizer variable access and manipulation routines
1769     # ------------------------------------------------------------
1770
1771     #------------------------------
1772     # beginning of tokenizer hashes
1773     #------------------------------
1774
1775     my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
1776
1777     # These block types terminate statements and do not need a trailing
1778     # semicolon
1779     # patched for SWITCH/CASE/
1780     my %is_zero_continuation_block_type;
1781     my @q;
1782     @q = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
1783       if elsif else unless while until for foreach switch case given when);
1784     @is_zero_continuation_block_type{@q} = (1) x scalar(@q);
1785
1786     my %is_logical_container;
1787     @q = qw(if elsif unless while and or err not && !  || for foreach);
1788     @is_logical_container{@q} = (1) x scalar(@q);
1789
1790     my %is_binary_type;
1791     @q = qw(|| &&);
1792     @is_binary_type{@q} = (1) x scalar(@q);
1793
1794     my %is_binary_keyword;
1795     @q = qw(and or err eq ne cmp);
1796     @is_binary_keyword{@q} = (1) x scalar(@q);
1797
1798     # 'L' is token for opening { at hash key
1799     my %is_opening_type;
1800     @q = qw< L { ( [ >;
1801     @is_opening_type{@q} = (1) x scalar(@q);
1802
1803     # 'R' is token for closing } at hash key
1804     my %is_closing_type;
1805     @q = qw< R } ) ] >;
1806     @is_closing_type{@q} = (1) x scalar(@q);
1807
1808     my %is_redo_last_next_goto;
1809     @q = qw(redo last next goto);
1810     @is_redo_last_next_goto{@q} = (1) x scalar(@q);
1811
1812     my %is_use_require;
1813     @q = qw(use require);
1814     @is_use_require{@q} = (1) x scalar(@q);
1815
1816     # This hash holds the array index in $tokenizer_self for these keywords:
1817     # Fix for issue c035: removed 'format' from this hash
1818     my %is_END_DATA = (
1819         '__END__'  => _in_end_,
1820         '__DATA__' => _in_data_,
1821     );
1822
1823     my %is_list_end_type;
1824     @q = qw( ; { } );
1825     push @q, ',';
1826     @is_list_end_type{@q} = (1) x scalar(@q);
1827
1828     # original ref: camel 3 p 147,
1829     # but perl may accept undocumented flags
1830     # perl 5.10 adds 'p' (preserve)
1831     # Perl version 5.22 added 'n'
1832     # From http://perldoc.perl.org/perlop.html we have
1833     # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
1834     # s/PATTERN/REPLACEMENT/msixpodualngcer
1835     # y/SEARCHLIST/REPLACEMENTLIST/cdsr
1836     # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
1837     # qr/STRING/msixpodualn
1838     my %quote_modifiers = (
1839         's'  => '[msixpodualngcer]',
1840         'y'  => '[cdsr]',
1841         'tr' => '[cdsr]',
1842         'm'  => '[msixpodualngc]',
1843         'qr' => '[msixpodualn]',
1844         'q'  => EMPTY_STRING,
1845         'qq' => EMPTY_STRING,
1846         'qw' => EMPTY_STRING,
1847         'qx' => EMPTY_STRING,
1848     );
1849
1850     # table showing how many quoted things to look for after quote operator..
1851     # s, y, tr have 2 (pattern and replacement)
1852     # others have 1 (pattern only)
1853     my %quote_items = (
1854         's'  => 2,
1855         'y'  => 2,
1856         'tr' => 2,
1857         'm'  => 1,
1858         'qr' => 1,
1859         'q'  => 1,
1860         'qq' => 1,
1861         'qw' => 1,
1862         'qx' => 1,
1863     );
1864
1865     my %is_for_foreach;
1866     @_ = qw(for foreach);
1867     @is_for_foreach{@_} = (1) x scalar(@_);
1868
1869     my %is_my_our_state;
1870     @_ = qw(my our state);
1871     @is_my_our_state{@_} = (1) x scalar(@_);
1872
1873     # These keywords may introduce blocks after parenthesized expressions,
1874     # in the form:
1875     # keyword ( .... ) { BLOCK }
1876     # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
1877     my %is_blocktype_with_paren;
1878     @_ =
1879       qw(if elsif unless while until for foreach switch case given when catch);
1880     @is_blocktype_with_paren{@_} = (1) x scalar(@_);
1881
1882     my %is_case_default;
1883     @_ = qw(case default);
1884     @is_case_default{@_} = (1) x scalar(@_);
1885
1886     #------------------------
1887     # end of tokenizer hashes
1888     #------------------------
1889
1890     # ------------------------------------------------------------
1891     # beginning of various scanner interface routines
1892     # ------------------------------------------------------------
1893     sub scan_replacement_text {
1894
1895         # check for here-docs in replacement text invoked by
1896         # a substitution operator with executable modifier 'e'.
1897         #
1898         # given:
1899         #  $replacement_text
1900         # return:
1901         #  $rht = reference to any here-doc targets
1902         my ($replacement_text) = @_;
1903
1904         # quick check
1905         return unless ( $replacement_text =~ /<</ );
1906
1907         write_logfile_entry("scanning replacement text for here-doc targets\n");
1908
1909         # save the logger object for error messages
1910         my $logger_object = $tokenizer_self->[_logger_object_];
1911
1912         # localize all package variables
1913         local (
1914             $tokenizer_self,                 $last_nonblank_token,
1915             $last_nonblank_type,             $last_nonblank_block_type,
1916             $statement_type,                 $in_attribute_list,
1917             $current_package,                $context,
1918             %is_constant,                    %is_user_function,
1919             %user_function_prototype,        %is_block_function,
1920             %is_block_list_function,         %saw_function_definition,
1921             $brace_depth,                    $paren_depth,
1922             $square_bracket_depth,           @current_depth,
1923             @total_depth,                    $total_depth,
1924             @nesting_sequence_number,        @current_sequence_number,
1925             @paren_type,                     @paren_semicolon_count,
1926             @paren_structural_type,          @brace_type,
1927             @brace_structural_type,          @brace_context,
1928             @brace_package,                  @square_bracket_type,
1929             @square_bracket_structural_type, @depth_array,
1930             @starting_line_of_current_depth, @nested_ternary_flag,
1931             @nested_statement_type,          $next_sequence_number,
1932         );
1933
1934         # save all lexical variables
1935         my $rstate = save_tokenizer_state();
1936         _decrement_count();    # avoid error check for multiple tokenizers
1937
1938         # make a new tokenizer
1939         my $rOpts = {};
1940         my $rpending_logfile_message;
1941         my $source_object = Perl::Tidy::LineSource->new(
1942             input_file               => \$replacement_text,
1943             rOpts                    => $rOpts,
1944             rpending_logfile_message => $rpending_logfile_message,
1945         );
1946         my $tokenizer = Perl::Tidy::Tokenizer->new(
1947             source_object        => $source_object,
1948             logger_object        => $logger_object,
1949             starting_line_number => $input_line_number,
1950         );
1951
1952         # scan the replacement text
1953         1 while ( $tokenizer->get_line() );
1954
1955         # remove any here doc targets
1956         my $rht = undef;
1957         if ( $tokenizer_self->[_in_here_doc_] ) {
1958             $rht = [];
1959             push @{$rht},
1960               [
1961                 $tokenizer_self->[_here_doc_target_],
1962                 $tokenizer_self->[_here_quote_character_]
1963               ];
1964             if ( $tokenizer_self->[_rhere_target_list_] ) {
1965                 push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
1966                 $tokenizer_self->[_rhere_target_list_] = undef;
1967             }
1968             $tokenizer_self->[_in_here_doc_] = undef;
1969         }
1970
1971         # now its safe to report errors
1972         my $severe_error = $tokenizer->report_tokenization_errors();
1973
1974         # TODO: Could propagate a severe error up
1975
1976         # restore all tokenizer lexical variables
1977         restore_tokenizer_state($rstate);
1978
1979         # return the here doc targets
1980         return $rht;
1981     } ## end sub scan_replacement_text
1982
1983     sub scan_bare_identifier {
1984         ( $i, $tok, $type, $prototype ) =
1985           scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
1986             $rtoken_map, $max_token_index );
1987         return;
1988     }
1989
1990     sub scan_identifier {
1991         (
1992             $i, $tok, $type, $id_scan_state, $identifier,
1993             my $split_pretoken_flag
1994           )
1995           = scan_complex_identifier( $i, $id_scan_state, $identifier, $rtokens,
1996             $max_token_index, $expecting, $paren_type[$paren_depth] );
1997
1998         # Check for signal to fix a special variable adjacent to a keyword,
1999         # such as '$^One$0'.
2000         if ($split_pretoken_flag) {
2001
2002             # Try to fix it by splitting the pretoken
2003             if (   $i > 0
2004                 && $rtokens->[ $i - 1 ] eq '^'
2005                 && split_pretoken(1) )
2006             {
2007                 $identifier = substr( $identifier, 0, 3 );
2008                 $tok        = $identifier;
2009             }
2010             else {
2011
2012                 # This shouldn't happen ...
2013                 my $var    = substr( $tok, 0, 3 );
2014                 my $excess = substr( $tok, 3 );
2015                 interrupt_logfile();
2016                 warning(<<EOM);
2017 $input_line_number: Trouble parsing at characters '$excess' after special variable '$var'.
2018 A space may be needed after '$var'.
2019 EOM
2020                 resume_logfile();
2021             }
2022         }
2023         return;
2024     } ## end sub scan_identifier
2025
2026     use constant VERIFY_FASTSCAN => 0;
2027     my %fast_scan_context;
2028
2029     BEGIN {
2030         %fast_scan_context = (
2031             '$' => SCALAR_CONTEXT,
2032             '*' => SCALAR_CONTEXT,
2033             '@' => LIST_CONTEXT,
2034             '%' => LIST_CONTEXT,
2035             '&' => UNKNOWN_CONTEXT,
2036         );
2037     }
2038
2039     sub scan_simple_identifier {
2040
2041         # This is a wrapper for sub scan_identifier. It does a fast preliminary
2042         # scan for certain common identifiers:
2043         #   '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
2044         # If it does not find one of these, or this is a restart, it calls the
2045         # original scanner directly.
2046
2047         # This gives the same results as the full scanner in about 1/4 the
2048         # total runtime for a typical input stream.
2049
2050         my $i_begin   = $i;
2051         my $tok_begin = $tok;
2052         my $fast_scan_type;
2053
2054         ###############################
2055         # quick scan with leading sigil
2056         ###############################
2057         if (  !$id_scan_state
2058             && $i + 1 <= $max_token_index
2059             && $fast_scan_context{$tok} )
2060         {
2061             $context = $fast_scan_context{$tok};
2062
2063             # look for $var, @var, ...
2064             if ( $rtoken_type->[ $i + 1 ] eq 'w' ) {
2065                 my $pretype_next = EMPTY_STRING;
2066                 my $i_next       = $i + 2;
2067                 if ( $i_next <= $max_token_index ) {
2068                     if (   $rtoken_type->[$i_next] eq 'b'
2069                         && $i_next < $max_token_index )
2070                     {
2071                         $i_next += 1;
2072                     }
2073                     $pretype_next = $rtoken_type->[$i_next];
2074                 }
2075                 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
2076
2077                     # Found type 'i' like '$var', '@var', or '%var'
2078                     $identifier     = $tok . $rtokens->[ $i + 1 ];
2079                     $tok            = $identifier;
2080                     $type           = 'i';
2081                     $i              = $i + 1;
2082                     $fast_scan_type = $type;
2083                 }
2084             }
2085
2086             # Look for @{ or %{  .
2087             # But we must let the full scanner handle things ${ because it may
2088             # keep going to get a complete identifier like '${#}'  .
2089             elsif (
2090                 $rtoken_type->[ $i + 1 ] eq '{'
2091                 && (   $tok_begin eq '@'
2092                     || $tok_begin eq '%' )
2093               )
2094             {
2095
2096                 $identifier     = $tok;
2097                 $type           = 't';
2098                 $fast_scan_type = $type;
2099             }
2100         }
2101
2102         ############################
2103         # Quick scan with leading ->
2104         # Look for ->[ and ->{
2105         ############################
2106         elsif (
2107                $tok eq '->'
2108             && $i < $max_token_index
2109             && (   $rtokens->[ $i + 1 ] eq '{'
2110                 || $rtokens->[ $i + 1 ] eq '[' )
2111           )
2112         {
2113             $type           = $tok;
2114             $fast_scan_type = $type;
2115             $identifier     = $tok;
2116             $context        = UNKNOWN_CONTEXT;
2117         }
2118
2119         #######################################
2120         # Verify correctness during development
2121         #######################################
2122         if ( VERIFY_FASTSCAN && $fast_scan_type ) {
2123
2124             # We will call the full method
2125             my $identifier_simple = $identifier;
2126             my $tok_simple        = $tok;
2127             my $i_simple          = $i;
2128             my $context_simple    = $context;
2129
2130             $tok = $tok_begin;
2131             $i   = $i_begin;
2132             scan_identifier();
2133
2134             if (   $tok ne $tok_simple
2135                 || $type ne $fast_scan_type
2136                 || $i != $i_simple
2137                 || $identifier ne $identifier_simple
2138                 || $id_scan_state
2139                 || $context ne $context_simple )
2140             {
2141                 print STDERR <<EOM;
2142 scan_simple_identifier differs from scan_identifier:
2143 simple:  i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
2144 full:    i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
2145 EOM
2146             }
2147         }
2148
2149         ###################################################
2150         # call full scanner if fast method did not succeed
2151         ###################################################
2152         if ( !$fast_scan_type ) {
2153             scan_identifier();
2154         }
2155         return;
2156     } ## end sub scan_simple_identifier
2157
2158     sub scan_id {
2159         ( $i, $tok, $type, $id_scan_state ) =
2160           scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
2161             $id_scan_state, $max_token_index );
2162         return;
2163     }
2164
2165     sub scan_number {
2166         my $number;
2167         ( $i, $type, $number ) =
2168           scan_number_do( $input_line, $i, $rtoken_map, $type,
2169             $max_token_index );
2170         return $number;
2171     }
2172
2173     use constant VERIFY_FASTNUM => 0;
2174
2175     sub scan_number_fast {
2176
2177         # This is a wrapper for sub scan_number. It does a fast preliminary
2178         # scan for a simple integer.  It calls the original scan_number if it
2179         # does not find one.
2180
2181         my $i_begin   = $i;
2182         my $tok_begin = $tok;
2183         my $number;
2184
2185         ##################################
2186         # Quick check for (signed) integer
2187         ##################################
2188
2189         # This will be the string of digits:
2190         my $i_d   = $i;
2191         my $tok_d = $tok;
2192         my $typ_d = $rtoken_type->[$i_d];
2193
2194         # check for signed integer
2195         my $sign = EMPTY_STRING;
2196         if (   $typ_d ne 'd'
2197             && ( $typ_d eq '+' || $typ_d eq '-' )
2198             && $i_d < $max_token_index )
2199         {
2200             $sign = $tok_d;
2201             $i_d++;
2202             $tok_d = $rtokens->[$i_d];
2203             $typ_d = $rtoken_type->[$i_d];
2204         }
2205
2206         # Handle integers
2207         if (
2208             $typ_d eq 'd'
2209             && (
2210                 $i_d == $max_token_index
2211                 || (   $i_d < $max_token_index
2212                     && $rtoken_type->[ $i_d + 1 ] ne '.'
2213                     && $rtoken_type->[ $i_d + 1 ] ne 'w' )
2214             )
2215           )
2216         {
2217             # Let let full scanner handle multi-digit integers beginning with
2218             # '0' because there could be error messages.  For example, '009' is
2219             # not a valid number.
2220
2221             if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
2222                 $number = $sign . $tok_d;
2223                 $type   = 'n';
2224                 $i      = $i_d;
2225             }
2226         }
2227
2228         #######################################
2229         # Verify correctness during development
2230         #######################################
2231         if ( VERIFY_FASTNUM && defined($number) ) {
2232
2233             # We will call the full method
2234             my $type_simple   = $type;
2235             my $i_simple      = $i;
2236             my $number_simple = $number;
2237
2238             $tok    = $tok_begin;
2239             $i      = $i_begin;
2240             $number = scan_number();
2241
2242             if (   $type ne $type_simple
2243                 || ( $i != $i_simple && $i <= $max_token_index )
2244                 || $number ne $number_simple )
2245             {
2246                 print STDERR <<EOM;
2247 scan_number_fast differs from scan_number:
2248 simple:  i=$i_simple, type=$type_simple, number=$number_simple
2249 full:  i=$i, type=$type, number=$number
2250 EOM
2251             }
2252         }
2253
2254         #########################################
2255         # call full scanner if may not be integer
2256         #########################################
2257         if ( !defined($number) ) {
2258             $number = scan_number();
2259         }
2260         return $number;
2261     } ## end sub scan_number_fast
2262
2263     # a sub to warn if token found where term expected
2264     sub error_if_expecting_TERM {
2265         if ( $expecting == TERM ) {
2266             if ( $really_want_term{$last_nonblank_type} ) {
2267                 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
2268                     $rtoken_map, $rtoken_type, $input_line );
2269                 return 1;
2270             }
2271         }
2272         return;
2273     } ## end sub error_if_expecting_TERM
2274
2275     # a sub to warn if token found where operator expected
2276     sub error_if_expecting_OPERATOR {
2277         my $thing = shift;
2278         if ( $expecting == OPERATOR ) {
2279             if ( !defined($thing) ) { $thing = $tok }
2280             report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
2281                 $rtoken_map, $rtoken_type, $input_line );
2282             if ( $i_tok == 0 ) {
2283                 interrupt_logfile();
2284                 warning("Missing ';' or ',' above?\n");
2285                 resume_logfile();
2286             }
2287             return 1;
2288         }
2289         return;
2290     } ## end sub error_if_expecting_OPERATOR
2291
2292     # ------------------------------------------------------------
2293     # end scanner interfaces
2294     # ------------------------------------------------------------
2295
2296     #------------------
2297     # Tokenization subs
2298     #------------------
2299     sub do_GREATER_THAN_SIGN {
2300
2301         # '>'
2302         error_if_expecting_TERM()
2303           if ( $expecting == TERM );
2304         return;
2305     }
2306
2307     sub do_VERTICAL_LINE {
2308
2309         # '|'
2310         error_if_expecting_TERM()
2311           if ( $expecting == TERM );
2312         return;
2313     }
2314
2315     sub do_DOLLAR_SIGN {
2316
2317         # '$'
2318         # start looking for a scalar
2319         error_if_expecting_OPERATOR("Scalar")
2320           if ( $expecting == OPERATOR );
2321         scan_simple_identifier();
2322
2323         if ( $identifier eq '$^W' ) {
2324             $tokenizer_self->[_saw_perl_dash_w_] = 1;
2325         }
2326
2327         # Check for identifier in indirect object slot
2328         # (vorboard.pl, sort.t).  Something like:
2329         #   /^(print|printf|sort|exec|system)$/
2330         if (
2331             $is_indirect_object_taker{$last_nonblank_token}
2332             || ( ( $last_nonblank_token eq '(' )
2333                 && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
2334             || (   $last_nonblank_type eq 'w'
2335                 || $last_nonblank_type eq 'U' )    # possible object
2336           )
2337         {
2338
2339             # An identifier followed by '->' is not indirect object;
2340             # fixes b1175, b1176
2341             my ( $next_nonblank_type, $i_next ) =
2342               find_next_noncomment_type( $i, $rtokens, $max_token_index );
2343             $type = 'Z' if ( $next_nonblank_type ne '->' );
2344         }
2345         return;
2346     } ## end sub do_DOLLAR_SIGN
2347
2348     sub do_LEFT_PARENTHESIS {
2349
2350         # '('
2351         ++$paren_depth;
2352         $paren_semicolon_count[$paren_depth] = 0;
2353         if ($want_paren) {
2354             $container_type = $want_paren;
2355             $want_paren     = EMPTY_STRING;
2356         }
2357         elsif ( $statement_type =~ /^sub\b/ ) {
2358             $container_type = $statement_type;
2359         }
2360         else {
2361             $container_type = $last_nonblank_token;
2362
2363             # We can check for a syntax error here of unexpected '(',
2364             # but this is going to get messy...
2365             if (
2366                 $expecting == OPERATOR
2367
2368                 # Be sure this is not a method call of the form
2369                 # &method(...), $method->(..), &{method}(...),
2370                 # $ref[2](list) is ok & short for $ref[2]->(list)
2371                 # NOTE: at present, braces in something like &{ xxx }
2372                 # are not marked as a block, we might have a method call.
2373                 # Added ')' to fix case c017, something like ()()()
2374                 && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/
2375
2376               )
2377             {
2378
2379                 # ref: camel 3 p 703.
2380                 if ( $last_last_nonblank_token eq 'do' ) {
2381                     complain(
2382 "do SUBROUTINE is deprecated; consider & or -> notation\n"
2383                     );
2384                 }
2385                 else {
2386
2387                     # if this is an empty list, (), then it is not an
2388                     # error; for example, we might have a constant pi and
2389                     # invoke it with pi() or just pi;
2390                     my ( $next_nonblank_token, $i_next ) =
2391                       find_next_nonblank_token( $i, $rtokens,
2392                         $max_token_index );
2393
2394                     # Patch for c029: give up error check if
2395                     # a side comment follows
2396                     if (   $next_nonblank_token ne ')'
2397                         && $next_nonblank_token ne '#' )
2398                     {
2399                         my $hint;
2400
2401                         error_if_expecting_OPERATOR('(');
2402
2403                         if ( $last_nonblank_type eq 'C' ) {
2404                             $hint =
2405                               "$last_nonblank_token has a void prototype\n";
2406                         }
2407                         elsif ( $last_nonblank_type eq 'i' ) {
2408                             if (   $i_tok > 0
2409                                 && $last_nonblank_token =~ /^\$/ )
2410                             {
2411                                 $hint =
2412                                   "Do you mean '$last_nonblank_token->(' ?\n";
2413                             }
2414                         }
2415                         if ($hint) {
2416                             interrupt_logfile();
2417                             warning($hint);
2418                             resume_logfile();
2419                         }
2420                     } ## end if ( $next_nonblank_token...
2421                 } ## end else [ if ( $last_last_nonblank_token...
2422             } ## end if ( $expecting == OPERATOR...
2423         }
2424         $paren_type[$paren_depth] = $container_type;
2425         ( $type_sequence, $indent_flag ) =
2426           increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2427
2428         # propagate types down through nested parens
2429         # for example: the second paren in 'if ((' would be structural
2430         # since the first is.
2431
2432         if ( $last_nonblank_token eq '(' ) {
2433             $type = $last_nonblank_type;
2434         }
2435
2436         #     We exclude parens as structural after a ',' because it
2437         #     causes subtle problems with continuation indentation for
2438         #     something like this, where the first 'or' will not get
2439         #     indented.
2440         #
2441         #         assert(
2442         #             __LINE__,
2443         #             ( not defined $check )
2444         #               or ref $check
2445         #               or $check eq "new"
2446         #               or $check eq "old",
2447         #         );
2448         #
2449         #     Likewise, we exclude parens where a statement can start
2450         #     because of problems with continuation indentation, like
2451         #     these:
2452         #
2453         #         ($firstline =~ /^#\!.*perl/)
2454         #         and (print $File::Find::name, "\n")
2455         #           and (return 1);
2456         #
2457         #         (ref($usage_fref) =~ /CODE/)
2458         #         ? &$usage_fref
2459         #           : (&blast_usage, &blast_params, &blast_general_params);
2460
2461         else {
2462             $type = '{';
2463         }
2464
2465         if ( $last_nonblank_type eq ')' ) {
2466             warning(
2467                 "Syntax error? found token '$last_nonblank_type' then '('\n");
2468         }
2469         $paren_structural_type[$paren_depth] = $type;
2470         return;
2471
2472     } ## end sub do_LEFT_PARENTHESIS
2473
2474     sub do_RIGHT_PARENTHESIS {
2475
2476         # ')'
2477         ( $type_sequence, $indent_flag ) =
2478           decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
2479
2480         if ( $paren_structural_type[$paren_depth] eq '{' ) {
2481             $type = '}';
2482         }
2483
2484         $container_type = $paren_type[$paren_depth];
2485
2486         # restore statement type as 'sub' at closing paren of a signature
2487         # so that a subsequent ':' is identified as an attribute
2488         if ( $container_type =~ /^sub\b/ ) {
2489             $statement_type = $container_type;
2490         }
2491
2492         #    /^(for|foreach)$/
2493         if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
2494             my $num_sc = $paren_semicolon_count[$paren_depth];
2495             if ( $num_sc > 0 && $num_sc != 2 ) {
2496                 warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
2497             }
2498         }
2499
2500         if ( $paren_depth > 0 ) { $paren_depth-- }
2501         return;
2502     } ## end sub do_RIGHT_PARENTHESIS
2503
2504     sub do_COMMA {
2505
2506         # ','
2507         if ( $last_nonblank_type eq ',' ) {
2508             complain("Repeated ','s \n");
2509         }
2510
2511         # Note that we have to check both token and type here because a
2512         # comma following a qw list can have last token='(' but type = 'q'
2513         elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' ) {
2514             warning("Unexpected leading ',' after a '('\n");
2515         }
2516
2517         # patch for operator_expected: note if we are in the list (use.t)
2518         if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2519         return;
2520
2521     } ## end sub do_COMMA
2522
2523     sub do_SEMICOLON {
2524
2525         # ';'
2526         $context        = UNKNOWN_CONTEXT;
2527         $statement_type = EMPTY_STRING;
2528         $want_paren     = EMPTY_STRING;
2529
2530         #    /^(for|foreach)$/
2531         if ( $is_for_foreach{ $paren_type[$paren_depth] } )
2532         {    # mark ; in for loop
2533
2534             # Be careful: we do not want a semicolon such as the
2535             # following to be included:
2536             #
2537             #    for (sort {strcoll($a,$b);} keys %investments) {
2538
2539             if (   $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
2540                 && $square_bracket_depth ==
2541                 $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
2542             {
2543
2544                 $type = 'f';
2545                 $paren_semicolon_count[$paren_depth]++;
2546             }
2547         }
2548         return;
2549     } ## end sub do_SEMICOLON
2550
2551     sub do_QUOTATION_MARK {
2552
2553         # '"'
2554         error_if_expecting_OPERATOR("String")
2555           if ( $expecting == OPERATOR );
2556         $in_quote                = 1;
2557         $type                    = 'Q';
2558         $allowed_quote_modifiers = EMPTY_STRING;
2559         return;
2560     } ## end sub do_QUOTATION_MARK
2561
2562     sub do_APOSTROPHE {
2563
2564         # "'"
2565         error_if_expecting_OPERATOR("String")
2566           if ( $expecting == OPERATOR );
2567         $in_quote                = 1;
2568         $type                    = 'Q';
2569         $allowed_quote_modifiers = EMPTY_STRING;
2570         return;
2571     } ## end sub do_APOSTROPHE
2572
2573     sub do_BACKTICK {
2574
2575         # '`'
2576         error_if_expecting_OPERATOR("String")
2577           if ( $expecting == OPERATOR );
2578         $in_quote                = 1;
2579         $type                    = 'Q';
2580         $allowed_quote_modifiers = EMPTY_STRING;
2581         return;
2582     } ## end sub do_BACKTICK
2583
2584     sub do_SLASH {
2585
2586         # '/'
2587         my $is_pattern;
2588
2589         # a pattern cannot follow certain keywords which take optional
2590         # arguments, like 'shift' and 'pop'. See also '?'.
2591         if (
2592             $last_nonblank_type eq 'k'
2593             && $is_keyword_rejecting_slash_as_pattern_delimiter{
2594                 $last_nonblank_token}
2595           )
2596         {
2597             $is_pattern = 0;
2598         }
2599         elsif ( $expecting == UNKNOWN ) {    # indeterminate, must guess..
2600             my $msg;
2601             ( $is_pattern, $msg ) =
2602               guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
2603                 $max_token_index );
2604
2605             if ($msg) {
2606                 write_diagnostics("DIVIDE:$msg\n");
2607                 write_logfile_entry($msg);
2608             }
2609         }
2610         else { $is_pattern = ( $expecting == TERM ) }
2611
2612         if ($is_pattern) {
2613             $in_quote                = 1;
2614             $type                    = 'Q';
2615             $allowed_quote_modifiers = '[msixpodualngc]';
2616         }
2617         else {    # not a pattern; check for a /= token
2618
2619             if ( $rtokens->[ $i + 1 ] eq '=' ) {    # form token /=
2620                 $i++;
2621                 $tok  = '/=';
2622                 $type = $tok;
2623             }
2624
2625            #DEBUG - collecting info on what tokens follow a divide
2626            # for development of guessing algorithm
2627            #if ( is_possible_numerator( $i, $rtokens, $max_token_index ) < 0 ) {
2628            #    #write_diagnostics( "DIVIDE? $input_line\n" );
2629            #}
2630         }
2631         return;
2632     } ## end sub do_SLASH
2633
2634     sub do_LEFT_CURLY_BRACKET {
2635
2636         # '{'
2637         # if we just saw a ')', we will label this block with
2638         # its type.  We need to do this to allow sub
2639         # code_block_type to determine if this brace starts a
2640         # code block or anonymous hash.  (The type of a paren
2641         # pair is the preceding token, such as 'if', 'else',
2642         # etc).
2643         $container_type = EMPTY_STRING;
2644
2645         # ATTRS: for a '{' following an attribute list, reset
2646         # things to look like we just saw the sub name
2647         if ( $statement_type =~ /^sub\b/ ) {
2648             $last_nonblank_token = $statement_type;
2649             $last_nonblank_type  = 'i';
2650             $statement_type      = EMPTY_STRING;
2651         }
2652
2653         # patch for SWITCH/CASE: hide these keywords from an immediately
2654         # following opening brace
2655         elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
2656             && $statement_type eq $last_nonblank_token )
2657         {
2658             $last_nonblank_token = ";";
2659         }
2660
2661         elsif ( $last_nonblank_token eq ')' ) {
2662             $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
2663
2664             # defensive move in case of a nesting error (pbug.t)
2665             # in which this ')' had no previous '('
2666             # this nesting error will have been caught
2667             if ( !defined($last_nonblank_token) ) {
2668                 $last_nonblank_token = 'if';
2669             }
2670
2671             # check for syntax error here;
2672             unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
2673                 if ( $tokenizer_self->[_extended_syntax_] ) {
2674
2675                     # we append a trailing () to mark this as an unknown
2676                     # block type.  This allows perltidy to format some
2677                     # common extensions of perl syntax.
2678                     # This is used by sub code_block_type
2679                     $last_nonblank_token .= '()';
2680                 }
2681                 else {
2682                     my $list =
2683                       join( SPACE, sort keys %is_blocktype_with_paren );
2684                     warning(
2685 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
2686                     );
2687                 }
2688             }
2689         }
2690
2691         # patch for paren-less for/foreach glitch, part 2.
2692         # see note below under 'qw'
2693         elsif ($last_nonblank_token eq 'qw'
2694             && $is_for_foreach{$want_paren} )
2695         {
2696             $last_nonblank_token = $want_paren;
2697             if ( $last_last_nonblank_token eq $want_paren ) {
2698                 warning(
2699 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
2700                 );
2701
2702             }
2703             $want_paren = EMPTY_STRING;
2704         }
2705
2706         # now identify which of the three possible types of
2707         # curly braces we have: hash index container, anonymous
2708         # hash reference, or code block.
2709
2710         # non-structural (hash index) curly brace pair
2711         # get marked 'L' and 'R'
2712         if ( is_non_structural_brace() ) {
2713             $type = 'L';
2714
2715             # patch for SWITCH/CASE:
2716             # allow paren-less identifier after 'when'
2717             # if the brace is preceded by a space
2718             if (   $statement_type eq 'when'
2719                 && $last_nonblank_type eq 'i'
2720                 && $last_last_nonblank_type eq 'k'
2721                 && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
2722             {
2723                 $type       = '{';
2724                 $block_type = $statement_type;
2725             }
2726         }
2727
2728         # code and anonymous hash have the same type, '{', but are
2729         # distinguished by 'block_type',
2730         # which will be blank for an anonymous hash
2731         else {
2732
2733             $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
2734                 $max_token_index );
2735
2736             # patch to promote bareword type to function taking block
2737             if (   $block_type
2738                 && $last_nonblank_type eq 'w'
2739                 && $last_nonblank_i >= 0 )
2740             {
2741                 if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
2742                     $routput_token_type->[$last_nonblank_i] =
2743                       $is_grep_alias{$block_type} ? 'k' : 'G';
2744                 }
2745             }
2746
2747             # patch for SWITCH/CASE: if we find a stray opening block brace
2748             # where we might accept a 'case' or 'when' block, then take it
2749             if (   $statement_type eq 'case'
2750                 || $statement_type eq 'when' )
2751             {
2752                 if ( !$block_type || $block_type eq '}' ) {
2753                     $block_type = $statement_type;
2754                 }
2755             }
2756         }
2757
2758         $brace_type[ ++$brace_depth ]        = $block_type;
2759         $brace_package[$brace_depth]         = $current_package;
2760         $brace_structural_type[$brace_depth] = $type;
2761         $brace_context[$brace_depth]         = $context;
2762         ( $type_sequence, $indent_flag ) =
2763           increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2764         return;
2765     } ## end sub do_LEFT_CURLY_BRACKET
2766
2767     sub do_RIGHT_CURLY_BRACKET {
2768
2769         # '}'
2770         $block_type = $brace_type[$brace_depth];
2771         if ($block_type) { $statement_type = EMPTY_STRING }
2772         if ( defined( $brace_package[$brace_depth] ) ) {
2773             $current_package = $brace_package[$brace_depth];
2774         }
2775
2776         # can happen on brace error (caught elsewhere)
2777         else {
2778         }
2779         ( $type_sequence, $indent_flag ) =
2780           decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
2781
2782         if ( $brace_structural_type[$brace_depth] eq 'L' ) {
2783             $type = 'R';
2784         }
2785
2786         # propagate type information for 'do' and 'eval' blocks, and also
2787         # for smartmatch operator.  This is necessary to enable us to know
2788         # if an operator or term is expected next.
2789         if ( $is_block_operator{$block_type} ) {
2790             $tok = $block_type;
2791         }
2792
2793         $context = $brace_context[$brace_depth];
2794         if ( $brace_depth > 0 ) { $brace_depth--; }
2795         return;
2796     } ## end sub do_RIGHT_CURLY_BRACKET
2797
2798     sub do_AMPERSAND {
2799
2800         # '&' = maybe sub call? start looking
2801         # We have to check for sub call unless we are sure we
2802         # are expecting an operator.  This example from s2p
2803         # got mistaken as a q operator in an early version:
2804         #   print BODY &q(<<'EOT');
2805         if ( $expecting != OPERATOR ) {
2806
2807             # But only look for a sub call if we are expecting a term or
2808             # if there is no existing space after the &.
2809             # For example we probably don't want & as sub call here:
2810             #    Fcntl::S_IRUSR & $mode;
2811             if ( $expecting == TERM || $next_type ne 'b' ) {
2812                 scan_simple_identifier();
2813             }
2814         }
2815         else {
2816         }
2817         return;
2818     } ## end sub do_AMPERSAND
2819
2820     sub do_LESS_THAN_SIGN {
2821
2822         # '<' - angle operator or less than?
2823         if ( $expecting != OPERATOR ) {
2824             ( $i, $type ) =
2825               find_angle_operator_termination( $input_line, $i, $rtoken_map,
2826                 $expecting, $max_token_index );
2827
2828             ##  This message is not very helpful and quite confusing if the above
2829             ##  routine decided not to write a message with the line number.
2830             ##  if ( $type eq '<' && $expecting == TERM ) {
2831             ##      error_if_expecting_TERM();
2832             ##      interrupt_logfile();
2833             ##      warning("Unterminated <> operator?\n");
2834             ##      resume_logfile();
2835             ##  }
2836
2837         }
2838         else {
2839         }
2840         return;
2841     } ## end sub do_LESS_THAN_SIGN
2842
2843     sub do_QUESTION_MARK {
2844
2845         # '?' = conditional or starting pattern?
2846         my $is_pattern;
2847
2848         # Patch for rt #126965
2849         # a pattern cannot follow certain keywords which take optional
2850         # arguments, like 'shift' and 'pop'. See also '/'.
2851         if (
2852             $last_nonblank_type eq 'k'
2853             && $is_keyword_rejecting_question_as_pattern_delimiter{
2854                 $last_nonblank_token}
2855           )
2856         {
2857             $is_pattern = 0;
2858         }
2859
2860         # patch for RT#131288, user constant function without prototype
2861         # last type is 'U' followed by ?.
2862         elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
2863             $is_pattern = 0;
2864         }
2865         elsif ( $expecting == UNKNOWN ) {
2866
2867             # In older versions of Perl, a bare ? can be a pattern
2868             # delimiter.  In perl version 5.22 this was
2869             # dropped, but we have to support it in order to format
2870             # older programs. See:
2871             ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
2872             # For example, the following line worked
2873             # at one time:
2874             #      ?(.*)? && (print $1,"\n");
2875             # In current versions it would have to be written with slashes:
2876             #      /(.*)/ && (print $1,"\n");
2877             my $msg;
2878             ( $is_pattern, $msg ) =
2879               guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
2880                 $max_token_index );
2881
2882             if ($msg) { write_logfile_entry($msg) }
2883         }
2884         else { $is_pattern = ( $expecting == TERM ) }
2885
2886         if ($is_pattern) {
2887             $in_quote                = 1;
2888             $type                    = 'Q';
2889             $allowed_quote_modifiers = '[msixpodualngc]';
2890         }
2891         else {
2892             ( $type_sequence, $indent_flag ) =
2893               increase_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
2894         }
2895         return;
2896     } ## end sub do_QUESTION_MARK
2897
2898     sub do_STAR {
2899
2900         # '*' = typeglob, or multiply?
2901         if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
2902             if (   $next_type ne 'b'
2903                 && $next_type ne '('
2904                 && $next_type ne '#' )    # Fix c036
2905             {
2906                 $expecting = TERM;
2907             }
2908         }
2909         if ( $expecting == TERM ) {
2910             scan_simple_identifier();
2911         }
2912         else {
2913
2914             if ( $rtokens->[ $i + 1 ] eq '=' ) {
2915                 $tok  = '*=';
2916                 $type = $tok;
2917                 $i++;
2918             }
2919             elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
2920                 $tok  = '**';
2921                 $type = $tok;
2922                 $i++;
2923                 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2924                     $tok  = '**=';
2925                     $type = $tok;
2926                     $i++;
2927                 }
2928             }
2929         }
2930         return;
2931     } ## end sub do_STAR
2932
2933     sub do_DOT {
2934
2935         # '.' =  what kind of . ?
2936         if ( $expecting != OPERATOR ) {
2937             scan_number();
2938             if ( $type eq '.' ) {
2939                 error_if_expecting_TERM()
2940                   if ( $expecting == TERM );
2941             }
2942         }
2943         else {
2944         }
2945         return;
2946     } ## end sub do_DOT
2947
2948     sub do_COLON {
2949
2950         # ':' = label, ternary, attribute, ?
2951
2952         # if this is the first nonblank character, call it a label
2953         # since perl seems to just swallow it
2954         if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
2955             $type = 'J';
2956         }
2957
2958         # ATTRS: check for a ':' which introduces an attribute list
2959         # either after a 'sub' keyword or within a paren list
2960         elsif ( $statement_type =~ /^sub\b/ ) {
2961             $type              = 'A';
2962             $in_attribute_list = 1;
2963         }
2964
2965         # Within a signature, unless we are in a ternary.  For example,
2966         # from 't/filter_example.t':
2967         #    method foo4 ( $class: $bar ) { $class->bar($bar) }
2968         elsif ( $paren_type[$paren_depth] =~ /^sub\b/
2969             && !is_balanced_closing_container(QUESTION_COLON) )
2970         {
2971             $type              = 'A';
2972             $in_attribute_list = 1;
2973         }
2974
2975         # check for scalar attribute, such as
2976         # my $foo : shared = 1;
2977         elsif ($is_my_our_state{$statement_type}
2978             && $current_depth[QUESTION_COLON] == 0 )
2979         {
2980             $type              = 'A';
2981             $in_attribute_list = 1;
2982         }
2983
2984         # Look for Switch::Plain syntax if an error would otherwise occur
2985         # here. Note that we do not need to check if the extended syntax
2986         # flag is set because otherwise an error would occur, and we would
2987         # then have to output a message telling the user to set the
2988         # extended syntax flag to avoid the error.
2989         #  case 1: {
2990         #  default: {
2991         #  default:
2992         # Note that the line 'default:' will be parsed as a label elsewhere.
2993         elsif ( $is_case_default{$statement_type}
2994             && !is_balanced_closing_container(QUESTION_COLON) )
2995         {
2996             # mark it as a perltidy label type
2997             $type = 'J';
2998         }
2999
3000         # otherwise, it should be part of a ?/: operator
3001         else {
3002             ( $type_sequence, $indent_flag ) =
3003               decrease_nesting_depth( QUESTION_COLON, $rtoken_map->[$i_tok] );
3004             if ( $last_nonblank_token eq '?' ) {
3005                 warning("Syntax error near ? :\n");
3006             }
3007         }
3008         return;
3009     } ## end sub do_COLON
3010
3011     sub do_PLUS_SIGN {
3012
3013         # '+' = what kind of plus?
3014         if ( $expecting == TERM ) {
3015             my $number = scan_number_fast();
3016
3017             # unary plus is safest assumption if not a number
3018             if ( !defined($number) ) { $type = 'p'; }
3019         }
3020         elsif ( $expecting == OPERATOR ) {
3021         }
3022         else {
3023             if ( $next_type eq 'w' ) { $type = 'p' }
3024         }
3025         return;
3026     } ## end sub do_PLUS_SIGN
3027
3028     sub do_AT_SIGN {
3029
3030         # '@' = sigil for array?
3031         error_if_expecting_OPERATOR("Array")
3032           if ( $expecting == OPERATOR );
3033         scan_simple_identifier();
3034         return;
3035     }
3036
3037     sub do_PERCENT_SIGN {
3038
3039         # '%' = hash or modulo?
3040         # first guess is hash if no following blank or paren
3041         if ( $expecting == UNKNOWN ) {
3042             if ( $next_type ne 'b' && $next_type ne '(' ) {
3043                 $expecting = TERM;
3044             }
3045         }
3046         if ( $expecting == TERM ) {
3047             scan_simple_identifier();
3048         }
3049         return;
3050     } ## end sub do_PERCENT_SIGN
3051
3052     sub do_LEFT_SQUARE_BRACKET {
3053
3054         # '['
3055         $square_bracket_type[ ++$square_bracket_depth ] = $last_nonblank_token;
3056         ( $type_sequence, $indent_flag ) =
3057           increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3058
3059         # It may seem odd, but structural square brackets have
3060         # type '{' and '}'.  This simplifies the indentation logic.
3061         if ( !is_non_structural_brace() ) {
3062             $type = '{';
3063         }
3064         $square_bracket_structural_type[$square_bracket_depth] = $type;
3065         return;
3066     } ## end sub do_LEFT_SQUARE_BRACKET
3067
3068     sub do_RIGHT_SQUARE_BRACKET {
3069
3070         # ']'
3071         ( $type_sequence, $indent_flag ) =
3072           decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
3073
3074         if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' ) {
3075             $type = '}';
3076         }
3077
3078         # propagate type information for smartmatch operator.  This is
3079         # necessary to enable us to know if an operator or term is expected
3080         # next.
3081         if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
3082             $tok = $square_bracket_type[$square_bracket_depth];
3083         }
3084
3085         if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
3086         return;
3087     } ## end sub do_RIGHT_SQUARE_BRACKET
3088
3089     sub do_MINUS_SIGN {
3090
3091         # '-' = what kind of minus?
3092         if ( ( $expecting != OPERATOR )
3093             && $is_file_test_operator{$next_tok} )
3094         {
3095             my ( $next_nonblank_token, $i_next ) =
3096               find_next_nonblank_token( $i + 1, $rtokens, $max_token_index );
3097
3098             # check for a quoted word like "-w=>xx";
3099             # it is sufficient to just check for a following '='
3100             if ( $next_nonblank_token eq '=' ) {
3101                 $type = 'm';
3102             }
3103             else {
3104                 $i++;
3105                 $tok .= $next_tok;
3106                 $type = 'F';
3107             }
3108         }
3109         elsif ( $expecting == TERM ) {
3110             my $number = scan_number_fast();
3111
3112             # maybe part of bareword token? unary is safest
3113             if ( !defined($number) ) { $type = 'm'; }
3114
3115         }
3116         elsif ( $expecting == OPERATOR ) {
3117         }
3118         else {
3119
3120             if ( $next_type eq 'w' ) {
3121                 $type = 'm';
3122             }
3123         }
3124         return;
3125     } ## end sub do_MINUS_SIGN
3126
3127     sub do_CARAT_SIGN {
3128
3129         # '^'
3130         # check for special variables like ${^WARNING_BITS}
3131         if ( $expecting == TERM ) {
3132
3133             if (   $last_nonblank_token eq '{'
3134                 && ( $next_tok !~ /^\d/ )
3135                 && ( $next_tok =~ /^\w/ ) )
3136             {
3137
3138                 if ( $next_tok eq 'W' ) {
3139                     $tokenizer_self->[_saw_perl_dash_w_] = 1;
3140                 }
3141                 $tok  = $tok . $next_tok;
3142                 $i    = $i + 1;
3143                 $type = 'w';
3144
3145                 # Optional coding to try to catch syntax errors. This can
3146                 # be removed if it ever causes incorrect warning messages.
3147                 # The '{^' should be preceded by either by a type or '$#'
3148                 # Examples:
3149                 #   $#{^CAPTURE}       ok
3150                 #   *${^LAST_FH}{NAME} ok
3151                 #   @{^HOWDY}          ok
3152                 #   $hash{^HOWDY}      error
3153
3154                 # Note that a type sigil '$' may be tokenized as 'Z'
3155                 # after something like 'print', so allow type 'Z'
3156                 if (   $last_last_nonblank_type ne 't'
3157                     && $last_last_nonblank_type ne 'Z'
3158                     && $last_last_nonblank_token ne '$#' )
3159                 {
3160                     warning("Possible syntax error near '{^'\n");
3161                 }
3162             }
3163
3164             else {
3165                 unless ( error_if_expecting_TERM() ) {
3166
3167                     # Something like this is valid but strange:
3168                     # undef ^I;
3169                     complain("The '^' seems unusual here\n");
3170                 }
3171             }
3172         }
3173         return;
3174     } ## end sub do_CARAT_SIGN
3175
3176     sub do_DOUBLE_COLON {
3177
3178         #  '::' = probably a sub call
3179         scan_bare_identifier();
3180         return;
3181     }
3182
3183     sub do_LEFT_SHIFT {
3184
3185         # '<<' = maybe a here-doc?
3186
3187 ##      This check removed because it could be a deprecated here-doc with
3188 ##      no specified target.  See example in log 16 Sep 2020.
3189 ##            return
3190 ##              unless ( $i < $max_token_index )
3191 ##              ;          # here-doc not possible if end of line
3192
3193         if ( $expecting != OPERATOR ) {
3194             my ( $found_target, $here_doc_target, $here_quote_character,
3195                 $saw_error );
3196             (
3197                 $found_target, $here_doc_target, $here_quote_character, $i,
3198                 $saw_error
3199               )
3200               = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3201                 $max_token_index );
3202
3203             if ($found_target) {
3204                 push @{$rhere_target_list},
3205                   [ $here_doc_target, $here_quote_character ];
3206                 $type = 'h';
3207                 if ( length($here_doc_target) > 80 ) {
3208                     my $truncated = substr( $here_doc_target, 0, 80 );
3209                     complain("Long here-target: '$truncated' ...\n");
3210                 }
3211                 elsif ( !$here_doc_target ) {
3212                     warning(
3213                         'Use of bare << to mean <<"" is deprecated' . "\n" )
3214                       unless ($here_quote_character);
3215                 }
3216                 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3217                     complain(
3218                         "Unconventional here-target: '$here_doc_target'\n");
3219                 }
3220             }
3221             elsif ( $expecting == TERM ) {
3222                 unless ($saw_error) {
3223
3224                     # shouldn't happen..arriving here implies an error in
3225                     # the logic in sub 'find_here_doc'
3226                     if (DEVEL_MODE) {
3227                         Fault(<<EOM);
3228 Program bug; didn't find here doc target
3229 EOM
3230                     }
3231                     warning(
3232                         "Possible program error: didn't find here doc target\n"
3233                     );
3234                     report_definite_bug();
3235                 }
3236             }
3237         }
3238         else {
3239         }
3240         return;
3241     } ## end sub do_LEFT_SHIFT
3242
3243     sub do_NEW_HERE_DOC {
3244
3245         # '<<~' = a here-doc, new type added in v26
3246         return
3247           unless ( $i < $max_token_index )
3248           ;    # here-doc not possible if end of line
3249         if ( $expecting != OPERATOR ) {
3250             my ( $found_target, $here_doc_target, $here_quote_character,
3251                 $saw_error );
3252             (
3253                 $found_target, $here_doc_target, $here_quote_character, $i,
3254                 $saw_error
3255               )
3256               = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
3257                 $max_token_index );
3258
3259             if ($found_target) {
3260
3261                 if ( length($here_doc_target) > 80 ) {
3262                     my $truncated = substr( $here_doc_target, 0, 80 );
3263                     complain("Long here-target: '$truncated' ...\n");
3264                 }
3265                 elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
3266                     complain(
3267                         "Unconventional here-target: '$here_doc_target'\n");
3268                 }
3269
3270                 # Note that we put a leading space on the here quote
3271                 # character indicate that it may be preceded by spaces
3272                 $here_quote_character = SPACE . $here_quote_character;
3273                 push @{$rhere_target_list},
3274                   [ $here_doc_target, $here_quote_character ];
3275                 $type = 'h';
3276             }
3277             elsif ( $expecting == TERM ) {
3278                 unless ($saw_error) {
3279
3280                     # shouldn't happen..arriving here implies an error in
3281                     # the logic in sub 'find_here_doc'
3282                     if (DEVEL_MODE) {
3283                         Fault(<<EOM);
3284 Program bug; didn't find here doc target
3285 EOM
3286                     }
3287                     warning(
3288                         "Possible program error: didn't find here doc target\n"
3289                     );
3290                     report_definite_bug();
3291                 }
3292             }
3293         }
3294         else {
3295             error_if_expecting_OPERATOR();
3296         }
3297         return;
3298     } ## end sub do_NEW_HERE_DOC
3299
3300     sub do_POINTER {
3301
3302         #  '->'
3303         # if -> points to a bare word, we must scan for an identifier,
3304         # otherwise something like ->y would look like the y operator
3305
3306         # NOTE: this will currently allow things like
3307         #     '->@array'    '->*VAR'  '->%hash'
3308         # to get parsed as identifiers, even though these are not currently
3309         # allowed syntax.  To catch syntax errors like this we could first
3310         # check that the next character and skip this call if it is one of
3311         # ' @ % * '.  A disadvantage with doing this is that this would
3312         # have to be fixed if the perltidy syntax is ever extended to make
3313         # any of these valid.  So for now this check is not done.
3314         scan_simple_identifier();
3315         return;
3316     } ## end sub do_POINTER
3317
3318     sub do_PLUS_PLUS {
3319
3320         # '++'
3321         # type = 'pp' for pre-increment, '++' for post-increment
3322         if    ( $expecting == TERM ) { $type = 'pp' }
3323         elsif ( $expecting == UNKNOWN ) {
3324
3325             my ( $next_nonblank_token, $i_next ) =
3326               find_next_nonblank_token( $i, $rtokens, $max_token_index );
3327
3328             # Fix for c042: look past a side comment
3329             if ( $next_nonblank_token eq '#' ) {
3330                 ( $next_nonblank_token, $i_next ) =
3331                   find_next_nonblank_token( $max_token_index,
3332                     $rtokens, $max_token_index );
3333             }
3334
3335             if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
3336         }
3337         return;
3338     } ## end sub do_PLUS_PLUS
3339
3340     sub do_FAT_COMMA {
3341
3342         # '=>'
3343         if ( $last_nonblank_type eq $tok ) {
3344             complain("Repeated '=>'s \n");
3345         }
3346
3347         # patch for operator_expected: note if we are in the list (use.t)
3348         # TODO: make version numbers a new token type
3349         if ( $statement_type eq 'use' ) { $statement_type = '_use' }
3350         return;
3351     } ## end sub do_FAT_COMMA
3352
3353     sub do_MINUS_MINUS {
3354
3355         # '--'
3356         # type = 'mm' for pre-decrement, '--' for post-decrement
3357
3358         if    ( $expecting == TERM ) { $type = 'mm' }
3359         elsif ( $expecting == UNKNOWN ) {
3360             my ( $next_nonblank_token, $i_next ) =
3361               find_next_nonblank_token( $i, $rtokens, $max_token_index );
3362
3363             # Fix for c042: look past a side comment
3364             if ( $next_nonblank_token eq '#' ) {
3365                 ( $next_nonblank_token, $i_next ) =
3366                   find_next_nonblank_token( $max_token_index,
3367                     $rtokens, $max_token_index );
3368             }
3369
3370             if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
3371         }
3372         return;
3373     } ## end sub do_MINUS_MINUS
3374
3375     sub do_LOGICAL_AND {
3376
3377         # '&&'
3378         error_if_expecting_TERM()
3379           if ( $expecting == TERM && $last_nonblank_token ne ',' );    #c015
3380         return;
3381     }
3382
3383     sub do_LOGICAL_OR {
3384
3385         # '||'
3386         error_if_expecting_TERM()
3387           if ( $expecting == TERM && $last_nonblank_token ne ',' );    #c015
3388         return;
3389     }
3390
3391     sub do_SLASH_SLASH {
3392
3393         # '//'
3394         error_if_expecting_TERM()
3395           if ( $expecting == TERM );
3396         return;
3397     }
3398
3399     sub do_DIGITS {
3400
3401         # 'd' = string of digits
3402         error_if_expecting_OPERATOR("Number")
3403           if ( $expecting == OPERATOR );
3404
3405         my $number = scan_number_fast();
3406         if ( !defined($number) ) {
3407
3408             # shouldn't happen - we should always get a number
3409             if (DEVEL_MODE) {
3410                 Fault(<<EOM);
3411 non-number beginning with digit--program bug
3412 EOM
3413             }
3414             warning(
3415                 "Unexpected error condition: non-number beginning with digit\n"
3416             );
3417             report_definite_bug();
3418         }
3419         return;
3420     } ## end sub do_DIGITS
3421
3422     sub do_ATTRIBUTE_LIST {
3423
3424         my ($next_nonblank_token) = @_;
3425
3426         # Called at a bareword encountered while in an attribute list
3427         # returns 'is_attribute':
3428         #    true if attribute found
3429         #    false if an attribute (continue parsing bareword)
3430
3431         # treat bare word followed by open paren like qw(
3432         if ( $next_nonblank_token eq '(' ) {
3433
3434             # For something like:
3435             #     : prototype($$)
3436             # we should let do_scan_sub see it so that it can see
3437             # the prototype.  All other attributes get parsed as a
3438             # quoted string.
3439             if ( $tok eq 'prototype' ) {
3440                 $id_scan_state = 'prototype';
3441
3442                 # start just after the word 'prototype'
3443                 my $i_beg = $i + 1;
3444                 ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
3445                     {
3446                         input_line      => $input_line,
3447                         i               => $i,
3448                         i_beg           => $i_beg,
3449                         tok             => $tok,
3450                         type            => $type,
3451                         rtokens         => $rtokens,
3452                         rtoken_map      => $rtoken_map,
3453                         id_scan_state   => $id_scan_state,
3454                         max_token_index => $max_token_index
3455                     }
3456                 );
3457
3458                 # If successful, mark as type 'q' to be consistent
3459                 # with other attributes.  Type 'w' would also work.
3460                 if ( $i > $i_beg ) {
3461                     $type = 'q';
3462                     return 1;
3463                 }
3464
3465                 # If not successful, continue and parse as a quote.
3466             }
3467
3468             # All other attribute lists must be parsed as quotes
3469             # (see 'signatures.t' for good examples)
3470             $in_quote                = $quote_items{'q'};
3471             $allowed_quote_modifiers = $quote_modifiers{'q'};
3472             $type                    = 'q';
3473             $quote_type              = 'q';
3474             return 1;
3475         }
3476
3477         # handle bareword not followed by open paren
3478         else {
3479             $type = 'w';
3480             return 1;
3481         }
3482
3483         # attribute not found
3484         return;
3485     } ## end sub do_ATTRIBUTE_LIST
3486
3487     sub do_QUOTED_BAREWORD {
3488
3489         # find type of a bareword followed by a '=>'
3490         if ( $is_constant{$current_package}{$tok} ) {
3491             $type = 'C';
3492         }
3493         elsif ( $is_user_function{$current_package}{$tok} ) {
3494             $type      = 'U';
3495             $prototype = $user_function_prototype{$current_package}{$tok};
3496         }
3497         elsif ( $tok =~ /^v\d+$/ ) {
3498             $type = 'v';
3499             report_v_string($tok);
3500         }
3501         else {
3502
3503             # Bareword followed by a fat comma - see 'git18.in'
3504             # If tok is something like 'x17' then it could
3505             # actually be operator x followed by number 17.
3506             # For example, here:
3507             #     123x17 => [ 792, 1224 ],
3508             # (a key of 123 repeated 17 times, perhaps not
3509             # what was intended). We will mark x17 as type
3510             # 'n' and it will be split. If the previous token
3511             # was also a bareword then it is not very clear is
3512             # going on.  In this case we will not be sure that
3513             # an operator is expected, so we just mark it as a
3514             # bareword.  Perl is a little murky in what it does
3515             # with stuff like this, and its behavior can change
3516             # over time.  Something like
3517             #    a x18 => [792, 1224], will compile as
3518             # a key with 18 a's.  But something like
3519             #    push @array, a x18;
3520             # is a syntax error.
3521             if (
3522                    $expecting == OPERATOR
3523                 && substr( $tok, 0, 1 ) eq 'x'
3524                 && ( length($tok) == 1
3525                     || substr( $tok, 1, 1 ) =~ /^\d/ )
3526               )
3527             {
3528                 $type = 'n';
3529                 if ( split_pretoken(1) ) {
3530                     $type = 'x';
3531                     $tok  = 'x';
3532                 }
3533             }
3534             else {
3535
3536                 # git #18
3537                 $type = 'w';
3538                 error_if_expecting_OPERATOR();
3539             }
3540         }
3541         return;
3542     } ## end sub do_QUOTED_BAREWORD
3543
3544     sub do_X_OPERATOR {
3545
3546         if ( $tok eq 'x' ) {
3547             if ( $rtokens->[ $i + 1 ] eq '=' ) {    # x=
3548                 $tok  = 'x=';
3549                 $type = $tok;
3550                 $i++;
3551             }
3552             else {
3553                 $type = 'x';
3554             }
3555         }
3556         else {
3557
3558             # Split a pretoken like 'x10' into 'x' and '10'.
3559             # Note: In previous versions of perltidy it was marked
3560             # as a number, $type = 'n', and fixed downstream by the
3561             # Formatter.
3562             $type = 'n';
3563             if ( split_pretoken(1) ) {
3564                 $type = 'x';
3565                 $tok  = 'x';
3566             }
3567         }
3568         return;
3569     } ## end sub do_X_OPERATOR
3570
3571     sub do_USE_CONSTANT {
3572         scan_bare_identifier();
3573         my ( $next_nonblank_tok2, $i_next2 ) =
3574           find_next_nonblank_token( $i, $rtokens, $max_token_index );
3575
3576         if ($next_nonblank_tok2) {
3577
3578             if ( $is_keyword{$next_nonblank_tok2} ) {
3579
3580                 # Assume qw is used as a quote and okay, as in:
3581                 #  use constant qw{ DEBUG 0 };
3582                 # Not worth trying to parse for just a warning
3583
3584                 # NOTE: This warning is deactivated because recent
3585                 # versions of perl do not complain here, but
3586                 # the coding is retained for reference.
3587                 if ( 0 && $next_nonblank_tok2 ne 'qw' ) {
3588                     warning(
3589 "Attempting to define constant '$next_nonblank_tok2' which is a perl keyword\n"
3590                     );
3591                 }
3592             }
3593
3594             else {
3595                 $is_constant{$current_package}{$next_nonblank_tok2} = 1;
3596             }
3597         }
3598         return;
3599     } ## end sub do_USE_CONSTANT
3600
3601     sub do_KEYWORD {
3602
3603         # found a keyword - set any associated flags
3604         $type = 'k';
3605
3606         # Since for and foreach may not be followed immediately
3607         # by an opening paren, we have to remember which keyword
3608         # is associated with the next '('
3609         if ( $is_for_foreach{$tok} ) {
3610             if ( new_statement_ok() ) {
3611                 $want_paren = $tok;
3612             }
3613         }
3614
3615         # recognize 'use' statements, which are special
3616         elsif ( $is_use_require{$tok} ) {
3617             $statement_type = $tok;
3618             error_if_expecting_OPERATOR()
3619               if ( $expecting == OPERATOR );
3620         }
3621
3622         # remember my and our to check for trailing ": shared"
3623         elsif ( $is_my_our_state{$tok} ) {
3624             $statement_type = $tok;
3625         }
3626
3627         # Check for misplaced 'elsif' and 'else', but allow isolated
3628         # else or elsif blocks to be formatted.  This is indicated
3629         # by a last noblank token of ';'
3630         elsif ( $tok eq 'elsif' ) {
3631             if (
3632                 $last_nonblank_token ne ';'
3633
3634                 ## !~ /^(if|elsif|unless)$/
3635                 && !$is_if_elsif_unless{$last_nonblank_block_type}
3636               )
3637             {
3638                 warning(
3639                     "expecting '$tok' to follow one of 'if|elsif|unless'\n");
3640             }
3641         }
3642         elsif ( $tok eq 'else' ) {
3643
3644             # patched for SWITCH/CASE
3645             if (
3646                 $last_nonblank_token ne ';'
3647
3648                 ## !~ /^(if|elsif|unless|case|when)$/
3649                 && !$is_if_elsif_unless_case_when{$last_nonblank_block_type}
3650
3651                 # patch to avoid an unwanted error message for
3652                 # the case of a parenless 'case' (RT 105484):
3653                 # switch ( 1 ) { case x { 2 } else { } }
3654                 ## !~ /^(if|elsif|unless|case|when)$/
3655                 && !$is_if_elsif_unless_case_when{$statement_type}
3656               )
3657             {
3658                 warning(
3659 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
3660                 );
3661             }
3662         }
3663         elsif ( $tok eq 'continue' ) {
3664             if (   $last_nonblank_token ne ';'
3665                 && $last_nonblank_block_type !~
3666                 /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
3667             {
3668
3669                 # note: ';' '{' and '}' in list above
3670                 # because continues can follow bare blocks;
3671                 # ':' is labeled block
3672                 #
3673                 ############################################
3674                 # NOTE: This check has been deactivated because
3675                 # continue has an alternative usage for given/when
3676                 # blocks in perl 5.10
3677                 ## warning("'$tok' should follow a block\n");
3678                 ############################################
3679             }
3680         }
3681
3682         # patch for SWITCH/CASE if 'case' and 'when are
3683         # treated as keywords.  Also 'default' for Switch::Plain
3684         elsif ($tok eq 'when'
3685             || $tok eq 'case'
3686             || $tok eq 'default' )
3687         {
3688             $statement_type = $tok;    # next '{' is block
3689         }
3690
3691         #
3692         # indent trailing if/unless/while/until
3693         # outdenting will be handled by later indentation loop
3694 ## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
3695 ##$opt_o = 1
3696 ##  if !(
3697 ##             $opt_b
3698 ##          || $opt_c
3699 ##          || $opt_d
3700 ##          || $opt_f
3701 ##          || $opt_i
3702 ##          || $opt_l
3703 ##          || $opt_o
3704 ##          || $opt_x
3705 ##  );
3706 ##                    if (   $tok =~ /^(if|unless|while|until)$/
3707 ##                        && $next_nonblank_token ne '(' )
3708 ##                    {
3709 ##                        $indent_flag = 1;
3710 ##                    }
3711         return;
3712     } ## end sub do_KEYWORD
3713
3714     sub do_QUOTE_OPERATOR {
3715 ##NICOL PATCH
3716         if ( $expecting == OPERATOR ) {
3717
3718             # Be careful not to call an error for a qw quote
3719             # where a parenthesized list is allowed.  For example,
3720             # it could also be a for/foreach construct such as
3721             #
3722             #    foreach my $key qw\Uno Due Tres Quadro\ {
3723             #        print "Set $key\n";
3724             #    }
3725             #
3726
3727             # Or it could be a function call.
3728             # NOTE: Braces in something like &{ xxx } are not
3729             # marked as a block, we might have a method call.
3730             # &method(...), $method->(..), &{method}(...),
3731             # $ref[2](list) is ok & short for $ref[2]->(list)
3732             #
3733             # See notes in 'sub code_block_type' and
3734             # 'sub is_non_structural_brace'
3735
3736             unless (
3737                 $tok eq 'qw'
3738                 && (   $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
3739                     || $is_for_foreach{$want_paren} )
3740               )
3741             {
3742                 error_if_expecting_OPERATOR();
3743             }
3744         }
3745         $in_quote                = $quote_items{$tok};
3746         $allowed_quote_modifiers = $quote_modifiers{$tok};
3747
3748         # All quote types are 'Q' except possibly qw quotes.
3749         # qw quotes are special in that they may generally be trimmed
3750         # of leading and trailing whitespace.  So they are given a
3751         # separate type, 'q', unless requested otherwise.
3752         $type =
3753           ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
3754           ? 'q'
3755           : 'Q';
3756         $quote_type = $type;
3757         return;
3758     } ## end sub do_QUOTE_OPERATOR
3759
3760     sub do_UNKNOWN_BAREWORD {
3761
3762         my ($next_nonblank_token) = @_;
3763
3764         scan_bare_identifier();
3765
3766         if (   $statement_type eq 'use'
3767             && $last_nonblank_token eq 'use' )
3768         {
3769             $saw_use_module{$current_package}->{$tok} = 1;
3770         }
3771
3772         if ( $type eq 'w' ) {
3773
3774             if ( $expecting == OPERATOR ) {
3775
3776                 # Patch to avoid error message for RPerl overloaded
3777                 # operator functions: use overload
3778                 #    '+' => \&sse_add,
3779                 #    '-' => \&sse_sub,
3780                 #    '*' => \&sse_mul,
3781                 #    '/' => \&sse_div;
3782                 # FIXME: this should eventually be generalized
3783                 if (   $saw_use_module{$current_package}->{'RPerl'}
3784                     && $tok =~ /^sse_(mul|div|add|sub)$/ )
3785                 {
3786
3787                 }
3788
3789                 # Fix part 1 for git #63 in which a comment falls
3790                 # between an -> and the following word.  An
3791                 # alternate fix would be to change operator_expected
3792                 # to return an UNKNOWN for this type.
3793                 elsif ( $last_nonblank_type eq '->' ) {
3794
3795                 }
3796
3797                 # don't complain about possible indirect object
3798                 # notation.
3799                 # For example:
3800                 #   package main;
3801                 #   sub new($) { ... }
3802                 #   $b = new A::;  # calls A::new
3803                 #   $c = new A;    # same thing but suspicious
3804                 # This will call A::new but we have a 'new' in
3805                 # main:: which looks like a constant.
3806                 #
3807                 elsif ( $last_nonblank_type eq 'C' ) {
3808                     if ( $tok !~ /::$/ ) {
3809                         complain(<<EOM);
3810 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
3811        Maybe indirectet object notation?
3812 EOM
3813                     }
3814                 }
3815                 else {
3816                     error_if_expecting_OPERATOR("bareword");
3817                 }
3818             }
3819
3820             # mark bare words immediately followed by a paren as
3821             # functions
3822             $next_tok = $rtokens->[ $i + 1 ];
3823             if ( $next_tok eq '(' ) {
3824
3825                 # Fix part 2 for git #63.  Leave type as 'w' to keep
3826                 # the type the same as if the -> were not separated
3827                 $type = 'U' unless ( $last_nonblank_type eq '->' );
3828             }
3829
3830             # underscore after file test operator is file handle
3831             if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
3832                 $type = 'Z';
3833             }
3834
3835             # patch for SWITCH/CASE if 'case' and 'when are
3836             # not treated as keywords:
3837             if (
3838                 ( $tok eq 'case' && $brace_type[$brace_depth] eq 'switch' )
3839                 || (   $tok eq 'when'
3840                     && $brace_type[$brace_depth] eq 'given' )
3841               )
3842             {
3843                 $statement_type = $tok;    # next '{' is block
3844                 $type           = 'k';     # for keyword syntax coloring
3845             }
3846
3847             # patch for SWITCH/CASE if switch and given not keywords
3848             # Switch is not a perl 5 keyword, but we will gamble
3849             # and mark switch followed by paren as a keyword.  This
3850             # is only necessary to get html syntax coloring nice,
3851             # and does not commit this as being a switch/case.
3852             if ( $next_nonblank_token eq '('
3853                 && ( $tok eq 'switch' || $tok eq 'given' ) )
3854             {
3855                 $type = 'k';    # for keyword syntax coloring
3856             }
3857         }
3858         return;
3859     } ## end sub do_UNKNOWN_BAREWORD
3860
3861     sub sub_attribute_ok_here {
3862
3863         my ( $tok_kw, $next_nonblank_token, $i_next ) = @_;
3864
3865         # Decide if 'sub :' can be the start of a sub attribute list.
3866         # We will decide based on if the colon is followed by a
3867         # bareword which is not a keyword.
3868         # Changed inext+1 to inext to fixed case b1190.
3869         my $sub_attribute_ok_here;
3870         if (   $is_sub{$tok_kw}
3871             && $expecting != OPERATOR
3872             && $next_nonblank_token eq ':' )
3873         {
3874             my ( $nn_nonblank_token, $i_nn ) =
3875               find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
3876             $sub_attribute_ok_here =
3877                  $nn_nonblank_token =~ /^\w/
3878               && $nn_nonblank_token !~ /^\d/
3879               && !$is_keyword{$nn_nonblank_token};
3880         }
3881         return $sub_attribute_ok_here;
3882     } ## end sub sub_attribute_ok_here
3883
3884     sub do_BAREWORD {
3885
3886         my ($is_END_or_DATA) = @_;
3887
3888         # handle a bareword token:
3889         # returns
3890         #    true if this token ends the current line
3891         #    false otherwise
3892
3893         # Patch for c043, part 3: A bareword after '->' expects a TERM
3894         # FIXME: It would be cleaner to give method calls a new type 'M'
3895         # and update sub operator_expected to handle this.
3896         if ( $last_nonblank_type eq '->' ) {
3897             $expecting = TERM;
3898         }
3899
3900         my ( $next_nonblank_token, $i_next ) =
3901           find_next_nonblank_token( $i, $rtokens, $max_token_index );
3902
3903         # a bare word immediately followed by :: is not a keyword;
3904         # use $tok_kw when testing for keywords to avoid a mistake
3905         my $tok_kw = $tok;
3906         if (   $rtokens->[ $i + 1 ] eq ':'
3907             && $rtokens->[ $i + 2 ] eq ':' )
3908         {
3909             $tok_kw .= '::';
3910         }
3911
3912         if ($in_attribute_list) {
3913             my $is_attribute = do_ATTRIBUTE_LIST($next_nonblank_token);
3914             return if ($is_attribute);
3915         }
3916
3917         #----------------------------------------
3918         # Starting final if-elsif- chain of tests
3919         #----------------------------------------
3920
3921         # This is the return flag:
3922         #   true => this is the last token on the line
3923         #   false => keep tokenizing the line
3924         my $is_last;
3925
3926         # The following blocks of code must update these vars:
3927         # $type - the final token type, must always be set
3928
3929         # In addition, if additional pretokens are added:
3930         # $tok  - the final token
3931         # $i    - the index of the last pretoken
3932
3933         # They may also need to check and set various flags
3934
3935         # Quote a word followed by => operator
3936         # unless the word __END__ or __DATA__ and the only word on
3937         # the line.
3938         if (  !$is_END_or_DATA
3939             && $next_nonblank_token eq '='
3940             && $rtokens->[ $i_next + 1 ] eq '>' )
3941         {
3942             do_QUOTED_BAREWORD();
3943         }
3944
3945         # quote a bare word within braces..like xxx->{s}; note that we
3946         # must be sure this is not a structural brace, to avoid
3947         # mistaking {s} in the following for a quoted bare word:
3948         #     for(@[){s}bla}BLA}
3949         # Also treat q in something like var{-q} as a bare word, not
3950         # a quote operator
3951         elsif (
3952             $next_nonblank_token eq '}'
3953             && (
3954                 $last_nonblank_type eq 'L'
3955                 || (   $last_nonblank_type eq 'm'
3956                     && $last_last_nonblank_type eq 'L' )
3957             )
3958           )
3959         {
3960             $type = 'w';
3961         }
3962
3963         # Scan a bare word following a -> as an identifier; it could
3964         # have a long package name.  Fixes c037, c041.
3965         elsif ( $last_nonblank_token eq '->' ) {
3966             scan_bare_identifier();
3967
3968             # Patch for c043, part 4; use type 'w' after a '->'.
3969             # This is just a safety check on sub scan_bare_identifier,
3970             # which should get this case correct.
3971             $type = 'w';
3972         }
3973
3974         # handle operator x (now we know it isn't $x=)
3975         elsif (
3976                $expecting == OPERATOR
3977             && substr( $tok, 0, 1 ) eq 'x'
3978             && ( length($tok) == 1
3979                 || substr( $tok, 1, 1 ) =~ /^\d/ )
3980           )
3981         {
3982             do_X_OPERATOR();
3983         }
3984         elsif ( $tok_kw eq 'CORE::' ) {
3985             $type = $tok = $tok_kw;
3986             $i += 2;
3987         }
3988         elsif ( ( $tok eq 'strict' )
3989             and ( $last_nonblank_token eq 'use' ) )
3990         {
3991             $tokenizer_self->[_saw_use_strict_] = 1;
3992             scan_bare_identifier();
3993         }
3994
3995         elsif ( ( $tok eq 'warnings' )
3996             and ( $last_nonblank_token eq 'use' ) )
3997         {
3998             $tokenizer_self->[_saw_perl_dash_w_] = 1;
3999
4000             # scan as identifier, so that we pick up something like:
4001             # use warnings::register
4002             scan_bare_identifier();
4003         }
4004
4005         elsif (
4006                $tok eq 'AutoLoader'
4007             && $tokenizer_self->[_look_for_autoloader_]
4008             && (
4009                 $last_nonblank_token eq 'use'
4010
4011                 # these regexes are from AutoSplit.pm, which we want
4012                 # to mimic
4013                 || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
4014                 || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
4015             )
4016           )
4017         {
4018             write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
4019             $tokenizer_self->[_saw_autoloader_]      = 1;
4020             $tokenizer_self->[_look_for_autoloader_] = 0;
4021             scan_bare_identifier();
4022         }
4023
4024         elsif (
4025                $tok eq 'SelfLoader'
4026             && $tokenizer_self->[_look_for_selfloader_]
4027             && (   $last_nonblank_token eq 'use'
4028                 || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
4029                 || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
4030           )
4031         {
4032             write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
4033             $tokenizer_self->[_saw_selfloader_]      = 1;
4034             $tokenizer_self->[_look_for_selfloader_] = 0;
4035             scan_bare_identifier();
4036         }
4037
4038         elsif ( ( $tok eq 'constant' )
4039             and ( $last_nonblank_token eq 'use' ) )
4040         {
4041             do_USE_CONSTANT();
4042         }
4043
4044         # various quote operators
4045         elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
4046             do_QUOTE_OPERATOR();
4047         }
4048
4049         # check for a statement label
4050         elsif (
4051                ( $next_nonblank_token eq ':' )
4052             && ( $rtokens->[ $i_next + 1 ] ne ':' )
4053             && ( $i_next <= $max_token_index )    # colon on same line
4054
4055             # like 'sub : lvalue' ?
4056             ##&& !$sub_attribute_ok_here            # like 'sub : lvalue' ?
4057             && !sub_attribute_ok_here( $tok_kw, $next_nonblank_token, $i_next )
4058             && label_ok()
4059           )
4060         {
4061             if ( $tok !~ /[A-Z]/ ) {
4062                 push @{ $tokenizer_self->[_rlower_case_labels_at_] },
4063                   $input_line_number;
4064             }
4065             $type = 'J';
4066             $tok .= ':';
4067             $i = $i_next;
4068         }
4069
4070         #      'sub' or alias
4071         elsif ( $is_sub{$tok_kw} ) {
4072             error_if_expecting_OPERATOR()
4073               if ( $expecting == OPERATOR );
4074             initialize_subname();
4075             scan_id();
4076         }
4077
4078         #      'package'
4079         elsif ( $is_package{$tok_kw} ) {
4080             error_if_expecting_OPERATOR()
4081               if ( $expecting == OPERATOR );
4082             scan_id();
4083         }
4084
4085         # Fix for c035: split 'format' from 'is_format_END_DATA' to be
4086         # more restrictive. Require a new statement to be ok here.
4087         elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
4088             $type = ';';    # make tokenizer look for TERM next
4089             $tokenizer_self->[_in_format_] = 1;
4090             $is_last = 1;                          ## is last token on this line
4091         }
4092
4093         # Note on token types for format, __DATA__, __END__:
4094         # It simplifies things to give these type ';', so that when we
4095         # start rescanning we will be expecting a token of type TERM.
4096         # We will switch to type 'k' before outputting the tokens.
4097         elsif ( $is_END_DATA{$tok_kw} ) {
4098             $type = ';';    # make tokenizer look for TERM next
4099
4100             # Remember that we are in one of these three sections
4101             $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1;
4102             $is_last = 1;    ## is last token on this line
4103         }
4104
4105         elsif ( $is_keyword{$tok_kw} ) {
4106             do_KEYWORD();
4107         }
4108
4109         # check for inline label following
4110         #         /^(redo|last|next|goto)$/
4111         elsif (( $last_nonblank_type eq 'k' )
4112             && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
4113         {
4114             $type = 'j';
4115         }
4116
4117         # something else --
4118         else {
4119             do_UNKNOWN_BAREWORD($next_nonblank_token);
4120         }
4121
4122         return $is_last;
4123
4124     } ## end sub do_BAREWORD
4125
4126     sub do_FOLLOW_QUOTE {
4127
4128         # Continue following a quote on a new line
4129         $type = $quote_type;
4130
4131         unless ( @{$routput_token_list} ) {    # initialize if continuation line
4132             push( @{$routput_token_list}, $i );
4133             $routput_token_type->[$i] = $type;
4134
4135         }
4136
4137         # Removed to fix b1280.  This is not needed and was causing the
4138         # starting type 'qw' to be lost, leading to mis-tokenization of
4139         # a trailing block brace in a parenless for stmt 'for .. qw.. {'
4140         ##$tok = $quote_character if ($quote_character);
4141
4142         # scan for the end of the quote or pattern
4143         (
4144             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
4145             $quoted_string_1, $quoted_string_2
4146           )
4147           = do_quote(
4148             $i,               $in_quote,    $quote_character,
4149             $quote_pos,       $quote_depth, $quoted_string_1,
4150             $quoted_string_2, $rtokens,     $rtoken_map,
4151             $max_token_index
4152           );
4153
4154         # all done if we didn't find it
4155         if ($in_quote) { return }
4156
4157         # save pattern and replacement text for rescanning
4158         my $qs1 = $quoted_string_1;
4159
4160         # re-initialize for next search
4161         $quote_character = EMPTY_STRING;
4162         $quote_pos       = 0;
4163         $quote_type      = 'Q';
4164         $quoted_string_1 = EMPTY_STRING;
4165         $quoted_string_2 = EMPTY_STRING;
4166         if ( ++$i > $max_token_index ) { return }
4167
4168         # look for any modifiers
4169         if ($allowed_quote_modifiers) {
4170
4171             # check for exact quote modifiers
4172             if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
4173                 my $str = $rtokens->[$i];
4174                 my $saw_modifier_e;
4175                 while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
4176                     my $pos  = pos($str);
4177                     my $char = substr( $str, $pos - 1, 1 );
4178                     $saw_modifier_e ||= ( $char eq 'e' );
4179                 }
4180
4181                 # For an 'e' quote modifier we must scan the replacement
4182                 # text for here-doc targets...
4183                 # but if the modifier starts a new line we can skip
4184                 # this because either the here doc will be fully
4185                 # contained in the replacement text (so we can
4186                 # ignore it) or Perl will not find it.
4187                 # See test 'here2.in'.
4188                 if ( $saw_modifier_e && $i_tok >= 0 ) {
4189
4190                     my $rht = scan_replacement_text($qs1);
4191
4192                     # Change type from 'Q' to 'h' for quotes with
4193                     # here-doc targets so that the formatter (see sub
4194                     # process_line_of_CODE) will not make any line
4195                     # breaks after this point.
4196                     if ($rht) {
4197                         push @{$rhere_target_list}, @{$rht};
4198                         $type = 'h';
4199                         if ( $i_tok < 0 ) {
4200                             my $ilast = $routput_token_list->[-1];
4201                             $routput_token_type->[$ilast] = $type;
4202                         }
4203                     }
4204                 }
4205
4206                 if ( defined( pos($str) ) ) {
4207
4208                     # matched
4209                     if ( pos($str) == length($str) ) {
4210                         if ( ++$i > $max_token_index ) { return }
4211                     }
4212
4213                     # Looks like a joined quote modifier
4214                     # and keyword, maybe something like
4215                     # s/xxx/yyy/gefor @k=...
4216                     # Example is "galgen.pl".  Would have to split
4217                     # the word and insert a new token in the
4218                     # pre-token list.  This is so rare that I haven't
4219                     # done it.  Will just issue a warning citation.
4220
4221                     # This error might also be triggered if my quote
4222                     # modifier characters are incomplete
4223                     else {
4224                         warning(<<EOM);
4225
4226 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
4227 Please put a space between quote modifiers and trailing keywords.
4228 EOM
4229
4230                         # print "token $rtokens->[$i]\n";
4231                         # my $num = length($str) - pos($str);
4232                         # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
4233                         # print "continuing with new token $rtokens->[$i]\n";
4234
4235                         # skipping past this token does least damage
4236                         if ( ++$i > $max_token_index ) { return }
4237                     }
4238                 }
4239                 else {
4240
4241                     # example file: rokicki4.pl
4242                     # This error might also be triggered if my quote
4243                     # modifier characters are incomplete
4244                     write_logfile_entry(
4245                         "Note: found word $str at quote modifier location\n");
4246                 }
4247             }
4248
4249             # re-initialize
4250             $allowed_quote_modifiers = EMPTY_STRING;
4251         }
4252         return;
4253     } ## end sub do_FOLLOW_QUOTE
4254
4255     # ------------------------------------------------------------
4256     # begin hash of code for handling most token types
4257     # ------------------------------------------------------------
4258     my $tokenization_code = {
4259
4260         '>'   => \&do_GREATER_THAN_SIGN,
4261         '|'   => \&do_VERTICAL_LINE,
4262         '$'   => \&do_DOLLAR_SIGN,
4263         '('   => \&do_LEFT_PARENTHESIS,
4264         ')'   => \&do_RIGHT_PARENTHESIS,
4265         ','   => \&do_COMMA,
4266         ';'   => \&do_SEMICOLON,
4267         '"'   => \&do_QUOTATION_MARK,
4268         "'"   => \&do_APOSTROPHE,
4269         '`'   => \&do_BACKTICK,
4270         '/'   => \&do_SLASH,
4271         '{'   => \&do_LEFT_CURLY_BRACKET,
4272         '}'   => \&do_RIGHT_CURLY_BRACKET,
4273         '&'   => \&do_AMPERSAND,
4274         '<'   => \&do_LESS_THAN_SIGN,
4275         '?'   => \&do_QUESTION_MARK,
4276         '*'   => \&do_STAR,
4277         '.'   => \&do_DOT,
4278         ':'   => \&do_COLON,
4279         '+'   => \&do_PLUS_SIGN,
4280         '@'   => \&do_AT_SIGN,
4281         '%'   => \&do_PERCENT_SIGN,
4282         '['   => \&do_LEFT_SQUARE_BRACKET,
4283         ']'   => \&do_RIGHT_SQUARE_BRACKET,
4284         '-'   => \&do_MINUS_SIGN,
4285         '^'   => \&do_CARAT_SIGN,
4286         '::'  => \&do_DOUBLE_COLON,
4287         '<<'  => \&do_LEFT_SHIFT,
4288         '<<~' => \&do_NEW_HERE_DOC,
4289         '->'  => \&do_POINTER,
4290         '++'  => \&do_PLUS_PLUS,
4291         '=>'  => \&do_FAT_COMMA,
4292         '--'  => \&do_MINUS_MINUS,
4293         '&&'  => \&do_LOGICAL_AND,
4294         '||'  => \&do_LOGICAL_OR,
4295         '//'  => \&do_SLASH_SLASH,
4296
4297         # No special code for these types yet, but syntax checks
4298         # could be added.
4299         ##  '!'   => undef,
4300         ##  '!='  => undef,
4301         ##  '!~'  => undef,
4302         ##  '%='  => undef,
4303         ##  '&&=' => undef,
4304         ##  '&='  => undef,
4305         ##  '+='  => undef,
4306         ##  '-='  => undef,
4307         ##  '..'  => undef,
4308         ##  '..'  => undef,
4309         ##  '...' => undef,
4310         ##  '.='  => undef,
4311         ##  '<<=' => undef,
4312         ##  '<='  => undef,
4313         ##  '<=>' => undef,
4314         ##  '<>'  => undef,
4315         ##  '='   => undef,
4316         ##  '=='  => undef,
4317         ##  '=~'  => undef,
4318         ##  '>='  => undef,
4319         ##  '>>'  => undef,
4320         ##  '>>=' => undef,
4321         ##  '\\'  => undef,
4322         ##  '^='  => undef,
4323         ##  '|='  => undef,
4324         ##  '||=' => undef,
4325         ##  '//=' => undef,
4326         ##  '~'   => undef,
4327         ##  '~~'  => undef,
4328         ##  '!~~' => undef,
4329
4330     };
4331
4332     # ------------------------------------------------------------
4333     # end hash of code for handling individual token types
4334     # ------------------------------------------------------------
4335
4336     use constant DEBUG_TOKENIZE => 0;
4337
4338     sub tokenize_this_line {
4339
4340   # This routine breaks a line of perl code into tokens which are of use in
4341   # indentation and reformatting.  One of my goals has been to define tokens
4342   # such that a newline may be inserted between any pair of tokens without
4343   # changing or invalidating the program. This version comes close to this,
4344   # although there are necessarily a few exceptions which must be caught by
4345   # the formatter.  Many of these involve the treatment of bare words.
4346   #
4347   # The tokens and their types are returned in arrays.  See previous
4348   # routine for their names.
4349   #
4350   # See also the array "valid_token_types" in the BEGIN section for an
4351   # up-to-date list.
4352   #
4353   # To simplify things, token types are either a single character, or they
4354   # are identical to the tokens themselves.
4355   #
4356   # As a debugging aid, the -D flag creates a file containing a side-by-side
4357   # comparison of the input string and its tokenization for each line of a file.
4358   # This is an invaluable debugging aid.
4359   #
4360   # In addition to tokens, and some associated quantities, the tokenizer
4361   # also returns flags indication any special line types.  These include
4362   # quotes, here_docs, formats.
4363   #
4364   # -----------------------------------------------------------------------
4365   #
4366   # How to add NEW_TOKENS:
4367   #
4368   # New token types will undoubtedly be needed in the future both to keep up
4369   # with changes in perl and to help adapt the tokenizer to other applications.
4370   #
4371   # Here are some notes on the minimal steps.  I wrote these notes while
4372   # adding the 'v' token type for v-strings, which are things like version
4373   # numbers 5.6.0, and ip addresses, and will use that as an example.  ( You
4374   # can use your editor to search for the string "NEW_TOKENS" to find the
4375   # appropriate sections to change):
4376   #
4377   # *. Try to talk somebody else into doing it!  If not, ..
4378   #
4379   # *. Make a backup of your current version in case things don't work out!
4380   #
4381   # *. Think of a new, unused character for the token type, and add to
4382   # the array @valid_token_types in the BEGIN section of this package.
4383   # For example, I used 'v' for v-strings.
4384   #
4385   # *. Implement coding to recognize the $type of the token in this routine.
4386   # This is the hardest part, and is best done by imitating or modifying
4387   # some of the existing coding.  For example, to recognize v-strings, I
4388   # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
4389   # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
4390   #
4391   # *. Update sub operator_expected.  This update is critically important but
4392   # the coding is trivial.  Look at the comments in that routine for help.
4393   # For v-strings, which should behave like numbers, I just added 'v' to the
4394   # regex used to handle numbers and strings (types 'n' and 'Q').
4395   #
4396   # *. Implement a 'bond strength' rule in sub set_bond_strengths in
4397   # Perl::Tidy::Formatter for breaking lines around this token type.  You can
4398   # skip this step and take the default at first, then adjust later to get
4399   # desired results.  For adding type 'v', I looked at sub bond_strength and
4400   # saw that number type 'n' was using default strengths, so I didn't do
4401   # anything.  I may tune it up someday if I don't like the way line
4402   # breaks with v-strings look.
4403   #
4404   # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
4405   # Perl::Tidy::Formatter.  For adding type 'v', I looked at this routine
4406   # and saw that type 'n' used spaces on both sides, so I just added 'v'
4407   # to the array @spaces_both_sides.
4408   #
4409   # *. Update HtmlWriter package so that users can colorize the token as
4410   # desired.  This is quite easy; see comments identified by 'NEW_TOKENS' in
4411   # that package.  For v-strings, I initially chose to use a default color
4412   # equal to the default for numbers, but it might be nice to change that
4413   # eventually.
4414   #
4415   # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
4416   #
4417   # *. Run lots and lots of debug tests.  Start with special files designed
4418   # to test the new token type.  Run with the -D flag to create a .DEBUG
4419   # file which shows the tokenization.  When these work ok, test as many old
4420   # scripts as possible.  Start with all of the '.t' files in the 'test'
4421   # directory of the distribution file.  Compare .tdy output with previous
4422   # version and updated version to see the differences.  Then include as
4423   # many more files as possible. My own technique has been to collect a huge
4424   # number of perl scripts (thousands!) into one directory and run perltidy
4425   # *, then run diff between the output of the previous version and the
4426   # current version.
4427   #
4428   # *. For another example, search for the smartmatch operator '~~'
4429   # with your editor to see where updates were made for it.
4430   #
4431   # -----------------------------------------------------------------------
4432
4433         my $line_of_tokens = shift;
4434         my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
4435
4436         # patch while coding change is underway
4437         # make callers private data to allow access
4438         # $tokenizer_self = $caller_tokenizer_self;
4439
4440         # extract line number for use in error messages
4441         $input_line_number = $line_of_tokens->{_line_number};
4442
4443         # reinitialize for multi-line quote
4444         $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
4445
4446         # check for pod documentation
4447         if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
4448             && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
4449         {
4450
4451             # must not be in multi-line quote
4452             # and must not be in an equation
4453             if ( !$in_quote
4454                 && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
4455             {
4456                 $tokenizer_self->[_in_pod_] = 1;
4457                 return;
4458             }
4459         }
4460
4461         $input_line = $untrimmed_input_line;
4462
4463         chomp $input_line;
4464
4465         # Set a flag to indicate if we might be at an __END__ or __DATA__ line
4466         # This will be used below to avoid quoting a bare word followed by
4467         # a fat comma.
4468         my $is_END_or_DATA;
4469
4470         # trim start of this line unless we are continuing a quoted line
4471         # do not trim end because we might end in a quote (test: deken4.pl)
4472         # Perl::Tidy::Formatter will delete needless trailing blanks
4473         unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
4474             $input_line =~ s/^(\s+)//;    # trim left end
4475
4476             # calculate a guessed level for nonblank lines to avoid calls to
4477             #    sub guess_old_indentation_level()
4478             if ( $input_line && $1 ) {
4479                 my $leading_spaces = $1;
4480                 my $spaces         = length($leading_spaces);
4481
4482                 # handle leading tabs
4483                 if ( ord( substr( $leading_spaces, 0, 1 ) ) == 9
4484                     && $leading_spaces =~ /^(\t+)/ )
4485                 {
4486                     my $tabsize = $tokenizer_self->[_tabsize_];
4487                     $spaces += length($1) * ( $tabsize - 1 );
4488                 }
4489
4490                 my $indent_columns = $tokenizer_self->[_indent_columns_];
4491                 $line_of_tokens->{_guessed_indentation_level} =
4492                   int( $spaces / $indent_columns );
4493             }
4494
4495             $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
4496               && $input_line =~ /^__(END|DATA)__\s*$/;
4497         }
4498
4499         # update the copy of the line for use in error messages
4500         # This must be exactly what we give the pre_tokenizer
4501         $tokenizer_self->[_line_of_text_] = $input_line;
4502
4503         # re-initialize for the main loop
4504         $routput_token_list     = [];    # stack of output token indexes
4505         $routput_token_type     = [];    # token types
4506         $routput_block_type     = [];    # types of code block
4507         $routput_container_type = [];    # paren types, such as if, elsif, ..
4508         $routput_type_sequence  = [];    # nesting sequential number
4509
4510         $rhere_target_list = [];
4511
4512         $tok             = $last_nonblank_token;
4513         $type            = $last_nonblank_type;
4514         $prototype       = $last_nonblank_prototype;
4515         $last_nonblank_i = -1;
4516         $block_type      = $last_nonblank_block_type;
4517         $container_type  = $last_nonblank_container_type;
4518         $type_sequence   = $last_nonblank_type_sequence;
4519         $indent_flag     = 0;
4520         $peeked_ahead    = 0;
4521
4522         # This variable signals pre_tokenize to get all tokens.
4523         # But note that it is no longer needed with fast block comment
4524         # option below.
4525         my $max_tokens_wanted = 0;
4526
4527         # optimize for a full-line comment
4528         if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) {
4529             $max_tokens_wanted = 1;    # no use tokenizing a comment
4530
4531             # and check for skipped section
4532             if (   $rOpts_code_skipping
4533                 && $input_line =~ /$code_skipping_pattern_begin/ )
4534             {
4535                 $tokenizer_self->[_in_skipped_] = 1;
4536                 return;
4537             }
4538
4539             # Optional fast processing of a block comment
4540             my $ci_string_sum =
4541               ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4542             my $ci_string_i = $ci_string_sum + $in_statement_continuation;
4543             $line_of_tokens->{_line_type}        = 'CODE';
4544             $line_of_tokens->{_rtokens}          = [$input_line];
4545             $line_of_tokens->{_rtoken_type}      = ['#'];
4546             $line_of_tokens->{_rlevels}          = [$level_in_tokenizer];
4547             $line_of_tokens->{_rci_levels}       = [$ci_string_i];
4548             $line_of_tokens->{_rblock_type}      = [EMPTY_STRING];
4549             $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
4550             $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
4551             return;
4552         }
4553
4554         tokenizer_main_loop( $max_tokens_wanted, $is_END_or_DATA );
4555
4556         #-----------------------------------------------
4557         # all done tokenizing this line ...
4558         # now prepare the final list of tokens and types
4559         #-----------------------------------------------
4560
4561         tokenizer_wrapup_line($line_of_tokens);
4562
4563         return;
4564     } ## end sub tokenize_this_line
4565
4566     sub tokenizer_main_loop {
4567         my ( $max_tokens_wanted, $is_END_or_DATA ) = @_;
4568
4569         # tokenization is done in two stages..
4570         # stage 1 is a very simple pre-tokenization
4571
4572         # start by breaking the line into pre-tokens
4573         ( $rtokens, $rtoken_map, $rtoken_type ) =
4574           pre_tokenize( $input_line, $max_tokens_wanted );
4575
4576         $max_token_index = scalar( @{$rtokens} ) - 1;
4577         push( @{$rtokens}, SPACE, SPACE, SPACE )
4578           ;    # extra whitespace simplifies logic
4579         push( @{$rtoken_map},  0,   0,   0 );     # shouldn't be referenced
4580         push( @{$rtoken_type}, 'b', 'b', 'b' );
4581
4582         # initialize for main loop
4583         if (0) { #<<< this is not necessary
4584         foreach my $ii ( 0 .. $max_token_index + 3 ) {
4585             $routput_token_type->[$ii]     = EMPTY_STRING;
4586             $routput_block_type->[$ii]     = EMPTY_STRING;
4587             $routput_container_type->[$ii] = EMPTY_STRING;
4588             $routput_type_sequence->[$ii]  = EMPTY_STRING;
4589             $routput_indent_flag->[$ii]    = 0;
4590         }
4591         }
4592
4593         $i     = -1;
4594         $i_tok = -1;
4595
4596         # ------------------------------------------------------------
4597         # begin main tokenization loop
4598         # ------------------------------------------------------------
4599
4600         # we are looking at each pre-token of one line and combining them
4601         # into tokens
4602         while ( ++$i <= $max_token_index ) {
4603
4604             # continue looking for the end of a quote
4605             if ($in_quote) {
4606                 do_FOLLOW_QUOTE();
4607                 last if ( $in_quote || $i > $max_token_index );
4608             }
4609
4610             if ( $type ne 'b' && $tok ne 'CORE::' ) {
4611
4612                 # try to catch some common errors
4613                 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
4614
4615                     if ( $last_nonblank_token eq 'eq' ) {
4616                         complain("Should 'eq' be '==' here ?\n");
4617                     }
4618                     elsif ( $last_nonblank_token eq 'ne' ) {
4619                         complain("Should 'ne' be '!=' here ?\n");
4620                     }
4621                 }
4622
4623                 # fix c090, only rotate vars if a new token will be stored
4624                 if ( $i_tok >= 0 ) {
4625                     $last_last_nonblank_token      = $last_nonblank_token;
4626                     $last_last_nonblank_type       = $last_nonblank_type;
4627                     $last_last_nonblank_block_type = $last_nonblank_block_type;
4628                     $last_last_nonblank_container_type =
4629                       $last_nonblank_container_type;
4630                     $last_last_nonblank_type_sequence =
4631                       $last_nonblank_type_sequence;
4632
4633                     # Fix part #3 for git82: propagate type 'Z' though L-R pair
4634                     unless ( $type eq 'R' && $last_nonblank_type eq 'Z' ) {
4635                         $last_nonblank_token = $tok;
4636                         $last_nonblank_type  = $type;
4637                     }
4638                     $last_nonblank_prototype      = $prototype;
4639                     $last_nonblank_block_type     = $block_type;
4640                     $last_nonblank_container_type = $container_type;
4641                     $last_nonblank_type_sequence  = $type_sequence;
4642                     $last_nonblank_i              = $i_tok;
4643                 }
4644
4645                 # Patch for c030: Fix things in case a '->' got separated from
4646                 # the subsequent identifier by a side comment.  We need the
4647                 # last_nonblank_token to have a leading -> to avoid triggering
4648                 # an operator expected error message at the next '('. See also
4649                 # fix for git #63.
4650                 if ( $last_last_nonblank_token eq '->' ) {
4651                     if (   $last_nonblank_type eq 'w'
4652                         || $last_nonblank_type eq 'i'
4653                         && substr( $last_nonblank_token, 0, 1 ) eq '$' )
4654                     {
4655                         $last_nonblank_token = '->' . $last_nonblank_token;
4656                         $last_nonblank_type  = 'i';
4657                     }
4658                 }
4659             }
4660
4661             # store previous token type
4662             if ( $i_tok >= 0 ) {
4663                 $routput_token_type->[$i_tok]     = $type;
4664                 $routput_block_type->[$i_tok]     = $block_type;
4665                 $routput_container_type->[$i_tok] = $container_type;
4666                 $routput_type_sequence->[$i_tok]  = $type_sequence;
4667                 $routput_indent_flag->[$i_tok]    = $indent_flag;
4668             }
4669
4670             # get the next pre-token and type
4671             # $tok and $type will be modified to make the output token
4672             my $pre_tok  = $tok  = $rtokens->[$i];      # get the next pre-token
4673             my $pre_type = $type = $rtoken_type->[$i];  # and type
4674
4675             # remember the starting index of this token; we will be updating $i
4676             $i_tok = $i;
4677
4678             # re-initialize various flags for the next output token
4679             $block_type     &&= EMPTY_STRING;
4680             $container_type &&= EMPTY_STRING;
4681             $type_sequence  &&= EMPTY_STRING;
4682             $indent_flag    &&= 0;
4683             $prototype      &&= EMPTY_STRING;
4684
4685             # this pre-token will start an output token
4686             push( @{$routput_token_list}, $i_tok );
4687
4688             #--------------------------
4689             # handle a whitespace token
4690             #--------------------------
4691             next if ( $pre_type eq 'b' );
4692
4693             #-----------------
4694             # handle a comment
4695             #-----------------
4696             last if ( $pre_type eq '#' );
4697
4698             # continue gathering identifier if necessary
4699             if ($id_scan_state) {
4700
4701                 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
4702                     scan_id();
4703                 }
4704                 else {
4705                     scan_identifier();
4706                 }
4707
4708                 if ($id_scan_state) {
4709
4710                     # Still scanning ...
4711                     # Check for side comment between sub and prototype (c061)
4712
4713                     # done if nothing left to scan on this line
4714                     last if ( $i > $max_token_index );
4715
4716                     my ( $next_nonblank_token, $i_next ) =
4717                       find_next_nonblank_token_on_this_line( $i, $rtokens,
4718                         $max_token_index );
4719
4720                     # done if it was just some trailing space
4721                     last if ( $i_next > $max_token_index );
4722
4723                     # something remains on the line ... must be a side comment
4724                     next;
4725                 }
4726
4727                 next if ( ( $i > 0 ) || $type );
4728
4729                 # didn't find any token; start over
4730                 $type = $pre_type;
4731                 $tok  = $pre_tok;
4732             }
4733
4734             my $prev_tok  = $i > 0 ? $rtokens->[ $i - 1 ]     : SPACE;
4735             my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
4736
4737             #-----------------------------------------------------------
4738             # Combine pre-tokens into digraphs and trigraphs if possible
4739             #-----------------------------------------------------------
4740
4741             # See if we can make a digraph...
4742             # The following tokens are excluded and handled specially:
4743             # '/=' is excluded because the / might start a pattern.
4744             # 'x=' is excluded since it might be $x=, with $ on previous line
4745             # '**' and *= might be typeglobs of punctuation variables
4746             # I have allowed tokens starting with <, such as <=,
4747             # because I don't think these could be valid angle operators.
4748             # test file: storrs4.pl
4749             if (   $can_start_digraph{$tok}
4750                 && $i < $max_token_index
4751                 && $is_digraph{ $tok . $rtokens->[ $i + 1 ] } )
4752             {
4753
4754                 my $combine_ok = 1;
4755                 my $test_tok   = $tok . $rtokens->[ $i + 1 ];
4756
4757                 # check for special cases which cannot be combined
4758
4759                 # '//' must be defined_or operator if an operator is expected.
4760                 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
4761                 # could be migrated here for clarity
4762
4763                 # Patch for RT#102371, misparsing a // in the following snippet:
4764                 #     state $b //= ccc();
4765                 # The solution is to always accept the digraph (or trigraph)
4766                 # after type 'Z' (possible file handle).  The reason is that
4767                 # sub operator_expected gives TERM expected here, which is
4768                 # wrong in this case.
4769                 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
4770
4771                     # note that here $tok = '/' and the next tok and type is '/'
4772                     $expecting = operator_expected( [ $prev_type, $tok, '/' ] );
4773
4774                     # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
4775                     $combine_ok = 0 if ( $expecting == TERM );
4776                 }
4777
4778                 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
4779                 # Accept the digraphs '**' only after type 'Z'
4780                 # Otherwise postpone the decision.
4781                 if ( $test_tok eq '**' ) {
4782                     if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
4783                 }
4784
4785                 if (
4786
4787                     # still ok to combine?
4788                     $combine_ok
4789
4790                     && ( $test_tok ne '/=' )    # might be pattern
4791                     && ( $test_tok ne 'x=' )    # might be $x
4792                     && ( $test_tok ne '*=' )    # typeglob?
4793
4794                     # Moved above as part of fix for
4795                     # RT #114359: Missparsing of "print $x ** 0.5;
4796                     # && ( $test_tok ne '**' )    # typeglob?
4797                   )
4798                 {
4799                     $tok = $test_tok;
4800                     $i++;
4801
4802                     # Now try to assemble trigraphs.  Note that all possible
4803                     # perl trigraphs can be constructed by appending a character
4804                     # to a digraph.
4805                     $test_tok = $tok . $rtokens->[ $i + 1 ];
4806
4807                     if ( $is_trigraph{$test_tok} ) {
4808                         $tok = $test_tok;
4809                         $i++;
4810                     }
4811
4812                     # The only current tetragraph is the double diamond operator
4813                     # and its first three characters are not a trigraph, so
4814                     # we do can do a special test for it
4815                     elsif ( $test_tok eq '<<>' ) {
4816                         $test_tok .= $rtokens->[ $i + 2 ];
4817                         if ( $is_tetragraph{$test_tok} ) {
4818                             $tok = $test_tok;
4819                             $i += 2;
4820                         }
4821                     }
4822                 }
4823             }
4824
4825             $type      = $tok;
4826             $next_tok  = $rtokens->[ $i + 1 ];
4827             $next_type = $rtoken_type->[ $i + 1 ];
4828
4829             DEBUG_TOKENIZE && do {
4830                 local $LIST_SEPARATOR = ')(';
4831                 my @debug_list = (
4832                     $last_nonblank_token,      $tok,
4833                     $next_tok,                 $brace_depth,
4834                     $brace_type[$brace_depth], $paren_depth,
4835                     $paren_type[$paren_depth],
4836                 );
4837                 print STDOUT "TOKENIZE:(@debug_list)\n";
4838             };
4839
4840             # Turn off attribute list on first non-blank, non-bareword.
4841             # Added '#' to fix c038 (later moved above).
4842             if ( $in_attribute_list && $pre_type ne 'w' ) {
4843                 $in_attribute_list = 0;
4844             }
4845
4846             ###############################################################
4847             # We have the next token, $tok.
4848             # Now we have to examine this token and decide what it is
4849             # and define its $type
4850             #
4851             # section 1: bare words
4852             ###############################################################
4853
4854             if ( $pre_type eq 'w' ) {
4855                 $expecting =
4856                   operator_expected( [ $prev_type, $tok, $next_type ] );
4857                 my $is_last = do_BAREWORD($is_END_or_DATA);
4858                 last if ($is_last);
4859             }
4860
4861             ###############################################################
4862             # section 2: strings of digits
4863             ###############################################################
4864             elsif ( $pre_type eq 'd' ) {
4865                 $expecting =
4866                   operator_expected( [ $prev_type, $tok, $next_type ] );
4867                 do_DIGITS();
4868             }
4869
4870             ###############################################################
4871             # section 3: all other tokens
4872             ###############################################################
4873             else {
4874                 my $code = $tokenization_code->{$tok};
4875                 if ($code) {
4876                     $expecting =
4877                       operator_expected( [ $prev_type, $tok, $next_type ] );
4878                     $code->();
4879                     redo if $in_quote;
4880                 }
4881             }
4882         }
4883
4884         # -----------------------------
4885         # end of main tokenization loop
4886         # -----------------------------
4887
4888         # Store the final token
4889         if ( $i_tok >= 0 ) {
4890             $routput_token_type->[$i_tok]     = $type;
4891             $routput_block_type->[$i_tok]     = $block_type;
4892             $routput_container_type->[$i_tok] = $container_type;
4893             $routput_type_sequence->[$i_tok]  = $type_sequence;
4894             $routput_indent_flag->[$i_tok]    = $indent_flag;
4895         }
4896
4897         # Remember last nonblank values
4898         unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
4899             $last_last_nonblank_token          = $last_nonblank_token;
4900             $last_last_nonblank_type           = $last_nonblank_type;
4901             $last_last_nonblank_block_type     = $last_nonblank_block_type;
4902             $last_last_nonblank_container_type = $last_nonblank_container_type;
4903             $last_last_nonblank_type_sequence  = $last_nonblank_type_sequence;
4904             $last_nonblank_token               = $tok;
4905             $last_nonblank_type                = $type;
4906             $last_nonblank_block_type          = $block_type;
4907             $last_nonblank_container_type      = $container_type;
4908             $last_nonblank_type_sequence       = $type_sequence;
4909             $last_nonblank_prototype           = $prototype;
4910         }
4911
4912         # reset indentation level if necessary at a sub or package
4913         # in an attempt to recover from a nesting error
4914         if ( $level_in_tokenizer < 0 ) {
4915             if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
4916                 reset_indentation_level(0);
4917                 brace_warning("resetting level to 0 at $1 $2\n");
4918             }
4919         }
4920
4921         $tokenizer_self->[_in_attribute_list_] = $in_attribute_list;
4922         $tokenizer_self->[_in_quote_]          = $in_quote;
4923         $tokenizer_self->[_quote_target_] =
4924           $in_quote ? matching_end_token($quote_character) : EMPTY_STRING;
4925         $tokenizer_self->[_rhere_target_list_] = $rhere_target_list;
4926
4927         return;
4928     } ## end sub tokenizer_main_loop
4929
4930     sub tokenizer_wrapup_line {
4931         my ($line_of_tokens) = @_;
4932
4933         # We have broken the current line into tokens. Now we have to wrap up
4934         # the result for shipping.  Most of the remaining work involves
4935         # defining the various indentation parameters that the formatter needs
4936         # (indentation level and continuation indentation).  This turns out to
4937         # be somewhat complicated.
4938
4939         my @token_type    = ();    # stack of output token types
4940         my @block_type    = ();    # stack of output code block types
4941         my @type_sequence = ();    # stack of output type sequence numbers
4942         my @tokens        = ();    # output tokens
4943         my @levels        = ();    # structural brace levels of output tokens
4944         my @ci_string = ();  # string needed to compute continuation indentation
4945         my $container_environment = EMPTY_STRING;
4946         my $im                    = -1;             # previous $i value
4947         my $num;
4948
4949         # Count the number of '1's in the string (previously sub ones_count)
4950         my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
4951
4952 # Computing Token Indentation
4953 #
4954 #     The final section of the tokenizer forms tokens and also computes
4955 #     parameters needed to find indentation.  It is much easier to do it
4956 #     in the tokenizer than elsewhere.  Here is a brief description of how
4957 #     indentation is computed.  Perl::Tidy computes indentation as the sum
4958 #     of 2 terms:
4959 #
4960 #     (1) structural indentation, such as if/else/elsif blocks
4961 #     (2) continuation indentation, such as long parameter call lists.
4962 #
4963 #     These are occasionally called primary and secondary indentation.
4964 #
4965 #     Structural indentation is introduced by tokens of type '{', although
4966 #     the actual tokens might be '{', '(', or '['.  Structural indentation
4967 #     is of two types: BLOCK and non-BLOCK.  Default structural indentation
4968 #     is 4 characters if the standard indentation scheme is used.
4969 #
4970 #     Continuation indentation is introduced whenever a line at BLOCK level
4971 #     is broken before its termination.  Default continuation indentation
4972 #     is 2 characters in the standard indentation scheme.
4973 #
4974 #     Both types of indentation may be nested arbitrarily deep and
4975 #     interlaced.  The distinction between the two is somewhat arbitrary.
4976 #
4977 #     For each token, we will define two variables which would apply if
4978 #     the current statement were broken just before that token, so that
4979 #     that token started a new line:
4980 #
4981 #     $level = the structural indentation level,
4982 #     $ci_level = the continuation indentation level
4983 #
4984 #     The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
4985 #     assuming defaults.  However, in some special cases it is customary
4986 #     to modify $ci_level from this strict value.
4987 #
4988 #     The total structural indentation is easy to compute by adding and
4989 #     subtracting 1 from a saved value as types '{' and '}' are seen.  The
4990 #     running value of this variable is $level_in_tokenizer.
4991 #
4992 #     The total continuation is much more difficult to compute, and requires
4993 #     several variables.  These variables are:
4994 #
4995 #     $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
4996 #       each indentation level, if there are intervening open secondary
4997 #       structures just prior to that level.
4998 #     $continuation_string_in_tokenizer = a string of 1's and 0's indicating
4999 #       if the last token at that level is "continued", meaning that it
5000 #       is not the first token of an expression.
5001 #     $nesting_block_string = a string of 1's and 0's indicating, for each
5002 #       indentation level, if the level is of type BLOCK or not.
5003 #     $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
5004 #     $nesting_list_string = a string of 1's and 0's indicating, for each
5005 #       indentation level, if it is appropriate for list formatting.
5006 #       If so, continuation indentation is used to indent long list items.
5007 #     $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
5008 #     @{$rslevel_stack} = a stack of total nesting depths at each
5009 #       structural indentation level, where "total nesting depth" means
5010 #       the nesting depth that would occur if every nesting token -- '{', '[',
5011 #       and '(' -- , regardless of context, is used to compute a nesting
5012 #       depth.
5013
5014         $line_of_tokens->{_nesting_tokens_0} = $nesting_token_string;
5015
5016         my ( $ci_string_i, $level_i );
5017
5018         # loop over the list of pre-tokens indexes
5019         foreach my $i ( @{$routput_token_list} ) {
5020
5021             # Get $tok_i, the PRE-token.  It only equals the token for symbols
5022             my $type_i = $routput_token_type->[$i];
5023             my $tok_i  = $rtokens->[$i];
5024
5025             # Quick handling of indentation levels for blanks and comments
5026             if ( $type_i eq 'b' || $type_i eq '#' ) {
5027                 $ci_string_i = $ci_string_sum + $in_statement_continuation;
5028                 $level_i     = $level_in_tokenizer;
5029             }
5030
5031             # All other types
5032             else {
5033
5034                 # Check for an invalid token type..
5035                 # This can happen by running perltidy on non-scripts although
5036                 # it could also be bug introduced by programming change.  Perl
5037                 # silently accepts a 032 (^Z) and takes it as the end
5038                 if ( !$is_valid_token_type{$type_i} ) {
5039                     my $val = ord($type_i);
5040                     warning(
5041 "unexpected character decimal $val ($type_i) in script\n"
5042                     );
5043                     $tokenizer_self->[_in_error_] = 1;
5044                 }
5045
5046              # See if we should undo the $forced_indentation_flag.
5047              # Forced indentation after 'if', 'unless', 'while' and 'until'
5048              # expressions without trailing parens is optional and doesn't
5049              # always look good.  It is usually okay for a trailing logical
5050              # expression, but if the expression is a function call, code block,
5051              # or some kind of list it puts in an unwanted extra indentation
5052              # level which is hard to remove.
5053              #
5054              # Example where extra indentation looks ok:
5055              # return 1
5056              #   if $det_a < 0 and $det_b > 0
5057              #       or $det_a > 0 and $det_b < 0;
5058              #
5059              # Example where extra indentation is not needed because
5060              # the eval brace also provides indentation:
5061              # print "not " if defined eval {
5062              #     reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
5063              # };
5064              #
5065              # The following rule works fairly well:
5066              #   Undo the flag if the end of this line, or start of the next
5067              #   line, is an opening container token or a comma.
5068              # This almost always works, but if not after another pass it will
5069              # be stable.
5070                 my $forced_indentation_flag = $routput_indent_flag->[$i];
5071                 if ( $forced_indentation_flag && $type_i eq 'k' ) {
5072                     my $ixlast  = -1;
5073                     my $ilast   = $routput_token_list->[$ixlast];
5074                     my $toklast = $routput_token_type->[$ilast];
5075                     if ( $toklast eq '#' ) {
5076                         $ixlast--;
5077                         $ilast   = $routput_token_list->[$ixlast];
5078                         $toklast = $routput_token_type->[$ilast];
5079                     }
5080                     if ( $toklast eq 'b' ) {
5081                         $ixlast--;
5082                         $ilast   = $routput_token_list->[$ixlast];
5083                         $toklast = $routput_token_type->[$ilast];
5084                     }
5085                     if ( $toklast =~ /^[\{,]$/ ) {
5086                         $forced_indentation_flag = 0;
5087                     }
5088                     else {
5089                         ( $toklast, my $i_next ) =
5090                           find_next_nonblank_token( $max_token_index, $rtokens,
5091                             $max_token_index );
5092                         if ( $toklast =~ /^[\{,]$/ ) {
5093                             $forced_indentation_flag = 0;
5094                         }
5095                     }
5096                 } ## end if ( $forced_indentation_flag...)
5097
5098                 # if we are already in an indented if, see if we should outdent
5099                 if ($indented_if_level) {
5100
5101                     # don't try to nest trailing if's - shouldn't happen
5102                     if ( $type_i eq 'k' ) {
5103                         $forced_indentation_flag = 0;
5104                     }
5105
5106                     # check for the normal case - outdenting at next ';'
5107                     elsif ( $type_i eq ';' ) {
5108                         if ( $level_in_tokenizer == $indented_if_level ) {
5109                             $forced_indentation_flag = -1;
5110                             $indented_if_level       = 0;
5111                         }
5112                     }
5113
5114                     # handle case of missing semicolon
5115                     elsif ( $type_i eq '}' ) {
5116                         if ( $level_in_tokenizer == $indented_if_level ) {
5117                             $indented_if_level = 0;
5118
5119                             $level_in_tokenizer--;
5120                             if ( @{$rslevel_stack} > 1 ) {
5121                                 pop( @{$rslevel_stack} );
5122                             }
5123                             if ( length($nesting_block_string) > 1 )
5124                             {    # true for valid script
5125                                 chop $nesting_block_string;
5126                                 chop $nesting_list_string;
5127                             }
5128                         }
5129                     }
5130                 } ## end if ($indented_if_level)
5131
5132                 # Now we have the first approximation to the level
5133                 $level_i = $level_in_tokenizer;
5134
5135                 # set primary indentation levels based on structural braces
5136                 # Note: these are set so that the leading braces have a HIGHER
5137                 # level than their CONTENTS, which is convenient for indentation
5138                 # Also, define continuation indentation for each token.
5139                 if (   $type_i eq '{'
5140                     || $type_i eq 'L'
5141                     || $forced_indentation_flag > 0 )
5142                 {
5143
5144                     # use environment before updating
5145                     $container_environment =
5146                         $nesting_block_flag ? 'BLOCK'
5147                       : $nesting_list_flag  ? 'LIST'
5148                       :                       EMPTY_STRING;
5149
5150                     # if the difference between total nesting levels is not 1,
5151                     # there are intervening non-structural nesting types between
5152                     # this '{' and the previous unclosed '{'
5153                     my $intervening_secondary_structure = 0;
5154                     if ( @{$rslevel_stack} ) {
5155                         $intervening_secondary_structure =
5156                           $slevel_in_tokenizer - $rslevel_stack->[-1];
5157                     }
5158
5159      # Continuation Indentation
5160      #
5161      # Having tried setting continuation indentation both in the formatter and
5162      # in the tokenizer, I can say that setting it in the tokenizer is much,
5163      # much easier.  The formatter already has too much to do, and can't
5164      # make decisions on line breaks without knowing what 'ci' will be at
5165      # arbitrary locations.
5166      #
5167      # But a problem with setting the continuation indentation (ci) here
5168      # in the tokenizer is that we do not know where line breaks will actually
5169      # be.  As a result, we don't know if we should propagate continuation
5170      # indentation to higher levels of structure.
5171      #
5172      # For nesting of only structural indentation, we never need to do this.
5173      # For example, in a long if statement, like this
5174      #
5175      #   if ( !$output_block_type[$i]
5176      #     && ($in_statement_continuation) )
5177      #   {           <--outdented
5178      #       do_something();
5179      #   }
5180      #
5181      # the second line has ci but we do normally give the lines within the BLOCK
5182      # any ci.  This would be true if we had blocks nested arbitrarily deeply.
5183      #
5184      # But consider something like this, where we have created a break after
5185      # an opening paren on line 1, and the paren is not (currently) a
5186      # structural indentation token:
5187      #
5188      # my $file = $menubar->Menubutton(
5189      #   qw/-text File -underline 0 -menuitems/ => [
5190      #       [
5191      #           Cascade    => '~View',
5192      #           -menuitems => [
5193      #           ...
5194      #
5195      # The second line has ci, so it would seem reasonable to propagate it
5196      # down, giving the third line 1 ci + 1 indentation.  This suggests the
5197      # following rule, which is currently used to propagating ci down: if there
5198      # are any non-structural opening parens (or brackets, or braces), before
5199      # an opening structural brace, then ci is propagated down, and otherwise
5200      # not.  The variable $intervening_secondary_structure contains this
5201      # information for the current token, and the string
5202      # "$ci_string_in_tokenizer" is a stack of previous values of this
5203      # variable.
5204
5205                     # save the current states
5206                     push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
5207                     $level_in_tokenizer++;
5208
5209                     if ( $level_in_tokenizer >
5210                         $tokenizer_self->[_maximum_level_] )
5211                     {
5212                         $tokenizer_self->[_maximum_level_] =
5213                           $level_in_tokenizer;
5214                     }
5215
5216                     if ($forced_indentation_flag) {
5217
5218                         # break BEFORE '?' when there is forced indentation
5219                         if ( $type_i eq '?' ) {
5220                             $level_i = $level_in_tokenizer;
5221                         }
5222                         if ( $type_i eq 'k' ) {
5223                             $indented_if_level = $level_in_tokenizer;
5224                         }
5225
5226                         # do not change container environment here if we are not
5227                         # at a real list. Adding this check prevents "blinkers"
5228                         # often near 'unless" clauses, such as in the following
5229                         # code:
5230 ##          next
5231 ##            unless -e (
5232 ##                    $archive =
5233 ##                      File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
5234 ##            );
5235
5236                         $nesting_block_string .= "$nesting_block_flag";
5237                     } ## end if ($forced_indentation_flag)
5238                     else {
5239
5240                         if ( $routput_block_type->[$i] ) {
5241                             $nesting_block_flag = 1;
5242                             $nesting_block_string .= '1';
5243                         }
5244                         else {
5245                             $nesting_block_flag = 0;
5246                             $nesting_block_string .= '0';
5247                         }
5248                     }
5249
5250                     # we will use continuation indentation within containers
5251                     # which are not blocks and not logical expressions
5252                     my $bit = 0;
5253                     if ( !$routput_block_type->[$i] ) {
5254
5255                         # propagate flag down at nested open parens
5256                         if ( $routput_container_type->[$i] eq '(' ) {
5257                             $bit = 1 if $nesting_list_flag;
5258                         }
5259
5260                   # use list continuation if not a logical grouping
5261                   # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
5262                         else {
5263                             $bit = 1
5264                               unless
5265                               $is_logical_container{ $routput_container_type
5266                                   ->[$i] };
5267                         }
5268                     }
5269                     $nesting_list_string .= $bit;
5270                     $nesting_list_flag = $bit;
5271
5272                     $ci_string_in_tokenizer .=
5273                       ( $intervening_secondary_structure != 0 ) ? '1' : '0';
5274                     $ci_string_sum =
5275                       ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5276                     $continuation_string_in_tokenizer .=
5277                       ( $in_statement_continuation > 0 ) ? '1' : '0';
5278
5279    #  Sometimes we want to give an opening brace continuation indentation,
5280    #  and sometimes not.  For code blocks, we don't do it, so that the leading
5281    #  '{' gets outdented, like this:
5282    #
5283    #   if ( !$output_block_type[$i]
5284    #     && ($in_statement_continuation) )
5285    #   {           <--outdented
5286    #
5287    #  For other types, we will give them continuation indentation.  For example,
5288    #  here is how a list looks with the opening paren indented:
5289    #
5290    #     @LoL =
5291    #       ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
5292    #         [ "homer", "marge", "bart" ], );
5293    #
5294    #  This looks best when 'ci' is one-half of the indentation  (i.e., 2 and 4)
5295
5296                     my $total_ci = $ci_string_sum;
5297                     if (
5298                         !$routput_block_type->[$i]    # patch: skip for BLOCK
5299                         && ($in_statement_continuation)
5300                         && !( $forced_indentation_flag && $type_i eq ':' )
5301                       )
5302                     {
5303                         $total_ci += $in_statement_continuation
5304                           unless (
5305                             substr( $ci_string_in_tokenizer, -1 ) eq '1' );
5306                     }
5307
5308                     $ci_string_i               = $total_ci;
5309                     $in_statement_continuation = 0;
5310                 } ## end if ( $type_i eq '{' ||...})
5311
5312                 elsif ($type_i eq '}'
5313                     || $type_i eq 'R'
5314                     || $forced_indentation_flag < 0 )
5315                 {
5316
5317                  # only a nesting error in the script would prevent popping here
5318                     if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
5319
5320                     $level_i = --$level_in_tokenizer;
5321
5322                     # restore previous level values
5323                     if ( length($nesting_block_string) > 1 )
5324                     {    # true for valid script
5325                         chop $nesting_block_string;
5326                         $nesting_block_flag =
5327                           substr( $nesting_block_string, -1 ) eq '1';
5328                         chop $nesting_list_string;
5329                         $nesting_list_flag =
5330                           substr( $nesting_list_string, -1 ) eq '1';
5331
5332                         chop $ci_string_in_tokenizer;
5333                         $ci_string_sum =
5334                           ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
5335
5336                         $in_statement_continuation =
5337                           chop $continuation_string_in_tokenizer;
5338
5339                         # zero continuation flag at terminal BLOCK '}' which
5340                         # ends a statement.
5341                         my $block_type_i = $routput_block_type->[$i];
5342                         if ($block_type_i) {
5343
5344                             # ...These include non-anonymous subs
5345                             # note: could be sub ::abc { or sub 'abc
5346                             if ( $block_type_i =~ m/^sub\s*/gc ) {
5347
5348                                 # note: older versions of perl require the /gc
5349                                 # modifier here or else the \G does not work.
5350                                 if ( $block_type_i =~ /\G('|::|\w)/gc ) {
5351                                     $in_statement_continuation = 0;
5352                                 }
5353                             }
5354
5355                             # ...and include all block types except user subs
5356                             # with block prototypes and these:
5357                             # (sort|grep|map|do|eval)
5358                             elsif (
5359                                 $is_zero_continuation_block_type{$block_type_i}
5360                               )
5361                             {
5362                                 $in_statement_continuation = 0;
5363                             }
5364
5365                             # ..but these are not terminal types:
5366                             #     /^(sort|grep|map|do|eval)$/ )
5367                             elsif ($is_sort_map_grep_eval_do{$block_type_i}
5368                                 || $is_grep_alias{$block_type_i} )
5369                             {
5370                             }
5371
5372                             # ..and a block introduced by a label
5373                             # /^\w+\s*:$/gc ) {
5374                             elsif ( $block_type_i =~ /:$/ ) {
5375                                 $in_statement_continuation = 0;
5376                             }
5377
5378                             # user function with block prototype
5379                             else {
5380                                 $in_statement_continuation = 0;
5381                             }
5382                         } ## end if ($block_type_i)
5383
5384                         # If we are in a list, then
5385                         # we must set continuation indentation at the closing
5386                         # paren of something like this (paren after $check):
5387                         #     assert(
5388                         #         __LINE__,
5389                         #         ( not defined $check )
5390                         #           or ref $check
5391                         #           or $check eq "new"
5392                         #           or $check eq "old",
5393                         #     );
5394                         elsif ( $tok_i eq ')' ) {
5395                             $in_statement_continuation = 1
5396                               if (
5397                                 $is_list_end_type{
5398                                     $routput_container_type->[$i]
5399                                 }
5400                               );
5401                             ##if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
5402                         }
5403
5404                         elsif ( $tok_i eq ';' ) {
5405                             $in_statement_continuation = 0;
5406                         }
5407                     } ## end if ( length($nesting_block_string...))
5408
5409                     # use environment after updating
5410                     $container_environment =
5411                         $nesting_block_flag ? 'BLOCK'
5412                       : $nesting_list_flag  ? 'LIST'
5413                       :                       EMPTY_STRING;
5414                     $ci_string_i = $ci_string_sum + $in_statement_continuation;
5415                 } ## end elsif ( $type_i eq '}' ||...{)
5416
5417                 # not a structural indentation type..
5418                 else {
5419
5420                     $container_environment =
5421                         $nesting_block_flag ? 'BLOCK'
5422                       : $nesting_list_flag  ? 'LIST'
5423                       :                       EMPTY_STRING;
5424
5425                     # zero the continuation indentation at certain tokens so
5426                     # that they will be at the same level as its container.  For
5427                     # commas, this simplifies the -lp indentation logic, which
5428                     # counts commas.  For ?: it makes them stand out.
5429                     if ($nesting_list_flag) {
5430                         ##      $type_i =~ /^[,\?\:]$/
5431                         if ( $is_comma_question_colon{$type_i} ) {
5432                             $in_statement_continuation = 0;
5433                         }
5434                     }
5435
5436                     # be sure binary operators get continuation indentation
5437                     if (
5438                         $container_environment
5439                         && (   $type_i eq 'k' && $is_binary_keyword{$tok_i}
5440                             || $is_binary_type{$type_i} )
5441                       )
5442                     {
5443                         $in_statement_continuation = 1;
5444                     }
5445
5446                     # continuation indentation is sum of any open ci from
5447                     # previous levels plus the current level
5448                     $ci_string_i = $ci_string_sum + $in_statement_continuation;
5449
5450                     # update continuation flag ...
5451
5452                     ## if ( $type_i ne 'b' && $type_i ne '#' ) {  # moved above
5453
5454                     # if we are in a BLOCK
5455                     if ($nesting_block_flag) {
5456
5457                         # the next token after a ';' and label starts a new stmt
5458                         if ( $type_i eq ';' || $type_i eq 'J' ) {
5459                             $in_statement_continuation = 0;
5460                         }
5461
5462                         # otherwise, we are continuing the current statement
5463                         else {
5464                             $in_statement_continuation = 1;
5465                         }
5466                     }
5467
5468                     # if we are not in a BLOCK..
5469                     else {
5470
5471                         # do not use continuation indentation if not list
5472                         # environment (could be within if/elsif clause)
5473                         if ( !$nesting_list_flag ) {
5474                             $in_statement_continuation = 0;
5475                         }
5476
5477                         # otherwise, the token after a ',' starts a new term
5478
5479                         # Patch FOR RT#99961; no continuation after a ';'
5480                         # This is needed because perltidy currently marks
5481                         # a block preceded by a type character like % or @
5482                         # as a non block, to simplify formatting. But these
5483                         # are actually blocks and can have semicolons.
5484                         # See code_block_type() and is_non_structural_brace().
5485                         elsif ( $type_i eq ',' || $type_i eq ';' ) {
5486                             $in_statement_continuation = 0;
5487                         }
5488
5489                         # otherwise, we are continuing the current term
5490                         else {
5491                             $in_statement_continuation = 1;
5492                         }
5493                     } ## end else [ if ($nesting_block_flag)]
5494
5495                     ##}  ## end if ( $type_i ne 'b' ... # (old moved above)
5496
5497                 } ## end else [ if ( $type_i eq '{' ||...})]
5498
5499                 if ( $level_in_tokenizer < 0 ) {
5500                     unless ( $tokenizer_self->[_saw_negative_indentation_] ) {
5501                         $tokenizer_self->[_saw_negative_indentation_] = 1;
5502                         warning("Starting negative indentation\n");
5503                     }
5504                 }
5505
5506                 # set secondary nesting levels based on all containment token
5507                 # types Note: these are set so that the nesting depth is the
5508                 # depth of the PREVIOUS TOKEN, which is convenient for setting
5509                 # the strength of token bonds
5510
5511                 #    /^[L\{\(\[]$/
5512                 if ( $is_opening_type{$type_i} ) {
5513                     $slevel_in_tokenizer++;
5514                     $nesting_token_string .= $tok_i;
5515                     $nesting_type_string  .= $type_i;
5516                 }
5517
5518                 #       /^[R\}\)\]]$/
5519                 elsif ( $is_closing_type{$type_i} ) {
5520                     $slevel_in_tokenizer--;
5521                     my $char = chop $nesting_token_string;
5522
5523                     if ( $char ne $matching_start_token{$tok_i} ) {
5524                         $nesting_token_string .= $char . $tok_i;
5525                         $nesting_type_string  .= $type_i;
5526                     }
5527                     else {
5528                         chop $nesting_type_string;
5529                     }
5530                 }
5531
5532                 # apply token type patch:
5533                 # - output anonymous 'sub' as keyword (type 'k')
5534                 # - output __END__, __DATA__, and format as type 'k' instead
5535                 #   of ';' to make html colors correct, etc.
5536                 # The following hash tests are equivalent to these older tests:
5537                 #   if ( $type_i eq 't' && $is_sub{$tok_i} ) { $fix_type = 'k' }
5538                 #   if ( $type_i eq ';' && $tok_i =~ /\w/ ) { $fix_type = 'k' }
5539                 if (   $is_END_DATA_format_sub{$tok_i}
5540                     && $is_semicolon_or_t{$type_i} )
5541                 {
5542                     $type_i = 'k';
5543                 }
5544             } ## end else [ if ( $type_i eq 'b' ||...)]
5545
5546             # Store the values for this token
5547             push( @ci_string,     $ci_string_i );
5548             push( @levels,        $level_i );
5549             push( @block_type,    $routput_block_type->[$i] );
5550             push( @type_sequence, $routput_type_sequence->[$i] );
5551             push( @token_type,    $type_i );
5552
5553             # Form and store the previous token
5554             if ( $im >= 0 ) {
5555                 $num =
5556                   $rtoken_map->[$i] - $rtoken_map->[$im];  # how many characters
5557
5558                 if ( $num > 0 ) {
5559                     push( @tokens,
5560                         substr( $input_line, $rtoken_map->[$im], $num ) );
5561                 }
5562             }
5563
5564             # or grab some values for the leading token (needed for log output)
5565             else {
5566                 $line_of_tokens->{_nesting_blocks_0} = $nesting_block_string;
5567             }
5568
5569             $im = $i;
5570         } ## end foreach my $i ( @{$routput_token_list...})
5571
5572         # Form and store the final token
5573         $num = length($input_line) - $rtoken_map->[$im];   # make the last token
5574         if ( $num > 0 ) {
5575             push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
5576         }
5577
5578         $line_of_tokens->{_rtoken_type}    = \@token_type;
5579         $line_of_tokens->{_rtokens}        = \@tokens;
5580         $line_of_tokens->{_rblock_type}    = \@block_type;
5581         $line_of_tokens->{_rtype_sequence} = \@type_sequence;
5582         $line_of_tokens->{_rlevels}        = \@levels;
5583         $line_of_tokens->{_rci_levels}     = \@ci_string;
5584
5585         return;
5586     } ## end sub tokenizer_wrapup_line
5587 } ## end tokenize_this_line
5588
5589 #########i#############################################################
5590 # Tokenizer routines which assist in identifying token types
5591 #######################################################################
5592
5593 # hash lookup table of operator expected values
5594 my %op_expected_table;
5595
5596 # exceptions to perl's weird parsing rules after type 'Z'
5597 my %is_weird_parsing_rule_exception;
5598
5599 my %is_paren_dollar;
5600
5601 my %is_n_v;
5602
5603 BEGIN {
5604
5605     # Always expecting TERM following these types:
5606     # note: this is identical to '@value_requestor_type' defined later.
5607     my @q = qw(
5608       ; ! + x & ?  F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
5609       || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
5610       &= // >> ~. &. |. ^.
5611       ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
5612     );
5613     push @q, ',';
5614     push @q, '(';    # for completeness, not currently a token type
5615     @{op_expected_table}{@q} = (TERM) x scalar(@q);
5616
5617     # Always UNKNOWN following these types:
5618     # Fix for c030: added '->' to this list
5619     @q = qw( w -> );
5620     @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
5621
5622     # Always expecting OPERATOR ...
5623     # 'n' and 'v' are currently excluded because they might be VERSION numbers
5624     # 'i' is currently excluded because it might be a package
5625     # 'q' is currently excluded because it might be a prototype
5626     # Fix for c030: removed '->' from this list:
5627     @q = qw( -- C h R ++ ] Q <> );    ## n v q i );
5628     push @q, ')';
5629     @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
5630
5631     # Fix for git #62: added '*' and '%'
5632     @q = qw( < ? * % );
5633     @{is_weird_parsing_rule_exception}{@q} = (1) x scalar(@q);
5634
5635     @q = qw<) $>;
5636     @{is_paren_dollar}{@q} = (1) x scalar(@q);
5637
5638     @q = qw( n v );
5639     @{is_n_v}{@q} = (1) x scalar(@q);
5640
5641 }
5642
5643 use constant DEBUG_OPERATOR_EXPECTED => 0;
5644
5645 sub operator_expected {
5646
5647     # Returns a parameter indicating what types of tokens can occur next
5648
5649     # Call format:
5650     #    $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
5651     # where
5652     #    $prev_type is the type of the previous token (blank or not)
5653     #    $tok is the current token
5654     #    $next_type is the type of the next token (blank or not)
5655
5656     # Many perl symbols have two or more meanings.  For example, '<<'
5657     # can be a shift operator or a here-doc operator.  The
5658     # interpretation of these symbols depends on the current state of
5659     # the tokenizer, which may either be expecting a term or an
5660     # operator.  For this example, a << would be a shift if an OPERATOR
5661     # is expected, and a here-doc if a TERM is expected.  This routine
5662     # is called to make this decision for any current token.  It returns
5663     # one of three possible values:
5664     #
5665     #     OPERATOR - operator expected (or at least, not a term)
5666     #     UNKNOWN  - can't tell
5667     #     TERM     - a term is expected (or at least, not an operator)
5668     #
5669     # The decision is based on what has been seen so far.  This
5670     # information is stored in the "$last_nonblank_type" and
5671     # "$last_nonblank_token" variables.  For example, if the
5672     # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
5673     # if $last_nonblank_type is 'n' (numeric), we are expecting an
5674     # OPERATOR.
5675     #
5676     # If a UNKNOWN is returned, the calling routine must guess. A major
5677     # goal of this tokenizer is to minimize the possibility of returning
5678     # UNKNOWN, because a wrong guess can spoil the formatting of a
5679     # script.
5680     #
5681     # Adding NEW_TOKENS: it is critically important that this routine be
5682     # updated to allow it to determine if an operator or term is to be
5683     # expected after the new token.  Doing this simply involves adding
5684     # the new token character to one of the regexes in this routine or
5685     # to one of the hash lists
5686     # that it uses, which are initialized in the BEGIN section.
5687     # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
5688     # $statement_type
5689
5690     # When possible, token types should be selected such that we can determine
5691     # the 'operator_expected' value by a simple hash lookup.  If there are
5692     # exceptions, that is an indication that a new type is needed.
5693
5694     my ($rarg) = @_;
5695
5696     my $msg = EMPTY_STRING;
5697
5698     ##############
5699     # Table lookup
5700     ##############
5701
5702     # Many types are can be obtained by a table lookup given the previous type.
5703     # This typically handles half or more of the calls.
5704     my $op_expected = $op_expected_table{$last_nonblank_type};
5705     if ( defined($op_expected) ) {
5706         $msg = "Table lookup";
5707         goto RETURN;
5708     }
5709
5710     ######################
5711     # Handle special cases
5712     ######################
5713
5714     $op_expected = UNKNOWN;
5715     my ( $prev_type, $tok, $next_type ) = @{$rarg};
5716
5717     # Types 'k', '}' and 'Z' depend on context
5718     # FIXME: Types 'i', 'n', 'v', 'q' currently also temporarily depend on
5719     # context but that dependence could eventually be eliminated with better
5720     # token type definition
5721
5722     # identifier...
5723     if ( $last_nonblank_type eq 'i' ) {
5724         $op_expected = OPERATOR;
5725
5726         # FIXME: it would be cleaner to make this a special type
5727         # expecting VERSION or {} after package NAMESPACE
5728         # TODO: maybe mark these words as type 'Y'?
5729         if (   substr( $last_nonblank_token, 0, 7 ) eq 'package'
5730             && $statement_type      =~ /^package\b/
5731             && $last_nonblank_token =~ /^package\b/ )
5732         {
5733             $op_expected = TERM;
5734         }
5735     }
5736
5737     # keyword...
5738     elsif ( $last_nonblank_type eq 'k' ) {
5739         $op_expected = TERM;
5740         if ( $expecting_operator_token{$last_nonblank_token} ) {
5741             $op_expected = OPERATOR;
5742         }
5743         elsif ( $expecting_term_token{$last_nonblank_token} ) {
5744
5745             # Exceptions from TERM:
5746
5747             # // may follow perl functions which may be unary operators
5748             # see test file dor.t (defined or);
5749             if (
5750                    $tok eq '/'
5751                 && $next_type eq '/'
5752                 && $is_keyword_rejecting_slash_as_pattern_delimiter{
5753                     $last_nonblank_token}
5754               )
5755             {
5756                 $op_expected = OPERATOR;
5757             }
5758
5759             # Patch to allow a ? following 'split' to be a deprecated pattern
5760             # delimiter.  This patch is coordinated with the omission of split
5761             # from the list
5762             # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
5763             # will force perltidy to guess.
5764             elsif ($tok eq '?'
5765                 && $last_nonblank_token eq 'split' )
5766             {
5767                 $op_expected = UNKNOWN;
5768             }
5769         }
5770     } ## end type 'k'
5771
5772     # closing container token...
5773
5774     # Note that the actual token for type '}' may also be a ')'.
5775
5776     # Also note that $last_nonblank_token is not the token corresponding to
5777     # $last_nonblank_type when the type is a closing container.  In that
5778     # case it is the token before the corresponding opening container token.
5779     # So for example, for this snippet
5780     #       $a = do { BLOCK } / 2;
5781     # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
5782
5783     elsif ( $last_nonblank_type eq '}' ) {
5784         $op_expected = UNKNOWN;
5785
5786         # handle something after 'do' and 'eval'
5787         if ( $is_block_operator{$last_nonblank_token} ) {
5788
5789             # something like $a = do { BLOCK } / 2;
5790             $op_expected = OPERATOR;    # block mode following }
5791         }
5792
5793         ##elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
5794         elsif ( $is_paren_dollar{ substr( $last_nonblank_token, 0, 1 ) }
5795             || substr( $last_nonblank_token, 0, 2 ) eq '->' )
5796         {
5797             $op_expected = OPERATOR;
5798             if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
5799         }
5800
5801         # Check for smartmatch operator before preceding brace or square
5802         # bracket.  For example, at the ? after the ] in the following
5803         # expressions we are expecting an operator:
5804         #
5805         # qr/3/ ~~ ['1234'] ? 1 : 0;
5806         # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
5807         elsif ( $last_nonblank_token eq '~~' ) {
5808             $op_expected = OPERATOR;
5809         }
5810
5811         # A right brace here indicates the end of a simple block.  All
5812         # non-structural right braces have type 'R' all braces associated with
5813         # block operator keywords have been given those keywords as
5814         # "last_nonblank_token" and caught above.  (This statement is order
5815         # dependent, and must come after checking $last_nonblank_token).
5816         else {
5817
5818             # patch for dor.t (defined or).
5819             if (   $tok eq '/'
5820                 && $next_type eq '/'
5821                 && $last_nonblank_token eq ']' )
5822             {
5823                 $op_expected = OPERATOR;
5824             }
5825
5826             # Patch for RT #116344: misparse a ternary operator after an
5827             # anonymous hash, like this:
5828             #   return ref {} ? 1 : 0;
5829             # The right brace should really be marked type 'R' in this case,
5830             # and it is safest to return an UNKNOWN here. Expecting a TERM will
5831             # cause the '?' to always be interpreted as a pattern delimiter
5832             # rather than introducing a ternary operator.
5833             elsif ( $tok eq '?' ) {
5834                 $op_expected = UNKNOWN;
5835             }
5836             else {
5837                 $op_expected = TERM;
5838             }
5839         }
5840     } ## end type '}'
5841
5842     # number or v-string...
5843     # An exception is for VERSION numbers a 'use' statement. It has the format
5844     #     use Module VERSION LIST
5845     # We could avoid this exception by writing a special sub to parse 'use'
5846     # statements and perhaps mark these numbers with a new type V (for VERSION)
5847     ##elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
5848     elsif ( $is_n_v{$last_nonblank_type} ) {
5849         $op_expected = OPERATOR;
5850         if ( $statement_type eq 'use' ) {
5851             $op_expected = UNKNOWN;
5852         }
5853     }
5854
5855     # quote...
5856     # FIXME: labeled prototype words should probably be given type 'A' or maybe
5857     # 'J'; not 'q'; or maybe mark as type 'Y'
5858     elsif ( $last_nonblank_type eq 'q' ) {
5859         $op_expected = OPERATOR;
5860         if ( $last_nonblank_token eq 'prototype' )
5861           ##|| $last_nonblank_token eq 'switch' )
5862         {
5863             $op_expected = TERM;
5864         }
5865     }
5866
5867     # file handle or similar
5868     elsif ( $last_nonblank_type eq 'Z' ) {
5869
5870         $op_expected = UNKNOWN;
5871
5872         # angle.t
5873         if ( $last_nonblank_token =~ /^\w/ ) {
5874             $op_expected = UNKNOWN;
5875         }
5876
5877         # Exception to weird parsing rules for 'x(' ... see case b1205:
5878         # In something like 'print $vv x(...' the x is an operator;
5879         # Likewise in 'print $vv x$ww' the x is an operator (case b1207)
5880         # otherwise x follows the weird parsing rules.
5881         elsif ( $tok eq 'x' && $next_type =~ /^[\(\$\@\%]$/ ) {
5882             $op_expected = OPERATOR;
5883         }
5884
5885         # The 'weird parsing rules' of next section do not work for '<' and '?'
5886         # It is best to mark them as unknown.  Test case:
5887         #  print $fh <DATA>;
5888         elsif ( $is_weird_parsing_rule_exception{$tok} ) {
5889             $op_expected = UNKNOWN;
5890         }
5891
5892         # For possible file handle like "$a", Perl uses weird parsing rules.
5893         # For example:
5894         # print $a/2,"/hi";   - division
5895         # print $a / 2,"/hi"; - division
5896         # print $a/ 2,"/hi";  - division
5897         # print $a /2,"/hi";  - pattern (and error)!
5898         # Some examples where this logic works okay, for '&','*','+':
5899         #    print $fh &xsi_protos(@mods);
5900         #    my $x = new $CompressClass *FH;
5901         #    print $OUT +( $count % 15 ? ", " : "\n\t" );
5902         elsif ($prev_type eq 'b'
5903             && $next_type ne 'b' )
5904         {
5905             $op_expected = TERM;
5906         }
5907
5908         # Note that '?' and '<' have been moved above
5909         # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
5910         elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
5911
5912             # Do not complain in 'use' statements, which have special syntax.
5913             # For example, from RT#130344:
5914             #   use lib $FindBin::Bin . '/lib';
5915             if ( $statement_type ne 'use' ) {
5916                 complain(
5917 "operator in possible indirect object location not recommended\n"
5918                 );
5919             }
5920             $op_expected = OPERATOR;
5921         }
5922     }
5923
5924     # anything else...
5925     else {
5926         $op_expected = UNKNOWN;
5927     }
5928
5929   RETURN:
5930
5931     DEBUG_OPERATOR_EXPECTED && do {
5932         print STDOUT
5933 "OPERATOR_EXPECTED: $msg: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
5934     };
5935
5936     return $op_expected;
5937
5938 } ## end sub operator_expected
5939
5940 sub new_statement_ok {
5941
5942     # return true if the current token can start a new statement
5943     # USES GLOBAL VARIABLES: $last_nonblank_type
5944
5945     return label_ok()    # a label would be ok here
5946
5947       || $last_nonblank_type eq 'J';    # or we follow a label
5948
5949 } ## end sub new_statement_ok
5950
5951 sub label_ok {
5952
5953     # Decide if a bare word followed by a colon here is a label
5954     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5955     # $brace_depth, @brace_type
5956
5957     # if it follows an opening or closing code block curly brace..
5958     if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
5959         && $last_nonblank_type eq $last_nonblank_token )
5960     {
5961
5962         # it is a label if and only if the curly encloses a code block
5963         return $brace_type[$brace_depth];
5964     }
5965
5966     # otherwise, it is a label if and only if it follows a ';' (real or fake)
5967     # or another label
5968     else {
5969         return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
5970     }
5971 } ## end sub label_ok
5972
5973 sub code_block_type {
5974
5975     # Decide if this is a block of code, and its type.
5976     # Must be called only when $type = $token = '{'
5977     # The problem is to distinguish between the start of a block of code
5978     # and the start of an anonymous hash reference
5979     # Returns "" if not code block, otherwise returns 'last_nonblank_token'
5980     # to indicate the type of code block.  (For example, 'last_nonblank_token'
5981     # might be 'if' for an if block, 'else' for an else block, etc).
5982     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
5983     # $last_nonblank_block_type, $brace_depth, @brace_type
5984
5985     # handle case of multiple '{'s
5986
5987 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
5988
5989     my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
5990     if (   $last_nonblank_token eq '{'
5991         && $last_nonblank_type eq $last_nonblank_token )
5992     {
5993
5994         # opening brace where a statement may appear is probably
5995         # a code block but might be and anonymous hash reference
5996         if ( $brace_type[$brace_depth] ) {
5997             return decide_if_code_block( $i, $rtokens, $rtoken_type,
5998                 $max_token_index );
5999         }
6000
6001         # cannot start a code block within an anonymous hash
6002         else {
6003             return EMPTY_STRING;
6004         }
6005     }
6006
6007     elsif ( $last_nonblank_token eq ';' ) {
6008
6009         # an opening brace where a statement may appear is probably
6010         # a code block but might be and anonymous hash reference
6011         return decide_if_code_block( $i, $rtokens, $rtoken_type,
6012             $max_token_index );
6013     }
6014
6015     # handle case of '}{'
6016     elsif ($last_nonblank_token eq '}'
6017         && $last_nonblank_type eq $last_nonblank_token )
6018     {
6019
6020         # a } { situation ...
6021         # could be hash reference after code block..(blktype1.t)
6022         if ($last_nonblank_block_type) {
6023             return decide_if_code_block( $i, $rtokens, $rtoken_type,
6024                 $max_token_index );
6025         }
6026
6027         # must be a block if it follows a closing hash reference
6028         else {
6029             return $last_nonblank_token;
6030         }
6031     }
6032
6033     ################################################################
6034     # NOTE: braces after type characters start code blocks, but for
6035     # simplicity these are not identified as such.  See also
6036     # sub is_non_structural_brace.
6037     ################################################################
6038
6039 ##    elsif ( $last_nonblank_type eq 't' ) {
6040 ##       return $last_nonblank_token;
6041 ##    }
6042
6043     # brace after label:
6044     elsif ( $last_nonblank_type eq 'J' ) {
6045         return $last_nonblank_token;
6046     }
6047
6048 # otherwise, look at previous token.  This must be a code block if
6049 # it follows any of these:
6050 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
6051     elsif ($is_code_block_token{$last_nonblank_token}
6052         || $is_grep_alias{$last_nonblank_token} )
6053     {
6054
6055         # Bug Patch: Note that the opening brace after the 'if' in the following
6056         # snippet is an anonymous hash ref and not a code block!
6057         #   print 'hi' if { x => 1, }->{x};
6058         # We can identify this situation because the last nonblank type
6059         # will be a keyword (instead of a closing paren)
6060         if (
6061             $last_nonblank_type eq 'k'
6062             && (   $last_nonblank_token eq 'if'
6063                 || $last_nonblank_token eq 'unless' )
6064           )
6065         {
6066             return EMPTY_STRING;
6067         }
6068         else {
6069             return $last_nonblank_token;
6070         }
6071     }
6072
6073     # or a sub or package BLOCK
6074     elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6075         && $last_nonblank_token =~ /^(sub|package)\b/ )
6076     {
6077         return $last_nonblank_token;
6078     }
6079
6080     # or a sub alias
6081     elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
6082         && ( $is_sub{$last_nonblank_token} ) )
6083     {
6084         return 'sub';
6085     }
6086
6087     elsif ( $statement_type =~ /^(sub|package)\b/ ) {
6088         return $statement_type;
6089     }
6090
6091     # user-defined subs with block parameters (like grep/map/eval)
6092     elsif ( $last_nonblank_type eq 'G' ) {
6093         return $last_nonblank_token;
6094     }
6095
6096     # check bareword
6097     elsif ( $last_nonblank_type eq 'w' ) {
6098
6099         # check for syntax 'use MODULE LIST'
6100         # This fixes b1022 b1025 b1027 b1028 b1029 b1030 b1031
6101         return EMPTY_STRING if ( $statement_type eq 'use' );
6102
6103         return decide_if_code_block( $i, $rtokens, $rtoken_type,
6104             $max_token_index );
6105     }
6106
6107     # Patch for bug # RT #94338 reported by Daniel Trizen
6108     # for-loop in a parenthesized block-map triggering an error message:
6109     #    map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
6110     # Check for a code block within a parenthesized function call
6111     elsif ( $last_nonblank_token eq '(' ) {
6112         my $paren_type = $paren_type[$paren_depth];
6113
6114         #                   /^(map|grep|sort)$/
6115         if ( $paren_type && $is_sort_map_grep{$paren_type} ) {
6116
6117             # We will mark this as a code block but use type 't' instead
6118             # of the name of the containing function.  This will allow for
6119             # correct parsing but will usually produce better formatting.
6120             # Braces with block type 't' are not broken open automatically
6121             # in the formatter as are other code block types, and this usually
6122             # works best.
6123             return 't';    # (Not $paren_type)
6124         }
6125         else {
6126             return EMPTY_STRING;
6127         }
6128     }
6129
6130     # handle unknown syntax ') {'
6131     # we previously appended a '()' to mark this case
6132     elsif ( $last_nonblank_token =~ /\(\)$/ ) {
6133         return $last_nonblank_token;
6134     }
6135
6136     # anything else must be anonymous hash reference
6137     else {
6138         return EMPTY_STRING;
6139     }
6140 } ## end sub code_block_type
6141
6142 sub decide_if_code_block {
6143
6144     # USES GLOBAL VARIABLES: $last_nonblank_token
6145     my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
6146
6147     my ( $next_nonblank_token, $i_next ) =
6148       find_next_nonblank_token( $i, $rtokens, $max_token_index );
6149
6150     # we are at a '{' where a statement may appear.
6151     # We must decide if this brace starts an anonymous hash or a code
6152     # block.
6153     # return "" if anonymous hash, and $last_nonblank_token otherwise
6154
6155     # initialize to be code BLOCK
6156     my $code_block_type = $last_nonblank_token;
6157
6158     # Check for the common case of an empty anonymous hash reference:
6159     # Maybe something like sub { { } }
6160     if ( $next_nonblank_token eq '}' ) {
6161         $code_block_type = EMPTY_STRING;
6162     }
6163
6164     else {
6165
6166         # To guess if this '{' is an anonymous hash reference, look ahead
6167         # and test as follows:
6168         #
6169         # it is a hash reference if next come:
6170         #   - a string or digit followed by a comma or =>
6171         #   - bareword followed by =>
6172         # otherwise it is a code block
6173         #
6174         # Examples of anonymous hash ref:
6175         # {'aa',};
6176         # {1,2}
6177         #
6178         # Examples of code blocks:
6179         # {1; print "hello\n", 1;}
6180         # {$a,1};
6181
6182         # We are only going to look ahead one more (nonblank/comment) line.
6183         # Strange formatting could cause a bad guess, but that's unlikely.
6184         my @pre_types;
6185         my @pre_tokens;
6186
6187         # Ignore the rest of this line if it is a side comment
6188         if ( $next_nonblank_token ne '#' ) {
6189             @pre_types  = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
6190             @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
6191         }
6192         my ( $rpre_tokens, $rpre_types ) =
6193           peek_ahead_for_n_nonblank_pre_tokens(20);    # 20 is arbitrary but
6194                                                        # generous, and prevents
6195                                                        # wasting lots of
6196                                                        # time in mangled files
6197         if ( defined($rpre_types) && @{$rpre_types} ) {
6198             push @pre_types,  @{$rpre_types};
6199             push @pre_tokens, @{$rpre_tokens};
6200         }
6201
6202         # put a sentinel token to simplify stopping the search
6203         push @pre_types, '}';
6204         push @pre_types, '}';
6205
6206         my $jbeg = 0;
6207         $jbeg = 1 if $pre_types[0] eq 'b';
6208
6209         # first look for one of these
6210         #  - bareword
6211         #  - bareword with leading -
6212         #  - digit
6213         #  - quoted string
6214         my $j = $jbeg;
6215         if ( $pre_types[$j] =~ /^[\'\"]/ ) {
6216
6217             # find the closing quote; don't worry about escapes
6218             my $quote_mark = $pre_types[$j];
6219             foreach my $k ( $j + 1 .. @pre_types - 2 ) {
6220                 if ( $pre_types[$k] eq $quote_mark ) {
6221                     $j = $k + 1;
6222                     my $next = $pre_types[$j];
6223                     last;
6224                 }
6225             }
6226         }
6227         elsif ( $pre_types[$j] eq 'd' ) {
6228             $j++;
6229         }
6230         elsif ( $pre_types[$j] eq 'w' ) {
6231             $j++;
6232         }
6233         elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
6234             $j++;
6235         }
6236         if ( $j > $jbeg ) {
6237
6238             $j++ if $pre_types[$j] eq 'b';
6239
6240             # Patched for RT #95708
6241             if (
6242
6243                 # it is a comma which is not a pattern delimiter except for qw
6244                 (
6245                     $pre_types[$j] eq ','
6246                     ## !~ /^(s|m|y|tr|qr|q|qq|qx)$/
6247                     && !$is_q_qq_qx_qr_s_y_tr_m{ $pre_tokens[$jbeg] }
6248                 )
6249
6250                 # or a =>
6251                 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
6252               )
6253             {
6254                 $code_block_type = EMPTY_STRING;
6255             }
6256         }
6257
6258         if ($code_block_type) {
6259
6260             # Patch for cases b1085 b1128: It is uncertain if this is a block.
6261             # If this brace follows a bareword, then append a space as a signal
6262             # to the formatter that this may not be a block brace.  To find the
6263             # corresponding code in Formatter.pm search for 'b1085'.
6264             $code_block_type .= SPACE if ( $code_block_type =~ /^\w/ );
6265         }
6266     }
6267
6268     return $code_block_type;
6269 } ## end sub decide_if_code_block
6270
6271 sub report_unexpected {
6272
6273     # report unexpected token type and show where it is
6274     # USES GLOBAL VARIABLES: $tokenizer_self
6275     my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
6276         $rpretoken_type, $input_line )
6277       = @_;
6278
6279     if ( ++$tokenizer_self->[_unexpected_error_count_] <= MAX_NAG_MESSAGES ) {
6280         my $msg = "found $found where $expecting expected";
6281         my $pos = $rpretoken_map->[$i_tok];
6282         interrupt_logfile();
6283         my $input_line_number = $tokenizer_self->[_last_line_number_];
6284         my ( $offset, $numbered_line, $underline ) =
6285           make_numbered_line( $input_line_number, $input_line, $pos );
6286         $underline = write_on_underline( $underline, $pos - $offset, '^' );
6287
6288         my $trailer = EMPTY_STRING;
6289         if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
6290             my $pos_prev = $rpretoken_map->[$last_nonblank_i];
6291             my $num;
6292             if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
6293                 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
6294             }
6295             else {
6296                 $num = $pos - $pos_prev;
6297             }
6298             if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
6299
6300             $underline =
6301               write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
6302             $trailer = " (previous token underlined)";
6303         }
6304         $underline =~ s/\s+$//;
6305         warning( $numbered_line . "\n" );
6306         warning( $underline . "\n" );
6307         warning( $msg . $trailer . "\n" );
6308         resume_logfile();
6309     }
6310     return;
6311 } ## end sub report_unexpected
6312
6313 my %is_sigil_or_paren;
6314 my %is_R_closing_sb;
6315
6316 BEGIN {
6317
6318     my @q = qw< $ & % * @ ) >;
6319     @{is_sigil_or_paren}{@q} = (1) x scalar(@q);
6320
6321     @q = qw(R ]);
6322     @{is_R_closing_sb}{@q} = (1) x scalar(@q);
6323 }
6324
6325 sub is_non_structural_brace {
6326
6327     # Decide if a brace or bracket is structural or non-structural
6328     # by looking at the previous token and type
6329     # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
6330
6331     # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
6332     # Tentatively deactivated because it caused the wrong operator expectation
6333     # for this code:
6334     #      $user = @vars[1] / 100;
6335     # Must update sub operator_expected before re-implementing.
6336     # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
6337     #    return 0;
6338     # }
6339
6340     ################################################################
6341     # NOTE: braces after type characters start code blocks, but for
6342     # simplicity these are not identified as such.  See also
6343     # sub code_block_type
6344     ################################################################
6345
6346     ##if ($last_nonblank_type eq 't') {return 0}
6347
6348     # otherwise, it is non-structural if it is decorated
6349     # by type information.
6350     # For example, the '{' here is non-structural:   ${xxx}
6351     # Removed '::' to fix c074
6352     ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
6353     return (
6354         ## $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->)/
6355         $is_sigil_or_paren{ substr( $last_nonblank_token, 0, 1 ) }
6356           || substr( $last_nonblank_token, 0, 2 ) eq '->'
6357
6358           # or if we follow a hash or array closing curly brace or bracket
6359           # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
6360           # because the first '}' would have been given type 'R'
6361           ##|| $last_nonblank_type =~ /^([R\]])$/
6362           || $is_R_closing_sb{$last_nonblank_type}
6363     );
6364 } ## end sub is_non_structural_brace
6365
6366 #########i#############################################################
6367 # Tokenizer routines for tracking container nesting depths
6368 #######################################################################
6369
6370 # The following routines keep track of nesting depths of the nesting
6371 # types, ( [ { and ?.  This is necessary for determining the indentation
6372 # level, and also for debugging programs.  Not only do they keep track of
6373 # nesting depths of the individual brace types, but they check that each
6374 # of the other brace types is balanced within matching pairs.  For
6375 # example, if the program sees this sequence:
6376 #
6377 #         {  ( ( ) }
6378 #
6379 # then it can determine that there is an extra left paren somewhere
6380 # between the { and the }.  And so on with every other possible
6381 # combination of outer and inner brace types.  For another
6382 # example:
6383 #
6384 #         ( [ ..... ]  ] )
6385 #
6386 # which has an extra ] within the parens.
6387 #
6388 # The brace types have indexes 0 .. 3 which are indexes into
6389 # the matrices.
6390 #
6391 # The pair ? : are treated as just another nesting type, with ? acting
6392 # as the opening brace and : acting as the closing brace.
6393 #
6394 # The matrix
6395 #
6396 #         $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
6397 #
6398 # saves the nesting depth of brace type $b (where $b is either of the other
6399 # nesting types) when brace type $a enters a new depth.  When this depth
6400 # decreases, a check is made that the current depth of brace types $b is
6401 # unchanged, or otherwise there must have been an error.  This can
6402 # be very useful for localizing errors, particularly when perl runs to
6403 # the end of a large file (such as this one) and announces that there
6404 # is a problem somewhere.
6405 #
6406 # A numerical sequence number is maintained for every nesting type,
6407 # so that each matching pair can be uniquely identified in a simple
6408 # way.
6409
6410 sub increase_nesting_depth {
6411     my ( $aa, $pos ) = @_;
6412
6413     # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6414     # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
6415     # $statement_type
6416     $current_depth[$aa]++;
6417     $total_depth++;
6418     $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
6419     my $input_line_number = $tokenizer_self->[_last_line_number_];
6420     my $input_line        = $tokenizer_self->[_line_of_text_];
6421
6422     # Sequence numbers increment by number of items.  This keeps
6423     # a unique set of numbers but still allows the relative location
6424     # of any type to be determined.
6425
6426     ########################################################################
6427     # OLD SEQNO METHOD for incrementing sequence numbers.
6428     # Keep this coding awhile for possible testing.
6429     ## $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
6430     ## my $seqno = $nesting_sequence_number[$aa];
6431
6432     # NEW SEQNO METHOD, continuous sequence numbers. This allows sequence
6433     # numbers to be used as array indexes, and allows them to be compared.
6434     my $seqno = $next_sequence_number++;
6435     ########################################################################
6436
6437     $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
6438
6439     $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
6440       [ $input_line_number, $input_line, $pos ];
6441
6442     for my $bb ( 0 .. @closing_brace_names - 1 ) {
6443         next if ( $bb == $aa );
6444         $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
6445     }
6446
6447     # set a flag for indenting a nested ternary statement
6448     my $indent = 0;
6449     if ( $aa == QUESTION_COLON ) {
6450         $nested_ternary_flag[ $current_depth[$aa] ] = 0;
6451         if ( $current_depth[$aa] > 1 ) {
6452             if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
6453                 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
6454                 if ( $pdepth == $total_depth - 1 ) {
6455                     $indent = 1;
6456                     $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
6457                 }
6458             }
6459         }
6460     }
6461
6462     # Fix part #1 for git82: save last token type for propagation of type 'Z'
6463     $nested_statement_type[$aa][ $current_depth[$aa] ] =
6464       [ $statement_type, $last_nonblank_type, $last_nonblank_token ];
6465     $statement_type = EMPTY_STRING;
6466     return ( $seqno, $indent );
6467 } ## end sub increase_nesting_depth
6468
6469 sub is_balanced_closing_container {
6470
6471     # Return true if a closing container can go here without error
6472     # Return false if not
6473     my ($aa) = @_;
6474
6475     # cannot close if there was no opening
6476     return unless ( $current_depth[$aa] > 0 );
6477
6478     # check that any other brace types $bb contained within would be balanced
6479     for my $bb ( 0 .. @closing_brace_names - 1 ) {
6480         next if ( $bb == $aa );
6481         return
6482           unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6483             $current_depth[$bb] );
6484     }
6485
6486     # OK, everything will be balanced
6487     return 1;
6488 } ## end sub is_balanced_closing_container
6489
6490 sub decrease_nesting_depth {
6491
6492     my ( $aa, $pos ) = @_;
6493
6494     # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
6495     # @current_sequence_number, @depth_array, @starting_line_of_current_depth
6496     # $statement_type
6497     my $seqno             = 0;
6498     my $input_line_number = $tokenizer_self->[_last_line_number_];
6499     my $input_line        = $tokenizer_self->[_line_of_text_];
6500
6501     my $outdent = 0;
6502     $total_depth--;
6503     if ( $current_depth[$aa] > 0 ) {
6504
6505         # set a flag for un-indenting after seeing a nested ternary statement
6506         $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
6507         if ( $aa == QUESTION_COLON ) {
6508             $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
6509         }
6510
6511         # Fix part #2 for git82: use saved type for propagation of type 'Z'
6512         # through type L-R braces.  Perl seems to allow ${bareword}
6513         # as an indirect object, but nothing much more complex than that.
6514         ( $statement_type, my $saved_type, my $saved_token ) =
6515           @{ $nested_statement_type[$aa][ $current_depth[$aa] ] };
6516         if (   $aa == BRACE
6517             && $saved_type eq 'Z'
6518             && $last_nonblank_type eq 'w'
6519             && $brace_structural_type[$brace_depth] eq 'L' )
6520         {
6521             $last_nonblank_type = $saved_type;
6522         }
6523
6524         # check that any brace types $bb contained within are balanced
6525         for my $bb ( 0 .. @closing_brace_names - 1 ) {
6526             next if ( $bb == $aa );
6527
6528             unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
6529                 $current_depth[$bb] )
6530             {
6531                 my $diff =
6532                   $current_depth[$bb] -
6533                   $depth_array[$aa][$bb][ $current_depth[$aa] ];
6534
6535                 # don't whine too many times
6536                 my $saw_brace_error = get_saw_brace_error();
6537                 if (
6538                     $saw_brace_error <= MAX_NAG_MESSAGES
6539
6540                     # if too many closing types have occurred, we probably
6541                     # already caught this error
6542                     && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
6543                   )
6544                 {
6545                     interrupt_logfile();
6546                     my $rsl =
6547                       $starting_line_of_current_depth[$aa]
6548                       [ $current_depth[$aa] ];
6549                     my $sl  = $rsl->[0];
6550                     my $rel = [ $input_line_number, $input_line, $pos ];
6551                     my $el  = $rel->[0];
6552                     my ($ess);
6553
6554                     if ( $diff == 1 || $diff == -1 ) {
6555                         $ess = EMPTY_STRING;
6556                     }
6557                     else {
6558                         $ess = 's';
6559                     }
6560                     my $bname =
6561                       ( $diff > 0 )
6562                       ? $opening_brace_names[$bb]
6563                       : $closing_brace_names[$bb];
6564                     write_error_indicator_pair( @{$rsl}, '^' );
6565                     my $msg = <<"EOM";
6566 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
6567 EOM
6568
6569                     if ( $diff > 0 ) {
6570                         my $rml =
6571                           $starting_line_of_current_depth[$bb]
6572                           [ $current_depth[$bb] ];
6573                         my $ml = $rml->[0];
6574                         $msg .=
6575 "    The most recent un-matched $bname is on line $ml\n";
6576                         write_error_indicator_pair( @{$rml}, '^' );
6577                     }
6578                     write_error_indicator_pair( @{$rel}, '^' );
6579                     warning($msg);
6580                     resume_logfile();
6581                 }
6582                 increment_brace_error();
6583             }
6584         }
6585         $current_depth[$aa]--;
6586     }
6587     else {
6588
6589         my $saw_brace_error = get_saw_brace_error();
6590         if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
6591             my $msg = <<"EOM";
6592 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
6593 EOM
6594             indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
6595         }
6596         increment_brace_error();
6597
6598         # keep track of errors in braces alone (ignoring ternary nesting errors)
6599         $tokenizer_self->[_true_brace_error_count_]++
6600           if ( $closing_brace_names[$aa] ne "':'" );
6601     }
6602     return ( $seqno, $outdent );
6603 } ## end sub decrease_nesting_depth
6604
6605 sub check_final_nesting_depths {
6606
6607     # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
6608
6609     for my $aa ( 0 .. @closing_brace_names - 1 ) {
6610
6611         if ( $current_depth[$aa] ) {
6612             my $rsl =
6613               $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
6614             my $sl  = $rsl->[0];
6615             my $msg = <<"EOM";
6616 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
6617 The most recent un-matched $opening_brace_names[$aa] is on line $sl
6618 EOM
6619             indicate_error( $msg, @{$rsl}, '^' );
6620             increment_brace_error();
6621         }
6622     }
6623     return;
6624 } ## end sub check_final_nesting_depths
6625
6626 #########i#############################################################
6627 # Tokenizer routines for looking ahead in input stream
6628 #######################################################################
6629
6630 sub peek_ahead_for_n_nonblank_pre_tokens {
6631
6632     # returns next n pretokens if they exist
6633     # returns undef's if hits eof without seeing any pretokens
6634     # USES GLOBAL VARIABLES: $tokenizer_self
6635     my $max_pretokens = shift;
6636     my $line;
6637     my $i = 0;
6638     my ( $rpre_tokens, $rmap, $rpre_types );
6639
6640     while ( $line =
6641         $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6642     {
6643         $line =~ s/^\s*//;                 # trim leading blanks
6644         next if ( length($line) <= 0 );    # skip blank
6645         next if ( $line =~ /^#/ );         # skip comment
6646         ( $rpre_tokens, $rmap, $rpre_types ) =
6647           pre_tokenize( $line, $max_pretokens );
6648         last;
6649     }
6650     return ( $rpre_tokens, $rpre_types );
6651 } ## end sub peek_ahead_for_n_nonblank_pre_tokens
6652
6653 # look ahead for next non-blank, non-comment line of code
6654 sub peek_ahead_for_nonblank_token {
6655
6656     # USES GLOBAL VARIABLES: $tokenizer_self
6657     my ( $rtokens, $max_token_index ) = @_;
6658     my $line;
6659     my $i = 0;
6660
6661     while ( $line =
6662         $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
6663     {
6664         $line =~ s/^\s*//;                 # trim leading blanks
6665         next if ( length($line) <= 0 );    # skip blank
6666         next if ( $line =~ /^#/ );         # skip comment
6667
6668         # Updated from 2 to 3 to get trigraphs, added for case b1175
6669         my ( $rtok, $rmap, $rtype ) = pre_tokenize( $line, 3 );
6670         my $j = $max_token_index + 1;
6671
6672         foreach my $tok ( @{$rtok} ) {
6673             last if ( $tok =~ "\n" );
6674             $rtokens->[ ++$j ] = $tok;
6675         }
6676         last;
6677     }
6678     return;
6679 } ## end sub peek_ahead_for_nonblank_token
6680
6681 #########i#############################################################
6682 # Tokenizer guessing routines for ambiguous situations
6683 #######################################################################
6684
6685 sub guess_if_pattern_or_conditional {
6686
6687     # this routine is called when we have encountered a ? following an
6688     # unknown bareword, and we must decide if it starts a pattern or not
6689     # input parameters:
6690     #   $i - token index of the ? starting possible pattern
6691     # output parameters:
6692     #   $is_pattern = 0 if probably not pattern,  =1 if probably a pattern
6693     #   msg = a warning or diagnostic message
6694     # USES GLOBAL VARIABLES: $last_nonblank_token
6695
6696     my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6697     my $is_pattern = 0;
6698     my $msg        = "guessing that ? after $last_nonblank_token starts a ";
6699
6700     if ( $i >= $max_token_index ) {
6701         $msg .= "conditional (no end to pattern found on the line)\n";
6702     }
6703     else {
6704         my $ibeg = $i;
6705         $i = $ibeg + 1;
6706         my $next_token = $rtokens->[$i];    # first token after ?
6707
6708         # look for a possible ending ? on this line..
6709         my $in_quote        = 1;
6710         my $quote_depth     = 0;
6711         my $quote_character = EMPTY_STRING;
6712         my $quote_pos       = 0;
6713         my $quoted_string;
6714         (
6715             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6716             $quoted_string
6717           )
6718           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6719             $quote_pos, $quote_depth, $max_token_index );
6720
6721         if ($in_quote) {
6722
6723             # we didn't find an ending ? on this line,
6724             # so we bias towards conditional
6725             $is_pattern = 0;
6726             $msg .= "conditional (no ending ? on this line)\n";
6727
6728             # we found an ending ?, so we bias towards a pattern
6729         }
6730         else {
6731
6732             # Watch out for an ending ? in quotes, like this
6733             #    my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
6734             my $s_quote = 0;
6735             my $d_quote = 0;
6736             my $colons  = 0;
6737             foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
6738                 my $tok = $rtokens->[$ii];
6739                 if ( $tok eq ":" ) { $colons++ }
6740                 if ( $tok eq "'" ) { $s_quote++ }
6741                 if ( $tok eq '"' ) { $d_quote++ }
6742             }
6743             if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
6744                 $is_pattern = 0;
6745                 $msg .= "found ending ? but unbalanced quote chars\n";
6746             }
6747             elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
6748                 $is_pattern = 1;
6749                 $msg .= "pattern (found ending ? and pattern expected)\n";
6750             }
6751             else {
6752                 $msg .= "pattern (uncertain, but found ending ?)\n";
6753             }
6754         }
6755     }
6756     return ( $is_pattern, $msg );
6757 } ## end sub guess_if_pattern_or_conditional
6758
6759 my %is_known_constant;
6760 my %is_known_function;
6761
6762 BEGIN {
6763
6764     # Constants like 'pi' in Trig.pm are common
6765     my @q = qw(pi pi2 pi4 pip2 pip4);
6766     @{is_known_constant}{@q} = (1) x scalar(@q);
6767
6768     # parenless calls of 'ok' are common
6769     @q = qw( ok );
6770     @{is_known_function}{@q} = (1) x scalar(@q);
6771 }
6772
6773 sub guess_if_pattern_or_division {
6774
6775     # this routine is called when we have encountered a / following an
6776     # unknown bareword, and we must decide if it starts a pattern or is a
6777     # division
6778     # input parameters:
6779     #   $i - token index of the / starting possible pattern
6780     # output parameters:
6781     #   $is_pattern = 0 if probably division,  =1 if probably a pattern
6782     #   msg = a warning or diagnostic message
6783     # USES GLOBAL VARIABLES: $last_nonblank_token
6784     my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6785     my $is_pattern = 0;
6786     my $msg        = "guessing that / after $last_nonblank_token starts a ";
6787
6788     if ( $i >= $max_token_index ) {
6789         $msg .= "division (no end to pattern found on the line)\n";
6790     }
6791     else {
6792         my $ibeg = $i;
6793         my $divide_possible =
6794           is_possible_numerator( $i, $rtokens, $max_token_index );
6795
6796         if ( $divide_possible < 0 ) {
6797             $msg        = "pattern (division not possible here)\n";
6798             $is_pattern = 1;
6799             goto RETURN;
6800         }
6801
6802         $i = $ibeg + 1;
6803         my $next_token = $rtokens->[$i];    # first token after slash
6804
6805         # One of the things we can look at is the spacing around the slash.
6806         # There # are four possible spacings around the first slash:
6807         #
6808         #     return pi/two;#/;     -/-
6809         #     return pi/ two;#/;    -/+
6810         #     return pi / two;#/;   +/+
6811         #     return pi /two;#/;    +/-   <-- possible pattern
6812         #
6813         # Spacing rule: a space before the slash but not after the slash
6814         # usually indicates a pattern.  We can use this to break ties.
6815
6816         my $is_pattern_by_spacing =
6817           ( $i > 1 && $next_token !~ m/^\s/ && $rtokens->[ $i - 2 ] =~ m/^\s/ );
6818
6819         # look for a possible ending / on this line..
6820         my $in_quote        = 1;
6821         my $quote_depth     = 0;
6822         my $quote_character = EMPTY_STRING;
6823         my $quote_pos       = 0;
6824         my $quoted_string;
6825         (
6826             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6827             $quoted_string
6828           )
6829           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6830             $quote_pos, $quote_depth, $max_token_index );
6831
6832         if ($in_quote) {
6833
6834             # we didn't find an ending / on this line, so we bias towards
6835             # division
6836             if ( $divide_possible >= 0 ) {
6837                 $is_pattern = 0;
6838                 $msg .= "division (no ending / on this line)\n";
6839             }
6840             else {
6841
6842                 # assuming a multi-line pattern ... this is risky, but division
6843                 # does not seem possible.  If this fails, it would either be due
6844                 # to a syntax error in the code, or the division_expected logic
6845                 # needs to be fixed.
6846                 $msg        = "multi-line pattern (division not possible)\n";
6847                 $is_pattern = 1;
6848             }
6849         }
6850
6851         # we found an ending /, so we bias slightly towards a pattern
6852         else {
6853
6854             my $pattern_expected =
6855               pattern_expected( $i, $rtokens, $max_token_index );
6856
6857             if ( $pattern_expected >= 0 ) {
6858
6859                 # pattern looks possible...
6860                 if ( $divide_possible >= 0 ) {
6861
6862                     # Both pattern and divide can work here...
6863
6864                     # Increase weight of divide if a pure number follows
6865                     $divide_possible += $next_token =~ /^\d+$/;
6866
6867                     # Check for known constants in the numerator, like 'pi'
6868                     if ( $is_known_constant{$last_nonblank_token} ) {
6869                         $msg .=
6870 "division (pattern works too but saw known constant '$last_nonblank_token')\n";
6871                         $is_pattern = 0;
6872                     }
6873
6874                     # A very common bare word in pattern expressions is 'ok'
6875                     elsif ( $is_known_function{$last_nonblank_token} ) {
6876                         $msg .=
6877 "pattern (division works too but saw '$last_nonblank_token')\n";
6878                         $is_pattern = 1;
6879                     }
6880
6881                     # If one rule is more definite, use it
6882                     elsif ( $divide_possible > $pattern_expected ) {
6883                         $msg .=
6884                           "division (more likely based on following tokens)\n";
6885                         $is_pattern = 0;
6886                     }
6887
6888                     # otherwise, use the spacing rule
6889                     elsif ($is_pattern_by_spacing) {
6890                         $msg .=
6891 "pattern (guess on spacing, but division possible too)\n";
6892                         $is_pattern = 1;
6893                     }
6894                     else {
6895                         $msg .=
6896 "division (guess on spacing, but pattern is possible too)\n";
6897                         $is_pattern = 0;
6898                     }
6899                 }
6900
6901                 # divide_possible < 0 means divide can not work here
6902                 else {
6903                     $is_pattern = 1;
6904                     $msg .= "pattern (division not possible)\n";
6905                 }
6906             }
6907
6908             # pattern does not look possible...
6909             else {
6910
6911                 if ( $divide_possible >= 0 ) {
6912                     $is_pattern = 0;
6913                     $msg .= "division (pattern not possible)\n";
6914                 }
6915
6916                 # Neither pattern nor divide look possible...go by spacing
6917                 else {
6918                     if ($is_pattern_by_spacing) {
6919                         $msg .= "pattern (guess on spacing)\n";
6920                         $is_pattern = 1;
6921                     }
6922                     else {
6923                         $msg .= "division (guess on spacing)\n";
6924                         $is_pattern = 0;
6925                     }
6926                 }
6927             }
6928         }
6929     }
6930
6931   RETURN:
6932     return ( $is_pattern, $msg );
6933 } ## end sub guess_if_pattern_or_division
6934
6935 # try to resolve here-doc vs. shift by looking ahead for
6936 # non-code or the end token (currently only looks for end token)
6937 # returns 1 if it is probably a here doc, 0 if not
6938 sub guess_if_here_doc {
6939
6940     # This is how many lines we will search for a target as part of the
6941     # guessing strategy.  It is a constant because there is probably
6942     # little reason to change it.
6943     # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
6944     # %is_constant,
6945     my $HERE_DOC_WINDOW = 40;
6946
6947     my $next_token        = shift;
6948     my $here_doc_expected = 0;
6949     my $line;
6950     my $k   = 0;
6951     my $msg = "checking <<";
6952
6953     while ( $line =
6954         $tokenizer_self->[_line_buffer_object_]->peek_ahead( $k++ ) )
6955     {
6956         chomp $line;
6957
6958         if ( $line =~ /^$next_token$/ ) {
6959             $msg .= " -- found target $next_token ahead $k lines\n";
6960             $here_doc_expected = 1;    # got it
6961             last;
6962         }
6963         last if ( $k >= $HERE_DOC_WINDOW );
6964     }
6965
6966     unless ($here_doc_expected) {
6967
6968         if ( !defined($line) ) {
6969             $here_doc_expected = -1;    # hit eof without seeing target
6970             $msg .= " -- must be shift; target $next_token not in file\n";
6971
6972         }
6973         else {                          # still unsure..taking a wild guess
6974
6975             if ( !$is_constant{$current_package}{$next_token} ) {
6976                 $here_doc_expected = 1;
6977                 $msg .=
6978                   " -- guessing it's a here-doc ($next_token not a constant)\n";
6979             }
6980             else {
6981                 $msg .=
6982                   " -- guessing it's a shift ($next_token is a constant)\n";
6983             }
6984         }
6985     }
6986     write_logfile_entry($msg);
6987     return $here_doc_expected;
6988 } ## end sub guess_if_here_doc
6989
6990 #########i#############################################################
6991 # Tokenizer Routines for scanning identifiers and related items
6992 #######################################################################
6993
6994 sub scan_bare_identifier_do {
6995
6996     # this routine is called to scan a token starting with an alphanumeric
6997     # variable or package separator, :: or '.
6998     # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
6999     # $last_nonblank_type,@paren_type, $paren_depth
7000
7001     my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
7002         $max_token_index )
7003       = @_;
7004     my $i_begin = $i;
7005     my $package = undef;
7006
7007     my $i_beg = $i;
7008
7009     # we have to back up one pretoken at a :: since each : is one pretoken
7010     if ( $tok eq '::' ) { $i_beg-- }
7011     if ( $tok eq '->' ) { $i_beg-- }
7012     my $pos_beg = $rtoken_map->[$i_beg];
7013     pos($input_line) = $pos_beg;
7014
7015     #  Examples:
7016     #   A::B::C
7017     #   A::
7018     #   ::A
7019     #   A'B
7020     if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
7021
7022         my $pos  = pos($input_line);
7023         my $numc = $pos - $pos_beg;
7024         $tok = substr( $input_line, $pos_beg, $numc );
7025
7026         # type 'w' includes anything without leading type info
7027         # ($,%,@,*) including something like abc::def::ghi
7028         $type = 'w';
7029
7030         my $sub_name = EMPTY_STRING;
7031         if ( defined($2) ) { $sub_name = $2; }
7032         if ( defined($1) ) {
7033             $package = $1;
7034
7035             # patch: don't allow isolated package name which just ends
7036             # in the old style package separator (single quote).  Example:
7037             #   use CGI':all';
7038             if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
7039                 $pos--;
7040             }
7041
7042             $package =~ s/\'/::/g;
7043             if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7044             $package =~ s/::$//;
7045         }
7046         else {
7047             $package = $current_package;
7048
7049             # patched for c043, part 1: keyword does not follow '->'
7050             if ( $is_keyword{$tok} && $last_nonblank_type ne '->' ) {
7051                 $type = 'k';
7052             }
7053         }
7054
7055         # if it is a bareword..  patched for c043, part 2: not following '->'
7056         if ( $type eq 'w' && $last_nonblank_type ne '->' ) {
7057
7058             # check for v-string with leading 'v' type character
7059             # (This seems to have precedence over filehandle, type 'Y')
7060             if ( $tok =~ /^v\d[_\d]*$/ ) {
7061
7062                 # we only have the first part - something like 'v101' -
7063                 # look for more
7064                 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
7065                     $pos  = pos($input_line);
7066                     $numc = $pos - $pos_beg;
7067                     $tok  = substr( $input_line, $pos_beg, $numc );
7068                 }
7069                 $type = 'v';
7070
7071                 # warn if this version can't handle v-strings
7072                 report_v_string($tok);
7073             }
7074
7075             elsif ( $is_constant{$package}{$sub_name} ) {
7076                 $type = 'C';
7077             }
7078
7079             # bareword after sort has implied empty prototype; for example:
7080             # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
7081             # This has priority over whatever the user has specified.
7082             elsif ($last_nonblank_token eq 'sort'
7083                 && $last_nonblank_type eq 'k' )
7084             {
7085                 $type = 'Z';
7086             }
7087
7088             # Note: strangely, perl does not seem to really let you create
7089             # functions which act like eval and do, in the sense that eval
7090             # and do may have operators following the final }, but any operators
7091             # that you create with prototype (&) apparently do not allow
7092             # trailing operators, only terms.  This seems strange.
7093             # If this ever changes, here is the update
7094             # to make perltidy behave accordingly:
7095
7096             # elsif ( $is_block_function{$package}{$tok} ) {
7097             #    $tok='eval'; # patch to do braces like eval  - doesn't work
7098             #    $type = 'k';
7099             #}
7100             # FIXME: This could become a separate type to allow for different
7101             # future behavior:
7102             elsif ( $is_block_function{$package}{$sub_name} ) {
7103                 $type = 'G';
7104             }
7105             elsif ( $is_block_list_function{$package}{$sub_name} ) {
7106                 $type = 'G';
7107             }
7108             elsif ( $is_user_function{$package}{$sub_name} ) {
7109                 $type      = 'U';
7110                 $prototype = $user_function_prototype{$package}{$sub_name};
7111             }
7112
7113             # check for indirect object
7114             elsif (
7115
7116                 # added 2001-03-27: must not be followed immediately by '('
7117                 # see fhandle.t
7118                 ( $input_line !~ m/\G\(/gc )
7119
7120                 # and
7121                 && (
7122
7123                     # preceded by keyword like 'print', 'printf' and friends
7124                     $is_indirect_object_taker{$last_nonblank_token}
7125
7126                     # or preceded by something like 'print(' or 'printf('
7127                     || (
7128                         ( $last_nonblank_token eq '(' )
7129                         && $is_indirect_object_taker{ $paren_type[$paren_depth]
7130                         }
7131
7132                     )
7133                 )
7134               )
7135             {
7136
7137                 # may not be indirect object unless followed by a space;
7138                 # updated 2021-01-16 to consider newline to be a space.
7139                 # updated for case b990 to look for either ';' or space
7140                 if ( pos($input_line) == length($input_line)
7141                     || $input_line =~ m/\G[;\s]/gc )
7142                 {
7143                     $type = 'Y';
7144
7145                     # Abandon Hope ...
7146                     # Perl's indirect object notation is a very bad
7147                     # thing and can cause subtle bugs, especially for
7148                     # beginning programmers.  And I haven't even been
7149                     # able to figure out a sane warning scheme which
7150                     # doesn't get in the way of good scripts.
7151
7152                     # Complain if a filehandle has any lower case
7153                     # letters.  This is suggested good practice.
7154                     # Use 'sub_name' because something like
7155                     # main::MYHANDLE is ok for filehandle
7156                     if ( $sub_name =~ /[a-z]/ ) {
7157
7158                         # could be bug caused by older perltidy if
7159                         # followed by '('
7160                         if ( $input_line =~ m/\G\s*\(/gc ) {
7161                             complain(
7162 "Caution: unknown word '$tok' in indirect object slot\n"
7163                             );
7164                         }
7165                     }
7166                 }
7167
7168                 # bareword not followed by a space -- may not be filehandle
7169                 # (may be function call defined in a 'use' statement)
7170                 else {
7171                     $type = 'Z';
7172                 }
7173             }
7174         }
7175
7176         # Now we must convert back from character position
7177         # to pre_token index.
7178         # I don't think an error flag can occur here ..but who knows
7179         my $error;
7180         ( $i, $error ) =
7181           inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7182         if ($error) {
7183             warning("scan_bare_identifier: Possibly invalid tokenization\n");
7184         }
7185     }
7186
7187     # no match but line not blank - could be syntax error
7188     # perl will take '::' alone without complaint
7189     else {
7190         $type = 'w';
7191
7192         # change this warning to log message if it becomes annoying
7193         warning("didn't find identifier after leading ::\n");
7194     }
7195     return ( $i, $tok, $type, $prototype );
7196 } ## end sub scan_bare_identifier_do
7197
7198 sub scan_id_do {
7199
7200 # This is the new scanner and will eventually replace scan_identifier.
7201 # Only type 'sub' and 'package' are implemented.
7202 # Token types $ * % @ & -> are not yet implemented.
7203 #
7204 # Scan identifier following a type token.
7205 # The type of call depends on $id_scan_state: $id_scan_state = ''
7206 # for starting call, in which case $tok must be the token defining
7207 # the type.
7208 #
7209 # If the type token is the last nonblank token on the line, a value
7210 # of $id_scan_state = $tok is returned, indicating that further
7211 # calls must be made to get the identifier.  If the type token is
7212 # not the last nonblank token on the line, the identifier is
7213 # scanned and handled and a value of '' is returned.
7214 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
7215 # $statement_type, $tokenizer_self
7216
7217     my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
7218         $max_token_index )
7219       = @_;
7220     use constant DEBUG_NSCAN => 0;
7221     my $type = EMPTY_STRING;
7222     my ( $i_beg, $pos_beg );
7223
7224     #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7225     #my ($a,$b,$c) = caller;
7226     #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
7227
7228     # on re-entry, start scanning at first token on the line
7229     if ($id_scan_state) {
7230         $i_beg = $i;
7231         $type  = EMPTY_STRING;
7232     }
7233
7234     # on initial entry, start scanning just after type token
7235     else {
7236         $i_beg         = $i + 1;
7237         $id_scan_state = $tok;
7238         $type          = 't';
7239     }
7240
7241     # find $i_beg = index of next nonblank token,
7242     # and handle empty lines
7243     my $blank_line          = 0;
7244     my $next_nonblank_token = $rtokens->[$i_beg];
7245     if ( $i_beg > $max_token_index ) {
7246         $blank_line = 1;
7247     }
7248     else {
7249
7250         # only a '#' immediately after a '$' is not a comment
7251         if ( $next_nonblank_token eq '#' ) {
7252             unless ( $tok eq '$' ) {
7253                 $blank_line = 1;
7254             }
7255         }
7256
7257         if ( $next_nonblank_token =~ /^\s/ ) {
7258             ( $next_nonblank_token, $i_beg ) =
7259               find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
7260                 $max_token_index );
7261             if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
7262                 $blank_line = 1;
7263             }
7264         }
7265     }
7266
7267     # handle non-blank line; identifier, if any, must follow
7268     unless ($blank_line) {
7269
7270         if ( $is_sub{$id_scan_state} ) {
7271             ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
7272                 {
7273                     input_line      => $input_line,
7274                     i               => $i,
7275                     i_beg           => $i_beg,
7276                     tok             => $tok,
7277                     type            => $type,
7278                     rtokens         => $rtokens,
7279                     rtoken_map      => $rtoken_map,
7280                     id_scan_state   => $id_scan_state,
7281                     max_token_index => $max_token_index
7282                 }
7283             );
7284         }
7285
7286         elsif ( $is_package{$id_scan_state} ) {
7287             ( $i, $tok, $type ) =
7288               do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
7289                 $rtoken_map, $max_token_index );
7290             $id_scan_state = EMPTY_STRING;
7291         }
7292
7293         else {
7294             warning("invalid token in scan_id: $tok\n");
7295             $id_scan_state = EMPTY_STRING;
7296         }
7297     }
7298
7299     if ( $id_scan_state && ( !defined($type) || !$type ) ) {
7300
7301         # shouldn't happen:
7302         if (DEVEL_MODE) {
7303             Fault(<<EOM);
7304 Program bug in scan_id: undefined type but scan_state=$id_scan_state
7305 EOM
7306         }
7307         warning(
7308 "Possible program bug in sub scan_id: undefined type but scan_state=$id_scan_state\n"
7309         );
7310         report_definite_bug();
7311     }
7312
7313     DEBUG_NSCAN && do {
7314         print STDOUT
7315           "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
7316     };
7317     return ( $i, $tok, $type, $id_scan_state );
7318 } ## end sub scan_id_do
7319
7320 sub check_prototype {
7321     my ( $proto, $package, $subname ) = @_;
7322     return unless ( defined($package) && defined($subname) );
7323     if ( defined($proto) ) {
7324         $proto =~ s/^\s*\(\s*//;
7325         $proto =~ s/\s*\)$//;
7326         if ($proto) {
7327             $is_user_function{$package}{$subname}        = 1;
7328             $user_function_prototype{$package}{$subname} = "($proto)";
7329
7330             # prototypes containing '&' must be treated specially..
7331             if ( $proto =~ /\&/ ) {
7332
7333                 # right curly braces of prototypes ending in
7334                 # '&' may be followed by an operator
7335                 if ( $proto =~ /\&$/ ) {
7336                     $is_block_function{$package}{$subname} = 1;
7337                 }
7338
7339                 # right curly braces of prototypes NOT ending in
7340                 # '&' may NOT be followed by an operator
7341                 elsif ( $proto !~ /\&$/ ) {
7342                     $is_block_list_function{$package}{$subname} = 1;
7343                 }
7344             }
7345         }
7346         else {
7347             $is_constant{$package}{$subname} = 1;
7348         }
7349     }
7350     else {
7351         $is_user_function{$package}{$subname} = 1;
7352     }
7353     return;
7354 } ## end sub check_prototype
7355
7356 sub do_scan_package {
7357
7358     # do_scan_package parses a package name
7359     # it is called with $i_beg equal to the index of the first nonblank
7360     # token following a 'package' token.
7361     # USES GLOBAL VARIABLES: $current_package,
7362
7363     # package NAMESPACE
7364     # package NAMESPACE VERSION
7365     # package NAMESPACE BLOCK
7366     # package NAMESPACE VERSION BLOCK
7367     #
7368     # If VERSION is provided, package sets the $VERSION variable in the given
7369     # namespace to a version object with the VERSION provided. VERSION must be
7370     # a "strict" style version number as defined by the version module: a
7371     # positive decimal number (integer or decimal-fraction) without
7372     # exponentiation or else a dotted-decimal v-string with a leading 'v'
7373     # character and at least three components.
7374     # reference http://perldoc.perl.org/functions/package.html
7375
7376     my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
7377         $max_token_index )
7378       = @_;
7379     my $package = undef;
7380     my $pos_beg = $rtoken_map->[$i_beg];
7381     pos($input_line) = $pos_beg;
7382
7383     # handle non-blank line; package name, if any, must follow
7384     if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w*)/gc ) {
7385         $package = $1;
7386         $package = ( defined($1) && $1 ) ? $1 : 'main';
7387         $package =~ s/\'/::/g;
7388         if ( $package =~ /^\:/ ) { $package = 'main' . $package }
7389         $package =~ s/::$//;
7390         my $pos  = pos($input_line);
7391         my $numc = $pos - $pos_beg;
7392         $tok  = 'package ' . substr( $input_line, $pos_beg, $numc );
7393         $type = 'i';
7394
7395         # Now we must convert back from character position
7396         # to pre_token index.
7397         # I don't think an error flag can occur here ..but ?
7398         my $error;
7399         ( $i, $error ) =
7400           inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
7401         if ($error) { warning("Possibly invalid package\n") }
7402         $current_package = $package;
7403
7404         # we should now have package NAMESPACE
7405         # now expecting VERSION, BLOCK, or ; to follow ...
7406         # package NAMESPACE VERSION
7407         # package NAMESPACE BLOCK
7408         # package NAMESPACE VERSION BLOCK
7409         my ( $next_nonblank_token, $i_next ) =
7410           find_next_nonblank_token( $i, $rtokens, $max_token_index );
7411
7412         # check that something recognizable follows, but do not parse.
7413         # A VERSION number will be parsed later as a number or v-string in the
7414         # normal way.  What is important is to set the statement type if
7415         # everything looks okay so that the operator_expected() routine
7416         # knows that the number is in a package statement.
7417         # Examples of valid primitive tokens that might follow are:
7418         #  1235  . ; { } v3  v
7419         # FIX: added a '#' since a side comment may also follow
7420         if ( $next_nonblank_token =~ /^([v\.\d;\{\}\#])|v\d|\d+$/ ) {
7421             $statement_type = $tok;
7422         }
7423         else {
7424             warning(
7425                 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
7426             );
7427         }
7428     }
7429
7430     # no match but line not blank --
7431     # could be a label with name package, like package:  , for example.
7432     else {
7433         $type = 'k';
7434     }
7435
7436     return ( $i, $tok, $type );
7437 } ## end sub do_scan_package
7438
7439 my %is_special_variable_char;
7440
7441 BEGIN {
7442
7443     # These are the only characters which can (currently) form special
7444     # variables, like $^W: (issue c066).
7445     my @q =
7446       qw{ ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ };
7447     @{is_special_variable_char}{@q} = (1) x scalar(@q);
7448 }
7449
7450 {    ## begin closure for sub scan_complex_identifier
7451
7452     use constant DEBUG_SCAN_ID => 0;
7453
7454     # These are the possible states for this scanner:
7455     my $scan_state_SIGIL     = '$';
7456     my $scan_state_ALPHA     = 'A';
7457     my $scan_state_COLON     = ':';
7458     my $scan_state_LPAREN    = '(';
7459     my $scan_state_RPAREN    = ')';
7460     my $scan_state_AMPERSAND = '&';
7461     my $scan_state_SPLIT     = '^';
7462
7463     # Only these non-blank states may be returned to caller:
7464     my %is_returnable_scan_state = (
7465         $scan_state_SIGIL     => 1,
7466         $scan_state_AMPERSAND => 1,
7467     );
7468
7469     # USES GLOBAL VARIABLES:
7470     #    $context, $last_nonblank_token, $last_nonblank_type
7471
7472     #-----------
7473     # call args:
7474     #-----------
7475     my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
7476         $expecting, $container_type );
7477
7478     #-------------------------------------------
7479     # my variables, re-initialized on each call:
7480     #-------------------------------------------
7481     my $i_begin;                # starting index $i
7482     my $type;                   # returned identifier type
7483     my $tok_begin;              # starting token
7484     my $tok;                    # returned token
7485     my $id_scan_state_begin;    # starting scan state
7486     my $identifier_begin;       # starting identifier
7487     my $i_save;                 # a last good index, in case of error
7488     my $message;                # hold error message for log file
7489     my $tok_is_blank;
7490     my $last_tok_is_blank;
7491     my $in_prototype_or_signature;
7492     my $saw_alpha;
7493     my $saw_type;
7494     my $allow_tick;
7495
7496     sub initialize_my_scan_id_vars {
7497
7498         # Initialize all 'my' vars on entry
7499         $i_begin   = $i;
7500         $type      = EMPTY_STRING;
7501         $tok_begin = $rtokens->[$i_begin];
7502         $tok       = $tok_begin;
7503         if ( $tok_begin eq ':' ) { $tok_begin = '::' }
7504         $id_scan_state_begin = $id_scan_state;
7505         $identifier_begin    = $identifier;
7506         $i_save              = undef;
7507
7508         $message           = EMPTY_STRING;
7509         $tok_is_blank      = undef;          # a flag to speed things up
7510         $last_tok_is_blank = undef;
7511
7512         $in_prototype_or_signature =
7513           $container_type && $container_type =~ /^sub\b/;
7514
7515         # these flags will be used to help figure out the type:
7516         $saw_alpha = undef;
7517         $saw_type  = undef;
7518
7519         # allow old package separator (') except in 'use' statement
7520         $allow_tick = ( $last_nonblank_token ne 'use' );
7521         return;
7522     } ## end sub initialize_my_scan_id_vars
7523
7524     #----------------------------------
7525     # Routines for handling scan states
7526     #----------------------------------
7527     sub do_id_scan_state_dollar {
7528
7529         # We saw a sigil, now looking to start a variable name
7530
7531         if ( $tok eq '$' ) {
7532
7533             $identifier .= $tok;
7534
7535             # we've got a punctuation variable if end of line (punct.t)
7536             if ( $i == $max_token_index ) {
7537                 $type          = 'i';
7538                 $id_scan_state = EMPTY_STRING;
7539             }
7540         }
7541         elsif ( $tok =~ /^\w/ ) {    # alphanumeric ..
7542             $saw_alpha     = 1;
7543             $id_scan_state = $scan_state_COLON;    # now need ::
7544             $identifier .= $tok;
7545         }
7546         elsif ( $tok eq '::' ) {
7547             $id_scan_state = $scan_state_ALPHA;
7548             $identifier .= $tok;
7549         }
7550
7551         # POSTDEFREF ->@ ->% ->& ->*
7552         elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
7553             $identifier .= $tok;
7554         }
7555         elsif ( $tok eq "'" && $allow_tick ) {    # alphanumeric ..
7556             $saw_alpha     = 1;
7557             $id_scan_state = $scan_state_COLON;    # now need ::
7558             $identifier .= $tok;
7559
7560             # Perl will accept leading digits in identifiers,
7561             # although they may not always produce useful results.
7562             # Something like $main::0 is ok.  But this also works:
7563             #
7564             #  sub howdy::123::bubba{ print "bubba $54321!\n" }
7565             #  howdy::123::bubba();
7566             #
7567         }
7568         elsif ( $tok eq '#' ) {
7569
7570             my $is_punct_var = $identifier eq '$$';
7571
7572             # side comment or identifier?
7573             if (
7574
7575                 # A '#' starts a comment if it follows a space. For example,
7576                 # the following is equivalent to $ans=40.
7577                 #   my $ #
7578                 #     ans = 40;
7579                 !$last_tok_is_blank
7580
7581                 # a # inside a prototype or signature can only start a
7582                 # comment
7583                 && !$in_prototype_or_signature
7584
7585                 # these are valid punctuation vars: *# %# @# $#
7586                 # May also be '$#array' or POSTDEFREF ->$#
7587                 && (   $identifier =~ /^[\%\@\$\*]$/
7588                     || $identifier =~ /\$$/ )
7589
7590                 # but a '#' after '$$' is a side comment; see c147
7591                 && !$is_punct_var
7592
7593               )
7594             {
7595                 $identifier .= $tok;    # keep same state, a $ could follow
7596             }
7597             else {
7598
7599                 # otherwise it is a side comment
7600                 if    ( $identifier eq '->' )                 { }
7601                 elsif ($is_punct_var)                         { $type = 'i' }
7602                 elsif ( $id_scan_state eq $scan_state_SIGIL ) { $type = 't' }
7603                 else                                          { $type = 'i' }
7604                 $i             = $i_save;
7605                 $id_scan_state = EMPTY_STRING;
7606             }
7607         }
7608
7609         elsif ( $tok eq '{' ) {
7610
7611             # check for something like ${#} or ${©}
7612             if (
7613                 (
7614                        $identifier eq '$'
7615                     || $identifier eq '@'
7616                     || $identifier eq '$#'
7617                 )
7618                 && $i + 2 <= $max_token_index
7619                 && $rtokens->[ $i + 2 ] eq '}'
7620                 && $rtokens->[ $i + 1 ] !~ /[\s\w]/
7621               )
7622             {
7623                 my $next2 = $rtokens->[ $i + 2 ];
7624                 my $next1 = $rtokens->[ $i + 1 ];
7625                 $identifier .= $tok . $next1 . $next2;
7626                 $i += 2;
7627                 $id_scan_state = EMPTY_STRING;
7628             }
7629             else {
7630
7631                 # skip something like ${xxx} or ->{
7632                 $id_scan_state = EMPTY_STRING;
7633
7634                 # if this is the first token of a line, any tokens for this
7635                 # identifier have already been accumulated
7636                 if ( $identifier eq '$' || $i == 0 ) {
7637                     $identifier = EMPTY_STRING;
7638                 }
7639                 $i = $i_save;
7640             }
7641         }
7642
7643         # space ok after leading $ % * & @
7644         elsif ( $tok =~ /^\s*$/ ) {
7645
7646             $tok_is_blank = 1;
7647
7648             # note: an id with a leading '&' does not actually come this way
7649             if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
7650
7651                 if ( length($identifier) > 1 ) {
7652                     $id_scan_state = EMPTY_STRING;
7653                     $i             = $i_save;
7654                     $type          = 'i';    # probably punctuation variable
7655                 }
7656                 else {
7657
7658                     # fix c139: trim line-ending type 't'
7659                     if ( $i == $max_token_index ) {
7660                         $i    = $i_save;
7661                         $type = 't';
7662                     }
7663
7664                     # spaces after $'s are common, and space after @
7665                     # is harmless, so only complain about space
7666                     # after other type characters. Space after $ and
7667                     # @ will be removed in formatting.  Report space
7668                     # after % and * because they might indicate a
7669                     # parsing error.  In other words '% ' might be a
7670                     # modulo operator.  Delete this warning if it
7671                     # gets annoying.
7672                     elsif ( $identifier !~ /^[\@\$]$/ ) {
7673                         $message =
7674                           "Space in identifier, following $identifier\n";
7675                     }
7676                     else {
7677                         ## ok: silently accept space after '$' and '@' sigils
7678                     }
7679                 }
7680             }
7681
7682             elsif ( $identifier eq '->' ) {
7683
7684                 # space after '->' is ok except at line end ..
7685                 # so trim line-ending in type '->' (fixes c139)
7686                 if ( $i == $max_token_index ) {
7687                     $i    = $i_save;
7688                     $type = '->';
7689                 }
7690             }
7691
7692             # stop at space after something other than -> or sigil
7693             # Example of what can arrive here:
7694             #   eval { $MyClass->$$ };
7695             else {
7696                 $id_scan_state = EMPTY_STRING;
7697                 $i             = $i_save;
7698                 $type          = 'i';
7699             }
7700         }
7701         elsif ( $tok eq '^' ) {
7702
7703             # check for some special variables like $^ $^W
7704             if ( $identifier =~ /^[\$\*\@\%]$/ ) {
7705                 $identifier .= $tok;
7706                 $type = 'i';
7707
7708                 # There may be one more character, not a space, after the ^
7709                 my $next1 = $rtokens->[ $i + 1 ];
7710                 my $chr   = substr( $next1, 0, 1 );
7711                 if ( $is_special_variable_char{$chr} ) {
7712
7713                     # It is something like $^W
7714                     # Test case (c066) : $^Oeq'linux'
7715                     $i++;
7716                     $identifier .= $next1;
7717
7718                     # If pretoken $next1 is more than one character long,
7719                     # set a flag indicating that it needs to be split.
7720                     $id_scan_state =
7721                       ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
7722                 }
7723                 else {
7724
7725                     # it is just $^
7726                     # Simple test case (c065): '$aa=$^if($bb)';
7727                     $id_scan_state = EMPTY_STRING;
7728                 }
7729             }
7730             else {
7731                 $id_scan_state = EMPTY_STRING;
7732                 $i             = $i_save;
7733             }
7734         }
7735         else {    # something else
7736
7737             if ( $in_prototype_or_signature && $tok =~ /^[\),=#]/ ) {
7738
7739                 # We might be in an extrusion of
7740                 #     sub foo2 ( $first, $, $third ) {
7741                 # looking at a line starting with a comma, like
7742                 #   $
7743                 #   ,
7744                 # in this case the comma ends the signature variable
7745                 # '$' which will have been previously marked type 't'
7746                 # rather than 'i'.
7747                 if ( $i == $i_begin ) {
7748                     $identifier = EMPTY_STRING;
7749                     $type       = EMPTY_STRING;
7750                 }
7751
7752                 # at a # we have to mark as type 't' because more may
7753                 # follow, otherwise, in a signature we can let '$' be an
7754                 # identifier here for better formatting.
7755                 # See 'mangle4.in' for a test case.
7756                 else {
7757                     $type = 'i';
7758                     if ( $id_scan_state eq $scan_state_SIGIL && $tok eq '#' ) {
7759                         $type = 't';
7760                     }
7761                     $i = $i_save;
7762                 }
7763                 $id_scan_state = EMPTY_STRING;
7764             }
7765
7766             # check for various punctuation variables
7767             elsif ( $identifier =~ /^[\$\*\@\%]$/ ) {
7768                 $identifier .= $tok;
7769             }
7770
7771             # POSTDEFREF: Postfix reference ->$* ->%*  ->@* ->** ->&* ->$#*
7772             elsif ($tok eq '*'
7773                 && $identifier =~ /\-\>([\@\%\$\*\&]|\$\#)$/ )
7774             {
7775                 $identifier .= $tok;
7776             }
7777
7778             elsif ( $identifier eq '$#' ) {
7779
7780                 if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
7781
7782                 # perl seems to allow just these: $#: $#- $#+
7783                 elsif ( $tok =~ /^[\:\-\+]$/ ) {
7784                     $type = 'i';
7785                     $identifier .= $tok;
7786                 }
7787                 else {
7788                     $i = $i_save;
7789                     write_logfile_entry( 'Use of $# is deprecated' . "\n" );
7790                 }
7791             }
7792             elsif ( $identifier eq '$$' ) {
7793
7794                 # perl does not allow references to punctuation
7795                 # variables without braces.  For example, this
7796                 # won't work:
7797                 #  $:=\4;
7798                 #  $a = $$:;
7799                 # You would have to use
7800                 #  $a = ${$:};
7801
7802                 # '$$' alone is punctuation variable for PID
7803                 $i = $i_save;
7804                 if   ( $tok eq '{' ) { $type = 't' }
7805                 else                 { $type = 'i' }
7806             }
7807             elsif ( $identifier eq '->' ) {
7808                 $i = $i_save;
7809             }
7810             else {
7811                 $i = $i_save;
7812                 if ( length($identifier) == 1 ) {
7813                     $identifier = EMPTY_STRING;
7814                 }
7815             }
7816             $id_scan_state = EMPTY_STRING;
7817         }
7818         return;
7819     } ## end sub do_id_scan_state_dollar
7820
7821     sub do_id_scan_state_alpha {
7822
7823         # looking for alphanumeric after ::
7824         $tok_is_blank = $tok =~ /^\s*$/;
7825
7826         if ( $tok =~ /^\w/ ) {    # found it
7827             $identifier .= $tok;
7828             $id_scan_state = $scan_state_COLON;    # now need ::
7829             $saw_alpha     = 1;
7830         }
7831         elsif ( $tok eq "'" && $allow_tick ) {
7832             $identifier .= $tok;
7833             $id_scan_state = $scan_state_COLON;    # now need ::
7834             $saw_alpha     = 1;
7835         }
7836         elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7837             $id_scan_state = $scan_state_LPAREN;
7838             $identifier .= $tok;
7839         }
7840         elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7841             $id_scan_state = $scan_state_RPAREN;
7842             $identifier .= $tok;
7843         }
7844         else {
7845             $id_scan_state = EMPTY_STRING;
7846             $i             = $i_save;
7847         }
7848         return;
7849     } ## end sub do_id_scan_state_alpha
7850
7851     sub do_id_scan_state_colon {
7852
7853         # looking for possible :: after alphanumeric
7854
7855         $tok_is_blank = $tok =~ /^\s*$/;
7856
7857         if ( $tok eq '::' ) {    # got it
7858             $identifier .= $tok;
7859             $id_scan_state = $scan_state_ALPHA;    # now require alpha
7860         }
7861         elsif ( $tok =~ /^\w/ ) {    # more alphanumeric is ok here
7862             $identifier .= $tok;
7863             $id_scan_state = $scan_state_COLON;    # now need ::
7864             $saw_alpha     = 1;
7865         }
7866         elsif ( $tok eq "'" && $allow_tick ) {     # tick
7867
7868             if ( $is_keyword{$identifier} ) {
7869                 $id_scan_state = EMPTY_STRING;     # that's all
7870                 $i             = $i_save;
7871             }
7872             else {
7873                 $identifier .= $tok;
7874             }
7875         }
7876         elsif ( $tok_is_blank && $identifier =~ /^sub / ) {
7877             $id_scan_state = $scan_state_LPAREN;
7878             $identifier .= $tok;
7879         }
7880         elsif ( $tok eq '(' && $identifier =~ /^sub / ) {
7881             $id_scan_state = $scan_state_RPAREN;
7882             $identifier .= $tok;
7883         }
7884         else {
7885             $id_scan_state = EMPTY_STRING;    # that's all
7886             $i             = $i_save;
7887         }
7888         return;
7889     } ## end sub do_id_scan_state_colon
7890
7891     sub do_id_scan_state_left_paren {
7892
7893         # looking for possible '(' of a prototype
7894
7895         if ( $tok eq '(' ) {    # got it
7896             $identifier .= $tok;
7897             $id_scan_state = $scan_state_RPAREN;    # now find the end of it
7898         }
7899         elsif ( $tok =~ /^\s*$/ ) {                 # blank - keep going
7900             $identifier .= $tok;
7901             $tok_is_blank = 1;
7902         }
7903         else {
7904             $id_scan_state = EMPTY_STRING;          # that's all - no prototype
7905             $i             = $i_save;
7906         }
7907         return;
7908     } ## end sub do_id_scan_state_left_paren
7909
7910     sub do_id_scan_state_right_paren {
7911
7912         # looking for a ')' of prototype to close a '('
7913
7914         $tok_is_blank = $tok =~ /^\s*$/;
7915
7916         if ( $tok eq ')' ) {    # got it
7917             $identifier .= $tok;
7918             $id_scan_state = EMPTY_STRING;    # all done
7919         }
7920         elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
7921             $identifier .= $tok;
7922         }
7923         else {    # probable error in script, but keep going
7924             warning("Unexpected '$tok' while seeking end of prototype\n");
7925             $identifier .= $tok;
7926         }
7927         return;
7928     } ## end sub do_id_scan_state_right_paren
7929
7930     sub do_id_scan_state_ampersand {
7931
7932         # Starting sub call after seeing an '&'
7933
7934         if ( $tok =~ /^[\$\w]/ ) {    # alphanumeric ..
7935             $id_scan_state = $scan_state_COLON;    # now need ::
7936             $saw_alpha     = 1;
7937             $identifier .= $tok;
7938         }
7939         elsif ( $tok eq "'" && $allow_tick ) {     # alphanumeric ..
7940             $id_scan_state = $scan_state_COLON;    # now need ::
7941             $saw_alpha     = 1;
7942             $identifier .= $tok;
7943         }
7944         elsif ( $tok =~ /^\s*$/ ) {                # allow space
7945             $tok_is_blank = 1;
7946
7947             # fix c139: trim line-ending type 't'
7948             if ( length($identifier) == 1 && $i == $max_token_index ) {
7949                 $i    = $i_save;
7950                 $type = 't';
7951             }
7952         }
7953         elsif ( $tok eq '::' ) {                   # leading ::
7954             $id_scan_state = $scan_state_ALPHA;    # accept alpha next
7955             $identifier .= $tok;
7956         }
7957         elsif ( $tok eq '{' ) {
7958             if ( $identifier eq '&' || $i == 0 ) {
7959                 $identifier = EMPTY_STRING;
7960             }
7961             $i             = $i_save;
7962             $id_scan_state = EMPTY_STRING;
7963         }
7964         elsif ( $tok eq '^' ) {
7965             if ( $identifier eq '&' ) {
7966
7967                 # Special variable (c066)
7968                 $identifier .= $tok;
7969                 $type = '&';
7970
7971                 # There may be one more character, not a space, after the ^
7972                 my $next1 = $rtokens->[ $i + 1 ];
7973                 my $chr   = substr( $next1, 0, 1 );
7974                 if ( $is_special_variable_char{$chr} ) {
7975
7976                     # It is something like &^O
7977                     $i++;
7978                     $identifier .= $next1;
7979
7980                     # If pretoken $next1 is more than one character long,
7981                     # set a flag indicating that it needs to be split.
7982                     $id_scan_state =
7983                       ( length($next1) > 1 ) ? $scan_state_SPLIT : EMPTY_STRING;
7984                 }
7985                 else {
7986
7987                     # it is &^
7988                     $id_scan_state = EMPTY_STRING;
7989                 }
7990             }
7991             else {
7992                 $identifier = EMPTY_STRING;
7993                 $i          = $i_save;
7994             }
7995         }
7996         else {
7997
7998             # punctuation variable?
7999             # testfile: cunningham4.pl
8000             #
8001             # We have to be careful here.  If we are in an unknown state,
8002             # we will reject the punctuation variable.  In the following
8003             # example the '&' is a binary operator but we are in an unknown
8004             # state because there is no sigil on 'Prima', so we don't
8005             # know what it is.  But it is a bad guess that
8006             # '&~' is a function variable.
8007             # $self->{text}->{colorMap}->[
8008             #   Prima::PodView::COLOR_CODE_FOREGROUND
8009             #   & ~tb::COLOR_INDEX ] =
8010             #   $sec->{ColorCode}
8011
8012             # Fix for case c033: a '#' here starts a side comment
8013             if ( $identifier eq '&' && $expecting && $tok ne '#' ) {
8014                 $identifier .= $tok;
8015             }
8016             else {
8017                 $identifier = EMPTY_STRING;
8018                 $i          = $i_save;
8019                 $type       = '&';
8020             }
8021             $id_scan_state = EMPTY_STRING;
8022         }
8023         return;
8024     } ## end sub do_id_scan_state_ampersand
8025
8026     #-------------------
8027     # hash of scanner subs
8028     #-------------------
8029     my $scan_identifier_code = {
8030         $scan_state_SIGIL     => \&do_id_scan_state_dollar,
8031         $scan_state_ALPHA     => \&do_id_scan_state_alpha,
8032         $scan_state_COLON     => \&do_id_scan_state_colon,
8033         $scan_state_LPAREN    => \&do_id_scan_state_left_paren,
8034         $scan_state_RPAREN    => \&do_id_scan_state_right_paren,
8035         $scan_state_AMPERSAND => \&do_id_scan_state_ampersand,
8036     };
8037
8038     sub scan_complex_identifier {
8039
8040         # This routine assembles tokens into identifiers.  It maintains a
8041         # scan state, id_scan_state.  It updates id_scan_state based upon
8042         # current id_scan_state and token, and returns an updated
8043         # id_scan_state and the next index after the identifier.
8044
8045         # This routine now serves a a backup for sub scan_simple_identifier
8046         # which handles most identifiers.
8047
8048         (
8049             $i,         $id_scan_state, $identifier, $rtokens, $max_token_index,
8050             $expecting, $container_type
8051         ) = @_;
8052
8053         # return flag telling caller to split the pretoken
8054         my $split_pretoken_flag;
8055
8056         ####################
8057         # Initialize my vars
8058         ####################
8059
8060         initialize_my_scan_id_vars();
8061
8062         #########################################################
8063         # get started by defining a type and a state if necessary
8064         #########################################################
8065
8066         if ( !$id_scan_state ) {
8067             $context = UNKNOWN_CONTEXT;
8068
8069             # fixup for digraph
8070             if ( $tok eq '>' ) {
8071                 $tok       = '->';
8072                 $tok_begin = $tok;
8073             }
8074             $identifier = $tok;
8075
8076             if ( $tok eq '$' || $tok eq '*' ) {
8077                 $id_scan_state = $scan_state_SIGIL;
8078                 $context       = SCALAR_CONTEXT;
8079             }
8080             elsif ( $tok eq '%' || $tok eq '@' ) {
8081                 $id_scan_state = $scan_state_SIGIL;
8082                 $context       = LIST_CONTEXT;
8083             }
8084             elsif ( $tok eq '&' ) {
8085                 $id_scan_state = $scan_state_AMPERSAND;
8086             }
8087             elsif ( $tok eq 'sub' or $tok eq 'package' ) {
8088                 $saw_alpha     = 0;    # 'sub' is considered type info here
8089                 $id_scan_state = $scan_state_SIGIL;
8090                 $identifier .=
8091                   SPACE;    # need a space to separate sub from sub name
8092             }
8093             elsif ( $tok eq '::' ) {
8094                 $id_scan_state = $scan_state_ALPHA;
8095             }
8096             elsif ( $tok =~ /^\w/ ) {
8097                 $id_scan_state = $scan_state_COLON;
8098                 $saw_alpha     = 1;
8099             }
8100             elsif ( $tok eq '->' ) {
8101                 $id_scan_state = $scan_state_SIGIL;
8102             }
8103             else {
8104
8105                 # shouldn't happen: bad call parameter
8106                 my $msg =
8107 "Program bug detected: scan_identifier received bad starting token = '$tok'\n";
8108                 if (DEVEL_MODE) { Fault($msg) }
8109                 if ( !$tokenizer_self->[_in_error_] ) {
8110                     warning($msg);
8111                     $tokenizer_self->[_in_error_] = 1;
8112                 }
8113                 $id_scan_state = EMPTY_STRING;
8114                 goto RETURN;
8115             }
8116             $saw_type = !$saw_alpha;
8117         }
8118         else {
8119             $i--;
8120             $saw_alpha = ( $tok =~ /^\w/ );
8121             $saw_type  = ( $tok =~ /([\$\%\@\*\&])/ );
8122
8123             # check for a valid starting state
8124             if ( DEVEL_MODE && !$is_returnable_scan_state{$id_scan_state} ) {
8125                 Fault(<<EOM);
8126 Unexpected starting scan state in sub scan_complex_identifier: '$id_scan_state'
8127 EOM
8128             }
8129         }
8130
8131         ###############################
8132         # loop to gather the identifier
8133         ###############################
8134
8135         $i_save = $i;
8136
8137         while ( $i < $max_token_index && $id_scan_state ) {
8138
8139             # Be sure we have code to handle this state before we proceed
8140             my $code = $scan_identifier_code->{$id_scan_state};
8141             if ( !$code ) {
8142
8143                 if ( $id_scan_state eq $scan_state_SPLIT ) {
8144                     ## OK: this is the signal to exit and split the pretoken
8145                 }
8146
8147                 # unknown state - should not happen
8148                 else {
8149                     if (DEVEL_MODE) {
8150                         Fault(<<EOM);
8151 Unknown scan state in sub scan_complex_identifier: '$id_scan_state'
8152 Scan state at sub entry was '$id_scan_state_begin'
8153 EOM
8154                     }
8155                     $id_scan_state = EMPTY_STRING;
8156                     $i             = $i_save;
8157                 }
8158                 last;
8159             }
8160
8161             # Remember the starting index for progress check below
8162             my $i_start_loop = $i;
8163
8164             $last_tok_is_blank = $tok_is_blank;
8165             if   ($tok_is_blank) { $tok_is_blank = undef }
8166             else                 { $i_save       = $i }
8167
8168             $tok = $rtokens->[ ++$i ];
8169
8170             # patch to make digraph :: if necessary
8171             if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
8172                 $tok = '::';
8173                 $i++;
8174             }
8175
8176             $code->();
8177
8178             # check for forward progress: a decrease in the index $i
8179             # implies that scanning has finished
8180             last if ( $i <= $i_start_loop );
8181
8182         } ## end of main loop
8183
8184         ##############
8185         # Check result
8186         ##############
8187
8188         # Be sure a valid state is returned
8189         if ($id_scan_state) {
8190
8191             if ( !$is_returnable_scan_state{$id_scan_state} ) {
8192
8193                 if ( $id_scan_state eq $scan_state_SPLIT ) {
8194                     $split_pretoken_flag = 1;
8195                 }
8196
8197                 if ( $id_scan_state eq $scan_state_RPAREN ) {
8198                     warning(
8199                         "Hit end of line while seeking ) to end prototype\n");
8200                 }
8201
8202                 $id_scan_state = EMPTY_STRING;
8203             }
8204
8205             # Patch: the deprecated variable $# does not combine with anything
8206             # on the next line.
8207             if ( $identifier eq '$#' ) { $id_scan_state = EMPTY_STRING }
8208         }
8209
8210         # Be sure the token index is valid
8211         if ( $i < 0 ) { $i = 0 }
8212
8213         # Be sure a token type is defined
8214         if ( !$type ) {
8215
8216             if ($saw_type) {
8217
8218                 if ($saw_alpha) {
8219
8220                   # The type without the -> should be the same as with the -> so
8221                   # that if they get separated we get the same bond strengths,
8222                   # etc.  See b1234
8223                     if (   $identifier =~ /^->/
8224                         && $last_nonblank_type eq 'w'
8225                         && substr( $identifier, 2, 1 ) =~ /^\w/ )
8226                     {
8227                         $type = 'w';
8228                     }
8229                     else { $type = 'i' }
8230                 }
8231                 elsif ( $identifier eq '->' ) {
8232                     $type = '->';
8233                 }
8234                 elsif (
8235                     ( length($identifier) > 1 )
8236
8237                     # In something like '@$=' we have an identifier '@$'
8238                     # In something like '$${' we have type '$$' (and only
8239                     # part of an identifier)
8240                     && !( $identifier =~ /\$$/ && $tok eq '{' )
8241
8242                     ## && ( $identifier !~ /^(sub |package )$/ )
8243                     && $identifier ne 'sub '
8244                     && $identifier ne 'package '
8245                   )
8246                 {
8247                     $type = 'i';
8248                 }
8249                 else { $type = 't' }
8250             }
8251             elsif ($saw_alpha) {
8252
8253                 # type 'w' includes anything without leading type info
8254                 # ($,%,@,*) including something like abc::def::ghi
8255                 $type = 'w';
8256
8257                 # Fix for b1337, if restarting scan after line break between
8258                 # '->' or sigil and identifier name, use type 'i'
8259                 if (   $id_scan_state_begin
8260                     && $identifier =~ /^([\$\%\@\*\&]|->)/ )
8261                 {
8262                     $type = 'i';
8263                 }
8264             }
8265             else {
8266                 $type = EMPTY_STRING;
8267             }    # this can happen on a restart
8268         }
8269
8270         # See if we formed an identifier...
8271         if ($identifier) {
8272             $tok = $identifier;
8273             if ($message) { write_logfile_entry($message) }
8274         }
8275
8276         # did not find an identifier, back  up
8277         else {
8278             $tok = $tok_begin;
8279             $i   = $i_begin;
8280         }
8281
8282       RETURN:
8283
8284         DEBUG_SCAN_ID && do {
8285             my ( $a, $b, $c ) = caller;
8286             print STDOUT
8287 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
8288             print STDOUT
8289 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
8290         };
8291         return ( $i, $tok, $type, $id_scan_state, $identifier,
8292             $split_pretoken_flag );
8293     } ## end sub scan_complex_identifier
8294 } ## end closure for sub scan_complex_identifier
8295
8296 {    ## closure for sub do_scan_sub
8297
8298     my %warn_if_lexical;
8299
8300     BEGIN {
8301
8302         # lexical subs with these names can cause parsing errors in this version
8303         my @q = qw( m q qq qr qw qx s tr y );
8304         @{warn_if_lexical}{@q} = (1) x scalar(@q);
8305     }
8306
8307     # saved package and subnames in case prototype is on separate line
8308     my ( $package_saved, $subname_saved );
8309
8310     # initialize subname each time a new 'sub' keyword is encountered
8311     sub initialize_subname {
8312         $package_saved = EMPTY_STRING;
8313         $subname_saved = EMPTY_STRING;
8314         return;
8315     }
8316
8317     use constant {
8318         SUB_CALL       => 1,
8319         PAREN_CALL     => 2,
8320         PROTOTYPE_CALL => 3,
8321     };
8322
8323     sub do_scan_sub {
8324
8325         # do_scan_sub parses a sub name and prototype.
8326
8327         # At present there are three basic CALL TYPES which are
8328         # distinguished by the starting value of '$tok':
8329         # 1. $tok='sub', id_scan_state='sub'
8330         #    it is called with $i_beg equal to the index of the first nonblank
8331         #    token following a 'sub' token.
8332         # 2. $tok='(', id_scan_state='sub',
8333         #    it is called with $i_beg equal to the index of a '(' which may
8334         #    start a prototype.
8335         # 3. $tok='prototype', id_scan_state='prototype'
8336         #    it is called with $i_beg equal to the index of a '(' which is
8337         #    preceded by ': prototype' and has $id_scan_state eq 'prototype'
8338
8339         # Examples:
8340
8341         # A single type 1 call will get both the sub and prototype
8342         #   sub foo1 ( $$ ) { }
8343         #   ^
8344
8345         # The subname will be obtained with a 'sub' call
8346         # The prototype on line 2 will be obtained with a '(' call
8347         #   sub foo1
8348         #   ^                    <---call type 1
8349         #     ( $$ ) { }
8350         #     ^                  <---call type 2
8351
8352         # The subname will be obtained with a 'sub' call
8353         # The prototype will be obtained with a 'prototype' call
8354         #   sub foo1 ( $x, $y ) : prototype ( $$ ) { }
8355         #   ^ <---type 1                    ^ <---type 3
8356
8357         # TODO: add future error checks to be sure we have a valid
8358         # sub name.  For example, 'sub &doit' is wrong.  Also, be sure
8359         # a name is given if and only if a non-anonymous sub is
8360         # appropriate.
8361         # USES GLOBAL VARS: $current_package, $last_nonblank_token,
8362         # $in_attribute_list, %saw_function_definition,
8363         # $statement_type
8364
8365         my ($rinput_hash) = @_;
8366
8367         my $input_line      = $rinput_hash->{input_line};
8368         my $i               = $rinput_hash->{i};
8369         my $i_beg           = $rinput_hash->{i_beg};
8370         my $tok             = $rinput_hash->{tok};
8371         my $type            = $rinput_hash->{type};
8372         my $rtokens         = $rinput_hash->{rtokens};
8373         my $rtoken_map      = $rinput_hash->{rtoken_map};
8374         my $id_scan_state   = $rinput_hash->{id_scan_state};
8375         my $max_token_index = $rinput_hash->{max_token_index};
8376
8377         my $i_entry = $i;
8378
8379         # Determine the CALL TYPE
8380         # 1=sub
8381         # 2=(
8382         # 3=prototype
8383         my $call_type =
8384             $tok eq 'prototype' ? PROTOTYPE_CALL
8385           : $tok eq '('         ? PAREN_CALL
8386           :                       SUB_CALL;
8387
8388         $id_scan_state = EMPTY_STRING;  # normally we get everything in one call
8389         my $subname = $subname_saved;
8390         my $package = $package_saved;
8391         my $proto   = undef;
8392         my $attrs   = undef;
8393         my $match;
8394
8395         my $pos_beg = $rtoken_map->[$i_beg];
8396         pos($input_line) = $pos_beg;
8397
8398         # Look for the sub NAME if this is a SUB call
8399         if (
8400                $call_type == SUB_CALL
8401             && $input_line =~ m/\G\s*
8402         ((?:\w*(?:'|::))*)  # package - something that ends in :: or '
8403         (\w+)               # NAME    - required
8404         /gcx
8405           )
8406         {
8407             $match   = 1;
8408             $subname = $2;
8409
8410             my $is_lexical_sub =
8411               $last_nonblank_type eq 'k' && $last_nonblank_token eq 'my';
8412             if ( $is_lexical_sub && $1 ) {
8413                 warning("'my' sub $subname cannot be in package '$1'\n");
8414                 $is_lexical_sub = 0;
8415             }
8416
8417             if ($is_lexical_sub) {
8418
8419                 # lexical subs use the block sequence number as a package name
8420                 my $seqno =
8421                   $current_sequence_number[BRACE][ $current_depth[BRACE] ];
8422                 $seqno   = 1 unless ( defined($seqno) );
8423                 $package = $seqno;
8424                 if ( $warn_if_lexical{$subname} ) {
8425                     warning(
8426 "'my' sub '$subname' matches a builtin name and may not be handled correctly in this perltidy version.\n"
8427                     );
8428                 }
8429             }
8430             else {
8431                 $package = ( defined($1) && $1 ) ? $1 : $current_package;
8432                 $package =~ s/\'/::/g;
8433                 if ( $package =~ /^\:/ ) { $package = 'main' . $package }
8434                 $package =~ s/::$//;
8435             }
8436
8437             my $pos  = pos($input_line);
8438             my $numc = $pos - $pos_beg;
8439             $tok  = 'sub ' . substr( $input_line, $pos_beg, $numc );
8440             $type = 'i';
8441
8442             # remember the sub name in case another call is needed to
8443             # get the prototype
8444             $package_saved = $package;
8445             $subname_saved = $subname;
8446         }
8447
8448         # Now look for PROTO ATTRS for all call types
8449         # Look for prototype/attributes which are usually on the same
8450         # line as the sub name but which might be on a separate line.
8451         # For example, we might have an anonymous sub with attributes,
8452         # or a prototype on a separate line from its sub name
8453
8454         # NOTE: We only want to parse PROTOTYPES here. If we see anything that
8455         # does not look like a prototype, we assume it is a SIGNATURE and we
8456         # will stop and let the the standard tokenizer handle it.  In
8457         # particular, we stop if we see any nested parens, braces, or commas.
8458         # Also note, a valid prototype cannot contain any alphabetic character
8459         #  -- see https://perldoc.perl.org/perlsub
8460         # But it appears that an underscore is valid in a prototype, so the
8461         # regex below uses [A-Za-z] rather than \w
8462         # This is the old regex which has been replaced:
8463         # $input_line =~ m/\G(\s*\([^\)\(\}\{\,#]*\))?  # PROTO
8464         my $saw_opening_paren = $input_line =~ /\G\s*\(/;
8465         if (
8466             $input_line =~ m/\G(\s*\([^\)\(\}\{\,#A-Za-z]*\))?  # PROTO
8467             (\s*:)?                              # ATTRS leading ':'
8468             /gcx
8469             && ( $1 || $2 )
8470           )
8471         {
8472             $proto = $1;
8473             $attrs = $2;
8474
8475             # Append the prototype to the starting token if it is 'sub' or
8476             # 'prototype'.  This is not necessary but for compatibility with
8477             # previous versions when the -csc flag is used:
8478             if ( $proto && ( $match || $call_type == PROTOTYPE_CALL ) ) {
8479                 $tok .= $proto;
8480             }
8481
8482             # If we just entered the sub at an opening paren on this call, not
8483             # a following :prototype, label it with the previous token.  This is
8484             # necessary to propagate the sub name to its opening block.
8485             elsif ( $call_type == PAREN_CALL ) {
8486                 $tok = $last_nonblank_token;
8487             }
8488
8489             $match ||= 1;
8490
8491             # Patch part #1 to fixes cases b994 and b1053:
8492             # Mark an anonymous sub keyword without prototype as type 'k', i.e.
8493             #    'sub : lvalue { ...'
8494             $type = 'i';
8495             if ( $tok eq 'sub' && !$proto ) { $type = 'k' }
8496         }
8497
8498         if ($match) {
8499
8500             # ATTRS: if there are attributes, back up and let the ':' be
8501             # found later by the scanner.
8502             my $pos = pos($input_line);
8503             if ($attrs) {
8504                 $pos -= length($attrs);
8505             }
8506
8507             my $next_nonblank_token = $tok;
8508
8509             # catch case of line with leading ATTR ':' after anonymous sub
8510             if ( $pos == $pos_beg && $tok eq ':' ) {
8511                 $type              = 'A';
8512                 $in_attribute_list = 1;
8513             }
8514
8515             # Otherwise, if we found a match we must convert back from
8516             # string position to the pre_token index for continued parsing.
8517             else {
8518
8519                 # I don't think an error flag can occur here ..but ?
8520                 my $error;
8521                 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
8522                     $max_token_index );
8523                 if ($error) { warning("Possibly invalid sub\n") }
8524
8525                 # Patch part #2 to fixes cases b994 and b1053:
8526                 # Do not let spaces be part of the token of an anonymous sub
8527                 # keyword which we marked as type 'k' above...i.e. for
8528                 # something like:
8529                 #    'sub : lvalue { ...'
8530                 # Back up and let it be parsed as a blank
8531                 if (   $type eq 'k'
8532                     && $attrs
8533                     && $i > $i_entry
8534                     && substr( $rtokens->[$i], 0, 1 ) =~ m/\s/ )
8535                 {
8536                     $i--;
8537                 }
8538
8539                 # check for multiple definitions of a sub
8540                 ( $next_nonblank_token, my $i_next ) =
8541                   find_next_nonblank_token_on_this_line( $i, $rtokens,
8542                     $max_token_index );
8543             }
8544
8545             if ( $next_nonblank_token =~ /^(\s*|#)$/ )
8546             {    # skip blank or side comment
8547                 my ( $rpre_tokens, $rpre_types ) =
8548                   peek_ahead_for_n_nonblank_pre_tokens(1);
8549                 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
8550                     $next_nonblank_token = $rpre_tokens->[0];
8551                 }
8552                 else {
8553                     $next_nonblank_token = '}';
8554                 }
8555             }
8556
8557             # See what's next...
8558             if ( $next_nonblank_token eq '{' ) {
8559                 if ($subname) {
8560
8561                     # Check for multiple definitions of a sub, but
8562                     # it is ok to have multiple sub BEGIN, etc,
8563                     # so we do not complain if name is all caps
8564                     if (   $saw_function_definition{$subname}{$package}
8565                         && $subname !~ /^[A-Z]+$/ )
8566                     {
8567                         my $lno = $saw_function_definition{$subname}{$package};
8568                         if ( $package =~ /^\d/ ) {
8569                             warning(
8570 "already saw definition of lexical 'sub $subname' at line $lno\n"
8571                             );
8572
8573                         }
8574                         else {
8575                             warning(
8576 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
8577                             ) unless (DEVEL_MODE);
8578                         }
8579                     }
8580                     $saw_function_definition{$subname}{$package} =
8581                       $tokenizer_self->[_last_line_number_];
8582                 }
8583             }
8584             elsif ( $next_nonblank_token eq ';' ) {
8585             }
8586             elsif ( $next_nonblank_token eq '}' ) {
8587             }
8588
8589             # ATTRS - if an attribute list follows, remember the name
8590             # of the sub so the next opening brace can be labeled.
8591             # Setting 'statement_type' causes any ':'s to introduce
8592             # attributes.
8593             elsif ( $next_nonblank_token eq ':' ) {
8594                 if ( $call_type == SUB_CALL ) {
8595                     $statement_type =
8596                       substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8597                 }
8598             }
8599
8600             # if we stopped before an open paren ...
8601             elsif ( $next_nonblank_token eq '(' ) {
8602
8603                 # If we DID NOT see this paren above then it must be on the
8604                 # next line so we will set a flag to come back here and see if
8605                 # it is a PROTOTYPE
8606
8607                 # Otherwise, we assume it is a SIGNATURE rather than a
8608                 # PROTOTYPE and let the normal tokenizer handle it as a list
8609                 if ( !$saw_opening_paren ) {
8610                     $id_scan_state = 'sub';    # we must come back to get proto
8611                 }
8612                 if ( $call_type == SUB_CALL ) {
8613                     $statement_type =
8614                       substr( $tok, 0, 3 ) eq 'sub' ? $tok : 'sub';
8615                 }
8616             }
8617             elsif ($next_nonblank_token) {    # EOF technically ok
8618                 $subname = EMPTY_STRING unless defined($subname);
8619                 warning(
8620 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
8621                 );
8622             }
8623             check_prototype( $proto, $package, $subname );
8624         }
8625
8626         # no match to either sub name or prototype, but line not blank
8627         else {
8628
8629         }
8630         return ( $i, $tok, $type, $id_scan_state );
8631     } ## end sub do_scan_sub
8632 }
8633
8634 #########i###############################################################
8635 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
8636 #########################################################################
8637
8638 sub find_next_nonblank_token {
8639     my ( $i, $rtokens, $max_token_index ) = @_;
8640
8641     # Returns the next nonblank token after the token at index $i
8642     # To skip past a side comment, and any subsequent block comments
8643     # and blank lines, call with i=$max_token_index
8644
8645     if ( $i >= $max_token_index ) {
8646         if ( !peeked_ahead() ) {
8647             peeked_ahead(1);
8648             peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
8649         }
8650     }
8651
8652     my $next_nonblank_token = $rtokens->[ ++$i ];
8653     return ( SPACE, $i ) unless defined($next_nonblank_token);
8654
8655     if ( $next_nonblank_token =~ /^\s*$/ ) {
8656         $next_nonblank_token = $rtokens->[ ++$i ];
8657         return ( SPACE, $i ) unless defined($next_nonblank_token);
8658     }
8659     return ( $next_nonblank_token, $i );
8660 } ## end sub find_next_nonblank_token
8661
8662 sub find_next_noncomment_type {
8663     my ( $i, $rtokens, $max_token_index ) = @_;
8664
8665     # Given the current character position, look ahead past any comments
8666     # and blank lines and return the next token, including digraphs and
8667     # trigraphs.
8668
8669     my ( $next_nonblank_token, $i_next ) =
8670       find_next_nonblank_token( $i, $rtokens, $max_token_index );
8671
8672     # skip past any side comment
8673     if ( $next_nonblank_token eq '#' ) {
8674         ( $next_nonblank_token, $i_next ) =
8675           find_next_nonblank_token( $i_next, $rtokens, $max_token_index );
8676     }
8677
8678     goto RETURN if ( !$next_nonblank_token || $next_nonblank_token eq SPACE );
8679
8680     # check for possible a digraph
8681     goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
8682     my $test2 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8683     goto RETURN if ( !$is_digraph{$test2} );
8684     $next_nonblank_token = $test2;
8685     $i_next              = $i_next + 1;
8686
8687     # check for possible a trigraph
8688     goto RETURN if ( !defined( $rtokens->[ $i_next + 1 ] ) );
8689     my $test3 = $next_nonblank_token . $rtokens->[ $i_next + 1 ];
8690     goto RETURN if ( !$is_trigraph{$test3} );
8691     $next_nonblank_token = $test3;
8692     $i_next              = $i_next + 1;
8693
8694   RETURN:
8695     return ( $next_nonblank_token, $i_next );
8696 } ## end sub find_next_noncomment_type
8697
8698 sub is_possible_numerator {
8699
8700     # Look at the next non-comment character and decide if it could be a
8701     # numerator.  Return
8702     #   1 - yes
8703     #   0 - can't tell
8704     #  -1 - no
8705
8706     my ( $i, $rtokens, $max_token_index ) = @_;
8707     my $is_possible_numerator = 0;
8708
8709     my $next_token = $rtokens->[ $i + 1 ];
8710     if ( $next_token eq '=' ) { $i++; }    # handle /=
8711     my ( $next_nonblank_token, $i_next ) =
8712       find_next_nonblank_token( $i, $rtokens, $max_token_index );
8713
8714     if ( $next_nonblank_token eq '#' ) {
8715         ( $next_nonblank_token, $i_next ) =
8716           find_next_nonblank_token( $max_token_index, $rtokens,
8717             $max_token_index );
8718     }
8719
8720     if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
8721         $is_possible_numerator = 1;
8722     }
8723     elsif ( $next_nonblank_token =~ /^\s*$/ ) {
8724         $is_possible_numerator = 0;
8725     }
8726     else {
8727         $is_possible_numerator = -1;
8728     }
8729
8730     return $is_possible_numerator;
8731 } ## end sub is_possible_numerator
8732
8733 {    ## closure for sub pattern_expected
8734     my %pattern_test;
8735
8736     BEGIN {
8737
8738         # List of tokens which may follow a pattern.  Note that we will not
8739         # have formed digraphs at this point, so we will see '&' instead of
8740         # '&&' and '|' instead of '||'
8741
8742         # /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/
8743         my @q = qw( & && | || ? : + - * and or while if unless);
8744         push @q, ')', '}', ']', '>', ',', ';';
8745         @{pattern_test}{@q} = (1) x scalar(@q);
8746     }
8747
8748     sub pattern_expected {
8749
8750         # This a filter for a possible pattern.
8751         # It looks at the token after a possible pattern and tries to
8752         # determine if that token could end a pattern.
8753         # returns -
8754         #   1 - yes
8755         #   0 - can't tell
8756         #  -1 - no
8757         my ( $i, $rtokens, $max_token_index ) = @_;
8758         my $is_pattern = 0;
8759
8760         my $next_token = $rtokens->[ $i + 1 ];
8761         if ( $next_token =~ /^[msixpodualgc]/ ) {
8762             $i++;
8763         }    # skip possible modifier
8764         my ( $next_nonblank_token, $i_next ) =
8765           find_next_nonblank_token( $i, $rtokens, $max_token_index );
8766
8767         if ( $pattern_test{$next_nonblank_token} ) {
8768             $is_pattern = 1;
8769         }
8770         else {
8771
8772             # Added '#' to fix issue c044
8773             if (   $next_nonblank_token =~ /^\s*$/
8774                 || $next_nonblank_token eq '#' )
8775             {
8776                 $is_pattern = 0;
8777             }
8778             else {
8779                 $is_pattern = -1;
8780             }
8781         }
8782         return $is_pattern;
8783     } ## end sub pattern_expected
8784 }
8785
8786 sub find_next_nonblank_token_on_this_line {
8787     my ( $i, $rtokens, $max_token_index ) = @_;
8788     my $next_nonblank_token;
8789
8790     if ( $i < $max_token_index ) {
8791         $next_nonblank_token = $rtokens->[ ++$i ];
8792
8793         if ( $next_nonblank_token =~ /^\s*$/ ) {
8794
8795             if ( $i < $max_token_index ) {
8796                 $next_nonblank_token = $rtokens->[ ++$i ];
8797             }
8798         }
8799     }
8800     else {
8801         $next_nonblank_token = EMPTY_STRING;
8802     }
8803     return ( $next_nonblank_token, $i );
8804 } ## end sub find_next_nonblank_token_on_this_line
8805
8806 sub find_angle_operator_termination {
8807
8808     # We are looking at a '<' and want to know if it is an angle operator.
8809     # We are to return:
8810     #   $i = pretoken index of ending '>' if found, current $i otherwise
8811     #   $type = 'Q' if found, '>' otherwise
8812     my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
8813     my $i    = $i_beg;
8814     my $type = '<';
8815     pos($input_line) = 1 + $rtoken_map->[$i];
8816
8817     my $filter;
8818
8819     # we just have to find the next '>' if a term is expected
8820     if ( $expecting == TERM ) { $filter = '[\>]' }
8821
8822     # we have to guess if we don't know what is expected
8823     elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
8824
8825     # shouldn't happen - we shouldn't be here if operator is expected
8826     else {
8827         if (DEVEL_MODE) {
8828             Fault(<<EOM);
8829 Bad call to find_angle_operator_termination
8830 EOM
8831         }
8832         return ( $i, $type );
8833     }
8834
8835     # To illustrate what we might be looking at, in case we are
8836     # guessing, here are some examples of valid angle operators
8837     # (or file globs):
8838     #  <tmp_imp/*>
8839     #  <FH>
8840     #  <$fh>
8841     #  <*.c *.h>
8842     #  <_>
8843     #  <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
8844     #  <${PREFIX}*img*.$IMAGE_TYPE>
8845     #  <img*.$IMAGE_TYPE>
8846     #  <Timg*.$IMAGE_TYPE>
8847     #  <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
8848     #
8849     # Here are some examples of lines which do not have angle operators:
8850     #  return unless $self->[2]++ < $#{$self->[1]};
8851     #  < 2  || @$t >
8852     #
8853     # the following line from dlister.pl caused trouble:
8854     #  print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
8855     #
8856     # If the '<' starts an angle operator, it must end on this line and
8857     # it must not have certain characters like ';' and '=' in it.  I use
8858     # this to limit the testing.  This filter should be improved if
8859     # possible.
8860
8861     if ( $input_line =~ /($filter)/g ) {
8862
8863         if ( $1 eq '>' ) {
8864
8865             # We MAY have found an angle operator termination if we get
8866             # here, but we need to do more to be sure we haven't been
8867             # fooled.
8868             my $pos = pos($input_line);
8869
8870             my $pos_beg = $rtoken_map->[$i];
8871             my $str     = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
8872
8873             # Test for '<' after possible filehandle, issue c103
8874             # print $fh <>;          # syntax error
8875             # print $fh <DATA>;      # ok
8876             # print $fh < DATA>;     # syntax error at '>'
8877             # print STDERR < DATA>;  # ok, prints word 'DATA'
8878             # print BLABLA <DATA>;   # ok; does nothing unless BLABLA is defined
8879             if ( $last_nonblank_type eq 'Z' ) {
8880
8881                 # $str includes brackets; something like '<DATA>'
8882                 if (   substr( $last_nonblank_token, 0, 1 ) !~ /[A-Za-z_]/
8883                     && substr( $str, 1, 1 ) !~ /[A-Za-z_]/ )
8884                 {
8885                     return ( $i, $type );
8886                 }
8887             }
8888
8889             # Reject if the closing '>' follows a '-' as in:
8890             # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
8891             if ( $expecting eq UNKNOWN ) {
8892                 my $check = substr( $input_line, $pos - 2, 1 );
8893                 if ( $check eq '-' ) {
8894                     return ( $i, $type );
8895                 }
8896             }
8897
8898             ######################################debug#####
8899             #write_diagnostics( "ANGLE? :$str\n");
8900             #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
8901             ######################################debug#####
8902             $type = 'Q';
8903             my $error;
8904             ( $i, $error ) =
8905               inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
8906
8907             # It may be possible that a quote ends midway in a pretoken.
8908             # If this happens, it may be necessary to split the pretoken.
8909             if ($error) {
8910                 if (DEVEL_MODE) {
8911                     Fault(<<EOM);
8912 unexpected error condition returned by inverse_pretoken_map
8913 EOM
8914                 }
8915                 warning(
8916                     "Possible tokinization error..please check this line\n");
8917             }
8918
8919             # count blanks on inside of brackets
8920             my $blank_count = 0;
8921             $blank_count++ if ( $str =~ /<\s+/ );
8922             $blank_count++ if ( $str =~ /\s+>/ );
8923
8924             # Now let's see where we stand....
8925             # OK if math op not possible
8926             if ( $expecting == TERM ) {
8927             }
8928
8929             # OK if there are no more than 2 non-blank pre-tokens inside
8930             # (not possible to write 2 token math between < and >)
8931             # This catches most common cases
8932             elsif ( $i <= $i_beg + 3 + $blank_count ) {
8933
8934                 # No longer any need to document this common case
8935                 ## write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
8936             }
8937
8938             # OK if there is some kind of identifier inside
8939             #   print $fh <tvg::INPUT>;
8940             elsif ( $str =~ /^<\s*\$?(\w|::|\s)+\s*>$/ ) {
8941                 write_diagnostics("ANGLE (contains identifier): $str\n");
8942             }
8943
8944             # Not sure..
8945             else {
8946
8947                 # Let's try a Brace Test: any braces inside must balance
8948                 my $br = 0;
8949                 while ( $str =~ /\{/g ) { $br++ }
8950                 while ( $str =~ /\}/g ) { $br-- }
8951                 my $sb = 0;
8952                 while ( $str =~ /\[/g ) { $sb++ }
8953                 while ( $str =~ /\]/g ) { $sb-- }
8954                 my $pr = 0;
8955                 while ( $str =~ /\(/g ) { $pr++ }
8956                 while ( $str =~ /\)/g ) { $pr-- }
8957
8958                 # if braces do not balance - not angle operator
8959                 if ( $br || $sb || $pr ) {
8960                     $i    = $i_beg;
8961                     $type = '<';
8962                     write_diagnostics(
8963                         "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
8964                 }
8965
8966                 # we should keep doing more checks here...to be continued
8967                 # Tentatively accepting this as a valid angle operator.
8968                 # There are lots more things that can be checked.
8969                 else {
8970                     write_diagnostics(
8971                         "ANGLE-Guessing yes: $str expecting=$expecting\n");
8972                     write_logfile_entry("Guessing angle operator here: $str\n");
8973                 }
8974             }
8975         }
8976
8977         # didn't find ending >
8978         else {
8979             if ( $expecting == TERM ) {
8980                 warning("No ending > for angle operator\n");
8981             }
8982         }
8983     }
8984     return ( $i, $type );
8985 } ## end sub find_angle_operator_termination
8986
8987 sub scan_number_do {
8988
8989     #  scan a number in any of the formats that Perl accepts
8990     #  Underbars (_) are allowed in decimal numbers.
8991     #  input parameters -
8992     #      $input_line  - the string to scan
8993     #      $i           - pre_token index to start scanning
8994     #    $rtoken_map    - reference to the pre_token map giving starting
8995     #                    character position in $input_line of token $i
8996     #  output parameters -
8997     #    $i            - last pre_token index of the number just scanned
8998     #    number        - the number (characters); or undef if not a number
8999
9000     my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
9001     my $pos_beg = $rtoken_map->[$i];
9002     my $pos;
9003     my $i_begin = $i;
9004     my $number  = undef;
9005     my $type    = $input_type;
9006
9007     my $first_char = substr( $input_line, $pos_beg, 1 );
9008
9009     # Look for bad starting characters; Shouldn't happen..
9010     if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
9011         if (DEVEL_MODE) {
9012             Fault(<<EOM);
9013 Program bug - scan_number given bad first character = '$first_char'
9014 EOM
9015         }
9016         return ( $i, $type, $number );
9017     }
9018
9019     # handle v-string without leading 'v' character ('Two Dot' rule)
9020     # (vstring.t)
9021     # Here is the format prior to including underscores:
9022     ## if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
9023     pos($input_line) = $pos_beg;
9024     if ( $input_line =~ /\G((\d[_\d]*)?\.[\d_]+(\.[\d_]+)+)/g ) {
9025         $pos = pos($input_line);
9026         my $numc = $pos - $pos_beg;
9027         $number = substr( $input_line, $pos_beg, $numc );
9028         $type   = 'v';
9029         report_v_string($number);
9030     }
9031
9032     # handle octal, hex, binary
9033     if ( !defined($number) ) {
9034         pos($input_line) = $pos_beg;
9035
9036         # Perl 5.22 added floating point literals, like '0x0.b17217f7d1cf78p0'
9037         # For reference, the format prior to hex floating point is:
9038         #   /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
9039         #             (hex)               (octal)   (binary)
9040         if (
9041             $input_line =~
9042
9043             /\G[+-]?0(                   # leading [signed] 0
9044
9045            # a hex float, i.e. '0x0.b17217f7d1cf78p0'
9046            ([xX][0-9a-fA-F_]*            # X and optional leading digits
9047            (\.([0-9a-fA-F][0-9a-fA-F_]*)?)?   # optional decimal and fraction
9048            [Pp][+-]?[0-9a-fA-F]          # REQUIRED exponent with digit
9049            [0-9a-fA-F_]*)                # optional Additional exponent digits
9050
9051            # or hex integer
9052            |([xX][0-9a-fA-F_]+)
9053
9054            # or octal fraction
9055            |([oO]?[0-7_]+          # string of octal digits
9056            (\.([0-7][0-7_]*)?)?    # optional decimal and fraction
9057            [Pp][+-]?[0-7]          # REQUIRED exponent, no underscore
9058            [0-7_]*)                # Additional exponent digits with underscores
9059
9060            # or octal integer
9061            |([oO]?[0-7_]+)         # string of octal digits
9062
9063            # or a binary float
9064            |([bB][01_]*            # 'b' with string of binary digits
9065            (\.([01][01_]*)?)?      # optional decimal and fraction
9066            [Pp][+-]?[01]           # Required exponent indicator, no underscore
9067            [01_]*)                 # additional exponent bits
9068
9069            # or binary integer
9070            |([bB][01_]+)           # 'b' with string of binary digits
9071
9072            )/gx
9073           )
9074         {
9075             $pos = pos($input_line);
9076             my $numc = $pos - $pos_beg;
9077             $number = substr( $input_line, $pos_beg, $numc );
9078             $type   = 'n';
9079         }
9080     }
9081
9082     # handle decimal
9083     if ( !defined($number) ) {
9084         pos($input_line) = $pos_beg;
9085
9086         if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
9087             $pos = pos($input_line);
9088
9089             # watch out for things like 0..40 which would give 0. by this;
9090             if (   ( substr( $input_line, $pos - 1, 1 ) eq '.' )
9091                 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
9092             {
9093                 $pos--;
9094             }
9095             my $numc = $pos - $pos_beg;
9096             $number = substr( $input_line, $pos_beg, $numc );
9097             $type   = 'n';
9098         }
9099     }
9100
9101     # filter out non-numbers like e + - . e2  .e3 +e6
9102     # the rule: at least one digit, and any 'e' must be preceded by a digit
9103     if (
9104         $number !~ /\d/    # no digits
9105         || (   $number =~ /^(.*)[eE]/
9106             && $1 !~ /\d/ )    # or no digits before the 'e'
9107       )
9108     {
9109         $number = undef;
9110         $type   = $input_type;
9111         return ( $i, $type, $number );
9112     }
9113
9114     # Found a number; now we must convert back from character position
9115     # to pre_token index. An error here implies user syntax error.
9116     # An example would be an invalid octal number like '009'.
9117     my $error;
9118     ( $i, $error ) =
9119       inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
9120     if ($error) { warning("Possibly invalid number\n") }
9121
9122     return ( $i, $type, $number );
9123 } ## end sub scan_number_do
9124
9125 sub inverse_pretoken_map {
9126
9127     # Starting with the current pre_token index $i, scan forward until
9128     # finding the index of the next pre_token whose position is $pos.
9129     my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
9130     my $error = 0;
9131
9132     while ( ++$i <= $max_token_index ) {
9133
9134         if ( $pos <= $rtoken_map->[$i] ) {
9135
9136             # Let the calling routine handle errors in which we do not
9137             # land on a pre-token boundary.  It can happen by running
9138             # perltidy on some non-perl scripts, for example.
9139             if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
9140             $i--;
9141             last;
9142         }
9143     }
9144     return ( $i, $error );
9145 } ## end sub inverse_pretoken_map
9146
9147 sub find_here_doc {
9148
9149     # find the target of a here document, if any
9150     # input parameters:
9151     #   $i - token index of the second < of <<
9152     #   ($i must be less than the last token index if this is called)
9153     # output parameters:
9154     #   $found_target = 0 didn't find target; =1 found target
9155     #   HERE_TARGET - the target string (may be empty string)
9156     #   $i - unchanged if not here doc,
9157     #    or index of the last token of the here target
9158     #   $saw_error - flag noting unbalanced quote on here target
9159     my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
9160     my $ibeg                 = $i;
9161     my $found_target         = 0;
9162     my $here_doc_target      = EMPTY_STRING;
9163     my $here_quote_character = EMPTY_STRING;
9164     my $saw_error            = 0;
9165     my ( $next_nonblank_token, $i_next_nonblank, $next_token );
9166     $next_token = $rtokens->[ $i + 1 ];
9167
9168     # perl allows a backslash before the target string (heredoc.t)
9169     my $backslash = 0;
9170     if ( $next_token eq '\\' ) {
9171         $backslash  = 1;
9172         $next_token = $rtokens->[ $i + 2 ];
9173     }
9174
9175     ( $next_nonblank_token, $i_next_nonblank ) =
9176       find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
9177
9178     if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
9179
9180         my $in_quote    = 1;
9181         my $quote_depth = 0;
9182         my $quote_pos   = 0;
9183         my $quoted_string;
9184
9185         (
9186             $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
9187             $quoted_string
9188           )
9189           = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
9190             $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
9191
9192         if ($in_quote) {    # didn't find end of quote, so no target found
9193             $i = $ibeg;
9194             if ( $expecting == TERM ) {
9195                 warning(
9196 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
9197                 );
9198                 $saw_error = 1;
9199             }
9200         }
9201         else {              # found ending quote
9202             $found_target = 1;
9203
9204             my $tokj;
9205             foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
9206                 $tokj = $rtokens->[$j];
9207
9208                 # we have to remove any backslash before the quote character
9209                 # so that the here-doc-target exactly matches this string
9210                 next
9211                   if ( $tokj eq "\\"
9212                     && $j < $i - 1
9213                     && $rtokens->[ $j + 1 ] eq $here_quote_character );
9214                 $here_doc_target .= $tokj;
9215             }
9216         }
9217     }
9218
9219     elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
9220         $found_target = 1;
9221         write_logfile_entry(
9222             "found blank here-target after <<; suggest using \"\"\n");
9223         $i = $ibeg;
9224     }
9225     elsif ( $next_token =~ /^\w/ ) {    # simple bareword or integer after <<
9226
9227         my $here_doc_expected;
9228         if ( $expecting == UNKNOWN ) {
9229             $here_doc_expected = guess_if_here_doc($next_token);
9230         }
9231         else {
9232             $here_doc_expected = 1;
9233         }
9234
9235         if ($here_doc_expected) {
9236             $found_target    = 1;
9237             $here_doc_target = $next_token;
9238             $i               = $ibeg + 1;
9239         }
9240
9241     }
9242     else {
9243
9244         if ( $expecting == TERM ) {
9245             $found_target = 1;
9246             write_logfile_entry("Note: bare here-doc operator <<\n");
9247         }
9248         else {
9249             $i = $ibeg;
9250         }
9251     }
9252
9253     # patch to neglect any prepended backslash
9254     if ( $found_target && $backslash ) { $i++ }
9255
9256     return ( $found_target, $here_doc_target, $here_quote_character, $i,
9257         $saw_error );
9258 } ## end sub find_here_doc
9259
9260 sub do_quote {
9261
9262     # follow (or continue following) quoted string(s)
9263     # $in_quote return code:
9264     #   0 - ok, found end
9265     #   1 - still must find end of quote whose target is $quote_character
9266     #   2 - still looking for end of first of two quotes
9267     #
9268     # Returns updated strings:
9269     #  $quoted_string_1 = quoted string seen while in_quote=1
9270     #  $quoted_string_2 = quoted string seen while in_quote=2
9271     my (
9272         $i,               $in_quote,    $quote_character,
9273         $quote_pos,       $quote_depth, $quoted_string_1,
9274         $quoted_string_2, $rtokens,     $rtoken_map,
9275         $max_token_index
9276     ) = @_;
9277
9278     my $in_quote_starting = $in_quote;
9279
9280     my $quoted_string;
9281     if ( $in_quote == 2 ) {    # two quotes/quoted_string_1s to follow
9282         my $ibeg = $i;
9283         (
9284             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9285             $quoted_string
9286           )
9287           = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
9288             $quote_pos, $quote_depth, $max_token_index );
9289         $quoted_string_2 .= $quoted_string;
9290         if ( $in_quote == 1 ) {
9291             if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
9292             $quote_character = EMPTY_STRING;
9293         }
9294         else {
9295             $quoted_string_2 .= "\n";
9296         }
9297     }
9298
9299     if ( $in_quote == 1 ) {    # one (more) quote to follow
9300         my $ibeg = $i;
9301         (
9302             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9303             $quoted_string
9304           )
9305           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
9306             $quote_pos, $quote_depth, $max_token_index );
9307         $quoted_string_1 .= $quoted_string;
9308         if ( $in_quote == 1 ) {
9309             $quoted_string_1 .= "\n";
9310         }
9311     }
9312     return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
9313         $quoted_string_1, $quoted_string_2 );
9314 } ## end sub do_quote
9315
9316 sub follow_quoted_string {
9317
9318     # scan for a specific token, skipping escaped characters
9319     # if the quote character is blank, use the first non-blank character
9320     # input parameters:
9321     #   $rtokens = reference to the array of tokens
9322     #   $i = the token index of the first character to search
9323     #   $in_quote = number of quoted strings being followed
9324     #   $beginning_tok = the starting quote character
9325     #   $quote_pos = index to check next for alphanumeric delimiter
9326     # output parameters:
9327     #   $i = the token index of the ending quote character
9328     #   $in_quote = decremented if found end, unchanged if not
9329     #   $beginning_tok = the starting quote character
9330     #   $quote_pos = index to check next for alphanumeric delimiter
9331     #   $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
9332     #   $quoted_string = the text of the quote (without quotation tokens)
9333     my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
9334         $max_token_index )
9335       = @_;
9336     my ( $tok, $end_tok );
9337     my $i             = $i_beg - 1;
9338     my $quoted_string = EMPTY_STRING;
9339
9340     0 && do {
9341         print STDOUT
9342 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
9343     };
9344
9345     # get the corresponding end token
9346     if ( $beginning_tok !~ /^\s*$/ ) {
9347         $end_tok = matching_end_token($beginning_tok);
9348     }
9349
9350     # a blank token means we must find and use the first non-blank one
9351     else {
9352         my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
9353
9354         while ( $i < $max_token_index ) {
9355             $tok = $rtokens->[ ++$i ];
9356
9357             if ( $tok !~ /^\s*$/ ) {
9358
9359                 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
9360                     $i = $max_token_index;
9361                 }
9362                 else {
9363
9364                     if ( length($tok) > 1 ) {
9365                         if ( $quote_pos <= 0 ) { $quote_pos = 1 }
9366                         $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
9367                     }
9368                     else {
9369                         $beginning_tok = $tok;
9370                         $quote_pos     = 0;
9371                     }
9372                     $end_tok     = matching_end_token($beginning_tok);
9373                     $quote_depth = 1;
9374                     last;
9375                 }
9376             }
9377             else {
9378                 $allow_quote_comments = 1;
9379             }
9380         }
9381     }
9382
9383     # There are two different loops which search for the ending quote
9384     # character.  In the rare case of an alphanumeric quote delimiter, we
9385     # have to look through alphanumeric tokens character-by-character, since
9386     # the pre-tokenization process combines multiple alphanumeric
9387     # characters, whereas for a non-alphanumeric delimiter, only tokens of
9388     # length 1 can match.
9389
9390     ###################################################################
9391     # Case 1 (rare): loop for case of alphanumeric quote delimiter..
9392     # "quote_pos" is the position the current word to begin searching
9393     ###################################################################
9394     if ( $beginning_tok =~ /\w/ ) {
9395
9396         # Note this because it is not recommended practice except
9397         # for obfuscated perl contests
9398         if ( $in_quote == 1 ) {
9399             write_logfile_entry(
9400                 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
9401         }
9402
9403         # Note: changed < to <= here to fix c109. Relying on extra end blanks.
9404         while ( $i <= $max_token_index ) {
9405
9406             if ( $quote_pos == 0 || ( $i < 0 ) ) {
9407                 $tok = $rtokens->[ ++$i ];
9408
9409                 if ( $tok eq '\\' ) {
9410
9411                     # retain backslash unless it hides the end token
9412                     $quoted_string .= $tok
9413                       unless $rtokens->[ $i + 1 ] eq $end_tok;
9414                     $quote_pos++;
9415                     last if ( $i >= $max_token_index );
9416                     $tok = $rtokens->[ ++$i ];
9417                 }
9418             }
9419             my $old_pos = $quote_pos;
9420
9421             unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
9422             {
9423
9424             }
9425             $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
9426
9427             if ( $quote_pos > 0 ) {
9428
9429                 $quoted_string .=
9430                   substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
9431
9432                 # NOTE: any quote modifiers will be at the end of '$tok'. If we
9433                 # wanted to check them, this is the place to get them.  But
9434                 # this quote form is rarely used in practice, so it isn't
9435                 # worthwhile.
9436
9437                 $quote_depth--;
9438
9439                 if ( $quote_depth == 0 ) {
9440                     $in_quote--;
9441                     last;
9442                 }
9443             }
9444             else {
9445                 if ( $old_pos <= length($tok) ) {
9446                     $quoted_string .= substr( $tok, $old_pos );
9447                 }
9448             }
9449         }
9450     }
9451
9452     ########################################################################
9453     # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
9454     ########################################################################
9455     else {
9456
9457         while ( $i < $max_token_index ) {
9458             $tok = $rtokens->[ ++$i ];
9459
9460             if ( $tok eq $end_tok ) {
9461                 $quote_depth--;
9462
9463                 if ( $quote_depth == 0 ) {
9464                     $in_quote--;
9465                     last;
9466                 }
9467             }
9468             elsif ( $tok eq $beginning_tok ) {
9469                 $quote_depth++;
9470             }
9471             elsif ( $tok eq '\\' ) {
9472
9473                 # retain backslash unless it hides the beginning or end token
9474                 $tok = $rtokens->[ ++$i ];
9475                 $quoted_string .= '\\'
9476                   unless ( $tok eq $end_tok || $tok eq $beginning_tok );
9477             }
9478             $quoted_string .= $tok;
9479         }
9480     }
9481     if ( $i > $max_token_index ) { $i = $max_token_index }
9482     return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
9483         $quoted_string );
9484 } ## end sub follow_quoted_string
9485
9486 sub indicate_error {
9487     my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
9488     interrupt_logfile();
9489     warning($msg);
9490     write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
9491     resume_logfile();
9492     return;
9493 }
9494
9495 sub write_error_indicator_pair {
9496     my ( $line_number, $input_line, $pos, $carrat ) = @_;
9497     my ( $offset, $numbered_line, $underline ) =
9498       make_numbered_line( $line_number, $input_line, $pos );
9499     $underline = write_on_underline( $underline, $pos - $offset, $carrat );
9500     warning( $numbered_line . "\n" );
9501     $underline =~ s/\s*$//;
9502     warning( $underline . "\n" );
9503     return;
9504 } ## end sub write_error_indicator_pair
9505
9506 sub make_numbered_line {
9507
9508     #  Given an input line, its line number, and a character position of
9509     #  interest, create a string not longer than 80 characters of the form
9510     #     $lineno: sub_string
9511     #  such that the sub_string of $str contains the position of interest
9512     #
9513     #  Here is an example of what we want, in this case we add trailing
9514     #  '...' because the line is long.
9515     #
9516     # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9517     #
9518     #  Here is another example, this time in which we used leading '...'
9519     #  because of excessive length:
9520     #
9521     # 2: ... er of the World Wide Web Consortium's
9522     #
9523     #  input parameters are:
9524     #   $lineno = line number
9525     #   $str = the text of the line
9526     #   $pos = position of interest (the error) : 0 = first character
9527     #
9528     #   We return :
9529     #     - $offset = an offset which corrects the position in case we only
9530     #       display part of a line, such that $pos-$offset is the effective
9531     #       position from the start of the displayed line.
9532     #     - $numbered_line = the numbered line as above,
9533     #     - $underline = a blank 'underline' which is all spaces with the same
9534     #       number of characters as the numbered line.
9535
9536     my ( $lineno, $str, $pos ) = @_;
9537     my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
9538     my $excess = length($str) - $offset - 68;
9539     my $numc   = ( $excess > 0 ) ? 68 : undef;
9540
9541     if ( defined($numc) ) {
9542         if ( $offset == 0 ) {
9543             $str = substr( $str, $offset, $numc - 4 ) . " ...";
9544         }
9545         else {
9546             $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
9547         }
9548     }
9549     else {
9550
9551         if ( $offset == 0 ) {
9552         }
9553         else {
9554             $str = "... " . substr( $str, $offset + 4 );
9555         }
9556     }
9557
9558     my $numbered_line = sprintf( "%d: ", $lineno );
9559     $offset -= length($numbered_line);
9560     $numbered_line .= $str;
9561     my $underline = SPACE x length($numbered_line);
9562     return ( $offset, $numbered_line, $underline );
9563 } ## end sub make_numbered_line
9564
9565 sub write_on_underline {
9566
9567     # The "underline" is a string that shows where an error is; it starts
9568     # out as a string of blanks with the same length as the numbered line of
9569     # code above it, and we have to add marking to show where an error is.
9570     # In the example below, we want to write the string '--^' just below
9571     # the line of bad code:
9572     #
9573     # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
9574     #                 ---^
9575     # We are given the current underline string, plus a position and a
9576     # string to write on it.
9577     #
9578     # In the above example, there will be 2 calls to do this:
9579     # First call:  $pos=19, pos_chr=^
9580     # Second call: $pos=16, pos_chr=---
9581     #
9582     # This is a trivial thing to do with substr, but there is some
9583     # checking to do.
9584
9585     my ( $underline, $pos, $pos_chr ) = @_;
9586
9587     # check for error..shouldn't happen
9588     unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
9589         return $underline;
9590     }
9591     my $excess = length($pos_chr) + $pos - length($underline);
9592     if ( $excess > 0 ) {
9593         $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
9594     }
9595     substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
9596     return ($underline);
9597 } ## end sub write_on_underline
9598
9599 sub pre_tokenize {
9600
9601     # Break a string, $str, into a sequence of preliminary tokens.  We
9602     # are interested in these types of tokens:
9603     #   words       (type='w'),            example: 'max_tokens_wanted'
9604     #   digits      (type = 'd'),          example: '0755'
9605     #   whitespace  (type = 'b'),          example: '   '
9606     #   any other single character (i.e. punct; type = the character itself).
9607     # We cannot do better than this yet because we might be in a quoted
9608     # string or pattern.  Caller sets $max_tokens_wanted to 0 to get all
9609     # tokens.
9610
9611     # An advantage of doing this pre-tokenization step is that it keeps almost
9612     # all of the regex work highly localized.  A disadvantage is that in some
9613     # very rare instances we will have to go back and split a pre-token.
9614     my ( $str, $max_tokens_wanted ) = @_;
9615
9616     # we return references to these 3 arrays:
9617     my @tokens    = ();     # array of the tokens themselves
9618     my @token_map = (0);    # string position of start of each token
9619     my @type      = ();     # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
9620
9621     do {
9622
9623         # whitespace
9624         if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
9625
9626         # numbers
9627         # note that this must come before words!
9628         elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
9629
9630         # words
9631         elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
9632
9633         # single-character punctuation
9634         elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
9635
9636         # that's all..
9637         else {
9638             return ( \@tokens, \@token_map, \@type );
9639         }
9640
9641         push @tokens,    $1;
9642         push @token_map, pos($str);
9643
9644     } while ( --$max_tokens_wanted != 0 );
9645
9646     return ( \@tokens, \@token_map, \@type );
9647 } ## end sub pre_tokenize
9648
9649 sub show_tokens {
9650
9651     # this is an old debug routine
9652     # not called, but saved for reference
9653     my ( $rtokens, $rtoken_map ) = @_;
9654     my $num = scalar( @{$rtokens} );
9655
9656     foreach my $i ( 0 .. $num - 1 ) {
9657         my $len = length( $rtokens->[$i] );
9658         print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
9659     }
9660     return;
9661 } ## end sub show_tokens
9662
9663 {    ## closure for sub matching end token
9664     my %matching_end_token;
9665
9666     BEGIN {
9667         %matching_end_token = (
9668             '{' => '}',
9669             '(' => ')',
9670             '[' => ']',
9671             '<' => '>',
9672         );
9673     }
9674
9675     sub matching_end_token {
9676
9677         # return closing character for a pattern
9678         my $beginning_token = shift;
9679         if ( $matching_end_token{$beginning_token} ) {
9680             return $matching_end_token{$beginning_token};
9681         }
9682         return ($beginning_token);
9683     }
9684 }
9685
9686 sub dump_token_types {
9687     my ( $class, $fh ) = @_;
9688
9689     # This should be the latest list of token types in use
9690     # adding NEW_TOKENS: add a comment here
9691     $fh->print(<<'END_OF_LIST');
9692
9693 Here is a list of the token types currently used for lines of type 'CODE'.
9694 For the following tokens, the "type" of a token is just the token itself.
9695
9696 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
9697 ( ) <= >= == =~ !~ != ++ -- /= x=
9698 ... **= <<= >>= &&= ||= //= <=>
9699 , + - / * | % ! x ~ = \ ? : . < > ^ &
9700
9701 The following additional token types are defined:
9702
9703  type    meaning
9704     b    blank (white space)
9705     {    indent: opening structural curly brace or square bracket or paren
9706          (code block, anonymous hash reference, or anonymous array reference)
9707     }    outdent: right structural curly brace or square bracket or paren
9708     [    left non-structural square bracket (enclosing an array index)
9709     ]    right non-structural square bracket
9710     (    left non-structural paren (all but a list right of an =)
9711     )    right non-structural paren
9712     L    left non-structural curly brace (enclosing a key)
9713     R    right non-structural curly brace
9714     ;    terminal semicolon
9715     f    indicates a semicolon in a "for" statement
9716     h    here_doc operator <<
9717     #    a comment
9718     Q    indicates a quote or pattern
9719     q    indicates a qw quote block
9720     k    a perl keyword
9721     C    user-defined constant or constant function (with void prototype = ())
9722     U    user-defined function taking parameters
9723     G    user-defined function taking block parameter (like grep/map/eval)
9724     M    (unused, but reserved for subroutine definition name)
9725     P    (unused, but -html uses it to label pod text)
9726     t    type indicater such as %,$,@,*,&,sub
9727     w    bare word (perhaps a subroutine call)
9728     i    identifier of some type (with leading %, $, @, *, &, sub, -> )
9729     n    a number
9730     v    a v-string
9731     F    a file test operator (like -e)
9732     Y    File handle
9733     Z    identifier in indirect object slot: may be file handle, object
9734     J    LABEL:  code block label
9735     j    LABEL after next, last, redo, goto
9736     p    unary +
9737     m    unary -
9738     pp   pre-increment operator ++
9739     mm   pre-decrement operator --
9740     A    : used as attribute separator
9741
9742     Here are the '_line_type' codes used internally:
9743     SYSTEM         - system-specific code before hash-bang line
9744     CODE           - line of perl code (including comments)
9745     POD_START      - line starting pod, such as '=head'
9746     POD            - pod documentation text
9747     POD_END        - last line of pod section, '=cut'
9748     HERE           - text of here-document
9749     HERE_END       - last line of here-doc (target word)
9750     FORMAT         - format section
9751     FORMAT_END     - last line of format section, '.'
9752     SKIP           - code skipping section
9753     SKIP_END       - last line of code skipping section, '#>>V'
9754     DATA_START     - __DATA__ line
9755     DATA           - unidentified text following __DATA__
9756     END_START      - __END__ line
9757     END            - unidentified text following __END__
9758     ERROR          - we are in big trouble, probably not a perl script
9759 END_OF_LIST
9760
9761     return;
9762 } ## end sub dump_token_types
9763
9764 BEGIN {
9765
9766     # These names are used in error messages
9767     @opening_brace_names = qw# '{' '[' '(' '?' #;
9768     @closing_brace_names = qw# '}' ']' ')' ':' #;
9769
9770     my @q;
9771
9772     my @digraphs = qw(
9773       .. :: << >> ** && || // -> => += -= .= %= &= |= ^= *= <>
9774       <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
9775     );
9776     @is_digraph{@digraphs} = (1) x scalar(@digraphs);
9777
9778     @q = qw(
9779       . : < > * & | / - = + -  %  ^ !  x ~
9780     );
9781     @can_start_digraph{@q} = (1) x scalar(@q);
9782
9783     my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
9784     @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
9785
9786     my @tetragraphs = qw( <<>> );
9787     @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
9788
9789     # make a hash of all valid token types for self-checking the tokenizer
9790     # (adding NEW_TOKENS : select a new character and add to this list)
9791     my @valid_token_types = qw#
9792       A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
9793       { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
9794       #;
9795     push( @valid_token_types, @digraphs );
9796     push( @valid_token_types, @trigraphs );
9797     push( @valid_token_types, @tetragraphs );
9798     push( @valid_token_types, ( '#', ',', 'CORE::' ) );
9799     @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
9800
9801     # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
9802     my @file_test_operators =
9803       qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
9804     @is_file_test_operator{@file_test_operators} =
9805       (1) x scalar(@file_test_operators);
9806
9807     # these functions have prototypes of the form (&), so when they are
9808     # followed by a block, that block MAY BE followed by an operator.
9809     # Smartmatch operator ~~ may be followed by anonymous hash or array ref
9810     @q = qw( do eval );
9811     @is_block_operator{@q} = (1) x scalar(@q);
9812
9813     # these functions allow an identifier in the indirect object slot
9814     @q = qw( print printf sort exec system say);
9815     @is_indirect_object_taker{@q} = (1) x scalar(@q);
9816
9817     # These tokens may precede a code block
9818     # patched for SWITCH/CASE/CATCH.  Actually these could be removed
9819     # now and we could let the extended-syntax coding handle them.
9820     # Added 'default' for Switch::Plain.
9821     @q =
9822       qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
9823       unless do while until eval for foreach map grep sort
9824       switch case given when default catch try finally);
9825     @is_code_block_token{@q} = (1) x scalar(@q);
9826
9827     # Note: this hash was formerly named '%is_not_zero_continuation_block_type'
9828     # to contrast it with the block types in '%is_zero_continuation_block_type'
9829     @q = qw( sort map grep eval do );
9830     @is_sort_map_grep_eval_do{@q} = (1) x scalar(@q);
9831
9832     @q = qw( sort map grep );
9833     @is_sort_map_grep{@q} = (1) x scalar(@q);
9834
9835     %is_grep_alias = ();
9836
9837     # I'll build the list of keywords incrementally
9838     my @Keywords = ();
9839
9840     # keywords and tokens after which a value or pattern is expected,
9841     # but not an operator.  In other words, these should consume terms
9842     # to their right, or at least they are not expected to be followed
9843     # immediately by operators.
9844     my @value_requestor = qw(
9845       AUTOLOAD
9846       BEGIN
9847       CHECK
9848       DESTROY
9849       END
9850       EQ
9851       GE
9852       GT
9853       INIT
9854       LE
9855       LT
9856       NE
9857       UNITCHECK
9858       abs
9859       accept
9860       alarm
9861       and
9862       atan2
9863       bind
9864       binmode
9865       bless
9866       break
9867       caller
9868       chdir
9869       chmod
9870       chomp
9871       chop
9872       chown
9873       chr
9874       chroot
9875       close
9876       closedir
9877       cmp
9878       connect
9879       continue
9880       cos
9881       crypt
9882       dbmclose
9883       dbmopen
9884       defined
9885       delete
9886       die
9887       dump
9888       each
9889       else
9890       elsif
9891       eof
9892       eq
9893       evalbytes
9894       exec
9895       exists
9896       exit
9897       exp
9898       fc
9899       fcntl
9900       fileno
9901       flock
9902       for
9903       foreach
9904       formline
9905       ge
9906       getc
9907       getgrgid
9908       getgrnam
9909       gethostbyaddr
9910       gethostbyname
9911       getnetbyaddr
9912       getnetbyname
9913       getpeername
9914       getpgrp
9915       getpriority
9916       getprotobyname
9917       getprotobynumber
9918       getpwnam
9919       getpwuid
9920       getservbyname
9921       getservbyport
9922       getsockname
9923       getsockopt
9924       glob
9925       gmtime
9926       goto
9927       grep
9928       gt
9929       hex
9930       if
9931       index
9932       int
9933       ioctl
9934       join
9935       keys
9936       kill
9937       last
9938       lc
9939       lcfirst
9940       le
9941       length
9942       link
9943       listen
9944       local
9945       localtime
9946       lock
9947       log
9948       lstat
9949       lt
9950       map
9951       mkdir
9952       msgctl
9953       msgget
9954       msgrcv
9955       msgsnd
9956       my
9957       ne
9958       next
9959       no
9960       not
9961       oct
9962       open
9963       opendir
9964       or
9965       ord
9966       our
9967       pack
9968       pipe
9969       pop
9970       pos
9971       print
9972       printf
9973       prototype
9974       push
9975       quotemeta
9976       rand
9977       read
9978       readdir
9979       readlink
9980       readline
9981       readpipe
9982       recv
9983       redo
9984       ref
9985       rename
9986       require
9987       reset
9988       return
9989       reverse
9990       rewinddir
9991       rindex
9992       rmdir
9993       scalar
9994       seek
9995       seekdir
9996       select
9997       semctl
9998       semget
9999       semop
10000       send
10001       sethostent
10002       setnetent
10003       setpgrp
10004       setpriority
10005       setprotoent
10006       setservent
10007       setsockopt
10008       shift
10009       shmctl
10010       shmget
10011       shmread
10012       shmwrite
10013       shutdown
10014       sin
10015       sleep
10016       socket
10017       socketpair
10018       sort
10019       splice
10020       split
10021       sprintf
10022       sqrt
10023       srand
10024       stat
10025       state
10026       study
10027       substr
10028       symlink
10029       syscall
10030       sysopen
10031       sysread
10032       sysseek
10033       system
10034       syswrite
10035       tell
10036       telldir
10037       tie
10038       tied
10039       truncate
10040       uc
10041       ucfirst
10042       umask
10043       undef
10044       unless
10045       unlink
10046       unpack
10047       unshift
10048       untie
10049       until
10050       use
10051       utime
10052       values
10053       vec
10054       waitpid
10055       warn
10056       while
10057       write
10058       xor
10059
10060       switch
10061       case
10062       default
10063       given
10064       when
10065       err
10066       say
10067       isa
10068
10069       catch
10070     );
10071
10072     # patched above for SWITCH/CASE given/when err say
10073     # 'err' is a fairly safe addition.
10074     # Added 'default' for Switch::Plain. Note that we could also have
10075     # a separate set of keywords to include if we see 'use Switch::Plain'
10076     push( @Keywords, @value_requestor );
10077
10078     # These are treated the same but are not keywords:
10079     my @extra_vr = qw(
10080       constant
10081       vars
10082     );
10083     push( @value_requestor, @extra_vr );
10084
10085     @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
10086
10087     # this list contains keywords which do not look for arguments,
10088     # so that they might be followed by an operator, or at least
10089     # not a term.
10090     my @operator_requestor = qw(
10091       endgrent
10092       endhostent
10093       endnetent
10094       endprotoent
10095       endpwent
10096       endservent
10097       fork
10098       getgrent
10099       gethostent
10100       getlogin
10101       getnetent
10102       getppid
10103       getprotoent
10104       getpwent
10105       getservent
10106       setgrent
10107       setpwent
10108       time
10109       times
10110       wait
10111       wantarray
10112     );
10113
10114     push( @Keywords, @operator_requestor );
10115
10116     # These are treated the same but are not considered keywords:
10117     my @extra_or = qw(
10118       STDERR
10119       STDIN
10120       STDOUT
10121     );
10122
10123     push( @operator_requestor, @extra_or );
10124
10125     @expecting_operator_token{@operator_requestor} =
10126       (1) x scalar(@operator_requestor);
10127
10128     # these token TYPES expect trailing operator but not a term
10129     # note: ++ and -- are post-increment and decrement, 'C' = constant
10130     my @operator_requestor_types = qw( ++ -- C <> q );
10131     @expecting_operator_types{@operator_requestor_types} =
10132       (1) x scalar(@operator_requestor_types);
10133
10134     # these token TYPES consume values (terms)
10135     # note: pp and mm are pre-increment and decrement
10136     # f=semicolon in for,  F=file test operator
10137     my @value_requestor_type = qw#
10138       L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
10139       **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
10140       <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~ <<~
10141       f F pp mm Y p m U J G j >> << ^ t
10142       ~. ^. |. &. ^.= |.= &.=
10143       #;
10144     push( @value_requestor_type, ',' )
10145       ;    # (perl doesn't like a ',' in a qw block)
10146     @expecting_term_types{@value_requestor_type} =
10147       (1) x scalar(@value_requestor_type);
10148
10149     # Note: the following valid token types are not assigned here to
10150     # hashes requesting to be followed by values or terms, but are
10151     # instead currently hard-coded into sub operator_expected:
10152     # ) -> :: Q R Z ] b h i k n v w } #
10153
10154     # For simple syntax checking, it is nice to have a list of operators which
10155     # will really be unhappy if not followed by a term.  This includes most
10156     # of the above...
10157     %really_want_term = %expecting_term_types;
10158
10159     # with these exceptions...
10160     delete $really_want_term{'U'}; # user sub, depends on prototype
10161     delete $really_want_term{'F'}; # file test works on $_ if no following term
10162     delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
10163                                    # let perl do it
10164     @q = qw(q qq qx qr s y tr m);
10165     @is_q_qq_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10166
10167     # Note added 'qw' here
10168     @q = qw(q qq qw qx qr s y tr m);
10169     @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
10170
10171     @q = qw(package);
10172     @is_package{@q} = (1) x scalar(@q);
10173
10174     @q = qw( ? : );
10175     push @q, ',';
10176     @is_comma_question_colon{@q} = (1) x scalar(@q);
10177
10178     @q = qw( if elsif unless );
10179     @is_if_elsif_unless{@q} = (1) x scalar(@q);
10180
10181     @q = qw( ; t );
10182     @is_semicolon_or_t{@q} = (1) x scalar(@q);
10183
10184     @q = qw( if elsif unless case when );
10185     @is_if_elsif_unless_case_when{@q} = (1) x scalar(@q);
10186
10187     # Hash of other possible line endings which may occur.
10188     # Keep these coordinated with the regex where this is used.
10189     # Note: chr(13) = chr(015)="\r".
10190     @q = ( chr(13), chr(29), chr(26) );
10191     @other_line_endings{@q} = (1) x scalar(@q);
10192
10193     # These keywords are handled specially in the tokenizer code:
10194     my @special_keywords = qw(
10195       do
10196       eval
10197       format
10198       m
10199       package
10200       q
10201       qq
10202       qr
10203       qw
10204       qx
10205       s
10206       sub
10207       tr
10208       y
10209     );
10210     push( @Keywords, @special_keywords );
10211
10212     # Keywords after which list formatting may be used
10213     # WARNING: do not include |map|grep|eval or perl may die on
10214     # syntax errors (map1.t).
10215     my @keyword_taking_list = qw(
10216       and
10217       chmod
10218       chomp
10219       chop
10220       chown
10221       dbmopen
10222       die
10223       elsif
10224       exec
10225       fcntl
10226       for
10227       foreach
10228       formline
10229       getsockopt
10230       if
10231       index
10232       ioctl
10233       join
10234       kill
10235       local
10236       msgctl
10237       msgrcv
10238       msgsnd
10239       my
10240       open
10241       or
10242       our
10243       pack
10244       print
10245       printf
10246       push
10247       read
10248       readpipe
10249       recv
10250       return
10251       reverse
10252       rindex
10253       seek
10254       select
10255       semctl
10256       semget
10257       send
10258       setpriority
10259       setsockopt
10260       shmctl
10261       shmget
10262       shmread
10263       shmwrite
10264       socket
10265       socketpair
10266       sort
10267       splice
10268       split
10269       sprintf
10270       state
10271       substr
10272       syscall
10273       sysopen
10274       sysread
10275       sysseek
10276       system
10277       syswrite
10278       tie
10279       unless
10280       unlink
10281       unpack
10282       unshift
10283       until
10284       vec
10285       warn
10286       while
10287       given
10288       when
10289     );
10290     @is_keyword_taking_list{@keyword_taking_list} =
10291       (1) x scalar(@keyword_taking_list);
10292
10293     # perl functions which may be unary operators.
10294
10295     # This list is used to decide if a pattern delimited by slashes, /pattern/,
10296     # can follow one of these keywords.
10297     @q = qw(
10298       chomp eof eval fc lc pop shift uc undef
10299     );
10300
10301     @is_keyword_rejecting_slash_as_pattern_delimiter{@q} =
10302       (1) x scalar(@q);
10303
10304     # These are keywords for which an arg may optionally be omitted.  They are
10305     # currently only used to disambiguate a ? used as a ternary from one used
10306     # as a (deprecated) pattern delimiter.  In the future, they might be used
10307     # to give a warning about ambiguous syntax before a /.
10308     # Note: split has been omitted (see not below).
10309     my @keywords_taking_optional_arg = qw(
10310       abs
10311       alarm
10312       caller
10313       chdir
10314       chomp
10315       chop
10316       chr
10317       chroot
10318       close
10319       cos
10320       defined
10321       die
10322       eof
10323       eval
10324       evalbytes
10325       exit
10326       exp
10327       fc
10328       getc
10329       glob
10330       gmtime
10331       hex
10332       int
10333       last
10334       lc
10335       lcfirst
10336       length
10337       localtime
10338       log
10339       lstat
10340       mkdir
10341       next
10342       oct
10343       ord
10344       pop
10345       pos
10346       print
10347       printf
10348       prototype
10349       quotemeta
10350       rand
10351       readline
10352       readlink
10353       readpipe
10354       redo
10355       ref
10356       require
10357       reset
10358       reverse
10359       rmdir
10360       say
10361       select
10362       shift
10363       sin
10364       sleep
10365       sqrt
10366       srand
10367       stat
10368       study
10369       tell
10370       uc
10371       ucfirst
10372       umask
10373       undef
10374       unlink
10375       warn
10376       write
10377     );
10378     @is_keyword_taking_optional_arg{@keywords_taking_optional_arg} =
10379       (1) x scalar(@keywords_taking_optional_arg);
10380
10381     # This list is used to decide if a pattern delimited by question marks,
10382     # ?pattern?, can follow one of these keywords.  Note that from perl 5.22
10383     # on, a ?pattern? is not recognized, so we can be much more strict than
10384     # with a /pattern/. Note that 'split' is not in this list. In current
10385     # versions of perl a question following split must be a ternary, but
10386     # in older versions it could be a pattern.  The guessing algorithm will
10387     # decide.  We are combining two lists here to simplify the test.
10388     @q = ( @keywords_taking_optional_arg, @operator_requestor );
10389     @is_keyword_rejecting_question_as_pattern_delimiter{@q} =
10390       (1) x scalar(@q);
10391
10392     # These are not used in any way yet
10393     #    my @unused_keywords = qw(
10394     #     __FILE__
10395     #     __LINE__
10396     #     __PACKAGE__
10397     #     );
10398
10399     #  The list of keywords was originally extracted from function 'keyword' in
10400     #  perl file toke.c version 5.005.03, using this utility, plus a
10401     #  little editing: (file getkwd.pl):
10402     #  while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
10403     #  Add 'get' prefix where necessary, then split into the above lists.
10404     #  This list should be updated as necessary.
10405     #  The list should not contain these special variables:
10406     #  ARGV DATA ENV SIG STDERR STDIN STDOUT
10407     #  __DATA__ __END__
10408
10409     @is_keyword{@Keywords} = (1) x scalar(@Keywords);
10410 }
10411 1;