lib/Perl/Tidy/Tokenizer.pm

   1 ########################################################################
   2 #
   3 # the Perl::Tidy::Tokenizer package is essentially a filter which
   4 # reads lines of perl source code from a source object and provides
   5 # corresponding tokenized lines through its get_line() method.  Lines
   6 # flow from the source_object to the caller like this:
   7 #
   8 # source_object --> LineBuffer_object --> Tokenizer -->  calling routine
   9 #   get_line()         get_line()           get_line()     line_of_tokens
  10 #
  11 # The source object can be any object with a get_line() method which
  12 # supplies one line (a character string) perl call.
  13 # The LineBuffer object is created by the Tokenizer.
  14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
  15 # containing one tokenized line for each call to its get_line() method.
  16 #
  17 # WARNING: This is not a real class yet.  Only one tokenizer my be used.
  18 #
  19 ########################################################################
  20
  21 package Perl::Tidy::Tokenizer;
  22 use strict;
  23 use warnings;
  24 our $VERSION = '20190601';
  25
  26 use Perl::Tidy::LineBuffer;
  27
  28 BEGIN {
  29
  30     # Caution: these debug flags produce a lot of output
  31     # They should all be 0 except when debugging small scripts
  32
  33     use constant TOKENIZER_DEBUG_FLAG_EXPECT   => 0;
  34     use constant TOKENIZER_DEBUG_FLAG_NSCAN    => 0;
  35     use constant TOKENIZER_DEBUG_FLAG_QUOTE    => 0;
  36     use constant TOKENIZER_DEBUG_FLAG_SCAN_ID  => 0;
  37     use constant TOKENIZER_DEBUG_FLAG_TOKENIZE => 0;
  38
  39     my $debug_warning = sub {
  40         print STDOUT "TOKENIZER_DEBUGGING with key $_[0]\n";
  41     };
  42
  43     TOKENIZER_DEBUG_FLAG_EXPECT   && $debug_warning->('EXPECT');
  44     TOKENIZER_DEBUG_FLAG_NSCAN    && $debug_warning->('NSCAN');
  45     TOKENIZER_DEBUG_FLAG_QUOTE    && $debug_warning->('QUOTE');
  46     TOKENIZER_DEBUG_FLAG_SCAN_ID  && $debug_warning->('SCAN_ID');
  47     TOKENIZER_DEBUG_FLAG_TOKENIZE && $debug_warning->('TOKENIZE');
  48
  49 }
  50
  51 use Carp;
  52
  53 # PACKAGE VARIABLES for processing an entire FILE.
  54 use vars qw{
  55   $tokenizer_self
  56
  57   $last_nonblank_token
  58   $last_nonblank_type
  59   $last_nonblank_block_type
  60   $statement_type
  61   $in_attribute_list
  62   $current_package
  63   $context
  64
  65   %is_constant
  66   %is_user_function
  67   %user_function_prototype
  68   %is_block_function
  69   %is_block_list_function
  70   %saw_function_definition
  71
  72   $brace_depth
  73   $paren_depth
  74   $square_bracket_depth
  75
  76   @current_depth
  77   @total_depth
  78   $total_depth
  79   @nesting_sequence_number
  80   @current_sequence_number
  81   @paren_type
  82   @paren_semicolon_count
  83   @paren_structural_type
  84   @brace_type
  85   @brace_structural_type
  86   @brace_context
  87   @brace_package
  88   @square_bracket_type
  89   @square_bracket_structural_type
  90   @depth_array
  91   @nested_ternary_flag
  92   @nested_statement_type
  93   @starting_line_of_current_depth
  94 };
  95
  96 # GLOBAL CONSTANTS for routines in this package
  97 use vars qw{
  98   %is_indirect_object_taker
  99   %is_block_operator
 100   %expecting_operator_token
 101   %expecting_operator_types
 102   %expecting_term_types
 103   %expecting_term_token
 104   %is_digraph
 105   %is_file_test_operator
 106   %is_trigraph
 107   %is_tetragraph
 108   %is_valid_token_type
 109   %is_keyword
 110   %is_code_block_token
 111   %really_want_term
 112   @opening_brace_names
 113   @closing_brace_names
 114   %is_keyword_taking_list
 115   %is_keyword_taking_optional_args
 116   %is_q_qq_qw_qx_qr_s_y_tr_m
 117 };
 118
 119 # possible values of operator_expected()
 120 use constant TERM     => -1;
 121 use constant UNKNOWN  => 0;
 122 use constant OPERATOR => 1;
 123
 124 # possible values of context
 125 use constant SCALAR_CONTEXT  => -1;
 126 use constant UNKNOWN_CONTEXT => 0;
 127 use constant LIST_CONTEXT    => 1;
 128
 129 # Maximum number of little messages; probably need not be changed.
 130 use constant MAX_NAG_MESSAGES => 6;
 131
 132 {
 133
 134     # methods to count instances
 135     my $_count = 0;
 136     sub get_count        { return $_count; }
 137     sub _increment_count { return ++$_count }
 138     sub _decrement_count { return --$_count }
 139 }
 140
 141 sub DESTROY {
 142     my $self = shift;
 143     $self->_decrement_count();
 144     return;
 145 }
 146
 147 sub new {
 148
 149     my ( $class, @args ) = @_;
 150
 151     # Note: 'tabs' and 'indent_columns' are temporary and should be
 152     # removed asap
 153     my %defaults = (
 154         source_object        => undef,
 155         debugger_object      => undef,
 156         diagnostics_object   => undef,
 157         logger_object        => undef,
 158         starting_level       => undef,
 159         indent_columns       => 4,
 160         tabsize              => 8,
 161         look_for_hash_bang   => 0,
 162         trim_qw              => 1,
 163         look_for_autoloader  => 1,
 164         look_for_selfloader  => 1,
 165         starting_line_number => 1,
 166         extended_syntax      => 0,
 167     );
 168     my %args = ( %defaults, @args );
 169
 170     # we are given an object with a get_line() method to supply source lines
 171     my $source_object = $args{source_object};
 172
 173     # we create another object with a get_line() and peek_ahead() method
 174     my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
 175
 176     # Tokenizer state data is as follows:
 177     # _rhere_target_list    reference to list of here-doc targets
 178     # _here_doc_target      the target string for a here document
 179     # _here_quote_character the type of here-doc quoting (" ' ` or none)
 180     #                       to determine if interpolation is done
 181     # _quote_target         character we seek if chasing a quote
 182     # _line_start_quote     line where we started looking for a long quote
 183     # _in_here_doc          flag indicating if we are in a here-doc
 184     # _in_pod               flag set if we are in pod documentation
 185     # _in_error             flag set if we saw severe error (binary in script)
 186     # _in_data              flag set if we are in __DATA__ section
 187     # _in_end               flag set if we are in __END__ section
 188     # _in_format            flag set if we are in a format description
 189     # _in_attribute_list    flag telling if we are looking for attributes
 190     # _in_quote             flag telling if we are chasing a quote
 191     # _starting_level       indentation level of first line
 192     # _line_buffer_object   object with get_line() method to supply source code
 193     # _diagnostics_object   place to write debugging information
 194     # _unexpected_error_count  error count used to limit output
 195     # _lower_case_labels_at  line numbers where lower case labels seen
 196     # _hit_bug               program bug detected
 197     $tokenizer_self = {
 198         _rhere_target_list                  => [],
 199         _in_here_doc                        => 0,
 200         _here_doc_target                    => "",
 201         _here_quote_character               => "",
 202         _in_data                            => 0,
 203         _in_end                             => 0,
 204         _in_format                          => 0,
 205         _in_error                           => 0,
 206         _in_pod                             => 0,
 207         _in_attribute_list                  => 0,
 208         _in_quote                           => 0,
 209         _quote_target                       => "",
 210         _line_start_quote                   => -1,
 211         _starting_level                     => $args{starting_level},
 212         _know_starting_level                => defined( $args{starting_level} ),
 213         _tabsize                            => $args{tabsize},
 214         _indent_columns                     => $args{indent_columns},
 215         _look_for_hash_bang                 => $args{look_for_hash_bang},
 216         _trim_qw                            => $args{trim_qw},
 217         _continuation_indentation           => $args{continuation_indentation},
 218         _outdent_labels                     => $args{outdent_labels},
 219         _last_line_number                   => $args{starting_line_number} - 1,
 220         _saw_perl_dash_P                    => 0,
 221         _saw_perl_dash_w                    => 0,
 222         _saw_use_strict                     => 0,
 223         _saw_v_string                       => 0,
 224         _hit_bug                            => 0,
 225         _look_for_autoloader                => $args{look_for_autoloader},
 226         _look_for_selfloader                => $args{look_for_selfloader},
 227         _saw_autoloader                     => 0,
 228         _saw_selfloader                     => 0,
 229         _saw_hash_bang                      => 0,
 230         _saw_end                            => 0,
 231         _saw_data                           => 0,
 232         _saw_negative_indentation           => 0,
 233         _started_tokenizing                 => 0,
 234         _line_buffer_object                 => $line_buffer_object,
 235         _debugger_object                    => $args{debugger_object},
 236         _diagnostics_object                 => $args{diagnostics_object},
 237         _logger_object                      => $args{logger_object},
 238         _unexpected_error_count             => 0,
 239         _started_looking_for_here_target_at => 0,
 240         _nearly_matched_here_target_at      => undef,
 241         _line_text                          => "",
 242         _rlower_case_labels_at              => undef,
 243         _extended_syntax                    => $args{extended_syntax},
 244     };
 245
 246     prepare_for_a_new_file();
 247     find_starting_indentation_level();
 248
 249     bless $tokenizer_self, $class;
 250
 251     # This is not a full class yet, so die if an attempt is made to
 252     # create more than one object.
 253
 254     if ( _increment_count() > 1 ) {
 255         confess
 256 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
 257     }
 258
 259     return $tokenizer_self;
 260
 261 }
 262
 263 # interface to Perl::Tidy::Logger routines
 264 sub warning {
 265     my $msg           = shift;
 266     my $logger_object = $tokenizer_self->{_logger_object};
 267     if ($logger_object) {
 268         $logger_object->warning($msg);
 269     }
 270     return;
 271 }
 272
 273 sub complain {
 274     my $msg           = shift;
 275     my $logger_object = $tokenizer_self->{_logger_object};
 276     if ($logger_object) {
 277         $logger_object->complain($msg);
 278     }
 279     return;
 280 }
 281
 282 sub write_logfile_entry {
 283     my $msg           = shift;
 284     my $logger_object = $tokenizer_self->{_logger_object};
 285     if ($logger_object) {
 286         $logger_object->write_logfile_entry($msg);
 287     }
 288     return;
 289 }
 290
 291 sub interrupt_logfile {
 292     my $logger_object = $tokenizer_self->{_logger_object};
 293     if ($logger_object) {
 294         $logger_object->interrupt_logfile();
 295     }
 296     return;
 297 }
 298
 299 sub resume_logfile {
 300     my $logger_object = $tokenizer_self->{_logger_object};
 301     if ($logger_object) {
 302         $logger_object->resume_logfile();
 303     }
 304     return;
 305 }
 306
 307 sub increment_brace_error {
 308     my $logger_object = $tokenizer_self->{_logger_object};
 309     if ($logger_object) {
 310         $logger_object->increment_brace_error();
 311     }
 312     return;
 313 }
 314
 315 sub report_definite_bug {
 316     $tokenizer_self->{_hit_bug} = 1;
 317     my $logger_object = $tokenizer_self->{_logger_object};
 318     if ($logger_object) {
 319         $logger_object->report_definite_bug();
 320     }
 321     return;
 322 }
 323
 324 sub brace_warning {
 325     my $msg           = shift;
 326     my $logger_object = $tokenizer_self->{_logger_object};
 327     if ($logger_object) {
 328         $logger_object->brace_warning($msg);
 329     }
 330     return;
 331 }
 332
 333 sub get_saw_brace_error {
 334     my $logger_object = $tokenizer_self->{_logger_object};
 335     if ($logger_object) {
 336         return $logger_object->get_saw_brace_error();
 337     }
 338     else {
 339         return 0;
 340     }
 341 }
 342
 343 # interface to Perl::Tidy::Diagnostics routines
 344 sub write_diagnostics {
 345     my $msg = shift;
 346     if ( $tokenizer_self->{_diagnostics_object} ) {
 347         $tokenizer_self->{_diagnostics_object}->write_diagnostics($msg);
 348     }
 349     return;
 350 }
 351
 352 sub report_tokenization_errors {
 353
 354     my $self         = shift;
 355     my $severe_error = $self->{_in_error};
 356
 357     my $level = get_indentation_level();
 358     if ( $level != $tokenizer_self->{_starting_level} ) {
 359         warning("final indentation level: $level\n");
 360     }
 361
 362     check_final_nesting_depths();
 363
 364     if ( $tokenizer_self->{_look_for_hash_bang}
 365         && !$tokenizer_self->{_saw_hash_bang} )
 366     {
 367         warning(
 368             "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
 369     }
 370
 371     if ( $tokenizer_self->{_in_format} ) {
 372         warning("hit EOF while in format description\n");
 373     }
 374
 375     if ( $tokenizer_self->{_in_pod} ) {
 376
 377         # Just write log entry if this is after __END__ or __DATA__
 378         # because this happens to often, and it is not likely to be
 379         # a parsing error.
 380         if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) {
 381             write_logfile_entry(
 382 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
 383             );
 384         }
 385
 386         else {
 387             complain(
 388 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
 389             );
 390         }
 391
 392     }
 393
 394     if ( $tokenizer_self->{_in_here_doc} ) {
 395         $severe_error = 1;
 396         my $here_doc_target = $tokenizer_self->{_here_doc_target};
 397         my $started_looking_for_here_target_at =
 398           $tokenizer_self->{_started_looking_for_here_target_at};
 399         if ($here_doc_target) {
 400             warning(
 401 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
 402             );
 403         }
 404         else {
 405             warning(
 406 "hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
 407             );
 408         }
 409         my $nearly_matched_here_target_at =
 410           $tokenizer_self->{_nearly_matched_here_target_at};
 411         if ($nearly_matched_here_target_at) {
 412             warning(
 413 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
 414             );
 415         }
 416     }
 417
 418     if ( $tokenizer_self->{_in_quote} ) {
 419         $severe_error = 1;
 420         my $line_start_quote = $tokenizer_self->{_line_start_quote};
 421         my $quote_target     = $tokenizer_self->{_quote_target};
 422         my $what =
 423           ( $tokenizer_self->{_in_attribute_list} )
 424           ? "attribute list"
 425           : "quote/pattern";
 426         warning(
 427 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
 428         );
 429     }
 430
 431     if ( $tokenizer_self->{_hit_bug} ) {
 432         $severe_error = 1;
 433     }
 434
 435     my $logger_object = $tokenizer_self->{_logger_object};
 436
 437 # TODO: eventually may want to activate this to cause file to be output verbatim
 438     if (0) {
 439
 440         # Set the severe error for a fairly high warning count because
 441         # some of the warnings do not harm formatting, such as duplicate
 442         # sub names.
 443         my $warning_count = $logger_object->{_warning_count};
 444         if ( $warning_count > 50 ) {
 445             $severe_error = 1;
 446         }
 447
 448         # Brace errors are significant, so set the severe error flag at
 449         # a low number.
 450         my $saw_brace_error = $logger_object->{_saw_brace_error};
 451         if ( $saw_brace_error > 2 ) {
 452             $severe_error = 1;
 453         }
 454     }
 455
 456     unless ( $tokenizer_self->{_saw_perl_dash_w} ) {
 457         if ( $] < 5.006 ) {
 458             write_logfile_entry("Suggest including '-w parameter'\n");
 459         }
 460         else {
 461             write_logfile_entry("Suggest including 'use warnings;'\n");
 462         }
 463     }
 464
 465     if ( $tokenizer_self->{_saw_perl_dash_P} ) {
 466         write_logfile_entry("Use of -P parameter for defines is discouraged\n");
 467     }
 468
 469     unless ( $tokenizer_self->{_saw_use_strict} ) {
 470         write_logfile_entry("Suggest including 'use strict;'\n");
 471     }
 472
 473     # it is suggested that labels have at least one upper case character
 474     # for legibility and to avoid code breakage as new keywords are introduced
 475     if ( $tokenizer_self->{_rlower_case_labels_at} ) {
 476         my @lower_case_labels_at =
 477           @{ $tokenizer_self->{_rlower_case_labels_at} };
 478         write_logfile_entry(
 479             "Suggest using upper case characters in label(s)\n");
 480         local $" = ')(';
 481         write_logfile_entry("  defined at line(s): (@lower_case_labels_at)\n");
 482     }
 483     return $severe_error;
 484 }
 485
 486 sub report_v_string {
 487
 488     # warn if this version can't handle v-strings
 489     my $tok = shift;
 490     unless ( $tokenizer_self->{_saw_v_string} ) {
 491         $tokenizer_self->{_saw_v_string} = $tokenizer_self->{_last_line_number};
 492     }
 493     if ( $] < 5.006 ) {
 494         warning(
 495 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
 496         );
 497     }
 498     return;
 499 }
 500
 501 sub get_input_line_number {
 502     return $tokenizer_self->{_last_line_number};
 503 }
 504
 505 # returns the next tokenized line
 506 sub get_line {
 507
 508     my $self = shift;
 509
 510     # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
 511     # $square_bracket_depth, $paren_depth
 512
 513     my $input_line = $tokenizer_self->{_line_buffer_object}->get_line();
 514     $tokenizer_self->{_line_text} = $input_line;
 515
 516     return unless ($input_line);
 517
 518     my $input_line_number = ++$tokenizer_self->{_last_line_number};
 519
 520     # Find and remove what characters terminate this line, including any
 521     # control r
 522     my $input_line_separator = "";
 523     if ( chomp($input_line) ) { $input_line_separator = $/ }
 524
 525     # TODO: what other characters should be included here?
 526     if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
 527         $input_line_separator = $2 . $input_line_separator;
 528     }
 529
 530     # for backwards compatibility we keep the line text terminated with
 531     # a newline character
 532     $input_line .= "\n";
 533     $tokenizer_self->{_line_text} = $input_line;    # update
 534
 535     # create a data structure describing this line which will be
 536     # returned to the caller.
 537
 538     # _line_type codes are:
 539     #   SYSTEM         - system-specific code before hash-bang line
 540     #   CODE           - line of perl code (including comments)
 541     #   POD_START      - line starting pod, such as '=head'
 542     #   POD            - pod documentation text
 543     #   POD_END        - last line of pod section, '=cut'
 544     #   HERE           - text of here-document
 545     #   HERE_END       - last line of here-doc (target word)
 546     #   FORMAT         - format section
 547     #   FORMAT_END     - last line of format section, '.'
 548     #   DATA_START     - __DATA__ line
 549     #   DATA           - unidentified text following __DATA__
 550     #   END_START      - __END__ line
 551     #   END            - unidentified text following __END__
 552     #   ERROR          - we are in big trouble, probably not a perl script
 553
 554     # Other variables:
 555     #   _curly_brace_depth     - depth of curly braces at start of line
 556     #   _square_bracket_depth  - depth of square brackets at start of line
 557     #   _paren_depth           - depth of parens at start of line
 558     #   _starting_in_quote     - this line continues a multi-line quote
 559     #                            (so don't trim leading blanks!)
 560     #   _ending_in_quote       - this line ends in a multi-line quote
 561     #                            (so don't trim trailing blanks!)
 562     my $line_of_tokens = {
 563         _line_type                 => 'EOF',
 564         _line_text                 => $input_line,
 565         _line_number               => $input_line_number,
 566         _rtoken_type               => undef,
 567         _rtokens                   => undef,
 568         _rlevels                   => undef,
 569         _rslevels                  => undef,
 570         _rblock_type               => undef,
 571         _rcontainer_type           => undef,
 572         _rcontainer_environment    => undef,
 573         _rtype_sequence            => undef,
 574         _rnesting_tokens           => undef,
 575         _rci_levels                => undef,
 576         _rnesting_blocks           => undef,
 577         _guessed_indentation_level => 0,
 578         _starting_in_quote    => 0,                    # to be set by subroutine
 579         _ending_in_quote      => 0,
 580         _curly_brace_depth    => $brace_depth,
 581         _square_bracket_depth => $square_bracket_depth,
 582         _paren_depth          => $paren_depth,
 583         _quote_character      => '',
 584     };
 585
 586     # must print line unchanged if we are in a here document
 587     if ( $tokenizer_self->{_in_here_doc} ) {
 588
 589         $line_of_tokens->{_line_type} = 'HERE';
 590         my $here_doc_target      = $tokenizer_self->{_here_doc_target};
 591         my $here_quote_character = $tokenizer_self->{_here_quote_character};
 592         my $candidate_target     = $input_line;
 593         chomp $candidate_target;
 594
 595         # Handle <<~ targets, which are indicated here by a leading space on
 596         # the here quote character
 597         if ( $here_quote_character =~ /^\s/ ) {
 598             $candidate_target =~ s/^\s*//;
 599         }
 600         if ( $candidate_target eq $here_doc_target ) {
 601             $tokenizer_self->{_nearly_matched_here_target_at} = undef;
 602             $line_of_tokens->{_line_type}                     = 'HERE_END';
 603             write_logfile_entry("Exiting HERE document $here_doc_target\n");
 604
 605             my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
 606             if ( @{$rhere_target_list} ) {  # there can be multiple here targets
 607                 ( $here_doc_target, $here_quote_character ) =
 608                   @{ shift @{$rhere_target_list} };
 609                 $tokenizer_self->{_here_doc_target} = $here_doc_target;
 610                 $tokenizer_self->{_here_quote_character} =
 611                   $here_quote_character;
 612                 write_logfile_entry(
 613                     "Entering HERE document $here_doc_target\n");
 614                 $tokenizer_self->{_nearly_matched_here_target_at} = undef;
 615                 $tokenizer_self->{_started_looking_for_here_target_at} =
 616                   $input_line_number;
 617             }
 618             else {
 619                 $tokenizer_self->{_in_here_doc}          = 0;
 620                 $tokenizer_self->{_here_doc_target}      = "";
 621                 $tokenizer_self->{_here_quote_character} = "";
 622             }
 623         }
 624
 625         # check for error of extra whitespace
 626         # note for PERL6: leading whitespace is allowed
 627         else {
 628             $candidate_target =~ s/\s*$//;
 629             $candidate_target =~ s/^\s*//;
 630             if ( $candidate_target eq $here_doc_target ) {
 631                 $tokenizer_self->{_nearly_matched_here_target_at} =
 632                   $input_line_number;
 633             }
 634         }
 635         return $line_of_tokens;
 636     }
 637
 638     # must print line unchanged if we are in a format section
 639     elsif ( $tokenizer_self->{_in_format} ) {
 640
 641         if ( $input_line =~ /^\.[\s#]*$/ ) {
 642             write_logfile_entry("Exiting format section\n");
 643             $tokenizer_self->{_in_format} = 0;
 644             $line_of_tokens->{_line_type} = 'FORMAT_END';
 645         }
 646         else {
 647             $line_of_tokens->{_line_type} = 'FORMAT';
 648         }
 649         return $line_of_tokens;
 650     }
 651
 652     # must print line unchanged if we are in pod documentation
 653     elsif ( $tokenizer_self->{_in_pod} ) {
 654
 655         $line_of_tokens->{_line_type} = 'POD';
 656         if ( $input_line =~ /^=cut/ ) {
 657             $line_of_tokens->{_line_type} = 'POD_END';
 658             write_logfile_entry("Exiting POD section\n");
 659             $tokenizer_self->{_in_pod} = 0;
 660         }
 661         if ( $input_line =~ /^\#\!.*perl\b/ ) {
 662             warning(
 663                 "Hash-bang in pod can cause older versions of perl to fail! \n"
 664             );
 665         }
 666
 667         return $line_of_tokens;
 668     }
 669
 670     # must print line unchanged if we have seen a severe error (i.e., we
 671     # are seeing illegal tokens and cannot continue.  Syntax errors do
 672     # not pass this route).  Calling routine can decide what to do, but
 673     # the default can be to just pass all lines as if they were after __END__
 674     elsif ( $tokenizer_self->{_in_error} ) {
 675         $line_of_tokens->{_line_type} = 'ERROR';
 676         return $line_of_tokens;
 677     }
 678
 679     # print line unchanged if we are __DATA__ section
 680     elsif ( $tokenizer_self->{_in_data} ) {
 681
 682         # ...but look for POD
 683         # Note that the _in_data and _in_end flags remain set
 684         # so that we return to that state after seeing the
 685         # end of a pod section
 686         if ( $input_line =~ /^=(?!cut)/ ) {
 687             $line_of_tokens->{_line_type} = 'POD_START';
 688             write_logfile_entry("Entering POD section\n");
 689             $tokenizer_self->{_in_pod} = 1;
 690             return $line_of_tokens;
 691         }
 692         else {
 693             $line_of_tokens->{_line_type} = 'DATA';
 694             return $line_of_tokens;
 695         }
 696     }
 697
 698     # print line unchanged if we are in __END__ section
 699     elsif ( $tokenizer_self->{_in_end} ) {
 700
 701         # ...but look for POD
 702         # Note that the _in_data and _in_end flags remain set
 703         # so that we return to that state after seeing the
 704         # end of a pod section
 705         if ( $input_line =~ /^=(?!cut)/ ) {
 706             $line_of_tokens->{_line_type} = 'POD_START';
 707             write_logfile_entry("Entering POD section\n");
 708             $tokenizer_self->{_in_pod} = 1;
 709             return $line_of_tokens;
 710         }
 711         else {
 712             $line_of_tokens->{_line_type} = 'END';
 713             return $line_of_tokens;
 714         }
 715     }
 716
 717     # check for a hash-bang line if we haven't seen one
 718     if ( !$tokenizer_self->{_saw_hash_bang} ) {
 719         if ( $input_line =~ /^\#\!.*perl\b/ ) {
 720             $tokenizer_self->{_saw_hash_bang} = $input_line_number;
 721
 722             # check for -w and -P flags
 723             if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
 724                 $tokenizer_self->{_saw_perl_dash_P} = 1;
 725             }
 726
 727             if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
 728                 $tokenizer_self->{_saw_perl_dash_w} = 1;
 729             }
 730
 731             if (
 732                 ( $input_line_number > 1 )
 733
 734                 # leave any hash bang in a BEGIN block alone
 735                 # i.e. see 'debugger-duck_type.t'
 736                 && !(
 737                        $last_nonblank_block_type
 738                     && $last_nonblank_block_type eq 'BEGIN'
 739                 )
 740                 && ( !$tokenizer_self->{_look_for_hash_bang} )
 741               )
 742             {
 743
 744                 # this is helpful for VMS systems; we may have accidentally
 745                 # tokenized some DCL commands
 746                 if ( $tokenizer_self->{_started_tokenizing} ) {
 747                     warning(
 748 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
 749                     );
 750                 }
 751                 else {
 752                     complain("Useless hash-bang after line 1\n");
 753                 }
 754             }
 755
 756             # Report the leading hash-bang as a system line
 757             # This will prevent -dac from deleting it
 758             else {
 759                 $line_of_tokens->{_line_type} = 'SYSTEM';
 760                 return $line_of_tokens;
 761             }
 762         }
 763     }
 764
 765     # wait for a hash-bang before parsing if the user invoked us with -x
 766     if ( $tokenizer_self->{_look_for_hash_bang}
 767         && !$tokenizer_self->{_saw_hash_bang} )
 768     {
 769         $line_of_tokens->{_line_type} = 'SYSTEM';
 770         return $line_of_tokens;
 771     }
 772
 773     # a first line of the form ': #' will be marked as SYSTEM
 774     # since lines of this form may be used by tcsh
 775     if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
 776         $line_of_tokens->{_line_type} = 'SYSTEM';
 777         return $line_of_tokens;
 778     }
 779
 780     # now we know that it is ok to tokenize the line...
 781     # the line tokenizer will modify any of these private variables:
 782     #        _rhere_target_list
 783     #        _in_data
 784     #        _in_end
 785     #        _in_format
 786     #        _in_error
 787     #        _in_pod
 788     #        _in_quote
 789     my $ending_in_quote_last = $tokenizer_self->{_in_quote};
 790     tokenize_this_line($line_of_tokens);
 791
 792     # Now finish defining the return structure and return it
 793     $line_of_tokens->{_ending_in_quote} = $tokenizer_self->{_in_quote};
 794
 795     # handle severe error (binary data in script)
 796     if ( $tokenizer_self->{_in_error} ) {
 797         $tokenizer_self->{_in_quote} = 0;    # to avoid any more messages
 798         warning("Giving up after error\n");
 799         $line_of_tokens->{_line_type} = 'ERROR';
 800         reset_indentation_level(0);          # avoid error messages
 801         return $line_of_tokens;
 802     }
 803
 804     # handle start of pod documentation
 805     if ( $tokenizer_self->{_in_pod} ) {
 806
 807         # This gets tricky..above a __DATA__ or __END__ section, perl
 808         # accepts '=cut' as the start of pod section. But afterwards,
 809         # only pod utilities see it and they may ignore an =cut without
 810         # leading =head.  In any case, this isn't good.
 811         if ( $input_line =~ /^=cut\b/ ) {
 812             if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) {
 813                 complain("=cut while not in pod ignored\n");
 814                 $tokenizer_self->{_in_pod}    = 0;
 815                 $line_of_tokens->{_line_type} = 'POD_END';
 816             }
 817             else {
 818                 $line_of_tokens->{_line_type} = 'POD_START';
 819                 complain(
 820 "=cut starts a pod section .. this can fool pod utilities.\n"
 821                 );
 822                 write_logfile_entry("Entering POD section\n");
 823             }
 824         }
 825
 826         else {
 827             $line_of_tokens->{_line_type} = 'POD_START';
 828             write_logfile_entry("Entering POD section\n");
 829         }
 830
 831         return $line_of_tokens;
 832     }
 833
 834     # update indentation levels for log messages
 835     if ( $input_line !~ /^\s*$/ ) {
 836         my $rlevels = $line_of_tokens->{_rlevels};
 837         $line_of_tokens->{_guessed_indentation_level} =
 838           guess_old_indentation_level($input_line);
 839     }
 840
 841     # see if this line contains here doc targets
 842     my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
 843     if ( @{$rhere_target_list} ) {
 844
 845         my ( $here_doc_target, $here_quote_character ) =
 846           @{ shift @{$rhere_target_list} };
 847         $tokenizer_self->{_in_here_doc}          = 1;
 848         $tokenizer_self->{_here_doc_target}      = $here_doc_target;
 849         $tokenizer_self->{_here_quote_character} = $here_quote_character;
 850         write_logfile_entry("Entering HERE document $here_doc_target\n");
 851         $tokenizer_self->{_started_looking_for_here_target_at} =
 852           $input_line_number;
 853     }
 854
 855     # NOTE: __END__ and __DATA__ statements are written unformatted
 856     # because they can theoretically contain additional characters
 857     # which are not tokenized (and cannot be read with <DATA> either!).
 858     if ( $tokenizer_self->{_in_data} ) {
 859         $line_of_tokens->{_line_type} = 'DATA_START';
 860         write_logfile_entry("Starting __DATA__ section\n");
 861         $tokenizer_self->{_saw_data} = 1;
 862
 863         # keep parsing after __DATA__ if use SelfLoader was seen
 864         if ( $tokenizer_self->{_saw_selfloader} ) {
 865             $tokenizer_self->{_in_data} = 0;
 866             write_logfile_entry(
 867                 "SelfLoader seen, continuing; -nlsl deactivates\n");
 868         }
 869
 870         return $line_of_tokens;
 871     }
 872
 873     elsif ( $tokenizer_self->{_in_end} ) {
 874         $line_of_tokens->{_line_type} = 'END_START';
 875         write_logfile_entry("Starting __END__ section\n");
 876         $tokenizer_self->{_saw_end} = 1;
 877
 878         # keep parsing after __END__ if use AutoLoader was seen
 879         if ( $tokenizer_self->{_saw_autoloader} ) {
 880             $tokenizer_self->{_in_end} = 0;
 881             write_logfile_entry(
 882                 "AutoLoader seen, continuing; -nlal deactivates\n");
 883         }
 884         return $line_of_tokens;
 885     }
 886
 887     # now, finally, we know that this line is type 'CODE'
 888     $line_of_tokens->{_line_type} = 'CODE';
 889
 890     # remember if we have seen any real code
 891     if (  !$tokenizer_self->{_started_tokenizing}
 892         && $input_line !~ /^\s*$/
 893         && $input_line !~ /^\s*#/ )
 894     {
 895         $tokenizer_self->{_started_tokenizing} = 1;
 896     }
 897
 898     if ( $tokenizer_self->{_debugger_object} ) {
 899         $tokenizer_self->{_debugger_object}->write_debug_entry($line_of_tokens);
 900     }
 901
 902     # Note: if keyword 'format' occurs in this line code, it is still CODE
 903     # (keyword 'format' need not start a line)
 904     if ( $tokenizer_self->{_in_format} ) {
 905         write_logfile_entry("Entering format section\n");
 906     }
 907
 908     if ( $tokenizer_self->{_in_quote}
 909         and ( $tokenizer_self->{_line_start_quote} < 0 ) )
 910     {
 911
 912         #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
 913         if (
 914             ( my $quote_target = $tokenizer_self->{_quote_target} ) !~ /^\s*$/ )
 915         {
 916             $tokenizer_self->{_line_start_quote} = $input_line_number;
 917             write_logfile_entry(
 918                 "Start multi-line quote or pattern ending in $quote_target\n");
 919         }
 920     }
 921     elsif ( ( $tokenizer_self->{_line_start_quote} >= 0 )
 922         && !$tokenizer_self->{_in_quote} )
 923     {
 924         $tokenizer_self->{_line_start_quote} = -1;
 925         write_logfile_entry("End of multi-line quote or pattern\n");
 926     }
 927
 928     # we are returning a line of CODE
 929     return $line_of_tokens;
 930 }
 931
 932 sub find_starting_indentation_level {
 933
 934     # We need to find the indentation level of the first line of the
 935     # script being formatted.  Often it will be zero for an entire file,
 936     # but if we are formatting a local block of code (within an editor for
 937     # example) it may not be zero.  The user may specify this with the
 938     # -sil=n parameter but normally doesn't so we have to guess.
 939     #
 940     # USES GLOBAL VARIABLES: $tokenizer_self
 941     my $starting_level = 0;
 942
 943     # use value if given as parameter
 944     if ( $tokenizer_self->{_know_starting_level} ) {
 945         $starting_level = $tokenizer_self->{_starting_level};
 946     }
 947
 948     # if we know there is a hash_bang line, the level must be zero
 949     elsif ( $tokenizer_self->{_look_for_hash_bang} ) {
 950         $tokenizer_self->{_know_starting_level} = 1;
 951     }
 952
 953     # otherwise figure it out from the input file
 954     else {
 955         my $line;
 956         my $i = 0;
 957
 958         # keep looking at lines until we find a hash bang or piece of code
 959         my $msg = "";
 960         while ( $line =
 961             $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
 962         {
 963
 964             # if first line is #! then assume starting level is zero
 965             if ( $i == 1 && $line =~ /^\#\!/ ) {
 966                 $starting_level = 0;
 967                 last;
 968             }
 969             next if ( $line =~ /^\s*#/ );    # skip past comments
 970             next if ( $line =~ /^\s*$/ );    # skip past blank lines
 971             $starting_level = guess_old_indentation_level($line);
 972             last;
 973         }
 974         $msg = "Line $i implies starting-indentation-level = $starting_level\n";
 975         write_logfile_entry("$msg");
 976     }
 977     $tokenizer_self->{_starting_level} = $starting_level;
 978     reset_indentation_level($starting_level);
 979     return;
 980 }
 981
 982 sub guess_old_indentation_level {
 983     my ($line) = @_;
 984
 985     # Guess the indentation level of an input line.
 986     #
 987     # For the first line of code this result will define the starting
 988     # indentation level.  It will mainly be non-zero when perltidy is applied
 989     # within an editor to a local block of code.
 990     #
 991     # This is an impossible task in general because we can't know what tabs
 992     # meant for the old script and how many spaces were used for one
 993     # indentation level in the given input script.  For example it may have
 994     # been previously formatted with -i=7 -et=3.  But we can at least try to
 995     # make sure that perltidy guesses correctly if it is applied repeatedly to
 996     # a block of code within an editor, so that the block stays at the same
 997     # level when perltidy is applied repeatedly.
 998     #
 999     # USES GLOBAL VARIABLES: $tokenizer_self
1000     my $level = 0;
1001
1002     # find leading tabs, spaces, and any statement label
1003     my $spaces = 0;
1004     if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
1005
1006         # If there are leading tabs, we use the tab scheme for this run, if
1007         # any, so that the code will remain stable when editing.
1008         if ($1) { $spaces += length($1) * $tokenizer_self->{_tabsize} }
1009
1010         if ($2) { $spaces += length($2) }
1011
1012         # correct for outdented labels
1013         if ( $3 && $tokenizer_self->{'_outdent_labels'} ) {
1014             $spaces += $tokenizer_self->{_continuation_indentation};
1015         }
1016     }
1017
1018     # compute indentation using the value of -i for this run.
1019     # If -i=0 is used for this run (which is possible) it doesn't matter
1020     # what we do here but we'll guess that the old run used 4 spaces per level.
1021     my $indent_columns = $tokenizer_self->{_indent_columns};
1022     $indent_columns = 4 if ( !$indent_columns );
1023     $level          = int( $spaces / $indent_columns );
1024     return ($level);
1025 }
1026
1027 # This is a currently unused debug routine
1028 sub dump_functions {
1029
1030     my $fh = *STDOUT;
1031     foreach my $pkg ( keys %is_user_function ) {
1032         print $fh "\nnon-constant subs in package $pkg\n";
1033
1034         foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
1035             my $msg = "";
1036             if ( $is_block_list_function{$pkg}{$sub} ) {
1037                 $msg = 'block_list';
1038             }
1039
1040             if ( $is_block_function{$pkg}{$sub} ) {
1041                 $msg = 'block';
1042             }
1043             print $fh "$sub $msg\n";
1044         }
1045     }
1046
1047     foreach my $pkg ( keys %is_constant ) {
1048         print $fh "\nconstants and constant subs in package $pkg\n";
1049
1050         foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
1051             print $fh "$sub\n";
1052         }
1053     }
1054     return;
1055 }
1056
1057 sub ones_count {
1058
1059     # count number of 1's in a string of 1's and 0's
1060     # example: ones_count("010101010101") gives 6
1061     my $str = shift;
1062     return $str =~ tr/1/0/;
1063 }
1064
1065 sub prepare_for_a_new_file {
1066
1067     # previous tokens needed to determine what to expect next
1068     $last_nonblank_token      = ';';    # the only possible starting state which
1069     $last_nonblank_type       = ';';    # will make a leading brace a code block
1070     $last_nonblank_block_type = '';
1071
1072     # scalars for remembering statement types across multiple lines
1073     $statement_type    = '';            # '' or 'use' or 'sub..' or 'case..'
1074     $in_attribute_list = 0;
1075
1076     # scalars for remembering where we are in the file
1077     $current_package = "main";
1078     $context         = UNKNOWN_CONTEXT;
1079
1080     # hashes used to remember function information
1081     %is_constant             = ();      # user-defined constants
1082     %is_user_function        = ();      # user-defined functions
1083     %user_function_prototype = ();      # their prototypes
1084     %is_block_function       = ();
1085     %is_block_list_function  = ();
1086     %saw_function_definition = ();
1087
1088     # variables used to track depths of various containers
1089     # and report nesting errors
1090     $paren_depth                         = 0;
1091     $brace_depth                         = 0;
1092     $square_bracket_depth                = 0;
1093     @current_depth                       = (0) x scalar @closing_brace_names;
1094     $total_depth                         = 0;
1095     @total_depth                         = ();
1096     @nesting_sequence_number             = ( 0 .. @closing_brace_names - 1 );
1097     @current_sequence_number             = ();
1098     $paren_type[$paren_depth]            = '';
1099     $paren_semicolon_count[$paren_depth] = 0;
1100     $paren_structural_type[$brace_depth] = '';
1101     $brace_type[$brace_depth] = ';';    # identify opening brace as code block
1102     $brace_structural_type[$brace_depth]                   = '';
1103     $brace_context[$brace_depth]                           = UNKNOWN_CONTEXT;
1104     $brace_package[$paren_depth]                           = $current_package;
1105     $square_bracket_type[$square_bracket_depth]            = '';
1106     $square_bracket_structural_type[$square_bracket_depth] = '';
1107
1108     initialize_tokenizer_state();
1109     return;
1110 }
1111
1112 {                                       # begin tokenize_this_line
1113
1114     use constant BRACE          => 0;
1115     use constant SQUARE_BRACKET => 1;
1116     use constant PAREN          => 2;
1117     use constant QUESTION_COLON => 3;
1118
1119     # TV1: scalars for processing one LINE.
1120     # Re-initialized on each entry to sub tokenize_this_line.
1121     my (
1122         $block_type,        $container_type,    $expecting,
1123         $i,                 $i_tok,             $input_line,
1124         $input_line_number, $last_nonblank_i,   $max_token_index,
1125         $next_tok,          $next_type,         $peeked_ahead,
1126         $prototype,         $rhere_target_list, $rtoken_map,
1127         $rtoken_type,       $rtokens,           $tok,
1128         $type,              $type_sequence,     $indent_flag,
1129     );
1130
1131     # TV2: refs to ARRAYS for processing one LINE
1132     # Re-initialized on each call.
1133     my $routput_token_list     = [];    # stack of output token indexes
1134     my $routput_token_type     = [];    # token types
1135     my $routput_block_type     = [];    # types of code block
1136     my $routput_container_type = [];    # paren types, such as if, elsif, ..
1137     my $routput_type_sequence  = [];    # nesting sequential number
1138     my $routput_indent_flag    = [];    #
1139
1140     # TV3: SCALARS for quote variables.  These are initialized with a
1141     # subroutine call and continually updated as lines are processed.
1142     my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1143         $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
1144
1145     # TV4: SCALARS for multi-line identifiers and
1146     # statements. These are initialized with a subroutine call
1147     # and continually updated as lines are processed.
1148     my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
1149
1150     # TV5: SCALARS for tracking indentation level.
1151     # Initialized once and continually updated as lines are
1152     # processed.
1153     my (
1154         $nesting_token_string,      $nesting_type_string,
1155         $nesting_block_string,      $nesting_block_flag,
1156         $nesting_list_string,       $nesting_list_flag,
1157         $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
1158         $in_statement_continuation, $level_in_tokenizer,
1159         $slevel_in_tokenizer,       $rslevel_stack,
1160     );
1161
1162     # TV6: SCALARS for remembering several previous
1163     # tokens. Initialized once and continually updated as
1164     # lines are processed.
1165     my (
1166         $last_nonblank_container_type,     $last_nonblank_type_sequence,
1167         $last_last_nonblank_token,         $last_last_nonblank_type,
1168         $last_last_nonblank_block_type,    $last_last_nonblank_container_type,
1169         $last_last_nonblank_type_sequence, $last_nonblank_prototype,
1170     );
1171
1172     # ----------------------------------------------------------------
1173     # beginning of tokenizer variable access and manipulation routines
1174     # ----------------------------------------------------------------
1175
1176     sub initialize_tokenizer_state {
1177
1178         # TV1: initialized on each call
1179         # TV2: initialized on each call
1180         # TV3:
1181         $in_quote                = 0;
1182         $quote_type              = 'Q';
1183         $quote_character         = "";
1184         $quote_pos               = 0;
1185         $quote_depth             = 0;
1186         $quoted_string_1         = "";
1187         $quoted_string_2         = "";
1188         $allowed_quote_modifiers = "";
1189
1190         # TV4:
1191         $id_scan_state     = '';
1192         $identifier        = '';
1193         $want_paren        = "";
1194         $indented_if_level = 0;
1195
1196         # TV5:
1197         $nesting_token_string             = "";
1198         $nesting_type_string              = "";
1199         $nesting_block_string             = '1';    # initially in a block
1200         $nesting_block_flag               = 1;
1201         $nesting_list_string              = '0';    # initially not in a list
1202         $nesting_list_flag                = 0;      # initially not in a list
1203         $ci_string_in_tokenizer           = "";
1204         $continuation_string_in_tokenizer = "0";
1205         $in_statement_continuation        = 0;
1206         $level_in_tokenizer               = 0;
1207         $slevel_in_tokenizer              = 0;
1208         $rslevel_stack                    = [];
1209
1210         # TV6:
1211         $last_nonblank_container_type      = '';
1212         $last_nonblank_type_sequence       = '';
1213         $last_last_nonblank_token          = ';';
1214         $last_last_nonblank_type           = ';';
1215         $last_last_nonblank_block_type     = '';
1216         $last_last_nonblank_container_type = '';
1217         $last_last_nonblank_type_sequence  = '';
1218         $last_nonblank_prototype           = "";
1219         return;
1220     }
1221
1222     sub save_tokenizer_state {
1223
1224         my $rTV1 = [
1225             $block_type,        $container_type,    $expecting,
1226             $i,                 $i_tok,             $input_line,
1227             $input_line_number, $last_nonblank_i,   $max_token_index,
1228             $next_tok,          $next_type,         $peeked_ahead,
1229             $prototype,         $rhere_target_list, $rtoken_map,
1230             $rtoken_type,       $rtokens,           $tok,
1231             $type,              $type_sequence,     $indent_flag,
1232         ];
1233
1234         my $rTV2 = [
1235             $routput_token_list,    $routput_token_type,
1236             $routput_block_type,    $routput_container_type,
1237             $routput_type_sequence, $routput_indent_flag,
1238         ];
1239
1240         my $rTV3 = [
1241             $in_quote,        $quote_type,
1242             $quote_character, $quote_pos,
1243             $quote_depth,     $quoted_string_1,
1244             $quoted_string_2, $allowed_quote_modifiers,
1245         ];
1246
1247         my $rTV4 =
1248           [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
1249
1250         my $rTV5 = [
1251             $nesting_token_string,      $nesting_type_string,
1252             $nesting_block_string,      $nesting_block_flag,
1253             $nesting_list_string,       $nesting_list_flag,
1254             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
1255             $in_statement_continuation, $level_in_tokenizer,
1256             $slevel_in_tokenizer,       $rslevel_stack,
1257         ];
1258
1259         my $rTV6 = [
1260             $last_nonblank_container_type,
1261             $last_nonblank_type_sequence,
1262             $last_last_nonblank_token,
1263             $last_last_nonblank_type,
1264             $last_last_nonblank_block_type,
1265             $last_last_nonblank_container_type,
1266             $last_last_nonblank_type_sequence,
1267             $last_nonblank_prototype,
1268         ];
1269         return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
1270     }
1271
1272     sub restore_tokenizer_state {
1273         my ($rstate) = @_;
1274         my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
1275         (
1276             $block_type,        $container_type,    $expecting,
1277             $i,                 $i_tok,             $input_line,
1278             $input_line_number, $last_nonblank_i,   $max_token_index,
1279             $next_tok,          $next_type,         $peeked_ahead,
1280             $prototype,         $rhere_target_list, $rtoken_map,
1281             $rtoken_type,       $rtokens,           $tok,
1282             $type,              $type_sequence,     $indent_flag,
1283         ) = @{$rTV1};
1284
1285         (
1286             $routput_token_list,    $routput_token_type,
1287             $routput_block_type,    $routput_container_type,
1288             $routput_type_sequence, $routput_type_sequence,
1289         ) = @{$rTV2};
1290
1291         (
1292             $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
1293             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
1294         ) = @{$rTV3};
1295
1296         ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
1297           @{$rTV4};
1298
1299         (
1300             $nesting_token_string,      $nesting_type_string,
1301             $nesting_block_string,      $nesting_block_flag,
1302             $nesting_list_string,       $nesting_list_flag,
1303             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
1304             $in_statement_continuation, $level_in_tokenizer,
1305             $slevel_in_tokenizer,       $rslevel_stack,
1306         ) = @{$rTV5};
1307
1308         (
1309             $last_nonblank_container_type,
1310             $last_nonblank_type_sequence,
1311             $last_last_nonblank_token,
1312             $last_last_nonblank_type,
1313             $last_last_nonblank_block_type,
1314             $last_last_nonblank_container_type,
1315             $last_last_nonblank_type_sequence,
1316             $last_nonblank_prototype,
1317         ) = @{$rTV6};
1318         return;
1319     }
1320
1321     sub get_indentation_level {
1322
1323         # patch to avoid reporting error if indented if is not terminated
1324         if ($indented_if_level) { return $level_in_tokenizer - 1 }
1325         return $level_in_tokenizer;
1326     }
1327
1328     sub reset_indentation_level {
1329         $level_in_tokenizer = $slevel_in_tokenizer = shift;
1330         push @{$rslevel_stack}, $slevel_in_tokenizer;
1331         return;
1332     }
1333
1334     sub peeked_ahead {
1335         my $flag = shift;
1336         $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
1337         return $peeked_ahead;
1338     }
1339
1340     # ------------------------------------------------------------
1341     # end of tokenizer variable access and manipulation routines
1342     # ------------------------------------------------------------
1343
1344     # ------------------------------------------------------------
1345     # beginning of various scanner interface routines
1346     # ------------------------------------------------------------
1347     sub scan_replacement_text {
1348
1349         # check for here-docs in replacement text invoked by
1350         # a substitution operator with executable modifier 'e'.
1351         #
1352         # given:
1353         #  $replacement_text
1354         # return:
1355         #  $rht = reference to any here-doc targets
1356         my ($replacement_text) = @_;
1357
1358         # quick check
1359         return unless ( $replacement_text =~ /<</ );
1360
1361         write_logfile_entry("scanning replacement text for here-doc targets\n");
1362
1363         # save the logger object for error messages
1364         my $logger_object = $tokenizer_self->{_logger_object};
1365
1366         # localize all package variables
1367         local (
1368             $tokenizer_self,                 $last_nonblank_token,
1369             $last_nonblank_type,             $last_nonblank_block_type,
1370             $statement_type,                 $in_attribute_list,
1371             $current_package,                $context,
1372             %is_constant,                    %is_user_function,
1373             %user_function_prototype,        %is_block_function,
1374             %is_block_list_function,         %saw_function_definition,
1375             $brace_depth,                    $paren_depth,
1376             $square_bracket_depth,           @current_depth,
1377             @total_depth,                    $total_depth,
1378             @nesting_sequence_number,        @current_sequence_number,
1379             @paren_type,                     @paren_semicolon_count,
1380             @paren_structural_type,          @brace_type,
1381             @brace_structural_type,          @brace_context,
1382             @brace_package,                  @square_bracket_type,
1383             @square_bracket_structural_type, @depth_array,
1384             @starting_line_of_current_depth, @nested_ternary_flag,
1385             @nested_statement_type,
1386         );
1387
1388         # save all lexical variables
1389         my $rstate = save_tokenizer_state();
1390         _decrement_count();    # avoid error check for multiple tokenizers
1391
1392         # make a new tokenizer
1393         my $rOpts = {};
1394         my $rpending_logfile_message;
1395         my $source_object =
1396           Perl::Tidy::LineSource->new( \$replacement_text, $rOpts,
1397             $rpending_logfile_message );
1398         my $tokenizer = Perl::Tidy::Tokenizer->new(
1399             source_object        => $source_object,
1400             logger_object        => $logger_object,
1401             starting_line_number => $input_line_number,
1402         );
1403
1404         # scan the replacement text
1405         1 while ( $tokenizer->get_line() );
1406
1407         # remove any here doc targets
1408         my $rht = undef;
1409         if ( $tokenizer_self->{_in_here_doc} ) {
1410             $rht = [];
1411             push @{$rht},
1412               [
1413                 $tokenizer_self->{_here_doc_target},
1414                 $tokenizer_self->{_here_quote_character}
1415               ];
1416             if ( $tokenizer_self->{_rhere_target_list} ) {
1417                 push @{$rht}, @{ $tokenizer_self->{_rhere_target_list} };
1418                 $tokenizer_self->{_rhere_target_list} = undef;
1419             }
1420             $tokenizer_self->{_in_here_doc} = undef;
1421         }
1422
1423         # now its safe to report errors
1424         my $severe_error = $tokenizer->report_tokenization_errors();
1425
1426         # TODO: Could propagate a severe error up
1427
1428         # restore all tokenizer lexical variables
1429         restore_tokenizer_state($rstate);
1430
1431         # return the here doc targets
1432         return $rht;
1433     }
1434
1435     sub scan_bare_identifier {
1436         ( $i, $tok, $type, $prototype ) =
1437           scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
1438             $rtoken_map, $max_token_index );
1439         return;
1440     }
1441
1442     sub scan_identifier {
1443         ( $i, $tok, $type, $id_scan_state, $identifier ) =
1444           scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
1445             $max_token_index, $expecting, $paren_type[$paren_depth] );
1446         return;
1447     }
1448
1449     sub scan_id {
1450         ( $i, $tok, $type, $id_scan_state ) =
1451           scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
1452             $id_scan_state, $max_token_index );
1453         return;
1454     }
1455
1456     sub scan_number {
1457         my $number;
1458         ( $i, $type, $number ) =
1459           scan_number_do( $input_line, $i, $rtoken_map, $type,
1460             $max_token_index );
1461         return $number;
1462     }
1463
1464     # a sub to warn if token found where term expected
1465     sub error_if_expecting_TERM {
1466         if ( $expecting == TERM ) {
1467             if ( $really_want_term{$last_nonblank_type} ) {
1468                 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
1469                     $rtoken_map, $rtoken_type, $input_line );
1470                 return 1;
1471             }
1472         }
1473         return;
1474     }
1475
1476     # a sub to warn if token found where operator expected
1477     sub error_if_expecting_OPERATOR {
1478         my $thing = shift;
1479         if ( $expecting == OPERATOR ) {
1480             if ( !defined($thing) ) { $thing = $tok }
1481             report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
1482                 $rtoken_map, $rtoken_type, $input_line );
1483             if ( $i_tok == 0 ) {
1484                 interrupt_logfile();
1485                 warning("Missing ';' above?\n");
1486                 resume_logfile();
1487             }
1488             return 1;
1489         }
1490         return;
1491     }
1492
1493     # ------------------------------------------------------------
1494     # end scanner interfaces
1495     # ------------------------------------------------------------
1496
1497     my %is_for_foreach;
1498     @_ = qw(for foreach);
1499     @is_for_foreach{@_} = (1) x scalar(@_);
1500
1501     my %is_my_our;
1502     @_ = qw(my our);
1503     @is_my_our{@_} = (1) x scalar(@_);
1504
1505     # These keywords may introduce blocks after parenthesized expressions,
1506     # in the form:
1507     # keyword ( .... ) { BLOCK }
1508     # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
1509     my %is_blocktype_with_paren;
1510     @_ =
1511       qw(if elsif unless while until for foreach switch case given when catch);
1512     @is_blocktype_with_paren{@_} = (1) x scalar(@_);
1513
1514     # ------------------------------------------------------------
1515     # begin hash of code for handling most token types
1516     # ------------------------------------------------------------
1517     my $tokenization_code = {
1518
1519         # no special code for these types yet, but syntax checks
1520         # could be added
1521
1522 ##      '!'   => undef,
1523 ##      '!='  => undef,
1524 ##      '!~'  => undef,
1525 ##      '%='  => undef,
1526 ##      '&&=' => undef,
1527 ##      '&='  => undef,
1528 ##      '+='  => undef,
1529 ##      '-='  => undef,
1530 ##      '..'  => undef,
1531 ##      '..'  => undef,
1532 ##      '...' => undef,
1533 ##      '.='  => undef,
1534 ##      '<<=' => undef,
1535 ##      '<='  => undef,
1536 ##      '<=>' => undef,
1537 ##      '<>'  => undef,
1538 ##      '='   => undef,
1539 ##      '=='  => undef,
1540 ##      '=~'  => undef,
1541 ##      '>='  => undef,
1542 ##      '>>'  => undef,
1543 ##      '>>=' => undef,
1544 ##      '\\'  => undef,
1545 ##      '^='  => undef,
1546 ##      '|='  => undef,
1547 ##      '||=' => undef,
1548 ##      '//=' => undef,
1549 ##      '~'   => undef,
1550 ##      '~~'  => undef,
1551 ##      '!~~'  => undef,
1552
1553         '>' => sub {
1554             error_if_expecting_TERM()
1555               if ( $expecting == TERM );
1556         },
1557         '|' => sub {
1558             error_if_expecting_TERM()
1559               if ( $expecting == TERM );
1560         },
1561         '$' => sub {
1562
1563             # start looking for a scalar
1564             error_if_expecting_OPERATOR("Scalar")
1565               if ( $expecting == OPERATOR );
1566             scan_identifier();
1567
1568             if ( $identifier eq '$^W' ) {
1569                 $tokenizer_self->{_saw_perl_dash_w} = 1;
1570             }
1571
1572             # Check for identifier in indirect object slot
1573             # (vorboard.pl, sort.t).  Something like:
1574             #   /^(print|printf|sort|exec|system)$/
1575             if (
1576                 $is_indirect_object_taker{$last_nonblank_token}
1577
1578                 || ( ( $last_nonblank_token eq '(' )
1579                     && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
1580                 || ( $last_nonblank_type =~ /^[Uw]$/ )    # possible object
1581               )
1582             {
1583                 $type = 'Z';
1584             }
1585         },
1586         '(' => sub {
1587
1588             ++$paren_depth;
1589             $paren_semicolon_count[$paren_depth] = 0;
1590             if ($want_paren) {
1591                 $container_type = $want_paren;
1592                 $want_paren     = "";
1593             }
1594             elsif ( $statement_type =~ /^sub\b/ ) {
1595                 $container_type = $statement_type;
1596             }
1597             else {
1598                 $container_type = $last_nonblank_token;
1599
1600                 # We can check for a syntax error here of unexpected '(',
1601                 # but this is going to get messy...
1602                 if (
1603                     $expecting == OPERATOR
1604
1605                     # be sure this is not a method call of the form
1606                     # &method(...), $method->(..), &{method}(...),
1607                     # $ref[2](list) is ok & short for $ref[2]->(list)
1608                     # NOTE: at present, braces in something like &{ xxx }
1609                     # are not marked as a block, we might have a method call
1610                     && $last_nonblank_token !~ /^([\]\}\&]|\-\>)/
1611
1612                   )
1613                 {
1614
1615                     # ref: camel 3 p 703.
1616                     if ( $last_last_nonblank_token eq 'do' ) {
1617                         complain(
1618 "do SUBROUTINE is deprecated; consider & or -> notation\n"
1619                         );
1620                     }
1621                     else {
1622
1623                         # if this is an empty list, (), then it is not an
1624                         # error; for example, we might have a constant pi and
1625                         # invoke it with pi() or just pi;
1626                         my ( $next_nonblank_token, $i_next ) =
1627                           find_next_nonblank_token( $i, $rtokens,
1628                             $max_token_index );
1629                         if ( $next_nonblank_token ne ')' ) {
1630                             my $hint;
1631                             error_if_expecting_OPERATOR('(');
1632
1633                             if ( $last_nonblank_type eq 'C' ) {
1634                                 $hint =
1635                                   "$last_nonblank_token has a void prototype\n";
1636                             }
1637                             elsif ( $last_nonblank_type eq 'i' ) {
1638                                 if (   $i_tok > 0
1639                                     && $last_nonblank_token =~ /^\$/ )
1640                                 {
1641                                     $hint =
1642 "Do you mean '$last_nonblank_token->(' ?\n";
1643                                 }
1644                             }
1645                             if ($hint) {
1646                                 interrupt_logfile();
1647                                 warning($hint);
1648                                 resume_logfile();
1649                             }
1650                         } ## end if ( $next_nonblank_token...
1651                     } ## end else [ if ( $last_last_nonblank_token...
1652                 } ## end if ( $expecting == OPERATOR...
1653             }
1654             $paren_type[$paren_depth] = $container_type;
1655             ( $type_sequence, $indent_flag ) =
1656               increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
1657
1658             # propagate types down through nested parens
1659             # for example: the second paren in 'if ((' would be structural
1660             # since the first is.
1661
1662             if ( $last_nonblank_token eq '(' ) {
1663                 $type = $last_nonblank_type;
1664             }
1665
1666             #     We exclude parens as structural after a ',' because it
1667             #     causes subtle problems with continuation indentation for
1668             #     something like this, where the first 'or' will not get
1669             #     indented.
1670             #
1671             #         assert(
1672             #             __LINE__,
1673             #             ( not defined $check )
1674             #               or ref $check
1675             #               or $check eq "new"
1676             #               or $check eq "old",
1677             #         );
1678             #
1679             #     Likewise, we exclude parens where a statement can start
1680             #     because of problems with continuation indentation, like
1681             #     these:
1682             #
1683             #         ($firstline =~ /^#\!.*perl/)
1684             #         and (print $File::Find::name, "\n")
1685             #           and (return 1);
1686             #
1687             #         (ref($usage_fref) =~ /CODE/)
1688             #         ? &$usage_fref
1689             #           : (&blast_usage, &blast_params, &blast_general_params);
1690
1691             else {
1692                 $type = '{';
1693             }
1694
1695             if ( $last_nonblank_type eq ')' ) {
1696                 warning(
1697                     "Syntax error? found token '$last_nonblank_type' then '('\n"
1698                 );
1699             }
1700             $paren_structural_type[$paren_depth] = $type;
1701
1702         },
1703         ')' => sub {
1704             ( $type_sequence, $indent_flag ) =
1705               decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
1706
1707             if ( $paren_structural_type[$paren_depth] eq '{' ) {
1708                 $type = '}';
1709             }
1710
1711             $container_type = $paren_type[$paren_depth];
1712
1713             # restore statement type as 'sub' at closing paren of a signature
1714             # so that a subsequent ':' is identified as an attribute
1715             if ( $container_type =~ /^sub\b/ ) {
1716                 $statement_type = $container_type;
1717             }
1718
1719             #    /^(for|foreach)$/
1720             if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
1721                 my $num_sc = $paren_semicolon_count[$paren_depth];
1722                 if ( $num_sc > 0 && $num_sc != 2 ) {
1723                     warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
1724                 }
1725             }
1726
1727             if ( $paren_depth > 0 ) { $paren_depth-- }
1728         },
1729         ',' => sub {
1730             if ( $last_nonblank_type eq ',' ) {
1731                 complain("Repeated ','s \n");
1732             }
1733
1734             # patch for operator_expected: note if we are in the list (use.t)
1735             if ( $statement_type eq 'use' ) { $statement_type = '_use' }
1736 ##                FIXME: need to move this elsewhere, perhaps check after a '('
1737 ##                elsif ($last_nonblank_token eq '(') {
1738 ##                    warning("Leading ','s illegal in some versions of perl\n");
1739 ##                }
1740         },
1741         ';' => sub {
1742             $context        = UNKNOWN_CONTEXT;
1743             $statement_type = '';
1744             $want_paren     = "";
1745
1746             #    /^(for|foreach)$/
1747             if ( $is_for_foreach{ $paren_type[$paren_depth] } )
1748             {    # mark ; in for loop
1749
1750                 # Be careful: we do not want a semicolon such as the
1751                 # following to be included:
1752                 #
1753                 #    for (sort {strcoll($a,$b);} keys %investments) {
1754
1755                 if (   $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
1756                     && $square_bracket_depth ==
1757                     $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
1758                 {
1759
1760                     $type = 'f';
1761                     $paren_semicolon_count[$paren_depth]++;
1762                 }
1763             }
1764
1765         },
1766         '"' => sub {
1767             error_if_expecting_OPERATOR("String")
1768               if ( $expecting == OPERATOR );
1769             $in_quote                = 1;
1770             $type                    = 'Q';
1771             $allowed_quote_modifiers = "";
1772         },
1773         "'" => sub {
1774             error_if_expecting_OPERATOR("String")
1775               if ( $expecting == OPERATOR );
1776             $in_quote                = 1;
1777             $type                    = 'Q';
1778             $allowed_quote_modifiers = "";
1779         },
1780         '`' => sub {
1781             error_if_expecting_OPERATOR("String")
1782               if ( $expecting == OPERATOR );
1783             $in_quote                = 1;
1784             $type                    = 'Q';
1785             $allowed_quote_modifiers = "";
1786         },
1787         '/' => sub {
1788             my $is_pattern;
1789
1790             # a pattern cannot follow certain keywords which take optional
1791             # arguments, like 'shift' and 'pop'. See also '?'.
1792             if (   $last_nonblank_type eq 'k'
1793                 && $is_keyword_taking_optional_args{$last_nonblank_token} )
1794             {
1795                 $is_pattern = 0;
1796             }
1797             elsif ( $expecting == UNKNOWN ) {    # indeterminate, must guess..
1798                 my $msg;
1799                 ( $is_pattern, $msg ) =
1800                   guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
1801                     $max_token_index );
1802
1803                 if ($msg) {
1804                     write_diagnostics("DIVIDE:$msg\n");
1805                     write_logfile_entry($msg);
1806                 }
1807             }
1808             else { $is_pattern = ( $expecting == TERM ) }
1809
1810             if ($is_pattern) {
1811                 $in_quote                = 1;
1812                 $type                    = 'Q';
1813                 $allowed_quote_modifiers = '[msixpodualngc]';
1814             }
1815             else {    # not a pattern; check for a /= token
1816
1817                 if ( $rtokens->[ $i + 1 ] eq '=' ) {    # form token /=
1818                     $i++;
1819                     $tok  = '/=';
1820                     $type = $tok;
1821                 }
1822
1823               #DEBUG - collecting info on what tokens follow a divide
1824               # for development of guessing algorithm
1825               #if ( numerator_expected( $i, $rtokens, $max_token_index ) < 0 ) {
1826               #    #write_diagnostics( "DIVIDE? $input_line\n" );
1827               #}
1828             }
1829         },
1830         '{' => sub {
1831
1832             # if we just saw a ')', we will label this block with
1833             # its type.  We need to do this to allow sub
1834             # code_block_type to determine if this brace starts a
1835             # code block or anonymous hash.  (The type of a paren
1836             # pair is the preceding token, such as 'if', 'else',
1837             # etc).
1838             $container_type = "";
1839
1840             # ATTRS: for a '{' following an attribute list, reset
1841             # things to look like we just saw the sub name
1842             if ( $statement_type =~ /^sub/ ) {
1843                 $last_nonblank_token = $statement_type;
1844                 $last_nonblank_type  = 'i';
1845                 $statement_type      = "";
1846             }
1847
1848             # patch for SWITCH/CASE: hide these keywords from an immediately
1849             # following opening brace
1850             elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
1851                 && $statement_type eq $last_nonblank_token )
1852             {
1853                 $last_nonblank_token = ";";
1854             }
1855
1856             elsif ( $last_nonblank_token eq ')' ) {
1857                 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
1858
1859                 # defensive move in case of a nesting error (pbug.t)
1860                 # in which this ')' had no previous '('
1861                 # this nesting error will have been caught
1862                 if ( !defined($last_nonblank_token) ) {
1863                     $last_nonblank_token = 'if';
1864                 }
1865
1866                 # check for syntax error here;
1867                 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
1868                     if ( $tokenizer_self->{'_extended_syntax'} ) {
1869
1870                         # we append a trailing () to mark this as an unknown
1871                         # block type.  This allows perltidy to format some
1872                         # common extensions of perl syntax.
1873                         # This is used by sub code_block_type
1874                         $last_nonblank_token .= '()';
1875                     }
1876                     else {
1877                         my $list =
1878                           join( ' ', sort keys %is_blocktype_with_paren );
1879                         warning(
1880 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
1881                         );
1882                     }
1883                 }
1884             }
1885
1886             # patch for paren-less for/foreach glitch, part 2.
1887             # see note below under 'qw'
1888             elsif ($last_nonblank_token eq 'qw'
1889                 && $is_for_foreach{$want_paren} )
1890             {
1891                 $last_nonblank_token = $want_paren;
1892                 if ( $last_last_nonblank_token eq $want_paren ) {
1893                     warning(
1894 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
1895                     );
1896
1897                 }
1898                 $want_paren = "";
1899             }
1900
1901             # now identify which of the three possible types of
1902             # curly braces we have: hash index container, anonymous
1903             # hash reference, or code block.
1904
1905             # non-structural (hash index) curly brace pair
1906             # get marked 'L' and 'R'
1907             if ( is_non_structural_brace() ) {
1908                 $type = 'L';
1909
1910                 # patch for SWITCH/CASE:
1911                 # allow paren-less identifier after 'when'
1912                 # if the brace is preceded by a space
1913                 if (   $statement_type eq 'when'
1914                     && $last_nonblank_type eq 'i'
1915                     && $last_last_nonblank_type eq 'k'
1916                     && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
1917                 {
1918                     $type       = '{';
1919                     $block_type = $statement_type;
1920                 }
1921             }
1922
1923             # code and anonymous hash have the same type, '{', but are
1924             # distinguished by 'block_type',
1925             # which will be blank for an anonymous hash
1926             else {
1927
1928                 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
1929                     $max_token_index );
1930
1931                 # patch to promote bareword type to function taking block
1932                 if (   $block_type
1933                     && $last_nonblank_type eq 'w'
1934                     && $last_nonblank_i >= 0 )
1935                 {
1936                     if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
1937                         $routput_token_type->[$last_nonblank_i] = 'G';
1938                     }
1939                 }
1940
1941                 # patch for SWITCH/CASE: if we find a stray opening block brace
1942                 # where we might accept a 'case' or 'when' block, then take it
1943                 if (   $statement_type eq 'case'
1944                     || $statement_type eq 'when' )
1945                 {
1946                     if ( !$block_type || $block_type eq '}' ) {
1947                         $block_type = $statement_type;
1948                     }
1949                 }
1950             }
1951
1952             $brace_type[ ++$brace_depth ]        = $block_type;
1953             $brace_package[$brace_depth]         = $current_package;
1954             $brace_structural_type[$brace_depth] = $type;
1955             $brace_context[$brace_depth]         = $context;
1956             ( $type_sequence, $indent_flag ) =
1957               increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
1958         },
1959         '}' => sub {
1960             $block_type = $brace_type[$brace_depth];
1961             if ($block_type) { $statement_type = '' }
1962             if ( defined( $brace_package[$brace_depth] ) ) {
1963                 $current_package = $brace_package[$brace_depth];
1964             }
1965
1966             # can happen on brace error (caught elsewhere)
1967             else {
1968             }
1969             ( $type_sequence, $indent_flag ) =
1970               decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
1971
1972             if ( $brace_structural_type[$brace_depth] eq 'L' ) {
1973                 $type = 'R';
1974             }
1975
1976             # propagate type information for 'do' and 'eval' blocks, and also
1977             # for smartmatch operator.  This is necessary to enable us to know
1978             # if an operator or term is expected next.
1979             if ( $is_block_operator{$block_type} ) {
1980                 $tok = $block_type;
1981             }
1982
1983             $context = $brace_context[$brace_depth];
1984             if ( $brace_depth > 0 ) { $brace_depth--; }
1985         },
1986         '&' => sub {    # maybe sub call? start looking
1987
1988             # We have to check for sub call unless we are sure we
1989             # are expecting an operator.  This example from s2p
1990             # got mistaken as a q operator in an early version:
1991             #   print BODY &q(<<'EOT');
1992             if ( $expecting != OPERATOR ) {
1993
1994                 # But only look for a sub call if we are expecting a term or
1995                 # if there is no existing space after the &.
1996                 # For example we probably don't want & as sub call here:
1997                 #    Fcntl::S_IRUSR & $mode;
1998                 if ( $expecting == TERM || $next_type ne 'b' ) {
1999                     scan_identifier();
2000                 }
2001             }
2002             else {
2003             }
2004         },
2005         '<' => sub {    # angle operator or less than?
2006
2007             if ( $expecting != OPERATOR ) {
2008                 ( $i, $type ) =
2009                   find_angle_operator_termination( $input_line, $i, $rtoken_map,
2010                     $expecting, $max_token_index );
2011
2012                 if ( $type eq '<' && $expecting == TERM ) {
2013                     error_if_expecting_TERM();
2014                     interrupt_logfile();
2015                     warning("Unterminated <> operator?\n");
2016                     resume_logfile();
2017                 }
2018             }
2019             else {
2020             }
2021         },
2022         '?' => sub {    # ?: conditional or starting pattern?
2023
2024             my $is_pattern;
2025
2026             # Patch for rt #126965
2027             # a pattern cannot follow certain keywords which take optional
2028             # arguments, like 'shift' and 'pop'. See also '/'.
2029             if (   $last_nonblank_type eq 'k'
2030                 && $is_keyword_taking_optional_args{$last_nonblank_token} )
2031             {
2032                 $is_pattern = 0;
2033             }
2034             elsif ( $expecting == UNKNOWN ) {
2035
2036                 my $msg;
2037                 ( $is_pattern, $msg ) =
2038                   guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
2039                     $max_token_index );
2040
2041                 if ($msg) { write_logfile_entry($msg) }
2042             }
2043             else { $is_pattern = ( $expecting == TERM ) }
2044
2045             if ($is_pattern) {
2046                 $in_quote                = 1;
2047                 $type                    = 'Q';
2048                 $allowed_quote_modifiers = '[msixpodualngc]';
2049             }
2050             else {
2051                 ( $type_sequence, $indent_flag ) =
2052                   increase_nesting_depth( QUESTION_COLON,
2053                     $rtoken_map->[$i_tok] );
2054             }
2055         },
2056         '*' => sub {    # typeglob, or multiply?
2057
2058             if ( $expecting == TERM ) {
2059                 scan_identifier();
2060             }
2061             else {
2062
2063                 if ( $rtokens->[ $i + 1 ] eq '=' ) {
2064                     $tok  = '*=';
2065                     $type = $tok;
2066                     $i++;
2067                 }
2068                 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
2069                     $tok  = '**';
2070                     $type = $tok;
2071                     $i++;
2072                     if ( $rtokens->[ $i + 1 ] eq '=' ) {
2073                         $tok  = '**=';
2074                         $type = $tok;
2075                         $i++;
2076                     }
2077                 }
2078             }
2079         },
2080         '.' => sub {    # what kind of . ?
2081
2082             if ( $expecting != OPERATOR ) {
2083                 scan_number();
2084                 if ( $type eq '.' ) {
2085                     error_if_expecting_TERM()
2086                       if ( $expecting == TERM );
2087                 }
2088             }
2089             else {
2090             }
2091         },
2092         ':' => sub {
2093
2094             # if this is the first nonblank character, call it a label
2095             # since perl seems to just swallow it
2096             if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
2097                 $type = 'J';
2098             }
2099
2100             # ATTRS: check for a ':' which introduces an attribute list
2101             # (this might eventually get its own token type)
2102             elsif ( $statement_type =~ /^sub\b/ ) {
2103                 $type              = 'A';
2104                 $in_attribute_list = 1;
2105             }
2106
2107             # check for scalar attribute, such as
2108             # my $foo : shared = 1;
2109             elsif ($is_my_our{$statement_type}
2110                 && $current_depth[QUESTION_COLON] == 0 )
2111             {
2112                 $type              = 'A';
2113                 $in_attribute_list = 1;
2114             }
2115
2116             # otherwise, it should be part of a ?/: operator
2117             else {
2118                 ( $type_sequence, $indent_flag ) =
2119                   decrease_nesting_depth( QUESTION_COLON,
2120                     $rtoken_map->[$i_tok] );
2121                 if ( $last_nonblank_token eq '?' ) {
2122                     warning("Syntax error near ? :\n");
2123                 }
2124             }
2125         },
2126         '+' => sub {    # what kind of plus?
2127
2128             if ( $expecting == TERM ) {
2129                 my $number = scan_number();
2130
2131                 # unary plus is safest assumption if not a number
2132                 if ( !defined($number) ) { $type = 'p'; }
2133             }
2134             elsif ( $expecting == OPERATOR ) {
2135             }
2136             else {
2137                 if ( $next_type eq 'w' ) { $type = 'p' }
2138             }
2139         },
2140         '@' => sub {
2141
2142             error_if_expecting_OPERATOR("Array")
2143               if ( $expecting == OPERATOR );
2144             scan_identifier();
2145         },
2146         '%' => sub {    # hash or modulo?
2147
2148             # first guess is hash if no following blank
2149             if ( $expecting == UNKNOWN ) {
2150                 if ( $next_type ne 'b' ) { $expecting = TERM }
2151             }
2152             if ( $expecting == TERM ) {
2153                 scan_identifier();
2154             }
2155         },
2156         '[' => sub {
2157             $square_bracket_type[ ++$square_bracket_depth ] =
2158               $last_nonblank_token;
2159             ( $type_sequence, $indent_flag ) =
2160               increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2161
2162             # It may seem odd, but structural square brackets have
2163             # type '{' and '}'.  This simplifies the indentation logic.
2164             if ( !is_non_structural_brace() ) {
2165                 $type = '{';
2166             }
2167             $square_bracket_structural_type[$square_bracket_depth] = $type;
2168         },
2169         ']' => sub {
2170             ( $type_sequence, $indent_flag ) =
2171               decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
2172
2173             if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
2174             {
2175                 $type = '}';
2176             }
2177
2178             # propagate type information for smartmatch operator.  This is
2179             # necessary to enable us to know if an operator or term is expected
2180             # next.
2181             if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
2182                 $tok = $square_bracket_type[$square_bracket_depth];
2183             }
2184
2185             if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
2186         },
2187         '-' => sub {    # what kind of minus?
2188
2189             if ( ( $expecting != OPERATOR )
2190                 && $is_file_test_operator{$next_tok} )
2191             {
2192                 my ( $next_nonblank_token, $i_next ) =
2193                   find_next_nonblank_token( $i + 1, $rtokens,
2194                     $max_token_index );
2195
2196                 # check for a quoted word like "-w=>xx";
2197                 # it is sufficient to just check for a following '='
2198                 if ( $next_nonblank_token eq '=' ) {
2199                     $type = 'm';
2200                 }
2201                 else {
2202                     $i++;
2203                     $tok .= $next_tok;
2204                     $type = 'F';
2205                 }
2206             }
2207             elsif ( $expecting == TERM ) {
2208                 my $number = scan_number();
2209
2210                 # maybe part of bareword token? unary is safest
2211                 if ( !defined($number) ) { $type = 'm'; }
2212
2213             }
2214             elsif ( $expecting == OPERATOR ) {
2215             }
2216             else {
2217
2218                 if ( $next_type eq 'w' ) {
2219                     $type = 'm';
2220                 }
2221             }
2222         },
2223
2224         '^' => sub {
2225
2226             # check for special variables like ${^WARNING_BITS}
2227             if ( $expecting == TERM ) {
2228
2229                 # FIXME: this should work but will not catch errors
2230                 # because we also have to be sure that previous token is
2231                 # a type character ($,@,%).
2232                 if ( $last_nonblank_token eq '{'
2233                     && ( $next_tok =~ /^[A-Za-z_]/ ) )
2234                 {
2235
2236                     if ( $next_tok eq 'W' ) {
2237                         $tokenizer_self->{_saw_perl_dash_w} = 1;
2238                     }
2239                     $tok  = $tok . $next_tok;
2240                     $i    = $i + 1;
2241                     $type = 'w';
2242                 }
2243
2244                 else {
2245                     unless ( error_if_expecting_TERM() ) {
2246
2247                         # Something like this is valid but strange:
2248                         # undef ^I;
2249                         complain("The '^' seems unusual here\n");
2250                     }
2251                 }
2252             }
2253         },
2254
2255         '::' => sub {    # probably a sub call
2256             scan_bare_identifier();
2257         },
2258         '<<' => sub {    # maybe a here-doc?
2259             return
2260               unless ( $i < $max_token_index )
2261               ;          # here-doc not possible if end of line
2262
2263             if ( $expecting != OPERATOR ) {
2264                 my ( $found_target, $here_doc_target, $here_quote_character,
2265                     $saw_error );
2266                 (
2267                     $found_target, $here_doc_target, $here_quote_character, $i,
2268                     $saw_error
2269                   )
2270                   = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
2271                     $max_token_index );
2272
2273                 if ($found_target) {
2274                     push @{$rhere_target_list},
2275                       [ $here_doc_target, $here_quote_character ];
2276                     $type = 'h';
2277                     if ( length($here_doc_target) > 80 ) {
2278                         my $truncated = substr( $here_doc_target, 0, 80 );
2279                         complain("Long here-target: '$truncated' ...\n");
2280                     }
2281                     elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
2282                         complain(
2283                             "Unconventional here-target: '$here_doc_target'\n");
2284                     }
2285                 }
2286                 elsif ( $expecting == TERM ) {
2287                     unless ($saw_error) {
2288
2289                         # shouldn't happen..
2290                         warning("Program bug; didn't find here doc target\n");
2291                         report_definite_bug();
2292                     }
2293                 }
2294             }
2295             else {
2296             }
2297         },
2298         '<<~' => sub {    # a here-doc, new type added in v26
2299             return
2300               unless ( $i < $max_token_index )
2301               ;           # here-doc not possible if end of line
2302             if ( $expecting != OPERATOR ) {
2303                 my ( $found_target, $here_doc_target, $here_quote_character,
2304                     $saw_error );
2305                 (
2306                     $found_target, $here_doc_target, $here_quote_character, $i,
2307                     $saw_error
2308                   )
2309                   = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
2310                     $max_token_index );
2311
2312                 if ($found_target) {
2313
2314                     if ( length($here_doc_target) > 80 ) {
2315                         my $truncated = substr( $here_doc_target, 0, 80 );
2316                         complain("Long here-target: '$truncated' ...\n");
2317                     }
2318                     elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
2319                         complain(
2320                             "Unconventional here-target: '$here_doc_target'\n");
2321                     }
2322
2323                     # Note that we put a leading space on the here quote
2324                     # character indicate that it may be preceded by spaces
2325                     $here_quote_character = " " . $here_quote_character;
2326                     push @{$rhere_target_list},
2327                       [ $here_doc_target, $here_quote_character ];
2328                     $type = 'h';
2329                 }
2330                 elsif ( $expecting == TERM ) {
2331                     unless ($saw_error) {
2332
2333                         # shouldn't happen..
2334                         warning("Program bug; didn't find here doc target\n");
2335                         report_definite_bug();
2336                     }
2337                 }
2338             }
2339             else {
2340             }
2341         },
2342         '->' => sub {
2343
2344             # if -> points to a bare word, we must scan for an identifier,
2345             # otherwise something like ->y would look like the y operator
2346             scan_identifier();
2347         },
2348
2349         # type = 'pp' for pre-increment, '++' for post-increment
2350         '++' => sub {
2351             if ( $expecting == TERM ) { $type = 'pp' }
2352             elsif ( $expecting == UNKNOWN ) {
2353                 my ( $next_nonblank_token, $i_next ) =
2354                   find_next_nonblank_token( $i, $rtokens, $max_token_index );
2355                 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
2356             }
2357         },
2358
2359         '=>' => sub {
2360             if ( $last_nonblank_type eq $tok ) {
2361                 complain("Repeated '=>'s \n");
2362             }
2363
2364             # patch for operator_expected: note if we are in the list (use.t)
2365             # TODO: make version numbers a new token type
2366             if ( $statement_type eq 'use' ) { $statement_type = '_use' }
2367         },
2368
2369         # type = 'mm' for pre-decrement, '--' for post-decrement
2370         '--' => sub {
2371
2372             if ( $expecting == TERM ) { $type = 'mm' }
2373             elsif ( $expecting == UNKNOWN ) {
2374                 my ( $next_nonblank_token, $i_next ) =
2375                   find_next_nonblank_token( $i, $rtokens, $max_token_index );
2376                 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
2377             }
2378         },
2379
2380         '&&' => sub {
2381             error_if_expecting_TERM()
2382               if ( $expecting == TERM );
2383         },
2384
2385         '||' => sub {
2386             error_if_expecting_TERM()
2387               if ( $expecting == TERM );
2388         },
2389
2390         '//' => sub {
2391             error_if_expecting_TERM()
2392               if ( $expecting == TERM );
2393         },
2394     };
2395
2396     # ------------------------------------------------------------
2397     # end hash of code for handling individual token types
2398     # ------------------------------------------------------------
2399
2400     my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
2401
2402     # These block types terminate statements and do not need a trailing
2403     # semicolon
2404     # patched for SWITCH/CASE/
2405     my %is_zero_continuation_block_type;
2406     @_ = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
2407       if elsif else unless while until for foreach switch case given when);
2408     @is_zero_continuation_block_type{@_} = (1) x scalar(@_);
2409
2410     my %is_not_zero_continuation_block_type;
2411     @_ = qw(sort grep map do eval);
2412     @is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);
2413
2414     my %is_logical_container;
2415     @_ = qw(if elsif unless while and or err not && !  || for foreach);
2416     @is_logical_container{@_} = (1) x scalar(@_);
2417
2418     my %is_binary_type;
2419     @_ = qw(|| &&);
2420     @is_binary_type{@_} = (1) x scalar(@_);
2421
2422     my %is_binary_keyword;
2423     @_ = qw(and or err eq ne cmp);
2424     @is_binary_keyword{@_} = (1) x scalar(@_);
2425
2426     # 'L' is token for opening { at hash key
2427     my %is_opening_type;
2428     @_ = qw< L { ( [ >;
2429     @is_opening_type{@_} = (1) x scalar(@_);
2430
2431     # 'R' is token for closing } at hash key
2432     my %is_closing_type;
2433     @_ = qw< R } ) ] >;
2434     @is_closing_type{@_} = (1) x scalar(@_);
2435
2436     my %is_redo_last_next_goto;
2437     @_ = qw(redo last next goto);
2438     @is_redo_last_next_goto{@_} = (1) x scalar(@_);
2439
2440     my %is_use_require;
2441     @_ = qw(use require);
2442     @is_use_require{@_} = (1) x scalar(@_);
2443
2444     my %is_sub_package;
2445     @_ = qw(sub package);
2446     @is_sub_package{@_} = (1) x scalar(@_);
2447
2448     # This hash holds the hash key in $tokenizer_self for these keywords:
2449     my %is_format_END_DATA = (
2450         'format'   => '_in_format',
2451         '__END__'  => '_in_end',
2452         '__DATA__' => '_in_data',
2453     );
2454
2455     # original ref: camel 3 p 147,
2456     # but perl may accept undocumented flags
2457     # perl 5.10 adds 'p' (preserve)
2458     # Perl version 5.22 added 'n'
2459     # From http://perldoc.perl.org/perlop.html we have
2460     # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
2461     # s/PATTERN/REPLACEMENT/msixpodualngcer
2462     # y/SEARCHLIST/REPLACEMENTLIST/cdsr
2463     # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
2464     # qr/STRING/msixpodualn
2465     my %quote_modifiers = (
2466         's'  => '[msixpodualngcer]',
2467         'y'  => '[cdsr]',
2468         'tr' => '[cdsr]',
2469         'm'  => '[msixpodualngc]',
2470         'qr' => '[msixpodualn]',
2471         'q'  => "",
2472         'qq' => "",
2473         'qw' => "",
2474         'qx' => "",
2475     );
2476
2477     # table showing how many quoted things to look for after quote operator..
2478     # s, y, tr have 2 (pattern and replacement)
2479     # others have 1 (pattern only)
2480     my %quote_items = (
2481         's'  => 2,
2482         'y'  => 2,
2483         'tr' => 2,
2484         'm'  => 1,
2485         'qr' => 1,
2486         'q'  => 1,
2487         'qq' => 1,
2488         'qw' => 1,
2489         'qx' => 1,
2490     );
2491
2492     sub tokenize_this_line {
2493
2494   # This routine breaks a line of perl code into tokens which are of use in
2495   # indentation and reformatting.  One of my goals has been to define tokens
2496   # such that a newline may be inserted between any pair of tokens without
2497   # changing or invalidating the program. This version comes close to this,
2498   # although there are necessarily a few exceptions which must be caught by
2499   # the formatter.  Many of these involve the treatment of bare words.
2500   #
2501   # The tokens and their types are returned in arrays.  See previous
2502   # routine for their names.
2503   #
2504   # See also the array "valid_token_types" in the BEGIN section for an
2505   # up-to-date list.
2506   #
2507   # To simplify things, token types are either a single character, or they
2508   # are identical to the tokens themselves.
2509   #
2510   # As a debugging aid, the -D flag creates a file containing a side-by-side
2511   # comparison of the input string and its tokenization for each line of a file.
2512   # This is an invaluable debugging aid.
2513   #
2514   # In addition to tokens, and some associated quantities, the tokenizer
2515   # also returns flags indication any special line types.  These include
2516   # quotes, here_docs, formats.
2517   #
2518   # -----------------------------------------------------------------------
2519   #
2520   # How to add NEW_TOKENS:
2521   #
2522   # New token types will undoubtedly be needed in the future both to keep up
2523   # with changes in perl and to help adapt the tokenizer to other applications.
2524   #
2525   # Here are some notes on the minimal steps.  I wrote these notes while
2526   # adding the 'v' token type for v-strings, which are things like version
2527   # numbers 5.6.0, and ip addresses, and will use that as an example.  ( You
2528   # can use your editor to search for the string "NEW_TOKENS" to find the
2529   # appropriate sections to change):
2530   #
2531   # *. Try to talk somebody else into doing it!  If not, ..
2532   #
2533   # *. Make a backup of your current version in case things don't work out!
2534   #
2535   # *. Think of a new, unused character for the token type, and add to
2536   # the array @valid_token_types in the BEGIN section of this package.
2537   # For example, I used 'v' for v-strings.
2538   #
2539   # *. Implement coding to recognize the $type of the token in this routine.
2540   # This is the hardest part, and is best done by imitating or modifying
2541   # some of the existing coding.  For example, to recognize v-strings, I
2542   # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
2543   # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
2544   #
2545   # *. Update sub operator_expected.  This update is critically important but
2546   # the coding is trivial.  Look at the comments in that routine for help.
2547   # For v-strings, which should behave like numbers, I just added 'v' to the
2548   # regex used to handle numbers and strings (types 'n' and 'Q').
2549   #
2550   # *. Implement a 'bond strength' rule in sub set_bond_strengths in
2551   # Perl::Tidy::Formatter for breaking lines around this token type.  You can
2552   # skip this step and take the default at first, then adjust later to get
2553   # desired results.  For adding type 'v', I looked at sub bond_strength and
2554   # saw that number type 'n' was using default strengths, so I didn't do
2555   # anything.  I may tune it up someday if I don't like the way line
2556   # breaks with v-strings look.
2557   #
2558   # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
2559   # Perl::Tidy::Formatter.  For adding type 'v', I looked at this routine
2560   # and saw that type 'n' used spaces on both sides, so I just added 'v'
2561   # to the array @spaces_both_sides.
2562   #
2563   # *. Update HtmlWriter package so that users can colorize the token as
2564   # desired.  This is quite easy; see comments identified by 'NEW_TOKENS' in
2565   # that package.  For v-strings, I initially chose to use a default color
2566   # equal to the default for numbers, but it might be nice to change that
2567   # eventually.
2568   #
2569   # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
2570   #
2571   # *. Run lots and lots of debug tests.  Start with special files designed
2572   # to test the new token type.  Run with the -D flag to create a .DEBUG
2573   # file which shows the tokenization.  When these work ok, test as many old
2574   # scripts as possible.  Start with all of the '.t' files in the 'test'
2575   # directory of the distribution file.  Compare .tdy output with previous
2576   # version and updated version to see the differences.  Then include as
2577   # many more files as possible. My own technique has been to collect a huge
2578   # number of perl scripts (thousands!) into one directory and run perltidy
2579   # *, then run diff between the output of the previous version and the
2580   # current version.
2581   #
2582   # *. For another example, search for the smartmatch operator '~~'
2583   # with your editor to see where updates were made for it.
2584   #
2585   # -----------------------------------------------------------------------
2586
2587         my $line_of_tokens = shift;
2588         my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
2589
2590         # patch while coding change is underway
2591         # make callers private data to allow access
2592         # $tokenizer_self = $caller_tokenizer_self;
2593
2594         # extract line number for use in error messages
2595         $input_line_number = $line_of_tokens->{_line_number};
2596
2597         # reinitialize for multi-line quote
2598         $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
2599
2600         # check for pod documentation
2601         if ( ( $untrimmed_input_line =~ /^=[A-Za-z_]/ ) ) {
2602
2603             # must not be in multi-line quote
2604             # and must not be in an equation
2605             if ( !$in_quote && ( operator_expected( 'b', '=', 'b' ) == TERM ) )
2606             {
2607                 $tokenizer_self->{_in_pod} = 1;
2608                 return;
2609             }
2610         }
2611
2612         $input_line = $untrimmed_input_line;
2613
2614         chomp $input_line;
2615
2616         # trim start of this line unless we are continuing a quoted line
2617         # do not trim end because we might end in a quote (test: deken4.pl)
2618         # Perl::Tidy::Formatter will delete needless trailing blanks
2619         unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
2620             $input_line =~ s/^\s*//;    # trim left end
2621         }
2622
2623         # Set a flag to indicate if we might be at an __END__ or __DATA__ line
2624         # This will be used below to avoid quoting a bare word followed by
2625         # a fat comma.
2626         my $is_END_or_DATA = $input_line =~ /^\s*__(END|DATA)__\s*$/;
2627
2628         # update the copy of the line for use in error messages
2629         # This must be exactly what we give the pre_tokenizer
2630         $tokenizer_self->{_line_text} = $input_line;
2631
2632         # re-initialize for the main loop
2633         $routput_token_list     = [];    # stack of output token indexes
2634         $routput_token_type     = [];    # token types
2635         $routput_block_type     = [];    # types of code block
2636         $routput_container_type = [];    # paren types, such as if, elsif, ..
2637         $routput_type_sequence  = [];    # nesting sequential number
2638
2639         $rhere_target_list = [];
2640
2641         $tok             = $last_nonblank_token;
2642         $type            = $last_nonblank_type;
2643         $prototype       = $last_nonblank_prototype;
2644         $last_nonblank_i = -1;
2645         $block_type      = $last_nonblank_block_type;
2646         $container_type  = $last_nonblank_container_type;
2647         $type_sequence   = $last_nonblank_type_sequence;
2648         $indent_flag     = 0;
2649         $peeked_ahead    = 0;
2650
2651         # tokenization is done in two stages..
2652         # stage 1 is a very simple pre-tokenization
2653         my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
2654
2655         # a little optimization for a full-line comment
2656         if ( !$in_quote && ( $input_line =~ /^#/ ) ) {
2657             $max_tokens_wanted = 1    # no use tokenizing a comment
2658         }
2659
2660         # start by breaking the line into pre-tokens
2661         ( $rtokens, $rtoken_map, $rtoken_type ) =
2662           pre_tokenize( $input_line, $max_tokens_wanted );
2663
2664         $max_token_index = scalar( @{$rtokens} ) - 1;
2665         push( @{$rtokens}, ' ', ' ', ' ' );  # extra whitespace simplifies logic
2666         push( @{$rtoken_map},  0,   0,   0 );     # shouldn't be referenced
2667         push( @{$rtoken_type}, 'b', 'b', 'b' );
2668
2669         # initialize for main loop
2670         foreach my $ii ( 0 .. $max_token_index + 3 ) {
2671             $routput_token_type->[$ii]     = "";
2672             $routput_block_type->[$ii]     = "";
2673             $routput_container_type->[$ii] = "";
2674             $routput_type_sequence->[$ii]  = "";
2675             $routput_indent_flag->[$ii]    = 0;
2676         }
2677         $i     = -1;
2678         $i_tok = -1;
2679
2680         # ------------------------------------------------------------
2681         # begin main tokenization loop
2682         # ------------------------------------------------------------
2683
2684         # we are looking at each pre-token of one line and combining them
2685         # into tokens
2686         while ( ++$i <= $max_token_index ) {
2687
2688             if ($in_quote) {    # continue looking for end of a quote
2689                 $type = $quote_type;
2690
2691                 unless ( @{$routput_token_list} )
2692                 {               # initialize if continuation line
2693                     push( @{$routput_token_list}, $i );
2694                     $routput_token_type->[$i] = $type;
2695
2696                 }
2697                 $tok = $quote_character unless ( $quote_character =~ /^\s*$/ );
2698
2699                 # scan for the end of the quote or pattern
2700                 (
2701                     $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
2702                     $quoted_string_1, $quoted_string_2
2703                   )
2704                   = do_quote(
2705                     $i,               $in_quote,    $quote_character,
2706                     $quote_pos,       $quote_depth, $quoted_string_1,
2707                     $quoted_string_2, $rtokens,     $rtoken_map,
2708                     $max_token_index
2709                   );
2710
2711                 # all done if we didn't find it
2712                 last if ($in_quote);
2713
2714                 # save pattern and replacement text for rescanning
2715                 my $qs1 = $quoted_string_1;
2716                 my $qs2 = $quoted_string_2;
2717
2718                 # re-initialize for next search
2719                 $quote_character = '';
2720                 $quote_pos       = 0;
2721                 $quote_type      = 'Q';
2722                 $quoted_string_1 = "";
2723                 $quoted_string_2 = "";
2724                 last if ( ++$i > $max_token_index );
2725
2726                 # look for any modifiers
2727                 if ($allowed_quote_modifiers) {
2728
2729                     # check for exact quote modifiers
2730                     if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
2731                         my $str = $rtokens->[$i];
2732                         my $saw_modifier_e;
2733                         while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
2734                             my $pos  = pos($str);
2735                             my $char = substr( $str, $pos - 1, 1 );
2736                             $saw_modifier_e ||= ( $char eq 'e' );
2737                         }
2738
2739                         # For an 'e' quote modifier we must scan the replacement
2740                         # text for here-doc targets.
2741                         if ($saw_modifier_e) {
2742
2743                             my $rht = scan_replacement_text($qs1);
2744
2745                             # Change type from 'Q' to 'h' for quotes with
2746                             # here-doc targets so that the formatter (see sub
2747                             # print_line_of_tokens) will not make any line
2748                             # breaks after this point.
2749                             if ($rht) {
2750                                 push @{$rhere_target_list}, @{$rht};
2751                                 $type = 'h';
2752                                 if ( $i_tok < 0 ) {
2753                                     my $ilast = $routput_token_list->[-1];
2754                                     $routput_token_type->[$ilast] = $type;
2755                                 }
2756                             }
2757                         }
2758
2759                         if ( defined( pos($str) ) ) {
2760
2761                             # matched
2762                             if ( pos($str) == length($str) ) {
2763                                 last if ( ++$i > $max_token_index );
2764                             }
2765
2766                             # Looks like a joined quote modifier
2767                             # and keyword, maybe something like
2768                             # s/xxx/yyy/gefor @k=...
2769                             # Example is "galgen.pl".  Would have to split
2770                             # the word and insert a new token in the
2771                             # pre-token list.  This is so rare that I haven't
2772                             # done it.  Will just issue a warning citation.
2773
2774                             # This error might also be triggered if my quote
2775                             # modifier characters are incomplete
2776                             else {
2777                                 warning(<<EOM);
2778
2779 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
2780 Please put a space between quote modifiers and trailing keywords.
2781 EOM
2782
2783                          # print "token $rtokens->[$i]\n";
2784                          # my $num = length($str) - pos($str);
2785                          # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
2786                          # print "continuing with new token $rtokens->[$i]\n";
2787
2788                                 # skipping past this token does least damage
2789                                 last if ( ++$i > $max_token_index );
2790                             }
2791                         }
2792                         else {
2793
2794                             # example file: rokicki4.pl
2795                             # This error might also be triggered if my quote
2796                             # modifier characters are incomplete
2797                             write_logfile_entry(
2798 "Note: found word $str at quote modifier location\n"
2799                             );
2800                         }
2801                     }
2802
2803                     # re-initialize
2804                     $allowed_quote_modifiers = "";
2805                 }
2806             }
2807
2808             unless ( $tok =~ /^\s*$/ || $tok eq 'CORE::' ) {
2809
2810                 # try to catch some common errors
2811                 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
2812
2813                     if ( $last_nonblank_token eq 'eq' ) {
2814                         complain("Should 'eq' be '==' here ?\n");
2815                     }
2816                     elsif ( $last_nonblank_token eq 'ne' ) {
2817                         complain("Should 'ne' be '!=' here ?\n");
2818                     }
2819                 }
2820
2821                 $last_last_nonblank_token      = $last_nonblank_token;
2822                 $last_last_nonblank_type       = $last_nonblank_type;
2823                 $last_last_nonblank_block_type = $last_nonblank_block_type;
2824                 $last_last_nonblank_container_type =
2825                   $last_nonblank_container_type;
2826                 $last_last_nonblank_type_sequence =
2827                   $last_nonblank_type_sequence;
2828                 $last_nonblank_token          = $tok;
2829                 $last_nonblank_type           = $type;
2830                 $last_nonblank_prototype      = $prototype;
2831                 $last_nonblank_block_type     = $block_type;
2832                 $last_nonblank_container_type = $container_type;
2833                 $last_nonblank_type_sequence  = $type_sequence;
2834                 $last_nonblank_i              = $i_tok;
2835             }
2836
2837             # store previous token type
2838             if ( $i_tok >= 0 ) {
2839                 $routput_token_type->[$i_tok]     = $type;
2840                 $routput_block_type->[$i_tok]     = $block_type;
2841                 $routput_container_type->[$i_tok] = $container_type;
2842                 $routput_type_sequence->[$i_tok]  = $type_sequence;
2843                 $routput_indent_flag->[$i_tok]    = $indent_flag;
2844             }
2845             my $pre_tok  = $rtokens->[$i];        # get the next pre-token
2846             my $pre_type = $rtoken_type->[$i];    # and type
2847             $tok  = $pre_tok;
2848             $type = $pre_type;                    # to be modified as necessary
2849             $block_type = "";    # blank for all tokens except code block braces
2850             $container_type = "";    # blank for all tokens except some parens
2851             $type_sequence  = "";    # blank for all tokens except ?/:
2852             $indent_flag    = 0;
2853             $prototype = "";    # blank for all tokens except user defined subs
2854             $i_tok     = $i;
2855
2856             # this pre-token will start an output token
2857             push( @{$routput_token_list}, $i_tok );
2858
2859             # continue gathering identifier if necessary
2860             # but do not start on blanks and comments
2861             if ( $id_scan_state && $pre_type !~ /[b#]/ ) {
2862
2863                 if ( $id_scan_state =~ /^(sub|package)/ ) {
2864                     scan_id();
2865                 }
2866                 else {
2867                     scan_identifier();
2868                 }
2869
2870                 last if ($id_scan_state);
2871                 next if ( ( $i > 0 ) || $type );
2872
2873                 # didn't find any token; start over
2874                 $type = $pre_type;
2875                 $tok  = $pre_tok;
2876             }
2877
2878             # handle whitespace tokens..
2879             next if ( $type eq 'b' );
2880             my $prev_tok  = $i > 0 ? $rtokens->[ $i - 1 ]     : ' ';
2881             my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
2882
2883             # Build larger tokens where possible, since we are not in a quote.
2884             #
2885             # First try to assemble digraphs.  The following tokens are
2886             # excluded and handled specially:
2887             # '/=' is excluded because the / might start a pattern.
2888             # 'x=' is excluded since it might be $x=, with $ on previous line
2889             # '**' and *= might be typeglobs of punctuation variables
2890             # I have allowed tokens starting with <, such as <=,
2891             # because I don't think these could be valid angle operators.
2892             # test file: storrs4.pl
2893             my $test_tok   = $tok . $rtokens->[ $i + 1 ];
2894             my $combine_ok = $is_digraph{$test_tok};
2895
2896             # check for special cases which cannot be combined
2897             if ($combine_ok) {
2898
2899                 # '//' must be defined_or operator if an operator is expected.
2900                 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
2901                 # could be migrated here for clarity
2902
2903               # Patch for RT#102371, misparsing a // in the following snippet:
2904               #     state $b //= ccc();
2905               # The solution is to always accept the digraph (or trigraph) after
2906               # token type 'Z' (possible file handle).  The reason is that
2907               # sub operator_expected gives TERM expected here, which is
2908               # wrong in this case.
2909                 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
2910                     my $next_type = $rtokens->[ $i + 1 ];
2911                     my $expecting =
2912                       operator_expected( $prev_type, $tok, $next_type );
2913
2914                     # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
2915                     $combine_ok = 0 if ( $expecting == TERM );
2916                 }
2917
2918                 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
2919                 # Accept the digraphs '**' only after type 'Z'
2920                 # Otherwise postpone the decision.
2921                 if ( $test_tok eq '**' ) {
2922                     if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
2923                 }
2924             }
2925
2926             if (
2927                 $combine_ok
2928
2929                 && ( $test_tok ne '/=' )    # might be pattern
2930                 && ( $test_tok ne 'x=' )    # might be $x
2931                 && ( $test_tok ne '*=' )    # typeglob?
2932
2933                 # Moved above as part of fix for
2934                 # RT #114359: Missparsing of "print $x ** 0.5;
2935                 # && ( $test_tok ne '**' )    # typeglob?
2936               )
2937             {
2938                 $tok = $test_tok;
2939                 $i++;
2940
2941                 # Now try to assemble trigraphs.  Note that all possible
2942                 # perl trigraphs can be constructed by appending a character
2943                 # to a digraph.
2944                 $test_tok = $tok . $rtokens->[ $i + 1 ];
2945
2946                 if ( $is_trigraph{$test_tok} ) {
2947                     $tok = $test_tok;
2948                     $i++;
2949                 }
2950
2951                 # The only current tetragraph is the double diamond operator
2952                 # and its first three characters are not a trigraph, so
2953                 # we do can do a special test for it
2954                 elsif ( $test_tok eq '<<>' ) {
2955                     $test_tok .= $rtokens->[ $i + 2 ];
2956                     if ( $is_tetragraph{$test_tok} ) {
2957                         $tok = $test_tok;
2958                         $i += 2;
2959                     }
2960                 }
2961             }
2962
2963             $type      = $tok;
2964             $next_tok  = $rtokens->[ $i + 1 ];
2965             $next_type = $rtoken_type->[ $i + 1 ];
2966
2967             TOKENIZER_DEBUG_FLAG_TOKENIZE && do {
2968                 local $" = ')(';
2969                 my @debug_list = (
2970                     $last_nonblank_token,      $tok,
2971                     $next_tok,                 $brace_depth,
2972                     $brace_type[$brace_depth], $paren_depth,
2973                     $paren_type[$paren_depth]
2974                 );
2975                 print STDOUT "TOKENIZE:(@debug_list)\n";
2976             };
2977
2978             # turn off attribute list on first non-blank, non-bareword
2979             if ( $pre_type ne 'w' ) { $in_attribute_list = 0 }
2980
2981             ###############################################################
2982             # We have the next token, $tok.
2983             # Now we have to examine this token and decide what it is
2984             # and define its $type
2985             #
2986             # section 1: bare words
2987             ###############################################################
2988
2989             if ( $pre_type eq 'w' ) {
2990                 $expecting = operator_expected( $prev_type, $tok, $next_type );
2991                 my ( $next_nonblank_token, $i_next ) =
2992                   find_next_nonblank_token( $i, $rtokens, $max_token_index );
2993
2994                 # ATTRS: handle sub and variable attributes
2995                 if ($in_attribute_list) {
2996
2997                     # treat bare word followed by open paren like qw(
2998                     if ( $next_nonblank_token eq '(' ) {
2999                         $in_quote                = $quote_items{'q'};
3000                         $allowed_quote_modifiers = $quote_modifiers{'q'};
3001                         $type                    = 'q';
3002                         $quote_type              = 'q';
3003                         next;
3004                     }
3005
3006                     # handle bareword not followed by open paren
3007                     else {
3008                         $type = 'w';
3009                         next;
3010                     }
3011                 }
3012
3013                 # quote a word followed by => operator
3014                 # unless the word __END__ or __DATA__ and the only word on
3015                 # the line.
3016                 if ( !$is_END_or_DATA && $next_nonblank_token eq '=' ) {
3017
3018                     if ( $rtokens->[ $i_next + 1 ] eq '>' ) {
3019                         if ( $is_constant{$current_package}{$tok} ) {
3020                             $type = 'C';
3021                         }
3022                         elsif ( $is_user_function{$current_package}{$tok} ) {
3023                             $type = 'U';
3024                             $prototype =
3025                               $user_function_prototype{$current_package}{$tok};
3026                         }
3027                         elsif ( $tok =~ /^v\d+$/ ) {
3028                             $type = 'v';
3029                             report_v_string($tok);
3030                         }
3031                         else { $type = 'w' }
3032
3033                         next;
3034                     }
3035                 }
3036
3037      # quote a bare word within braces..like xxx->{s}; note that we
3038      # must be sure this is not a structural brace, to avoid
3039      # mistaking {s} in the following for a quoted bare word:
3040      #     for(@[){s}bla}BLA}
3041      # Also treat q in something like var{-q} as a bare word, not qoute operator
3042                 if (
3043                     $next_nonblank_token eq '}'
3044                     && (
3045                         $last_nonblank_type eq 'L'
3046                         || (   $last_nonblank_type eq 'm'
3047                             && $last_last_nonblank_type eq 'L' )
3048                     )
3049                   )
3050                 {
3051                     $type = 'w';
3052                     next;
3053                 }
3054
3055                 # a bare word immediately followed by :: is not a keyword;
3056                 # use $tok_kw when testing for keywords to avoid a mistake
3057                 my $tok_kw = $tok;
3058                 if (   $rtokens->[ $i + 1 ] eq ':'
3059                     && $rtokens->[ $i + 2 ] eq ':' )
3060                 {
3061                     $tok_kw .= '::';
3062                 }
3063
3064                 # handle operator x (now we know it isn't $x=)
3065                 if ( ( $tok =~ /^x\d*$/ ) && ( $expecting == OPERATOR ) ) {
3066                     if ( $tok eq 'x' ) {
3067
3068                         if ( $rtokens->[ $i + 1 ] eq '=' ) {    # x=
3069                             $tok  = 'x=';
3070                             $type = $tok;
3071                             $i++;
3072                         }
3073                         else {
3074                             $type = 'x';
3075                         }
3076                     }
3077
3078                     # FIXME: Patch: mark something like x4 as an integer for now
3079                     # It gets fixed downstream.  This is easier than
3080                     # splitting the pretoken.
3081                     else {
3082                         $type = 'n';
3083                     }
3084                 }
3085                 elsif ( $tok_kw eq 'CORE::' ) {
3086                     $type = $tok = $tok_kw;
3087                     $i += 2;
3088                 }
3089                 elsif ( ( $tok eq 'strict' )
3090                     and ( $last_nonblank_token eq 'use' ) )
3091                 {
3092                     $tokenizer_self->{_saw_use_strict} = 1;
3093                     scan_bare_identifier();
3094                 }
3095
3096                 elsif ( ( $tok eq 'warnings' )
3097                     and ( $last_nonblank_token eq 'use' ) )
3098                 {
3099                     $tokenizer_self->{_saw_perl_dash_w} = 1;
3100
3101                     # scan as identifier, so that we pick up something like:
3102                     # use warnings::register
3103                     scan_bare_identifier();
3104                 }
3105
3106                 elsif (
3107                        $tok eq 'AutoLoader'
3108                     && $tokenizer_self->{_look_for_autoloader}
3109                     && (
3110                         $last_nonblank_token eq 'use'
3111
3112                         # these regexes are from AutoSplit.pm, which we want
3113                         # to mimic
3114                         || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
3115                         || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
3116                     )
3117                   )
3118                 {
3119                     write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
3120                     $tokenizer_self->{_saw_autoloader}      = 1;
3121                     $tokenizer_self->{_look_for_autoloader} = 0;
3122                     scan_bare_identifier();
3123                 }
3124
3125                 elsif (
3126                        $tok eq 'SelfLoader'
3127                     && $tokenizer_self->{_look_for_selfloader}
3128                     && (   $last_nonblank_token eq 'use'
3129                         || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
3130                         || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
3131                   )
3132                 {
3133                     write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
3134                     $tokenizer_self->{_saw_selfloader}      = 1;
3135                     $tokenizer_self->{_look_for_selfloader} = 0;
3136                     scan_bare_identifier();
3137                 }
3138
3139                 elsif ( ( $tok eq 'constant' )
3140                     and ( $last_nonblank_token eq 'use' ) )
3141                 {
3142                     scan_bare_identifier();
3143                     my ( $next_nonblank_token, $i_next ) =
3144                       find_next_nonblank_token( $i, $rtokens,
3145                         $max_token_index );
3146
3147                     if ($next_nonblank_token) {
3148
3149                         if ( $is_keyword{$next_nonblank_token} ) {
3150
3151                             # Assume qw is used as a quote and okay, as in:
3152                             #  use constant qw{ DEBUG 0 };
3153                             # Not worth trying to parse for just a warning
3154
3155                             # NOTE: This warning is deactivated because recent
3156                             # versions of perl do not complain here, but
3157                             # the coding is retained for reference.
3158                             if ( 0 && $next_nonblank_token ne 'qw' ) {
3159                                 warning(
3160 "Attempting to define constant '$next_nonblank_token' which is a perl keyword\n"
3161                                 );
3162                             }
3163                         }
3164
3165                         # FIXME: could check for error in which next token is
3166                         # not a word (number, punctuation, ..)
3167                         else {
3168                             $is_constant{$current_package}{$next_nonblank_token}
3169                               = 1;
3170                         }
3171                     }
3172                 }
3173
3174                 # various quote operators
3175                 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
3176 ##NICOL PATCH
3177                     if ( $expecting == OPERATOR ) {
3178
3179                         # Be careful not to call an error for a qw quote
3180                         # where a parenthesized list is allowed.  For example,
3181                         # it could also be a for/foreach construct such as
3182                         #
3183                         #    foreach my $key qw\Uno Due Tres Quadro\ {
3184                         #        print "Set $key\n";
3185                         #    }
3186                         #
3187
3188                         # Or it could be a function call.
3189                         # NOTE: Braces in something like &{ xxx } are not
3190                         # marked as a block, we might have a method call.
3191                         # &method(...), $method->(..), &{method}(...),
3192                         # $ref[2](list) is ok & short for $ref[2]->(list)
3193                         #
3194                         # See notes in 'sub code_block_type' and
3195                         # 'sub is_non_structural_brace'
3196
3197                         unless (
3198                             $tok eq 'qw'
3199                             && (   $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
3200                                 || $is_for_foreach{$want_paren} )
3201                           )
3202                         {
3203                             error_if_expecting_OPERATOR();
3204                         }
3205                     }
3206                     $in_quote                = $quote_items{$tok};
3207                     $allowed_quote_modifiers = $quote_modifiers{$tok};
3208
3209                    # All quote types are 'Q' except possibly qw quotes.
3210                    # qw quotes are special in that they may generally be trimmed
3211                    # of leading and trailing whitespace.  So they are given a
3212                    # separate type, 'q', unless requested otherwise.
3213                     $type =
3214                       ( $tok eq 'qw' && $tokenizer_self->{_trim_qw} )
3215                       ? 'q'
3216                       : 'Q';
3217                     $quote_type = $type;
3218                 }
3219
3220                 # check for a statement label
3221                 elsif (
3222                        ( $next_nonblank_token eq ':' )
3223                     && ( $rtokens->[ $i_next + 1 ] ne ':' )
3224                     && ( $i_next <= $max_token_index )      # colon on same line
3225                     && label_ok()
3226                   )
3227                 {
3228                     if ( $tok !~ /[A-Z]/ ) {
3229                         push @{ $tokenizer_self->{_rlower_case_labels_at} },
3230                           $input_line_number;
3231                     }
3232                     $type = 'J';
3233                     $tok .= ':';
3234                     $i = $i_next;
3235                     next;
3236                 }
3237
3238                 #      'sub' || 'package'
3239                 elsif ( $is_sub_package{$tok_kw} ) {
3240                     error_if_expecting_OPERATOR()
3241                       if ( $expecting == OPERATOR );
3242                     scan_id();
3243                 }
3244
3245                 # Note on token types for format, __DATA__, __END__:
3246                 # It simplifies things to give these type ';', so that when we
3247                 # start rescanning we will be expecting a token of type TERM.
3248                 # We will switch to type 'k' before outputting the tokens.
3249                 elsif ( $is_format_END_DATA{$tok_kw} ) {
3250                     $type = ';';    # make tokenizer look for TERM next
3251                     $tokenizer_self->{ $is_format_END_DATA{$tok_kw} } = 1;
3252                     last;
3253                 }
3254
3255                 elsif ( $is_keyword{$tok_kw} ) {
3256                     $type = 'k';
3257
3258                     # Since for and foreach may not be followed immediately
3259                     # by an opening paren, we have to remember which keyword
3260                     # is associated with the next '('
3261                     if ( $is_for_foreach{$tok} ) {
3262                         if ( new_statement_ok() ) {
3263                             $want_paren = $tok;
3264                         }
3265                     }
3266
3267                     # recognize 'use' statements, which are special
3268                     elsif ( $is_use_require{$tok} ) {
3269                         $statement_type = $tok;
3270                         error_if_expecting_OPERATOR()
3271                           if ( $expecting == OPERATOR );
3272                     }
3273
3274                     # remember my and our to check for trailing ": shared"
3275                     elsif ( $is_my_our{$tok} ) {
3276                         $statement_type = $tok;
3277                     }
3278
3279                     # Check for misplaced 'elsif' and 'else', but allow isolated
3280                     # else or elsif blocks to be formatted.  This is indicated
3281                     # by a last noblank token of ';'
3282                     elsif ( $tok eq 'elsif' ) {
3283                         if (   $last_nonblank_token ne ';'
3284                             && $last_nonblank_block_type !~
3285                             /^(if|elsif|unless)$/ )
3286                         {
3287                             warning(
3288 "expecting '$tok' to follow one of 'if|elsif|unless'\n"
3289                             );
3290                         }
3291                     }
3292                     elsif ( $tok eq 'else' ) {
3293
3294                         # patched for SWITCH/CASE
3295                         if (
3296                                $last_nonblank_token ne ';'
3297                             && $last_nonblank_block_type !~
3298                             /^(if|elsif|unless|case|when)$/
3299
3300                             # patch to avoid an unwanted error message for
3301                             # the case of a parenless 'case' (RT 105484):
3302                             # switch ( 1 ) { case x { 2 } else { } }
3303                             && $statement_type !~
3304                             /^(if|elsif|unless|case|when)$/
3305                           )
3306                         {
3307                             warning(
3308 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
3309                             );
3310                         }
3311                     }
3312                     elsif ( $tok eq 'continue' ) {
3313                         if (   $last_nonblank_token ne ';'
3314                             && $last_nonblank_block_type !~
3315                             /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
3316                         {
3317
3318                             # note: ';' '{' and '}' in list above
3319                             # because continues can follow bare blocks;
3320                             # ':' is labeled block
3321                             #
3322                             ############################################
3323                             # NOTE: This check has been deactivated because
3324                             # continue has an alternative usage for given/when
3325                             # blocks in perl 5.10
3326                             ## warning("'$tok' should follow a block\n");
3327                             ############################################
3328                         }
3329                     }
3330
3331                     # patch for SWITCH/CASE if 'case' and 'when are
3332                     # treated as keywords.
3333                     elsif ( $tok eq 'when' || $tok eq 'case' ) {
3334                         $statement_type = $tok;    # next '{' is block
3335                     }
3336
3337                     #
3338                     # indent trailing if/unless/while/until
3339                     # outdenting will be handled by later indentation loop
3340 ## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
3341 ##$opt_o = 1
3342 ##  if !(
3343 ##             $opt_b
3344 ##          || $opt_c
3345 ##          || $opt_d
3346 ##          || $opt_f
3347 ##          || $opt_i
3348 ##          || $opt_l
3349 ##          || $opt_o
3350 ##          || $opt_x
3351 ##  );
3352 ##                    if (   $tok =~ /^(if|unless|while|until)$/
3353 ##                        && $next_nonblank_token ne '(' )
3354 ##                    {
3355 ##                        $indent_flag = 1;
3356 ##                    }
3357                 }
3358
3359                 # check for inline label following
3360                 #         /^(redo|last|next|goto)$/
3361                 elsif (( $last_nonblank_type eq 'k' )
3362                     && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
3363                 {
3364                     $type = 'j';
3365                     next;
3366                 }
3367
3368                 # something else --
3369                 else {
3370
3371                     scan_bare_identifier();
3372                     if ( $type eq 'w' ) {
3373
3374                         if ( $expecting == OPERATOR ) {
3375
3376                             # don't complain about possible indirect object
3377                             # notation.
3378                             # For example:
3379                             #   package main;
3380                             #   sub new($) { ... }
3381                             #   $b = new A::;  # calls A::new
3382                             #   $c = new A;    # same thing but suspicious
3383                             # This will call A::new but we have a 'new' in
3384                             # main:: which looks like a constant.
3385                             #
3386                             if ( $last_nonblank_type eq 'C' ) {
3387                                 if ( $tok !~ /::$/ ) {
3388                                     complain(<<EOM);
3389 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
3390        Maybe indirectet object notation?
3391 EOM
3392                                 }
3393                             }
3394                             else {
3395                                 error_if_expecting_OPERATOR("bareword");
3396                             }
3397                         }
3398
3399                         # mark bare words immediately followed by a paren as
3400                         # functions
3401                         $next_tok = $rtokens->[ $i + 1 ];
3402                         if ( $next_tok eq '(' ) {
3403                             $type = 'U';
3404                         }
3405
3406                         # underscore after file test operator is file handle
3407                         if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
3408                             $type = 'Z';
3409                         }
3410
3411                         # patch for SWITCH/CASE if 'case' and 'when are
3412                         # not treated as keywords:
3413                         if (
3414                             (
3415                                    $tok eq 'case'
3416                                 && $brace_type[$brace_depth] eq 'switch'
3417                             )
3418                             || (   $tok eq 'when'
3419                                 && $brace_type[$brace_depth] eq 'given' )
3420                           )
3421                         {
3422                             $statement_type = $tok;    # next '{' is block
3423                             $type = 'k';    # for keyword syntax coloring
3424                         }
3425
3426                         # patch for SWITCH/CASE if switch and given not keywords
3427                         # Switch is not a perl 5 keyword, but we will gamble
3428                         # and mark switch followed by paren as a keyword.  This
3429                         # is only necessary to get html syntax coloring nice,
3430                         # and does not commit this as being a switch/case.
3431                         if ( $next_nonblank_token eq '('
3432                             && ( $tok eq 'switch' || $tok eq 'given' ) )
3433                         {
3434                             $type = 'k';    # for keyword syntax coloring
3435                         }
3436                     }
3437                 }
3438             }
3439
3440             ###############################################################
3441             # section 2: strings of digits
3442             ###############################################################
3443             elsif ( $pre_type eq 'd' ) {
3444                 $expecting = operator_expected( $prev_type, $tok, $next_type );
3445                 error_if_expecting_OPERATOR("Number")
3446                   if ( $expecting == OPERATOR );
3447                 my $number = scan_number();
3448                 if ( !defined($number) ) {
3449
3450                     # shouldn't happen - we should always get a number
3451                     warning("non-number beginning with digit--program bug\n");
3452                     report_definite_bug();
3453                 }
3454             }
3455
3456             ###############################################################
3457             # section 3: all other tokens
3458             ###############################################################
3459
3460             else {
3461                 last if ( $tok eq '#' );
3462                 my $code = $tokenization_code->{$tok};
3463                 if ($code) {
3464                     $expecting =
3465                       operator_expected( $prev_type, $tok, $next_type );
3466                     $code->();
3467                     redo if $in_quote;
3468                 }
3469             }
3470         }
3471
3472         # -----------------------------
3473         # end of main tokenization loop
3474         # -----------------------------
3475
3476         if ( $i_tok >= 0 ) {
3477             $routput_token_type->[$i_tok]     = $type;
3478             $routput_block_type->[$i_tok]     = $block_type;
3479             $routput_container_type->[$i_tok] = $container_type;
3480             $routput_type_sequence->[$i_tok]  = $type_sequence;
3481             $routput_indent_flag->[$i_tok]    = $indent_flag;
3482         }
3483
3484         unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
3485             $last_last_nonblank_token          = $last_nonblank_token;
3486             $last_last_nonblank_type           = $last_nonblank_type;
3487             $last_last_nonblank_block_type     = $last_nonblank_block_type;
3488             $last_last_nonblank_container_type = $last_nonblank_container_type;
3489             $last_last_nonblank_type_sequence  = $last_nonblank_type_sequence;
3490             $last_nonblank_token               = $tok;
3491             $last_nonblank_type                = $type;
3492             $last_nonblank_block_type          = $block_type;
3493             $last_nonblank_container_type      = $container_type;
3494             $last_nonblank_type_sequence       = $type_sequence;
3495             $last_nonblank_prototype           = $prototype;
3496         }
3497
3498         # reset indentation level if necessary at a sub or package
3499         # in an attempt to recover from a nesting error
3500         if ( $level_in_tokenizer < 0 ) {
3501             if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
3502                 reset_indentation_level(0);
3503                 brace_warning("resetting level to 0 at $1 $2\n");
3504             }
3505         }
3506
3507         # all done tokenizing this line ...
3508         # now prepare the final list of tokens and types
3509
3510         my @token_type     = ();   # stack of output token types
3511         my @block_type     = ();   # stack of output code block types
3512         my @container_type = ();   # stack of output code container types
3513         my @type_sequence  = ();   # stack of output type sequence numbers
3514         my @tokens         = ();   # output tokens
3515         my @levels         = ();   # structural brace levels of output tokens
3516         my @slevels        = ();   # secondary nesting levels of output tokens
3517         my @nesting_tokens = ();   # string of tokens leading to this depth
3518         my @nesting_types  = ();   # string of token types leading to this depth
3519         my @nesting_blocks = ();   # string of block types leading to this depth
3520         my @nesting_lists  = ();   # string of list types leading to this depth
3521         my @ci_string = ();  # string needed to compute continuation indentation
3522         my @container_environment = ();    # BLOCK or LIST
3523         my $container_environment = '';
3524         my $im                    = -1;    # previous $i value
3525         my $num;
3526         my $ci_string_sum = ones_count($ci_string_in_tokenizer);
3527
3528 # Computing Token Indentation
3529 #
3530 #     The final section of the tokenizer forms tokens and also computes
3531 #     parameters needed to find indentation.  It is much easier to do it
3532 #     in the tokenizer than elsewhere.  Here is a brief description of how
3533 #     indentation is computed.  Perl::Tidy computes indentation as the sum
3534 #     of 2 terms:
3535 #
3536 #     (1) structural indentation, such as if/else/elsif blocks
3537 #     (2) continuation indentation, such as long parameter call lists.
3538 #
3539 #     These are occasionally called primary and secondary indentation.
3540 #
3541 #     Structural indentation is introduced by tokens of type '{', although
3542 #     the actual tokens might be '{', '(', or '['.  Structural indentation
3543 #     is of two types: BLOCK and non-BLOCK.  Default structural indentation
3544 #     is 4 characters if the standard indentation scheme is used.
3545 #
3546 #     Continuation indentation is introduced whenever a line at BLOCK level
3547 #     is broken before its termination.  Default continuation indentation
3548 #     is 2 characters in the standard indentation scheme.
3549 #
3550 #     Both types of indentation may be nested arbitrarily deep and
3551 #     interlaced.  The distinction between the two is somewhat arbitrary.
3552 #
3553 #     For each token, we will define two variables which would apply if
3554 #     the current statement were broken just before that token, so that
3555 #     that token started a new line:
3556 #
3557 #     $level = the structural indentation level,
3558 #     $ci_level = the continuation indentation level
3559 #
3560 #     The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
3561 #     assuming defaults.  However, in some special cases it is customary
3562 #     to modify $ci_level from this strict value.
3563 #
3564 #     The total structural indentation is easy to compute by adding and
3565 #     subtracting 1 from a saved value as types '{' and '}' are seen.  The
3566 #     running value of this variable is $level_in_tokenizer.
3567 #
3568 #     The total continuation is much more difficult to compute, and requires
3569 #     several variables.  These variables are:
3570 #
3571 #     $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
3572 #       each indentation level, if there are intervening open secondary
3573 #       structures just prior to that level.
3574 #     $continuation_string_in_tokenizer = a string of 1's and 0's indicating
3575 #       if the last token at that level is "continued", meaning that it
3576 #       is not the first token of an expression.
3577 #     $nesting_block_string = a string of 1's and 0's indicating, for each
3578 #       indentation level, if the level is of type BLOCK or not.
3579 #     $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
3580 #     $nesting_list_string = a string of 1's and 0's indicating, for each
3581 #       indentation level, if it is appropriate for list formatting.
3582 #       If so, continuation indentation is used to indent long list items.
3583 #     $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
3584 #     @{$rslevel_stack} = a stack of total nesting depths at each
3585 #       structural indentation level, where "total nesting depth" means
3586 #       the nesting depth that would occur if every nesting token -- '{', '[',
3587 #       and '(' -- , regardless of context, is used to compute a nesting
3588 #       depth.
3589
3590         #my $nesting_block_flag = ($nesting_block_string =~ /1$/);
3591         #my $nesting_list_flag = ($nesting_list_string =~ /1$/);
3592
3593         my ( $ci_string_i, $level_i, $nesting_block_string_i,
3594             $nesting_list_string_i, $nesting_token_string_i,
3595             $nesting_type_string_i, );
3596
3597         foreach my $i ( @{$routput_token_list} )
3598         {    # scan the list of pre-tokens indexes
3599
3600             # self-checking for valid token types
3601             my $type                    = $routput_token_type->[$i];
3602             my $forced_indentation_flag = $routput_indent_flag->[$i];
3603
3604             # See if we should undo the $forced_indentation_flag.
3605             # Forced indentation after 'if', 'unless', 'while' and 'until'
3606             # expressions without trailing parens is optional and doesn't
3607             # always look good.  It is usually okay for a trailing logical
3608             # expression, but if the expression is a function call, code block,
3609             # or some kind of list it puts in an unwanted extra indentation
3610             # level which is hard to remove.
3611             #
3612             # Example where extra indentation looks ok:
3613             # return 1
3614             #   if $det_a < 0 and $det_b > 0
3615             #       or $det_a > 0 and $det_b < 0;
3616             #
3617             # Example where extra indentation is not needed because
3618             # the eval brace also provides indentation:
3619             # print "not " if defined eval {
3620             #     reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
3621             # };
3622             #
3623             # The following rule works fairly well:
3624             #   Undo the flag if the end of this line, or start of the next
3625             #   line, is an opening container token or a comma.
3626             # This almost always works, but if not after another pass it will
3627             # be stable.
3628             if ( $forced_indentation_flag && $type eq 'k' ) {
3629                 my $ixlast  = -1;
3630                 my $ilast   = $routput_token_list->[$ixlast];
3631                 my $toklast = $routput_token_type->[$ilast];
3632                 if ( $toklast eq '#' ) {
3633                     $ixlast--;
3634                     $ilast   = $routput_token_list->[$ixlast];
3635                     $toklast = $routput_token_type->[$ilast];
3636                 }
3637                 if ( $toklast eq 'b' ) {
3638                     $ixlast--;
3639                     $ilast   = $routput_token_list->[$ixlast];
3640                     $toklast = $routput_token_type->[$ilast];
3641                 }
3642                 if ( $toklast =~ /^[\{,]$/ ) {
3643                     $forced_indentation_flag = 0;
3644                 }
3645                 else {
3646                     ( $toklast, my $i_next ) =
3647                       find_next_nonblank_token( $max_token_index, $rtokens,
3648                         $max_token_index );
3649                     if ( $toklast =~ /^[\{,]$/ ) {
3650                         $forced_indentation_flag = 0;
3651                     }
3652                 }
3653             }
3654
3655             # if we are already in an indented if, see if we should outdent
3656             if ($indented_if_level) {
3657
3658                 # don't try to nest trailing if's - shouldn't happen
3659                 if ( $type eq 'k' ) {
3660                     $forced_indentation_flag = 0;
3661                 }
3662
3663                 # check for the normal case - outdenting at next ';'
3664                 elsif ( $type eq ';' ) {
3665                     if ( $level_in_tokenizer == $indented_if_level ) {
3666                         $forced_indentation_flag = -1;
3667                         $indented_if_level       = 0;
3668                     }
3669                 }
3670
3671                 # handle case of missing semicolon
3672                 elsif ( $type eq '}' ) {
3673                     if ( $level_in_tokenizer == $indented_if_level ) {
3674                         $indented_if_level = 0;
3675
3676                         # TBD: This could be a subroutine call
3677                         $level_in_tokenizer--;
3678                         if ( @{$rslevel_stack} > 1 ) {
3679                             pop( @{$rslevel_stack} );
3680                         }
3681                         if ( length($nesting_block_string) > 1 )
3682                         {    # true for valid script
3683                             chop $nesting_block_string;
3684                             chop $nesting_list_string;
3685                         }
3686
3687                     }
3688                 }
3689             }
3690
3691             my $tok = $rtokens->[$i];  # the token, but ONLY if same as pretoken
3692             $level_i = $level_in_tokenizer;
3693
3694             # This can happen by running perltidy on non-scripts
3695             # although it could also be bug introduced by programming change.
3696             # Perl silently accepts a 032 (^Z) and takes it as the end
3697             if ( !$is_valid_token_type{$type} ) {
3698                 my $val = ord($type);
3699                 warning(
3700                     "unexpected character decimal $val ($type) in script\n");
3701                 $tokenizer_self->{_in_error} = 1;
3702             }
3703
3704             # ----------------------------------------------------------------
3705             # TOKEN TYPE PATCHES
3706             #  output __END__, __DATA__, and format as type 'k' instead of ';'
3707             # to make html colors correct, etc.
3708             my $fix_type = $type;
3709             if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }
3710
3711             # output anonymous 'sub' as keyword
3712             if ( $type eq 't' && $tok eq 'sub' ) { $fix_type = 'k' }
3713
3714             # -----------------------------------------------------------------
3715
3716             $nesting_token_string_i = $nesting_token_string;
3717             $nesting_type_string_i  = $nesting_type_string;
3718             $nesting_block_string_i = $nesting_block_string;
3719             $nesting_list_string_i  = $nesting_list_string;
3720
3721             # set primary indentation levels based on structural braces
3722             # Note: these are set so that the leading braces have a HIGHER
3723             # level than their CONTENTS, which is convenient for indentation
3724             # Also, define continuation indentation for each token.
3725             if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
3726             {
3727
3728                 # use environment before updating
3729                 $container_environment =
3730                     $nesting_block_flag ? 'BLOCK'
3731                   : $nesting_list_flag  ? 'LIST'
3732                   :                       "";
3733
3734                 # if the difference between total nesting levels is not 1,
3735                 # there are intervening non-structural nesting types between
3736                 # this '{' and the previous unclosed '{'
3737                 my $intervening_secondary_structure = 0;
3738                 if ( @{$rslevel_stack} ) {
3739                     $intervening_secondary_structure =
3740                       $slevel_in_tokenizer - $rslevel_stack->[-1];
3741                 }
3742
3743      # Continuation Indentation
3744      #
3745      # Having tried setting continuation indentation both in the formatter and
3746      # in the tokenizer, I can say that setting it in the tokenizer is much,
3747      # much easier.  The formatter already has too much to do, and can't
3748      # make decisions on line breaks without knowing what 'ci' will be at
3749      # arbitrary locations.
3750      #
3751      # But a problem with setting the continuation indentation (ci) here
3752      # in the tokenizer is that we do not know where line breaks will actually
3753      # be.  As a result, we don't know if we should propagate continuation
3754      # indentation to higher levels of structure.
3755      #
3756      # For nesting of only structural indentation, we never need to do this.
3757      # For example, in a long if statement, like this
3758      #
3759      #   if ( !$output_block_type[$i]
3760      #     && ($in_statement_continuation) )
3761      #   {           <--outdented
3762      #       do_something();
3763      #   }
3764      #
3765      # the second line has ci but we do normally give the lines within the BLOCK
3766      # any ci.  This would be true if we had blocks nested arbitrarily deeply.
3767      #
3768      # But consider something like this, where we have created a break after
3769      # an opening paren on line 1, and the paren is not (currently) a
3770      # structural indentation token:
3771      #
3772      # my $file = $menubar->Menubutton(
3773      #   qw/-text File -underline 0 -menuitems/ => [
3774      #       [
3775      #           Cascade    => '~View',
3776      #           -menuitems => [
3777      #           ...
3778      #
3779      # The second line has ci, so it would seem reasonable to propagate it
3780      # down, giving the third line 1 ci + 1 indentation.  This suggests the
3781      # following rule, which is currently used to propagating ci down: if there
3782      # are any non-structural opening parens (or brackets, or braces), before
3783      # an opening structural brace, then ci is propagated down, and otherwise
3784      # not.  The variable $intervening_secondary_structure contains this
3785      # information for the current token, and the string
3786      # "$ci_string_in_tokenizer" is a stack of previous values of this
3787      # variable.
3788
3789                 # save the current states
3790                 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
3791                 $level_in_tokenizer++;
3792
3793                 if ($forced_indentation_flag) {
3794
3795                     # break BEFORE '?' when there is forced indentation
3796                     if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
3797                     if ( $type eq 'k' ) {
3798                         $indented_if_level = $level_in_tokenizer;
3799                     }
3800
3801                     # do not change container environment here if we are not
3802                     # at a real list. Adding this check prevents "blinkers"
3803                     # often near 'unless" clauses, such as in the following
3804                     # code:
3805 ##          next
3806 ##            unless -e (
3807 ##                    $archive =
3808 ##                      File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
3809 ##            );
3810
3811                     $nesting_block_string .= "$nesting_block_flag";
3812                 }
3813                 else {
3814
3815                     if ( $routput_block_type->[$i] ) {
3816                         $nesting_block_flag = 1;
3817                         $nesting_block_string .= '1';
3818                     }
3819                     else {
3820                         $nesting_block_flag = 0;
3821                         $nesting_block_string .= '0';
3822                     }
3823                 }
3824
3825                 # we will use continuation indentation within containers
3826                 # which are not blocks and not logical expressions
3827                 my $bit = 0;
3828                 if ( !$routput_block_type->[$i] ) {
3829
3830                     # propagate flag down at nested open parens
3831                     if ( $routput_container_type->[$i] eq '(' ) {
3832                         $bit = 1 if $nesting_list_flag;
3833                     }
3834
3835                   # use list continuation if not a logical grouping
3836                   # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
3837                     else {
3838                         $bit = 1
3839                           unless
3840                           $is_logical_container{ $routput_container_type->[$i]
3841                           };
3842                     }
3843                 }
3844                 $nesting_list_string .= $bit;
3845                 $nesting_list_flag = $bit;
3846
3847                 $ci_string_in_tokenizer .=
3848                   ( $intervening_secondary_structure != 0 ) ? '1' : '0';
3849                 $ci_string_sum = ones_count($ci_string_in_tokenizer);
3850                 $continuation_string_in_tokenizer .=
3851                   ( $in_statement_continuation > 0 ) ? '1' : '0';
3852
3853    #  Sometimes we want to give an opening brace continuation indentation,
3854    #  and sometimes not.  For code blocks, we don't do it, so that the leading
3855    #  '{' gets outdented, like this:
3856    #
3857    #   if ( !$output_block_type[$i]
3858    #     && ($in_statement_continuation) )
3859    #   {           <--outdented
3860    #
3861    #  For other types, we will give them continuation indentation.  For example,
3862    #  here is how a list looks with the opening paren indented:
3863    #
3864    #     @LoL =
3865    #       ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
3866    #         [ "homer", "marge", "bart" ], );
3867    #
3868    #  This looks best when 'ci' is one-half of the indentation  (i.e., 2 and 4)
3869
3870                 my $total_ci = $ci_string_sum;
3871                 if (
3872                     !$routput_block_type->[$i]    # patch: skip for BLOCK
3873                     && ($in_statement_continuation)
3874                     && !( $forced_indentation_flag && $type eq ':' )
3875                   )
3876                 {
3877                     $total_ci += $in_statement_continuation
3878                       unless ( $ci_string_in_tokenizer =~ /1$/ );
3879                 }
3880
3881                 $ci_string_i               = $total_ci;
3882                 $in_statement_continuation = 0;
3883             }
3884
3885             elsif ($type eq '}'
3886                 || $type eq 'R'
3887                 || $forced_indentation_flag < 0 )
3888             {
3889
3890                 # only a nesting error in the script would prevent popping here
3891                 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
3892
3893                 $level_i = --$level_in_tokenizer;
3894
3895                 # restore previous level values
3896                 if ( length($nesting_block_string) > 1 )
3897                 {    # true for valid script
3898                     chop $nesting_block_string;
3899                     $nesting_block_flag = ( $nesting_block_string =~ /1$/ );
3900                     chop $nesting_list_string;
3901                     $nesting_list_flag = ( $nesting_list_string =~ /1$/ );
3902
3903                     chop $ci_string_in_tokenizer;
3904                     $ci_string_sum = ones_count($ci_string_in_tokenizer);
3905
3906                     $in_statement_continuation =
3907                       chop $continuation_string_in_tokenizer;
3908
3909                     # zero continuation flag at terminal BLOCK '}' which
3910                     # ends a statement.
3911                     if ( $routput_block_type->[$i] ) {
3912
3913                         # ...These include non-anonymous subs
3914                         # note: could be sub ::abc { or sub 'abc
3915                         if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
3916
3917                          # note: older versions of perl require the /gc modifier
3918                          # here or else the \G does not work.
3919                             if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
3920                             {
3921                                 $in_statement_continuation = 0;
3922                             }
3923                         }
3924
3925 # ...and include all block types except user subs with
3926 # block prototypes and these: (sort|grep|map|do|eval)
3927 # /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
3928                         elsif (
3929                             $is_zero_continuation_block_type{
3930                                 $routput_block_type->[$i]
3931                             } )
3932                         {
3933                             $in_statement_continuation = 0;
3934                         }
3935
3936                         # ..but these are not terminal types:
3937                         #     /^(sort|grep|map|do|eval)$/ )
3938                         elsif (
3939                             $is_not_zero_continuation_block_type{
3940                                 $routput_block_type->[$i]
3941                             } )
3942                         {
3943                         }
3944
3945                         # ..and a block introduced by a label
3946                         # /^\w+\s*:$/gc ) {
3947                         elsif ( $routput_block_type->[$i] =~ /:$/ ) {
3948                             $in_statement_continuation = 0;
3949                         }
3950
3951                         # user function with block prototype
3952                         else {
3953                             $in_statement_continuation = 0;
3954                         }
3955                     }
3956
3957                     # If we are in a list, then
3958                     # we must set continuation indentation at the closing
3959                     # paren of something like this (paren after $check):
3960                     #     assert(
3961                     #         __LINE__,
3962                     #         ( not defined $check )
3963                     #           or ref $check
3964                     #           or $check eq "new"
3965                     #           or $check eq "old",
3966                     #     );
3967                     elsif ( $tok eq ')' ) {
3968                         $in_statement_continuation = 1
3969                           if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
3970                     }
3971
3972                     elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
3973                 }
3974
3975                 # use environment after updating
3976                 $container_environment =
3977                     $nesting_block_flag ? 'BLOCK'
3978                   : $nesting_list_flag  ? 'LIST'
3979                   :                       "";
3980                 $ci_string_i = $ci_string_sum + $in_statement_continuation;
3981                 $nesting_block_string_i = $nesting_block_string;
3982                 $nesting_list_string_i  = $nesting_list_string;
3983             }
3984
3985             # not a structural indentation type..
3986             else {
3987
3988                 $container_environment =
3989                     $nesting_block_flag ? 'BLOCK'
3990                   : $nesting_list_flag  ? 'LIST'
3991                   :                       "";
3992
3993                 # zero the continuation indentation at certain tokens so
3994                 # that they will be at the same level as its container.  For
3995                 # commas, this simplifies the -lp indentation logic, which
3996                 # counts commas.  For ?: it makes them stand out.
3997                 if ($nesting_list_flag) {
3998                     if ( $type =~ /^[,\?\:]$/ ) {
3999                         $in_statement_continuation = 0;
4000                     }
4001                 }
4002
4003                 # be sure binary operators get continuation indentation
4004                 if (
4005                     $container_environment
4006                     && (   $type eq 'k' && $is_binary_keyword{$tok}
4007                         || $is_binary_type{$type} )
4008                   )
4009                 {
4010                     $in_statement_continuation = 1;
4011                 }
4012
4013                 # continuation indentation is sum of any open ci from previous
4014                 # levels plus the current level
4015                 $ci_string_i = $ci_string_sum + $in_statement_continuation;
4016
4017                 # update continuation flag ...
4018                 # if this isn't a blank or comment..
4019                 if ( $type ne 'b' && $type ne '#' ) {
4020
4021                     # and we are in a BLOCK
4022                     if ($nesting_block_flag) {
4023
4024                         # the next token after a ';' and label starts a new stmt
4025                         if ( $type eq ';' || $type eq 'J' ) {
4026                             $in_statement_continuation = 0;
4027                         }
4028
4029                         # otherwise, we are continuing the current statement
4030                         else {
4031                             $in_statement_continuation = 1;
4032                         }
4033                     }
4034
4035                     # if we are not in a BLOCK..
4036                     else {
4037
4038                         # do not use continuation indentation if not list
4039                         # environment (could be within if/elsif clause)
4040                         if ( !$nesting_list_flag ) {
4041                             $in_statement_continuation = 0;
4042                         }
4043
4044                         # otherwise, the token after a ',' starts a new term
4045
4046                         # Patch FOR RT#99961; no continuation after a ';'
4047                         # This is needed because perltidy currently marks
4048                         # a block preceded by a type character like % or @
4049                         # as a non block, to simplify formatting. But these
4050                         # are actually blocks and can have semicolons.
4051                         # See code_block_type() and is_non_structural_brace().
4052                         elsif ( $type eq ',' || $type eq ';' ) {
4053                             $in_statement_continuation = 0;
4054                         }
4055
4056                         # otherwise, we are continuing the current term
4057                         else {
4058                             $in_statement_continuation = 1;
4059                         }
4060                     }
4061                 }
4062             }
4063
4064             if ( $level_in_tokenizer < 0 ) {
4065                 unless ( $tokenizer_self->{_saw_negative_indentation} ) {
4066                     $tokenizer_self->{_saw_negative_indentation} = 1;
4067                     warning("Starting negative indentation\n");
4068                 }
4069             }
4070
4071             # set secondary nesting levels based on all containment token types
4072             # Note: these are set so that the nesting depth is the depth
4073             # of the PREVIOUS TOKEN, which is convenient for setting
4074             # the strength of token bonds
4075             my $slevel_i = $slevel_in_tokenizer;
4076
4077             #    /^[L\{\(\[]$/
4078             if ( $is_opening_type{$type} ) {
4079                 $slevel_in_tokenizer++;
4080                 $nesting_token_string .= $tok;
4081                 $nesting_type_string  .= $type;
4082             }
4083
4084             #       /^[R\}\)\]]$/
4085             elsif ( $is_closing_type{$type} ) {
4086                 $slevel_in_tokenizer--;
4087                 my $char = chop $nesting_token_string;
4088
4089                 if ( $char ne $matching_start_token{$tok} ) {
4090                     $nesting_token_string .= $char . $tok;
4091                     $nesting_type_string  .= $type;
4092                 }
4093                 else {
4094                     chop $nesting_type_string;
4095                 }
4096             }
4097
4098             push( @block_type,            $routput_block_type->[$i] );
4099             push( @ci_string,             $ci_string_i );
4100             push( @container_environment, $container_environment );
4101             push( @container_type,        $routput_container_type->[$i] );
4102             push( @levels,                $level_i );
4103             push( @nesting_tokens,        $nesting_token_string_i );
4104             push( @nesting_types,         $nesting_type_string_i );
4105             push( @slevels,               $slevel_i );
4106             push( @token_type,            $fix_type );
4107             push( @type_sequence,         $routput_type_sequence->[$i] );
4108             push( @nesting_blocks,        $nesting_block_string );
4109             push( @nesting_lists,         $nesting_list_string );
4110
4111             # now form the previous token
4112             if ( $im >= 0 ) {
4113                 $num =
4114                   $rtoken_map->[$i] - $rtoken_map->[$im];  # how many characters
4115
4116                 if ( $num > 0 ) {
4117                     push( @tokens,
4118                         substr( $input_line, $rtoken_map->[$im], $num ) );
4119                 }
4120             }
4121             $im = $i;
4122         }
4123
4124         $num = length($input_line) - $rtoken_map->[$im];   # make the last token
4125         if ( $num > 0 ) {
4126             push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
4127         }
4128
4129         $tokenizer_self->{_in_attribute_list} = $in_attribute_list;
4130         $tokenizer_self->{_in_quote}          = $in_quote;
4131         $tokenizer_self->{_quote_target} =
4132           $in_quote ? matching_end_token($quote_character) : "";
4133         $tokenizer_self->{_rhere_target_list} = $rhere_target_list;
4134
4135         $line_of_tokens->{_rtoken_type}            = \@token_type;
4136         $line_of_tokens->{_rtokens}                = \@tokens;
4137         $line_of_tokens->{_rblock_type}            = \@block_type;
4138         $line_of_tokens->{_rcontainer_type}        = \@container_type;
4139         $line_of_tokens->{_rcontainer_environment} = \@container_environment;
4140         $line_of_tokens->{_rtype_sequence}         = \@type_sequence;
4141         $line_of_tokens->{_rlevels}                = \@levels;
4142         $line_of_tokens->{_rslevels}               = \@slevels;
4143         $line_of_tokens->{_rnesting_tokens}        = \@nesting_tokens;
4144         $line_of_tokens->{_rci_levels}             = \@ci_string;
4145         $line_of_tokens->{_rnesting_blocks}        = \@nesting_blocks;
4146
4147         return;
4148     }
4149 }    # end tokenize_this_line
4150
4151 #########i#############################################################
4152 # Tokenizer routines which assist in identifying token types
4153 #######################################################################
4154
4155 sub operator_expected {
4156
4157     # Many perl symbols have two or more meanings.  For example, '<<'
4158     # can be a shift operator or a here-doc operator.  The
4159     # interpretation of these symbols depends on the current state of
4160     # the tokenizer, which may either be expecting a term or an
4161     # operator.  For this example, a << would be a shift if an operator
4162     # is expected, and a here-doc if a term is expected.  This routine
4163     # is called to make this decision for any current token.  It returns
4164     # one of three possible values:
4165     #
4166     #     OPERATOR - operator expected (or at least, not a term)
4167     #     UNKNOWN  - can't tell
4168     #     TERM     - a term is expected (or at least, not an operator)
4169     #
4170     # The decision is based on what has been seen so far.  This
4171     # information is stored in the "$last_nonblank_type" and
4172     # "$last_nonblank_token" variables.  For example, if the
4173     # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
4174     # if $last_nonblank_type is 'n' (numeric), we are expecting an
4175     # OPERATOR.
4176     #
4177     # If a UNKNOWN is returned, the calling routine must guess. A major
4178     # goal of this tokenizer is to minimize the possibility of returning
4179     # UNKNOWN, because a wrong guess can spoil the formatting of a
4180     # script.
4181     #
4182     # adding NEW_TOKENS: it is critically important that this routine be
4183     # updated to allow it to determine if an operator or term is to be
4184     # expected after the new token.  Doing this simply involves adding
4185     # the new token character to one of the regexes in this routine or
4186     # to one of the hash lists
4187     # that it uses, which are initialized in the BEGIN section.
4188     # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
4189     # $statement_type
4190
4191     my ( $prev_type, $tok, $next_type ) = @_;
4192
4193     my $op_expected = UNKNOWN;
4194
4195 ##print "tok=$tok last type=$last_nonblank_type last tok=$last_nonblank_token\n";
4196
4197 # Note: function prototype is available for token type 'U' for future
4198 # program development.  It contains the leading and trailing parens,
4199 # and no blanks.  It might be used to eliminate token type 'C', for
4200 # example (prototype = '()'). Thus:
4201 # if ($last_nonblank_type eq 'U') {
4202 #     print "previous token=$last_nonblank_token  type=$last_nonblank_type prototype=$last_nonblank_prototype\n";
4203 # }
4204
4205     # A possible filehandle (or object) requires some care...
4206     if ( $last_nonblank_type eq 'Z' ) {
4207
4208         # angle.t
4209         if ( $last_nonblank_token =~ /^[A-Za-z_]/ ) {
4210             $op_expected = UNKNOWN;
4211         }
4212
4213         # For possible file handle like "$a", Perl uses weird parsing rules.
4214         # For example:
4215         # print $a/2,"/hi";   - division
4216         # print $a / 2,"/hi"; - division
4217         # print $a/ 2,"/hi";  - division
4218         # print $a /2,"/hi";  - pattern (and error)!
4219         elsif ( ( $prev_type eq 'b' ) && ( $next_type ne 'b' ) ) {
4220             $op_expected = TERM;
4221         }
4222
4223         # Note when an operation is being done where a
4224         # filehandle might be expected, since a change in whitespace
4225         # could change the interpretation of the statement.
4226         else {
4227             if ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
4228                 complain("operator in print statement not recommended\n");
4229                 $op_expected = OPERATOR;
4230             }
4231         }
4232     }
4233
4234     # Check for smartmatch operator before preceding brace or square bracket.
4235     # For example, at the ? after the ] in the following expressions we are
4236     # expecting an operator:
4237     #
4238     # qr/3/ ~~ ['1234'] ? 1 : 0;
4239     # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
4240     elsif ( $last_nonblank_type eq '}' && $last_nonblank_token eq '~~' ) {
4241         $op_expected = OPERATOR;
4242     }
4243
4244     # handle something after 'do' and 'eval'
4245     elsif ( $is_block_operator{$last_nonblank_token} ) {
4246
4247         # something like $a = eval "expression";
4248         #                          ^
4249         if ( $last_nonblank_type eq 'k' ) {
4250             $op_expected = TERM;    # expression or list mode following keyword
4251         }
4252
4253         # something like $a = do { BLOCK } / 2;
4254         # or this ? after a smartmatch anonynmous hash or array reference:
4255         #   qr/3/ ~~ ['1234'] ? 1 : 0;
4256         #                                  ^
4257         else {
4258             $op_expected = OPERATOR;    # block mode following }
4259         }
4260     }
4261
4262     # handle bare word..
4263     elsif ( $last_nonblank_type eq 'w' ) {
4264
4265         # unfortunately, we can't tell what type of token to expect next
4266         # after most bare words
4267         $op_expected = UNKNOWN;
4268     }
4269
4270     # operator, but not term possible after these types
4271     # Note: moved ')' from type to token because parens in list context
4272     # get marked as '{' '}' now.  This is a minor glitch in the following:
4273     #    my %opts = (ref $_[0] eq 'HASH') ? %{shift()} : ();
4274     #
4275     elsif (( $last_nonblank_type =~ /^[\]RnviQh]$/ )
4276         || ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) )
4277     {
4278         $op_expected = OPERATOR;
4279
4280         # in a 'use' statement, numbers and v-strings are not true
4281         # numbers, so to avoid incorrect error messages, we will
4282         # mark them as unknown for now (use.t)
4283         # TODO: it would be much nicer to create a new token V for VERSION
4284         # number in a use statement.  Then this could be a check on type V
4285         # and related patches which change $statement_type for '=>'
4286         # and ',' could be removed.  Further, it would clean things up to
4287         # scan the 'use' statement with a separate subroutine.
4288         if (   ( $statement_type eq 'use' )
4289             && ( $last_nonblank_type =~ /^[nv]$/ ) )
4290         {
4291             $op_expected = UNKNOWN;
4292         }
4293
4294         # expecting VERSION or {} after package NAMESPACE
4295         elsif ($statement_type =~ /^package\b/
4296             && $last_nonblank_token =~ /^package\b/ )
4297         {
4298             $op_expected = TERM;
4299         }
4300     }
4301
4302     # no operator after many keywords, such as "die", "warn", etc
4303     elsif ( $expecting_term_token{$last_nonblank_token} ) {
4304
4305         # // may follow perl functions which may be unary operators
4306         # see test file dor.t (defined or);
4307         if (   $tok eq '/'
4308             && $next_type eq '/'
4309             && $last_nonblank_type eq 'k'
4310             && $is_keyword_taking_optional_args{$last_nonblank_token} )
4311         {
4312             $op_expected = OPERATOR;
4313         }
4314         else {
4315             $op_expected = TERM;
4316         }
4317     }
4318
4319     # no operator after things like + - **  (i.e., other operators)
4320     elsif ( $expecting_term_types{$last_nonblank_type} ) {
4321         $op_expected = TERM;
4322     }
4323
4324     # a few operators, like "time", have an empty prototype () and so
4325     # take no parameters but produce a value to operate on
4326     elsif ( $expecting_operator_token{$last_nonblank_token} ) {
4327         $op_expected = OPERATOR;
4328     }
4329
4330     # post-increment and decrement produce values to be operated on
4331     elsif ( $expecting_operator_types{$last_nonblank_type} ) {
4332         $op_expected = OPERATOR;
4333     }
4334
4335     # no value to operate on after sub block
4336     elsif ( $last_nonblank_token =~ /^sub\s/ ) { $op_expected = TERM; }
4337
4338     # a right brace here indicates the end of a simple block.
4339     # all non-structural right braces have type 'R'
4340     # all braces associated with block operator keywords have been given those
4341     # keywords as "last_nonblank_token" and caught above.
4342     # (This statement is order dependent, and must come after checking
4343     # $last_nonblank_token).
4344     elsif ( $last_nonblank_type eq '}' ) {
4345
4346         # patch for dor.t (defined or).
4347         if (   $tok eq '/'
4348             && $next_type eq '/'
4349             && $last_nonblank_token eq ']' )
4350         {
4351             $op_expected = OPERATOR;
4352         }
4353
4354         # Patch for RT #116344: misparse a ternary operator after an anonymous
4355         # hash, like this:
4356         #   return ref {} ? 1 : 0;
4357         # The right brace should really be marked type 'R' in this case, and
4358         # it is safest to return an UNKNOWN here. Expecting a TERM will
4359         # cause the '?' to always be interpreted as a pattern delimiter
4360         # rather than introducing a ternary operator.
4361         elsif ( $tok eq '?' ) {
4362             $op_expected = UNKNOWN;
4363         }
4364         else {
4365             $op_expected = TERM;
4366         }
4367     }
4368
4369     # something else..what did I forget?
4370     else {
4371
4372         # collecting diagnostics on unknown operator types..see what was missed
4373         $op_expected = UNKNOWN;
4374         write_diagnostics(
4375 "OP: unknown after type=$last_nonblank_type  token=$last_nonblank_token\n"
4376         );
4377     }
4378
4379     TOKENIZER_DEBUG_FLAG_EXPECT && do {
4380         print STDOUT
4381 "EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
4382     };
4383     return $op_expected;
4384 }
4385
4386 sub new_statement_ok {
4387
4388     # return true if the current token can start a new statement
4389     # USES GLOBAL VARIABLES: $last_nonblank_type
4390
4391     return label_ok()    # a label would be ok here
4392
4393       || $last_nonblank_type eq 'J';    # or we follow a label
4394
4395 }
4396
4397 sub label_ok {
4398
4399     # Decide if a bare word followed by a colon here is a label
4400     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
4401     # $brace_depth, @brace_type
4402
4403     # if it follows an opening or closing code block curly brace..
4404     if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
4405         && $last_nonblank_type eq $last_nonblank_token )
4406     {
4407
4408         # it is a label if and only if the curly encloses a code block
4409         return $brace_type[$brace_depth];
4410     }
4411
4412     # otherwise, it is a label if and only if it follows a ';' (real or fake)
4413     # or another label
4414     else {
4415         return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
4416     }
4417 }
4418
4419 sub code_block_type {
4420
4421     # Decide if this is a block of code, and its type.
4422     # Must be called only when $type = $token = '{'
4423     # The problem is to distinguish between the start of a block of code
4424     # and the start of an anonymous hash reference
4425     # Returns "" if not code block, otherwise returns 'last_nonblank_token'
4426     # to indicate the type of code block.  (For example, 'last_nonblank_token'
4427     # might be 'if' for an if block, 'else' for an else block, etc).
4428     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
4429     # $last_nonblank_block_type, $brace_depth, @brace_type
4430
4431     # handle case of multiple '{'s
4432
4433 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
4434
4435     my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
4436     if (   $last_nonblank_token eq '{'
4437         && $last_nonblank_type eq $last_nonblank_token )
4438     {
4439
4440         # opening brace where a statement may appear is probably
4441         # a code block but might be and anonymous hash reference
4442         if ( $brace_type[$brace_depth] ) {
4443             return decide_if_code_block( $i, $rtokens, $rtoken_type,
4444                 $max_token_index );
4445         }
4446
4447         # cannot start a code block within an anonymous hash
4448         else {
4449             return "";
4450         }
4451     }
4452
4453     elsif ( $last_nonblank_token eq ';' ) {
4454
4455         # an opening brace where a statement may appear is probably
4456         # a code block but might be and anonymous hash reference
4457         return decide_if_code_block( $i, $rtokens, $rtoken_type,
4458             $max_token_index );
4459     }
4460
4461     # handle case of '}{'
4462     elsif ($last_nonblank_token eq '}'
4463         && $last_nonblank_type eq $last_nonblank_token )
4464     {
4465
4466         # a } { situation ...
4467         # could be hash reference after code block..(blktype1.t)
4468         if ($last_nonblank_block_type) {
4469             return decide_if_code_block( $i, $rtokens, $rtoken_type,
4470                 $max_token_index );
4471         }
4472
4473         # must be a block if it follows a closing hash reference
4474         else {
4475             return $last_nonblank_token;
4476         }
4477     }
4478
4479     ################################################################
4480     # NOTE: braces after type characters start code blocks, but for
4481     # simplicity these are not identified as such.  See also
4482     # sub is_non_structural_brace.
4483     ################################################################
4484
4485 ##    elsif ( $last_nonblank_type eq 't' ) {
4486 ##       return $last_nonblank_token;
4487 ##    }
4488
4489     # brace after label:
4490     elsif ( $last_nonblank_type eq 'J' ) {
4491         return $last_nonblank_token;
4492     }
4493
4494 # otherwise, look at previous token.  This must be a code block if
4495 # it follows any of these:
4496 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
4497     elsif ( $is_code_block_token{$last_nonblank_token} ) {
4498
4499         # Bug Patch: Note that the opening brace after the 'if' in the following
4500         # snippet is an anonymous hash ref and not a code block!
4501         #   print 'hi' if { x => 1, }->{x};
4502         # We can identify this situation because the last nonblank type
4503         # will be a keyword (instead of a closing peren)
4504         if (   $last_nonblank_token =~ /^(if|unless)$/
4505             && $last_nonblank_type eq 'k' )
4506         {
4507             return "";
4508         }
4509         else {
4510             return $last_nonblank_token;
4511         }
4512     }
4513
4514     # or a sub or package BLOCK
4515     elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
4516         && $last_nonblank_token =~ /^(sub|package)\b/ )
4517     {
4518         return $last_nonblank_token;
4519     }
4520
4521     elsif ( $statement_type =~ /^(sub|package)\b/ ) {
4522         return $statement_type;
4523     }
4524
4525     # user-defined subs with block parameters (like grep/map/eval)
4526     elsif ( $last_nonblank_type eq 'G' ) {
4527         return $last_nonblank_token;
4528     }
4529
4530     # check bareword
4531     elsif ( $last_nonblank_type eq 'w' ) {
4532         return decide_if_code_block( $i, $rtokens, $rtoken_type,
4533             $max_token_index );
4534     }
4535
4536     # Patch for bug # RT #94338 reported by Daniel Trizen
4537     # for-loop in a parenthesized block-map triggering an error message:
4538     #    map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
4539     # Check for a code block within a parenthesized function call
4540     elsif ( $last_nonblank_token eq '(' ) {
4541         my $paren_type = $paren_type[$paren_depth];
4542         if ( $paren_type && $paren_type =~ /^(map|grep|sort)$/ ) {
4543
4544             # We will mark this as a code block but use type 't' instead
4545             # of the name of the contining function.  This will allow for
4546             # correct parsing but will usually produce better formatting.
4547             # Braces with block type 't' are not broken open automatically
4548             # in the formatter as are other code block types, and this usually
4549             # works best.
4550             return 't';    # (Not $paren_type)
4551         }
4552         else {
4553             return "";
4554         }
4555     }
4556
4557     # handle unknown syntax ') {'
4558     # we previously appended a '()' to mark this case
4559     elsif ( $last_nonblank_token =~ /\(\)$/ ) {
4560         return $last_nonblank_token;
4561     }
4562
4563     # anything else must be anonymous hash reference
4564     else {
4565         return "";
4566     }
4567 }
4568
4569 sub decide_if_code_block {
4570
4571     # USES GLOBAL VARIABLES: $last_nonblank_token
4572     my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
4573
4574     my ( $next_nonblank_token, $i_next ) =
4575       find_next_nonblank_token( $i, $rtokens, $max_token_index );
4576
4577     # we are at a '{' where a statement may appear.
4578     # We must decide if this brace starts an anonymous hash or a code
4579     # block.
4580     # return "" if anonymous hash, and $last_nonblank_token otherwise
4581
4582     # initialize to be code BLOCK
4583     my $code_block_type = $last_nonblank_token;
4584
4585     # Check for the common case of an empty anonymous hash reference:
4586     # Maybe something like sub { { } }
4587     if ( $next_nonblank_token eq '}' ) {
4588         $code_block_type = "";
4589     }
4590
4591     else {
4592
4593         # To guess if this '{' is an anonymous hash reference, look ahead
4594         # and test as follows:
4595         #
4596         # it is a hash reference if next come:
4597         #   - a string or digit followed by a comma or =>
4598         #   - bareword followed by =>
4599         # otherwise it is a code block
4600         #
4601         # Examples of anonymous hash ref:
4602         # {'aa',};
4603         # {1,2}
4604         #
4605         # Examples of code blocks:
4606         # {1; print "hello\n", 1;}
4607         # {$a,1};
4608
4609         # We are only going to look ahead one more (nonblank/comment) line.
4610         # Strange formatting could cause a bad guess, but that's unlikely.
4611         my @pre_types;
4612         my @pre_tokens;
4613
4614         # Ignore the rest of this line if it is a side comment
4615         if ( $next_nonblank_token ne '#' ) {
4616             @pre_types  = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
4617             @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
4618         }
4619         my ( $rpre_tokens, $rpre_types ) =
4620           peek_ahead_for_n_nonblank_pre_tokens(20);    # 20 is arbitrary but
4621                                                        # generous, and prevents
4622                                                        # wasting lots of
4623                                                        # time in mangled files
4624         if ( defined($rpre_types) && @{$rpre_types} ) {
4625             push @pre_types,  @{$rpre_types};
4626             push @pre_tokens, @{$rpre_tokens};
4627         }
4628
4629         # put a sentinel token to simplify stopping the search
4630         push @pre_types, '}';
4631         push @pre_types, '}';
4632
4633         my $jbeg = 0;
4634         $jbeg = 1 if $pre_types[0] eq 'b';
4635
4636         # first look for one of these
4637         #  - bareword
4638         #  - bareword with leading -
4639         #  - digit
4640         #  - quoted string
4641         my $j = $jbeg;
4642         if ( $pre_types[$j] =~ /^[\'\"]/ ) {
4643
4644             # find the closing quote; don't worry about escapes
4645             my $quote_mark = $pre_types[$j];
4646             foreach my $k ( $j + 1 .. @pre_types - 2 ) {
4647                 if ( $pre_types[$k] eq $quote_mark ) {
4648                     $j = $k + 1;
4649                     my $next = $pre_types[$j];
4650                     last;
4651                 }
4652             }
4653         }
4654         elsif ( $pre_types[$j] eq 'd' ) {
4655             $j++;
4656         }
4657         elsif ( $pre_types[$j] eq 'w' ) {
4658             $j++;
4659         }
4660         elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
4661             $j++;
4662         }
4663         if ( $j > $jbeg ) {
4664
4665             $j++ if $pre_types[$j] eq 'b';
4666
4667             # Patched for RT #95708
4668             if (
4669
4670                 # it is a comma which is not a pattern delimeter except for qw
4671                 (
4672                        $pre_types[$j] eq ','
4673                     && $pre_tokens[$jbeg] !~ /^(s|m|y|tr|qr|q|qq|qx)$/
4674                 )
4675
4676                 # or a =>
4677                 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
4678               )
4679             {
4680                 $code_block_type = "";
4681             }
4682         }
4683     }
4684
4685     return $code_block_type;
4686 }
4687
4688 sub report_unexpected {
4689
4690     # report unexpected token type and show where it is
4691     # USES GLOBAL VARIABLES: $tokenizer_self
4692     my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
4693         $rpretoken_type, $input_line )
4694       = @_;
4695
4696     if ( ++$tokenizer_self->{_unexpected_error_count} <= MAX_NAG_MESSAGES ) {
4697         my $msg = "found $found where $expecting expected";
4698         my $pos = $rpretoken_map->[$i_tok];
4699         interrupt_logfile();
4700         my $input_line_number = $tokenizer_self->{_last_line_number};
4701         my ( $offset, $numbered_line, $underline ) =
4702           make_numbered_line( $input_line_number, $input_line, $pos );
4703         $underline = write_on_underline( $underline, $pos - $offset, '^' );
4704
4705         my $trailer = "";
4706         if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
4707             my $pos_prev = $rpretoken_map->[$last_nonblank_i];
4708             my $num;
4709             if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
4710                 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
4711             }
4712             else {
4713                 $num = $pos - $pos_prev;
4714             }
4715             if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
4716
4717             $underline =
4718               write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
4719             $trailer = " (previous token underlined)";
4720         }
4721         warning( $numbered_line . "\n" );
4722         warning( $underline . "\n" );
4723         warning( $msg . $trailer . "\n" );
4724         resume_logfile();
4725     }
4726     return;
4727 }
4728
4729 sub is_non_structural_brace {
4730
4731     # Decide if a brace or bracket is structural or non-structural
4732     # by looking at the previous token and type
4733     # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
4734
4735     # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
4736     # Tentatively deactivated because it caused the wrong operator expectation
4737     # for this code:
4738     #      $user = @vars[1] / 100;
4739     # Must update sub operator_expected before re-implementing.
4740     # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
4741     #    return 0;
4742     # }
4743
4744     ################################################################
4745     # NOTE: braces after type characters start code blocks, but for
4746     # simplicity these are not identified as such.  See also
4747     # sub code_block_type
4748     ################################################################
4749
4750     ##if ($last_nonblank_type eq 't') {return 0}
4751
4752     # otherwise, it is non-structural if it is decorated
4753     # by type information.
4754     # For example, the '{' here is non-structural:   ${xxx}
4755     return (
4756         $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
4757
4758           # or if we follow a hash or array closing curly brace or bracket
4759           # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
4760           # because the first '}' would have been given type 'R'
4761           || $last_nonblank_type =~ /^([R\]])$/
4762     );
4763 }
4764
4765 #########i#############################################################
4766 # Tokenizer routines for tracking container nesting depths
4767 #######################################################################
4768
4769 # The following routines keep track of nesting depths of the nesting
4770 # types, ( [ { and ?.  This is necessary for determining the indentation
4771 # level, and also for debugging programs.  Not only do they keep track of
4772 # nesting depths of the individual brace types, but they check that each
4773 # of the other brace types is balanced within matching pairs.  For
4774 # example, if the program sees this sequence:
4775 #
4776 #         {  ( ( ) }
4777 #
4778 # then it can determine that there is an extra left paren somewhere
4779 # between the { and the }.  And so on with every other possible
4780 # combination of outer and inner brace types.  For another
4781 # example:
4782 #
4783 #         ( [ ..... ]  ] )
4784 #
4785 # which has an extra ] within the parens.
4786 #
4787 # The brace types have indexes 0 .. 3 which are indexes into
4788 # the matrices.
4789 #
4790 # The pair ? : are treated as just another nesting type, with ? acting
4791 # as the opening brace and : acting as the closing brace.
4792 #
4793 # The matrix
4794 #
4795 #         $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
4796 #
4797 # saves the nesting depth of brace type $b (where $b is either of the other
4798 # nesting types) when brace type $a enters a new depth.  When this depth
4799 # decreases, a check is made that the current depth of brace types $b is
4800 # unchanged, or otherwise there must have been an error.  This can
4801 # be very useful for localizing errors, particularly when perl runs to
4802 # the end of a large file (such as this one) and announces that there
4803 # is a problem somewhere.
4804 #
4805 # A numerical sequence number is maintained for every nesting type,
4806 # so that each matching pair can be uniquely identified in a simple
4807 # way.
4808
4809 sub increase_nesting_depth {
4810     my ( $aa, $pos ) = @_;
4811
4812     # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
4813     # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
4814     # $statement_type
4815     $current_depth[$aa]++;
4816     $total_depth++;
4817     $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
4818     my $input_line_number = $tokenizer_self->{_last_line_number};
4819     my $input_line        = $tokenizer_self->{_line_text};
4820
4821     # Sequence numbers increment by number of items.  This keeps
4822     # a unique set of numbers but still allows the relative location
4823     # of any type to be determined.
4824     $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
4825     my $seqno = $nesting_sequence_number[$aa];
4826     $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
4827
4828     $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
4829       [ $input_line_number, $input_line, $pos ];
4830
4831     for my $bb ( 0 .. @closing_brace_names - 1 ) {
4832         next if ( $bb == $aa );
4833         $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
4834     }
4835
4836     # set a flag for indenting a nested ternary statement
4837     my $indent = 0;
4838     if ( $aa == QUESTION_COLON ) {
4839         $nested_ternary_flag[ $current_depth[$aa] ] = 0;
4840         if ( $current_depth[$aa] > 1 ) {
4841             if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
4842                 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
4843                 if ( $pdepth == $total_depth - 1 ) {
4844                     $indent = 1;
4845                     $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
4846                 }
4847             }
4848         }
4849     }
4850     $nested_statement_type[$aa][ $current_depth[$aa] ] = $statement_type;
4851     $statement_type = "";
4852     return ( $seqno, $indent );
4853 }
4854
4855 sub decrease_nesting_depth {
4856
4857     my ( $aa, $pos ) = @_;
4858
4859     # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
4860     # @current_sequence_number, @depth_array, @starting_line_of_current_depth
4861     # $statement_type
4862     my $seqno             = 0;
4863     my $input_line_number = $tokenizer_self->{_last_line_number};
4864     my $input_line        = $tokenizer_self->{_line_text};
4865
4866     my $outdent = 0;
4867     $total_depth--;
4868     if ( $current_depth[$aa] > 0 ) {
4869
4870         # set a flag for un-indenting after seeing a nested ternary statement
4871         $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
4872         if ( $aa == QUESTION_COLON ) {
4873             $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
4874         }
4875         $statement_type = $nested_statement_type[$aa][ $current_depth[$aa] ];
4876
4877         # check that any brace types $bb contained within are balanced
4878         for my $bb ( 0 .. @closing_brace_names - 1 ) {
4879             next if ( $bb == $aa );
4880
4881             unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
4882                 $current_depth[$bb] )
4883             {
4884                 my $diff =
4885                   $current_depth[$bb] -
4886                   $depth_array[$aa][$bb][ $current_depth[$aa] ];
4887
4888                 # don't whine too many times
4889                 my $saw_brace_error = get_saw_brace_error();
4890                 if (
4891                     $saw_brace_error <= MAX_NAG_MESSAGES
4892
4893                     # if too many closing types have occurred, we probably
4894                     # already caught this error
4895                     && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
4896                   )
4897                 {
4898                     interrupt_logfile();
4899                     my $rsl =
4900                       $starting_line_of_current_depth[$aa]
4901                       [ $current_depth[$aa] ];
4902                     my $sl  = $rsl->[0];
4903                     my $rel = [ $input_line_number, $input_line, $pos ];
4904                     my $el  = $rel->[0];
4905                     my ($ess);
4906
4907                     if ( $diff == 1 || $diff == -1 ) {
4908                         $ess = '';
4909                     }
4910                     else {
4911                         $ess = 's';
4912                     }
4913                     my $bname =
4914                       ( $diff > 0 )
4915                       ? $opening_brace_names[$bb]
4916                       : $closing_brace_names[$bb];
4917                     write_error_indicator_pair( @{$rsl}, '^' );
4918                     my $msg = <<"EOM";
4919 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
4920 EOM
4921
4922                     if ( $diff > 0 ) {
4923                         my $rml =
4924                           $starting_line_of_current_depth[$bb]
4925                           [ $current_depth[$bb] ];
4926                         my $ml = $rml->[0];
4927                         $msg .=
4928 "    The most recent un-matched $bname is on line $ml\n";
4929                         write_error_indicator_pair( @{$rml}, '^' );
4930                     }
4931                     write_error_indicator_pair( @{$rel}, '^' );
4932                     warning($msg);
4933                     resume_logfile();
4934                 }
4935                 increment_brace_error();
4936             }
4937         }
4938         $current_depth[$aa]--;
4939     }
4940     else {
4941
4942         my $saw_brace_error = get_saw_brace_error();
4943         if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
4944             my $msg = <<"EOM";
4945 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
4946 EOM
4947             indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
4948         }
4949         increment_brace_error();
4950     }
4951     return ( $seqno, $outdent );
4952 }
4953
4954 sub check_final_nesting_depths {
4955
4956     # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
4957
4958     for my $aa ( 0 .. @closing_brace_names - 1 ) {
4959
4960         if ( $current_depth[$aa] ) {
4961             my $rsl =
4962               $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
4963             my $sl  = $rsl->[0];
4964             my $msg = <<"EOM";
4965 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
4966 The most recent un-matched $opening_brace_names[$aa] is on line $sl
4967 EOM
4968             indicate_error( $msg, @{$rsl}, '^' );
4969             increment_brace_error();
4970         }
4971     }
4972     return;
4973 }
4974
4975 #########i#############################################################
4976 # Tokenizer routines for looking ahead in input stream
4977 #######################################################################
4978
4979 sub peek_ahead_for_n_nonblank_pre_tokens {
4980
4981     # returns next n pretokens if they exist
4982     # returns undef's if hits eof without seeing any pretokens
4983     # USES GLOBAL VARIABLES: $tokenizer_self
4984     my $max_pretokens = shift;
4985     my $line;
4986     my $i = 0;
4987     my ( $rpre_tokens, $rmap, $rpre_types );
4988
4989     while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
4990     {
4991         $line =~ s/^\s*//;    # trim leading blanks
4992         next if ( length($line) <= 0 );    # skip blank
4993         next if ( $line =~ /^#/ );         # skip comment
4994         ( $rpre_tokens, $rmap, $rpre_types ) =
4995           pre_tokenize( $line, $max_pretokens );
4996         last;
4997     }
4998     return ( $rpre_tokens, $rpre_types );
4999 }
5000
5001 # look ahead for next non-blank, non-comment line of code
5002 sub peek_ahead_for_nonblank_token {
5003
5004     # USES GLOBAL VARIABLES: $tokenizer_self
5005     my ( $rtokens, $max_token_index ) = @_;
5006     my $line;
5007     my $i = 0;
5008
5009     while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
5010     {
5011         $line =~ s/^\s*//;    # trim leading blanks
5012         next if ( length($line) <= 0 );    # skip blank
5013         next if ( $line =~ /^#/ );         # skip comment
5014         my ( $rtok, $rmap, $rtype ) =
5015           pre_tokenize( $line, 2 );        # only need 2 pre-tokens
5016         my $j = $max_token_index + 1;
5017
5018         foreach my $tok ( @{$rtok} ) {
5019             last if ( $tok =~ "\n" );
5020             $rtokens->[ ++$j ] = $tok;
5021         }
5022         last;
5023     }
5024     return $rtokens;
5025 }
5026
5027 #########i#############################################################
5028 # Tokenizer guessing routines for ambiguous situations
5029 #######################################################################
5030
5031 sub guess_if_pattern_or_conditional {
5032
5033     # this routine is called when we have encountered a ? following an
5034     # unknown bareword, and we must decide if it starts a pattern or not
5035     # input parameters:
5036     #   $i - token index of the ? starting possible pattern
5037     # output parameters:
5038     #   $is_pattern = 0 if probably not pattern,  =1 if probably a pattern
5039     #   msg = a warning or diagnostic message
5040     # USES GLOBAL VARIABLES: $last_nonblank_token
5041
5042     # FIXME: this needs to be rewritten
5043
5044     my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
5045     my $is_pattern = 0;
5046     my $msg        = "guessing that ? after $last_nonblank_token starts a ";
5047
5048     if ( $i >= $max_token_index ) {
5049         $msg .= "conditional (no end to pattern found on the line)\n";
5050     }
5051     else {
5052         my $ibeg = $i;
5053         $i = $ibeg + 1;
5054         my $next_token = $rtokens->[$i];    # first token after ?
5055
5056         # look for a possible ending ? on this line..
5057         my $in_quote        = 1;
5058         my $quote_depth     = 0;
5059         my $quote_character = '';
5060         my $quote_pos       = 0;
5061         my $quoted_string;
5062         (
5063             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
5064             $quoted_string
5065           )
5066           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
5067             $quote_pos, $quote_depth, $max_token_index );
5068
5069         if ($in_quote) {
5070
5071             # we didn't find an ending ? on this line,
5072             # so we bias towards conditional
5073             $is_pattern = 0;
5074             $msg .= "conditional (no ending ? on this line)\n";
5075
5076             # we found an ending ?, so we bias towards a pattern
5077         }
5078         else {
5079
5080             # Watch out for an ending ? in quotes, like this
5081             #    my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
5082             my $s_quote = 0;
5083             my $d_quote = 0;
5084             my $colons  = 0;
5085             foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
5086                 my $tok = $rtokens->[$ii];
5087                 if ( $tok eq ":" ) { $colons++ }
5088                 if ( $tok eq "'" ) { $s_quote++ }
5089                 if ( $tok eq '"' ) { $d_quote++ }
5090             }
5091             if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
5092                 $is_pattern = 0;
5093                 $msg .= "found ending ? but unbalanced quote chars\n";
5094             }
5095             elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
5096                 $is_pattern = 1;
5097                 $msg .= "pattern (found ending ? and pattern expected)\n";
5098             }
5099             else {
5100                 $msg .= "pattern (uncertain, but found ending ?)\n";
5101             }
5102         }
5103     }
5104     return ( $is_pattern, $msg );
5105 }
5106
5107 sub guess_if_pattern_or_division {
5108
5109     # this routine is called when we have encountered a / following an
5110     # unknown bareword, and we must decide if it starts a pattern or is a
5111     # division
5112     # input parameters:
5113     #   $i - token index of the / starting possible pattern
5114     # output parameters:
5115     #   $is_pattern = 0 if probably division,  =1 if probably a pattern
5116     #   msg = a warning or diagnostic message
5117     # USES GLOBAL VARIABLES: $last_nonblank_token
5118     my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
5119     my $is_pattern = 0;
5120     my $msg        = "guessing that / after $last_nonblank_token starts a ";
5121
5122     if ( $i >= $max_token_index ) {
5123         $msg .= "division (no end to pattern found on the line)\n";
5124     }
5125     else {
5126         my $ibeg = $i;
5127         my $divide_expected =
5128           numerator_expected( $i, $rtokens, $max_token_index );
5129         $i = $ibeg + 1;
5130         my $next_token = $rtokens->[$i];    # first token after slash
5131
5132         # look for a possible ending / on this line..
5133         my $in_quote        = 1;
5134         my $quote_depth     = 0;
5135         my $quote_character = '';
5136         my $quote_pos       = 0;
5137         my $quoted_string;
5138         (
5139             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
5140             $quoted_string
5141           )
5142           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
5143             $quote_pos, $quote_depth, $max_token_index );
5144
5145         if ($in_quote) {
5146
5147             # we didn't find an ending / on this line,
5148             # so we bias towards division
5149             if ( $divide_expected >= 0 ) {
5150                 $is_pattern = 0;
5151                 $msg .= "division (no ending / on this line)\n";
5152             }
5153             else {
5154                 $msg        = "multi-line pattern (division not possible)\n";
5155                 $is_pattern = 1;
5156             }
5157
5158         }
5159
5160         # we found an ending /, so we bias towards a pattern
5161         else {
5162
5163             if ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
5164
5165                 if ( $divide_expected >= 0 ) {
5166
5167                     if ( $i - $ibeg > 60 ) {
5168                         $msg .= "division (matching / too distant)\n";
5169                         $is_pattern = 0;
5170                     }
5171                     else {
5172                         $msg .= "pattern (but division possible too)\n";
5173                         $is_pattern = 1;
5174                     }
5175                 }
5176                 else {
5177                     $is_pattern = 1;
5178                     $msg .= "pattern (division not possible)\n";
5179                 }
5180             }
5181             else {
5182
5183                 if ( $divide_expected >= 0 ) {
5184                     $is_pattern = 0;
5185                     $msg .= "division (pattern not possible)\n";
5186                 }
5187                 else {
5188                     $is_pattern = 1;
5189                     $msg .=
5190                       "pattern (uncertain, but division would not work here)\n";
5191                 }
5192             }
5193         }
5194     }
5195     return ( $is_pattern, $msg );
5196 }
5197
5198 # try to resolve here-doc vs. shift by looking ahead for
5199 # non-code or the end token (currently only looks for end token)
5200 # returns 1 if it is probably a here doc, 0 if not
5201 sub guess_if_here_doc {
5202
5203     # This is how many lines we will search for a target as part of the
5204     # guessing strategy.  It is a constant because there is probably
5205     # little reason to change it.
5206     # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
5207     # %is_constant,
5208     my $HERE_DOC_WINDOW = 40;
5209
5210     my $next_token        = shift;
5211     my $here_doc_expected = 0;
5212     my $line;
5213     my $k   = 0;
5214     my $msg = "checking <<";
5215
5216     while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $k++ ) )
5217     {
5218         chomp $line;
5219
5220         if ( $line =~ /^$next_token$/ ) {
5221             $msg .= " -- found target $next_token ahead $k lines\n";
5222             $here_doc_expected = 1;    # got it
5223             last;
5224         }
5225         last if ( $k >= $HERE_DOC_WINDOW );
5226     }
5227
5228     unless ($here_doc_expected) {
5229
5230         if ( !defined($line) ) {
5231             $here_doc_expected = -1;    # hit eof without seeing target
5232             $msg .= " -- must be shift; target $next_token not in file\n";
5233
5234         }
5235         else {                          # still unsure..taking a wild guess
5236
5237             if ( !$is_constant{$current_package}{$next_token} ) {
5238                 $here_doc_expected = 1;
5239                 $msg .=
5240                   " -- guessing it's a here-doc ($next_token not a constant)\n";
5241             }
5242             else {
5243                 $msg .=
5244                   " -- guessing it's a shift ($next_token is a constant)\n";
5245             }
5246         }
5247     }
5248     write_logfile_entry($msg);
5249     return $here_doc_expected;
5250 }
5251
5252 #########i#############################################################
5253 # Tokenizer Routines for scanning identifiers and related items
5254 #######################################################################
5255
5256 sub scan_bare_identifier_do {
5257
5258     # this routine is called to scan a token starting with an alphanumeric
5259     # variable or package separator, :: or '.
5260     # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
5261     # $last_nonblank_type,@paren_type, $paren_depth
5262
5263     my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
5264         $max_token_index )
5265       = @_;
5266     my $i_begin = $i;
5267     my $package = undef;
5268
5269     my $i_beg = $i;
5270
5271     # we have to back up one pretoken at a :: since each : is one pretoken
5272     if ( $tok eq '::' ) { $i_beg-- }
5273     if ( $tok eq '->' ) { $i_beg-- }
5274     my $pos_beg = $rtoken_map->[$i_beg];
5275     pos($input_line) = $pos_beg;
5276
5277     #  Examples:
5278     #   A::B::C
5279     #   A::
5280     #   ::A
5281     #   A'B
5282     if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
5283
5284         my $pos  = pos($input_line);
5285         my $numc = $pos - $pos_beg;
5286         $tok = substr( $input_line, $pos_beg, $numc );
5287
5288         # type 'w' includes anything without leading type info
5289         # ($,%,@,*) including something like abc::def::ghi
5290         $type = 'w';
5291
5292         my $sub_name = "";
5293         if ( defined($2) ) { $sub_name = $2; }
5294         if ( defined($1) ) {
5295             $package = $1;
5296
5297             # patch: don't allow isolated package name which just ends
5298             # in the old style package separator (single quote).  Example:
5299             #   use CGI':all';
5300             if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
5301                 $pos--;
5302             }
5303
5304             $package =~ s/\'/::/g;
5305             if ( $package =~ /^\:/ ) { $package = 'main' . $package }
5306             $package =~ s/::$//;
5307         }
5308         else {
5309             $package = $current_package;
5310
5311             if ( $is_keyword{$tok} ) {
5312                 $type = 'k';
5313             }
5314         }
5315
5316         # if it is a bareword..
5317         if ( $type eq 'w' ) {
5318
5319             # check for v-string with leading 'v' type character
5320             # (This seems to have precedence over filehandle, type 'Y')
5321             if ( $tok =~ /^v\d[_\d]*$/ ) {
5322
5323                 # we only have the first part - something like 'v101' -
5324                 # look for more
5325                 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
5326                     $pos  = pos($input_line);
5327                     $numc = $pos - $pos_beg;
5328                     $tok  = substr( $input_line, $pos_beg, $numc );
5329                 }
5330                 $type = 'v';
5331
5332                 # warn if this version can't handle v-strings
5333                 report_v_string($tok);
5334             }
5335
5336             elsif ( $is_constant{$package}{$sub_name} ) {
5337                 $type = 'C';
5338             }
5339
5340             # bareword after sort has implied empty prototype; for example:
5341             # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
5342             # This has priority over whatever the user has specified.
5343             elsif ($last_nonblank_token eq 'sort'
5344                 && $last_nonblank_type eq 'k' )
5345             {
5346                 $type = 'Z';
5347             }
5348
5349             # Note: strangely, perl does not seem to really let you create
5350             # functions which act like eval and do, in the sense that eval
5351             # and do may have operators following the final }, but any operators
5352             # that you create with prototype (&) apparently do not allow
5353             # trailing operators, only terms.  This seems strange.
5354             # If this ever changes, here is the update
5355             # to make perltidy behave accordingly:
5356
5357             # elsif ( $is_block_function{$package}{$tok} ) {
5358             #    $tok='eval'; # patch to do braces like eval  - doesn't work
5359             #    $type = 'k';
5360             #}
5361             # FIXME: This could become a separate type to allow for different
5362             # future behavior:
5363             elsif ( $is_block_function{$package}{$sub_name} ) {
5364                 $type = 'G';
5365             }
5366
5367             elsif ( $is_block_list_function{$package}{$sub_name} ) {
5368                 $type = 'G';
5369             }
5370             elsif ( $is_user_function{$package}{$sub_name} ) {
5371                 $type      = 'U';
5372                 $prototype = $user_function_prototype{$package}{$sub_name};
5373             }
5374
5375             # check for indirect object
5376             elsif (
5377
5378                 # added 2001-03-27: must not be followed immediately by '('
5379                 # see fhandle.t
5380                 ( $input_line !~ m/\G\(/gc )
5381
5382                 # and
5383                 && (
5384
5385                     # preceded by keyword like 'print', 'printf' and friends
5386                     $is_indirect_object_taker{$last_nonblank_token}
5387
5388                     # or preceded by something like 'print(' or 'printf('
5389                     || (
5390                         ( $last_nonblank_token eq '(' )
5391                         && $is_indirect_object_taker{ $paren_type[$paren_depth]
5392                         }
5393
5394                     )
5395                 )
5396               )
5397             {
5398
5399                 # may not be indirect object unless followed by a space
5400                 if ( $input_line =~ m/\G\s+/gc ) {
5401                     $type = 'Y';
5402
5403                     # Abandon Hope ...
5404                     # Perl's indirect object notation is a very bad
5405                     # thing and can cause subtle bugs, especially for
5406                     # beginning programmers.  And I haven't even been
5407                     # able to figure out a sane warning scheme which
5408                     # doesn't get in the way of good scripts.
5409
5410                     # Complain if a filehandle has any lower case
5411                     # letters.  This is suggested good practice.
5412                     # Use 'sub_name' because something like
5413                     # main::MYHANDLE is ok for filehandle
5414                     if ( $sub_name =~ /[a-z]/ ) {
5415
5416                         # could be bug caused by older perltidy if
5417                         # followed by '('
5418                         if ( $input_line =~ m/\G\s*\(/gc ) {
5419                             complain(
5420 "Caution: unknown word '$tok' in indirect object slot\n"
5421                             );
5422                         }
5423                     }
5424                 }
5425
5426                 # bareword not followed by a space -- may not be filehandle
5427                 # (may be function call defined in a 'use' statement)
5428                 else {
5429                     $type = 'Z';
5430                 }
5431             }
5432         }
5433
5434         # Now we must convert back from character position
5435         # to pre_token index.
5436         # I don't think an error flag can occur here ..but who knows
5437         my $error;
5438         ( $i, $error ) =
5439           inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
5440         if ($error) {
5441             warning("scan_bare_identifier: Possibly invalid tokenization\n");
5442         }
5443     }
5444
5445     # no match but line not blank - could be syntax error
5446     # perl will take '::' alone without complaint
5447     else {
5448         $type = 'w';
5449
5450         # change this warning to log message if it becomes annoying
5451         warning("didn't find identifier after leading ::\n");
5452     }
5453     return ( $i, $tok, $type, $prototype );
5454 }
5455
5456 sub scan_id_do {
5457
5458 # This is the new scanner and will eventually replace scan_identifier.
5459 # Only type 'sub' and 'package' are implemented.
5460 # Token types $ * % @ & -> are not yet implemented.
5461 #
5462 # Scan identifier following a type token.
5463 # The type of call depends on $id_scan_state: $id_scan_state = ''
5464 # for starting call, in which case $tok must be the token defining
5465 # the type.
5466 #
5467 # If the type token is the last nonblank token on the line, a value
5468 # of $id_scan_state = $tok is returned, indicating that further
5469 # calls must be made to get the identifier.  If the type token is
5470 # not the last nonblank token on the line, the identifier is
5471 # scanned and handled and a value of '' is returned.
5472 # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token, $in_attribute_list,
5473 # $statement_type, $tokenizer_self
5474
5475     my ( $input_line, $i, $tok, $rtokens, $rtoken_map, $id_scan_state,
5476         $max_token_index )
5477       = @_;
5478     my $type = '';
5479     my ( $i_beg, $pos_beg );
5480
5481     #print "NSCAN:entering i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
5482     #my ($a,$b,$c) = caller;
5483     #print "NSCAN: scan_id called with tok=$tok $a $b $c\n";
5484
5485     # on re-entry, start scanning at first token on the line
5486     if ($id_scan_state) {
5487         $i_beg = $i;
5488         $type  = '';
5489     }
5490
5491     # on initial entry, start scanning just after type token
5492     else {
5493         $i_beg         = $i + 1;
5494         $id_scan_state = $tok;
5495         $type          = 't';
5496     }
5497
5498     # find $i_beg = index of next nonblank token,
5499     # and handle empty lines
5500     my $blank_line          = 0;
5501     my $next_nonblank_token = $rtokens->[$i_beg];
5502     if ( $i_beg > $max_token_index ) {
5503         $blank_line = 1;
5504     }
5505     else {
5506
5507         # only a '#' immediately after a '$' is not a comment
5508         if ( $next_nonblank_token eq '#' ) {
5509             unless ( $tok eq '$' ) {
5510                 $blank_line = 1;
5511             }
5512         }
5513
5514         if ( $next_nonblank_token =~ /^\s/ ) {
5515             ( $next_nonblank_token, $i_beg ) =
5516               find_next_nonblank_token_on_this_line( $i_beg, $rtokens,
5517                 $max_token_index );
5518             if ( $next_nonblank_token =~ /(^#|^\s*$)/ ) {
5519                 $blank_line = 1;
5520             }
5521         }
5522     }
5523
5524     # handle non-blank line; identifier, if any, must follow
5525     unless ($blank_line) {
5526
5527         if ( $id_scan_state eq 'sub' ) {
5528             ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
5529                 $input_line, $i,             $i_beg,
5530                 $tok,        $type,          $rtokens,
5531                 $rtoken_map, $id_scan_state, $max_token_index
5532             );
5533         }
5534
5535         elsif ( $id_scan_state eq 'package' ) {
5536             ( $i, $tok, $type ) =
5537               do_scan_package( $input_line, $i, $i_beg, $tok, $type, $rtokens,
5538                 $rtoken_map, $max_token_index );
5539             $id_scan_state = '';
5540         }
5541
5542         else {
5543             warning("invalid token in scan_id: $tok\n");
5544             $id_scan_state = '';
5545         }
5546     }
5547
5548     if ( $id_scan_state && ( !defined($type) || !$type ) ) {
5549
5550         # shouldn't happen:
5551         warning(
5552 "Program bug in scan_id: undefined type but scan_state=$id_scan_state\n"
5553         );
5554         report_definite_bug();
5555     }
5556
5557     TOKENIZER_DEBUG_FLAG_NSCAN && do {
5558         print STDOUT
5559           "NSCAN: returns i=$i, tok=$tok, type=$type, state=$id_scan_state\n";
5560     };
5561     return ( $i, $tok, $type, $id_scan_state );
5562 }
5563
5564 sub check_prototype {
5565     my ( $proto, $package, $subname ) = @_;
5566     return unless ( defined($package) && defined($subname) );
5567     if ( defined($proto) ) {
5568         $proto =~ s/^\s*\(\s*//;
5569         $proto =~ s/\s*\)$//;
5570         if ($proto) {
5571             $is_user_function{$package}{$subname}        = 1;
5572             $user_function_prototype{$package}{$subname} = "($proto)";
5573
5574             # prototypes containing '&' must be treated specially..
5575             if ( $proto =~ /\&/ ) {
5576
5577                 # right curly braces of prototypes ending in
5578                 # '&' may be followed by an operator
5579                 if ( $proto =~ /\&$/ ) {
5580                     $is_block_function{$package}{$subname} = 1;
5581                 }
5582
5583                 # right curly braces of prototypes NOT ending in
5584                 # '&' may NOT be followed by an operator
5585                 elsif ( $proto !~ /\&$/ ) {
5586                     $is_block_list_function{$package}{$subname} = 1;
5587                 }
5588             }
5589         }
5590         else {
5591             $is_constant{$package}{$subname} = 1;
5592         }
5593     }
5594     else {
5595         $is_user_function{$package}{$subname} = 1;
5596     }
5597     return;
5598 }
5599
5600 sub do_scan_package {
5601
5602     # do_scan_package parses a package name
5603     # it is called with $i_beg equal to the index of the first nonblank
5604     # token following a 'package' token.
5605     # USES GLOBAL VARIABLES: $current_package,
5606
5607     # package NAMESPACE
5608     # package NAMESPACE VERSION
5609     # package NAMESPACE BLOCK
5610     # package NAMESPACE VERSION BLOCK
5611     #
5612     # If VERSION is provided, package sets the $VERSION variable in the given
5613     # namespace to a version object with the VERSION provided. VERSION must be
5614     # a "strict" style version number as defined by the version module: a
5615     # positive decimal number (integer or decimal-fraction) without
5616     # exponentiation or else a dotted-decimal v-string with a leading 'v'
5617     # character and at least three components.
5618     # reference http://perldoc.perl.org/functions/package.html
5619
5620     my ( $input_line, $i, $i_beg, $tok, $type, $rtokens, $rtoken_map,
5621         $max_token_index )
5622       = @_;
5623     my $package = undef;
5624     my $pos_beg = $rtoken_map->[$i_beg];
5625     pos($input_line) = $pos_beg;
5626
5627     # handle non-blank line; package name, if any, must follow
5628     if ( $input_line =~ m/\G\s*((?:\w*(?:'|::))*\w+)/gc ) {
5629         $package = $1;
5630         $package = ( defined($1) && $1 ) ? $1 : 'main';
5631         $package =~ s/\'/::/g;
5632         if ( $package =~ /^\:/ ) { $package = 'main' . $package }
5633         $package =~ s/::$//;
5634         my $pos  = pos($input_line);
5635         my $numc = $pos - $pos_beg;
5636         $tok  = 'package ' . substr( $input_line, $pos_beg, $numc );
5637         $type = 'i';
5638
5639         # Now we must convert back from character position
5640         # to pre_token index.
5641         # I don't think an error flag can occur here ..but ?
5642         my $error;
5643         ( $i, $error ) =
5644           inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
5645         if ($error) { warning("Possibly invalid package\n") }
5646         $current_package = $package;
5647
5648         # we should now have package NAMESPACE
5649         # now expecting VERSION, BLOCK, or ; to follow ...
5650         # package NAMESPACE VERSION
5651         # package NAMESPACE BLOCK
5652         # package NAMESPACE VERSION BLOCK
5653         my ( $next_nonblank_token, $i_next ) =
5654           find_next_nonblank_token( $i, $rtokens, $max_token_index );
5655
5656         # check that something recognizable follows, but do not parse.
5657         # A VERSION number will be parsed later as a number or v-string in the
5658         # normal way.  What is important is to set the statement type if
5659         # everything looks okay so that the operator_expected() routine
5660         # knows that the number is in a package statement.
5661         # Examples of valid primitive tokens that might follow are:
5662         #  1235  . ; { } v3  v
5663         if ( $next_nonblank_token =~ /^([v\.\d;\{\}])|v\d|\d+$/ ) {
5664             $statement_type = $tok;
5665         }
5666         else {
5667             warning(
5668                 "Unexpected '$next_nonblank_token' after package name '$tok'\n"
5669             );
5670         }
5671     }
5672
5673     # no match but line not blank --
5674     # could be a label with name package, like package:  , for example.
5675     else {
5676         $type = 'k';
5677     }
5678
5679     return ( $i, $tok, $type );
5680 }
5681
5682 sub scan_identifier_do {
5683
5684     # This routine assembles tokens into identifiers.  It maintains a
5685     # scan state, id_scan_state.  It updates id_scan_state based upon
5686     # current id_scan_state and token, and returns an updated
5687     # id_scan_state and the next index after the identifier.
5688     # USES GLOBAL VARIABLES: $context, $last_nonblank_token,
5689     # $last_nonblank_type
5690
5691     my ( $i, $id_scan_state, $identifier, $rtokens, $max_token_index,
5692         $expecting, $container_type )
5693       = @_;
5694     my $i_begin   = $i;
5695     my $type      = '';
5696     my $tok_begin = $rtokens->[$i_begin];
5697     if ( $tok_begin eq ':' ) { $tok_begin = '::' }
5698     my $id_scan_state_begin = $id_scan_state;
5699     my $identifier_begin    = $identifier;
5700     my $tok                 = $tok_begin;
5701     my $message             = "";
5702
5703     my $in_prototype_or_signature = $container_type =~ /^sub/;
5704
5705     # these flags will be used to help figure out the type:
5706     my $saw_alpha = ( $tok =~ /^[A-Za-z_]/ );
5707     my $saw_type;
5708
5709     # allow old package separator (') except in 'use' statement
5710     my $allow_tick = ( $last_nonblank_token ne 'use' );
5711
5712     # get started by defining a type and a state if necessary
5713     unless ($id_scan_state) {
5714         $context = UNKNOWN_CONTEXT;
5715
5716         # fixup for digraph
5717         if ( $tok eq '>' ) {
5718             $tok       = '->';
5719             $tok_begin = $tok;
5720         }
5721         $identifier = $tok;
5722
5723         if ( $tok eq '$' || $tok eq '*' ) {
5724             $id_scan_state = '$';
5725             $context       = SCALAR_CONTEXT;
5726         }
5727         elsif ( $tok eq '%' || $tok eq '@' ) {
5728             $id_scan_state = '$';
5729             $context       = LIST_CONTEXT;
5730         }
5731         elsif ( $tok eq '&' ) {
5732             $id_scan_state = '&';
5733         }
5734         elsif ( $tok eq 'sub' or $tok eq 'package' ) {
5735             $saw_alpha     = 0;     # 'sub' is considered type info here
5736             $id_scan_state = '$';
5737             $identifier .= ' ';     # need a space to separate sub from sub name
5738         }
5739         elsif ( $tok eq '::' ) {
5740             $id_scan_state = 'A';
5741         }
5742         elsif ( $tok =~ /^[A-Za-z_]/ ) {
5743             $id_scan_state = ':';
5744         }
5745         elsif ( $tok eq '->' ) {
5746             $id_scan_state = '$';
5747         }
5748         else {
5749
5750             # shouldn't happen
5751             my ( $a, $b, $c ) = caller;
5752             warning("Program Bug: scan_identifier given bad token = $tok \n");
5753             warning("   called from sub $a  line: $c\n");
5754             report_definite_bug();
5755         }
5756         $saw_type = !$saw_alpha;
5757     }
5758     else {
5759         $i--;
5760         $saw_type = ( $tok =~ /([\$\%\@\*\&])/ );
5761     }
5762
5763     # now loop to gather the identifier
5764     my $i_save = $i;
5765
5766     while ( $i < $max_token_index ) {
5767         $i_save = $i unless ( $tok =~ /^\s*$/ );
5768         $tok    = $rtokens->[ ++$i ];
5769
5770         if ( ( $tok eq ':' ) && ( $rtokens->[ $i + 1 ] eq ':' ) ) {
5771             $tok = '::';
5772             $i++;
5773         }
5774
5775         if ( $id_scan_state eq '$' ) {    # starting variable name
5776
5777             if ( $tok eq '$' ) {
5778
5779                 $identifier .= $tok;
5780
5781                 # we've got a punctuation variable if end of line (punct.t)
5782                 if ( $i == $max_token_index ) {
5783                     $type          = 'i';
5784                     $id_scan_state = '';
5785                     last;
5786                 }
5787             }
5788
5789             # POSTDEFREF ->@ ->% ->& ->*
5790             elsif ( ( $tok =~ /^[\@\%\&\*]$/ ) && $identifier =~ /\-\>$/ ) {
5791                 $identifier .= $tok;
5792             }
5793             elsif ( $tok =~ /^[A-Za-z_]/ ) {    # alphanumeric ..
5794                 $saw_alpha     = 1;
5795                 $id_scan_state = ':';           # now need ::
5796                 $identifier .= $tok;
5797             }
5798             elsif ( $tok eq "'" && $allow_tick ) {    # alphanumeric ..
5799                 $saw_alpha     = 1;
5800                 $id_scan_state = ':';                 # now need ::
5801                 $identifier .= $tok;
5802
5803                 # Perl will accept leading digits in identifiers,
5804                 # although they may not always produce useful results.
5805                 # Something like $main::0 is ok.  But this also works:
5806                 #
5807                 #  sub howdy::123::bubba{ print "bubba $54321!\n" }
5808                 #  howdy::123::bubba();
5809                 #
5810             }
5811             elsif ( $tok =~ /^[0-9]/ ) {    # numeric
5812                 $saw_alpha     = 1;
5813                 $id_scan_state = ':';       # now need ::
5814                 $identifier .= $tok;
5815             }
5816             elsif ( $tok eq '::' ) {
5817                 $id_scan_state = 'A';
5818                 $identifier .= $tok;
5819             }
5820
5821             # $# and POSTDEFREF ->$#
5822             elsif ( ( $tok eq '#' ) && ( $identifier =~ /\$$/ ) ) {    # $#array
5823                 $identifier .= $tok;    # keep same state, a $ could follow
5824             }
5825             elsif ( $tok eq '{' ) {
5826
5827                 # check for something like ${#} or ${©}
5828                 if (
5829                     (
5830                            $identifier eq '$'
5831                         || $identifier eq '@'
5832                         || $identifier eq '$#'
5833                     )
5834                     && $i + 2 <= $max_token_index
5835                     && $rtokens->[ $i + 2 ] eq '}'
5836                     && $rtokens->[ $i + 1 ] !~ /[\s\w]/
5837                   )
5838                 {
5839                     my $next2 = $rtokens->[ $i + 2 ];
5840                     my $next1 = $rtokens->[ $i + 1 ];
5841                     $identifier .= $tok . $next1 . $next2;
5842                     $i += 2;
5843                     $id_scan_state = '';
5844                     last;
5845                 }
5846
5847                 # skip something like ${xxx} or ->{
5848                 $id_scan_state = '';
5849
5850                 # if this is the first token of a line, any tokens for this
5851                 # identifier have already been accumulated
5852                 if ( $identifier eq '$' || $i == 0 ) { $identifier = ''; }
5853                 $i = $i_save;
5854                 last;
5855             }
5856
5857             # space ok after leading $ % * & @
5858             elsif ( $tok =~ /^\s*$/ ) {
5859
5860                 if ( $identifier =~ /^[\$\%\*\&\@]/ ) {
5861
5862                     if ( length($identifier) > 1 ) {
5863                         $id_scan_state = '';
5864                         $i             = $i_save;
5865                         $type          = 'i';    # probably punctuation variable
5866                         last;
5867                     }
5868                     else {
5869
5870                         # spaces after $'s are common, and space after @
5871                         # is harmless, so only complain about space
5872                         # after other type characters. Space after $ and
5873                         # @ will be removed in formatting.  Report space
5874                         # after % and * because they might indicate a
5875                         # parsing error.  In other words '% ' might be a
5876                         # modulo operator.  Delete this warning if it
5877                         # gets annoying.
5878                         if ( $identifier !~ /^[\@\$]$/ ) {
5879                             $message =
5880                               "Space in identifier, following $identifier\n";
5881                         }
5882                     }
5883                 }
5884
5885                 # else:
5886                 # space after '->' is ok
5887             }
5888             elsif ( $tok eq '^' ) {
5889
5890                 # check for some special variables like $^W
5891                 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
5892                     $identifier .= $tok;
5893                     $id_scan_state = 'A';
5894
5895                     # Perl accepts '$^]' or '@^]', but
5896                     # there must not be a space before the ']'.
5897                     my $next1 = $rtokens->[ $i + 1 ];
5898                     if ( $next1 eq ']' ) {
5899                         $i++;
5900                         $identifier .= $next1;
5901                         $id_scan_state = "";
5902                         last;
5903                     }
5904                 }
5905                 else {
5906                     $id_scan_state = '';
5907                 }
5908             }
5909             else {    # something else
5910
5911                 if ( $in_prototype_or_signature && $tok =~ /^[\),=]/ ) {
5912                     $id_scan_state = '';
5913                     $i             = $i_save;
5914                     $type          = 'i';       # probably punctuation variable
5915                     last;
5916                 }
5917
5918                 # check for various punctuation variables
5919                 if ( $identifier =~ /^[\$\*\@\%]$/ ) {
5920                     $identifier .= $tok;
5921                 }
5922
5923                 # POSTDEFREF: Postfix reference ->$* ->%*  ->@* ->** ->&* ->$#*
5924                 elsif ( $tok eq '*' && $identifier =~ /([\@\%\$\*\&]|\$\#)$/ ) {
5925                     $identifier .= $tok;
5926                 }
5927
5928                 elsif ( $identifier eq '$#' ) {
5929
5930                     if ( $tok eq '{' ) { $type = 'i'; $i = $i_save }
5931
5932                     # perl seems to allow just these: $#: $#- $#+
5933                     elsif ( $tok =~ /^[\:\-\+]$/ ) {
5934                         $type = 'i';
5935                         $identifier .= $tok;
5936                     }
5937                     else {
5938                         $i = $i_save;
5939                         write_logfile_entry( 'Use of $# is deprecated' . "\n" );
5940                     }
5941                 }
5942                 elsif ( $identifier eq '$$' ) {
5943
5944                     # perl does not allow references to punctuation
5945                     # variables without braces.  For example, this
5946                     # won't work:
5947                     #  $:=\4;
5948                     #  $a = $$:;
5949                     # You would have to use
5950                     #  $a = ${$:};
5951
5952                     $i = $i_save;
5953                     if   ( $tok eq '{' ) { $type = 't' }
5954                     else                 { $type = 'i' }
5955                 }
5956                 elsif ( $identifier eq '->' ) {
5957                     $i = $i_save;
5958                 }
5959                 else {
5960                     $i = $i_save;
5961                     if ( length($identifier) == 1 ) { $identifier = ''; }
5962                 }
5963                 $id_scan_state = '';
5964                 last;
5965             }
5966         }
5967         elsif ( $id_scan_state eq '&' ) {    # starting sub call?
5968
5969             if ( $tok =~ /^[\$A-Za-z_]/ ) {    # alphanumeric ..
5970                 $id_scan_state = ':';          # now need ::
5971                 $saw_alpha     = 1;
5972                 $identifier .= $tok;
5973             }
5974             elsif ( $tok eq "'" && $allow_tick ) {    # alphanumeric ..
5975                 $id_scan_state = ':';                 # now need ::
5976                 $saw_alpha     = 1;
5977                 $identifier .= $tok;
5978             }
5979             elsif ( $tok =~ /^[0-9]/ ) {    # numeric..see comments above
5980                 $id_scan_state = ':';       # now need ::
5981                 $saw_alpha     = 1;
5982                 $identifier .= $tok;
5983             }
5984             elsif ( $tok =~ /^\s*$/ ) {     # allow space
5985             }
5986             elsif ( $tok eq '::' ) {        # leading ::
5987                 $id_scan_state = 'A';       # accept alpha next
5988                 $identifier .= $tok;
5989             }
5990             elsif ( $tok eq '{' ) {
5991                 if ( $identifier eq '&' || $i == 0 ) { $identifier = ''; }
5992                 $i             = $i_save;
5993                 $id_scan_state = '';
5994                 last;
5995             }
5996             else {
5997
5998                 # punctuation variable?
5999                 # testfile: cunningham4.pl
6000                 #
6001                 # We have to be careful here.  If we are in an unknown state,
6002                 # we will reject the punctuation variable.  In the following
6003                 # example the '&' is a binary operator but we are in an unknown
6004                 # state because there is no sigil on 'Prima', so we don't
6005                 # know what it is.  But it is a bad guess that
6006                 # '&~' is a function variable.
6007                 # $self->{text}->{colorMap}->[
6008                 #   Prima::PodView::COLOR_CODE_FOREGROUND
6009                 #   & ~tb::COLOR_INDEX ] =
6010                 #   $sec->{ColorCode}
6011                 if ( $identifier eq '&' && $expecting ) {
6012                     $identifier .= $tok;
6013                 }
6014                 else {
6015                     $identifier = '';
6016                     $i          = $i_save;
6017                     $type       = '&';
6018                 }
6019                 $id_scan_state = '';
6020                 last;
6021             }
6022         }
6023         elsif ( $id_scan_state eq 'A' ) {    # looking for alpha (after ::)
6024
6025             if ( $tok =~ /^[A-Za-z_]/ ) {    # found it
6026                 $identifier .= $tok;
6027                 $id_scan_state = ':';        # now need ::
6028                 $saw_alpha     = 1;
6029             }
6030             elsif ( $tok eq "'" && $allow_tick ) {
6031                 $identifier .= $tok;
6032                 $id_scan_state = ':';        # now need ::
6033                 $saw_alpha     = 1;
6034             }
6035             elsif ( $tok =~ /^[0-9]/ ) {     # numeric..see comments above
6036                 $identifier .= $tok;
6037                 $id_scan_state = ':';        # now need ::
6038                 $saw_alpha     = 1;
6039             }
6040             elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
6041                 $id_scan_state = '(';
6042                 $identifier .= $tok;
6043             }
6044             elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
6045                 $id_scan_state = ')';
6046                 $identifier .= $tok;
6047             }
6048             else {
6049                 $id_scan_state = '';
6050                 $i             = $i_save;
6051                 last;
6052             }
6053         }
6054         elsif ( $id_scan_state eq ':' ) {    # looking for :: after alpha
6055
6056             if ( $tok eq '::' ) {            # got it
6057                 $identifier .= $tok;
6058                 $id_scan_state = 'A';        # now require alpha
6059             }
6060             elsif ( $tok =~ /^[A-Za-z_]/ ) {    # more alphanumeric is ok here
6061                 $identifier .= $tok;
6062                 $id_scan_state = ':';           # now need ::
6063                 $saw_alpha     = 1;
6064             }
6065             elsif ( $tok =~ /^[0-9]/ ) {        # numeric..see comments above
6066                 $identifier .= $tok;
6067                 $id_scan_state = ':';           # now need ::
6068                 $saw_alpha     = 1;
6069             }
6070             elsif ( $tok eq "'" && $allow_tick ) {    # tick
6071
6072                 if ( $is_keyword{$identifier} ) {
6073                     $id_scan_state = '';              # that's all
6074                     $i             = $i_save;
6075                 }
6076                 else {
6077                     $identifier .= $tok;
6078                 }
6079             }
6080             elsif ( ( $identifier =~ /^sub / ) && ( $tok =~ /^\s*$/ ) ) {
6081                 $id_scan_state = '(';
6082                 $identifier .= $tok;
6083             }
6084             elsif ( ( $identifier =~ /^sub / ) && ( $tok eq '(' ) ) {
6085                 $id_scan_state = ')';
6086                 $identifier .= $tok;
6087             }
6088             else {
6089                 $id_scan_state = '';        # that's all
6090                 $i             = $i_save;
6091                 last;
6092             }
6093         }
6094         elsif ( $id_scan_state eq '(' ) {    # looking for ( of prototype
6095
6096             if ( $tok eq '(' ) {             # got it
6097                 $identifier .= $tok;
6098                 $id_scan_state = ')';        # now find the end of it
6099             }
6100             elsif ( $tok =~ /^\s*$/ ) {      # blank - keep going
6101                 $identifier .= $tok;
6102             }
6103             else {
6104                 $id_scan_state = '';         # that's all - no prototype
6105                 $i             = $i_save;
6106                 last;
6107             }
6108         }
6109         elsif ( $id_scan_state eq ')' ) {    # looking for ) to end
6110
6111             if ( $tok eq ')' ) {             # got it
6112                 $identifier .= $tok;
6113                 $id_scan_state = '';         # all done
6114                 last;
6115             }
6116             elsif ( $tok =~ /^[\s\$\%\\\*\@\&\;]/ ) {
6117                 $identifier .= $tok;
6118             }
6119             else {    # probable error in script, but keep going
6120                 warning("Unexpected '$tok' while seeking end of prototype\n");
6121                 $identifier .= $tok;
6122             }
6123         }
6124         else {        # can get here due to error in initialization
6125             $id_scan_state = '';
6126             $i             = $i_save;
6127             last;
6128         }
6129     }
6130
6131     if ( $id_scan_state eq ')' ) {
6132         warning("Hit end of line while seeking ) to end prototype\n");
6133     }
6134
6135     # once we enter the actual identifier, it may not extend beyond
6136     # the end of the current line
6137     if ( $id_scan_state =~ /^[A\:\(\)]/ ) {
6138         $id_scan_state = '';
6139     }
6140     if ( $i < 0 ) { $i = 0 }
6141
6142     unless ($type) {
6143
6144         if ($saw_type) {
6145
6146             if ($saw_alpha) {
6147                 if ( $identifier =~ /^->/ && $last_nonblank_type eq 'w' ) {
6148                     $type = 'w';
6149                 }
6150                 else { $type = 'i' }
6151             }
6152             elsif ( $identifier eq '->' ) {
6153                 $type = '->';
6154             }
6155             elsif (
6156                 ( length($identifier) > 1 )
6157
6158                 # In something like '@$=' we have an identifier '@$'
6159                 # In something like '$${' we have type '$$' (and only
6160                 # part of an identifier)
6161                 && !( $identifier =~ /\$$/ && $tok eq '{' )
6162                 && ( $identifier !~ /^(sub |package )$/ )
6163               )
6164             {
6165                 $type = 'i';
6166             }
6167             else { $type = 't' }
6168         }
6169         elsif ($saw_alpha) {
6170
6171             # type 'w' includes anything without leading type info
6172             # ($,%,@,*) including something like abc::def::ghi
6173             $type = 'w';
6174         }
6175         else {
6176             $type = '';
6177         }    # this can happen on a restart
6178     }
6179
6180     if ($identifier) {
6181         $tok = $identifier;
6182         if ($message) { write_logfile_entry($message) }
6183     }
6184     else {
6185         $tok = $tok_begin;
6186         $i   = $i_begin;
6187     }
6188
6189     TOKENIZER_DEBUG_FLAG_SCAN_ID && do {
6190         my ( $a, $b, $c ) = caller;
6191         print STDOUT
6192 "SCANID: called from $a $b $c with tok, i, state, identifier =$tok_begin, $i_begin, $id_scan_state_begin, $identifier_begin\n";
6193         print STDOUT
6194 "SCANID: returned with tok, i, state, identifier =$tok, $i, $id_scan_state, $identifier\n";
6195     };
6196     return ( $i, $tok, $type, $id_scan_state, $identifier );
6197 }
6198
6199 {
6200
6201     # saved package and subnames in case prototype is on separate line
6202     my ( $package_saved, $subname_saved );
6203
6204     sub do_scan_sub {
6205
6206         # do_scan_sub parses a sub name and prototype
6207         # it is called with $i_beg equal to the index of the first nonblank
6208         # token following a 'sub' token.
6209
6210         # TODO: add future error checks to be sure we have a valid
6211         # sub name.  For example, 'sub &doit' is wrong.  Also, be sure
6212         # a name is given if and only if a non-anonymous sub is
6213         # appropriate.
6214         # USES GLOBAL VARS: $current_package, $last_nonblank_token,
6215         # $in_attribute_list, %saw_function_definition,
6216         # $statement_type
6217
6218         my (
6219             $input_line, $i,             $i_beg,
6220             $tok,        $type,          $rtokens,
6221             $rtoken_map, $id_scan_state, $max_token_index
6222         ) = @_;
6223         $id_scan_state = "";    # normally we get everything in one call
6224         my $subname = undef;
6225         my $package = undef;
6226         my $proto   = undef;
6227         my $attrs   = undef;
6228         my $match;
6229
6230         my $pos_beg = $rtoken_map->[$i_beg];
6231         pos($input_line) = $pos_beg;
6232
6233         # Look for the sub NAME
6234         if (
6235             $input_line =~ m/\G\s*
6236         ((?:\w*(?:'|::))*)  # package - something that ends in :: or '
6237         (\w+)               # NAME    - required
6238         /gcx
6239           )
6240         {
6241             $match   = 1;
6242             $subname = $2;
6243
6244             $package = ( defined($1) && $1 ) ? $1 : $current_package;
6245             $package =~ s/\'/::/g;
6246             if ( $package =~ /^\:/ ) { $package = 'main' . $package }
6247             $package =~ s/::$//;
6248             my $pos  = pos($input_line);
6249             my $numc = $pos - $pos_beg;
6250             $tok  = 'sub ' . substr( $input_line, $pos_beg, $numc );
6251             $type = 'i';
6252         }
6253
6254         # Now look for PROTO ATTRS
6255         # Look for prototype/attributes which are usually on the same
6256         # line as the sub name but which might be on a separate line.
6257         # For example, we might have an anonymous sub with attributes,
6258         # or a prototype on a separate line from its sub name
6259
6260         # NOTE: We only want to parse PROTOTYPES here. If we see anything that
6261         # does not look like a prototype, we assume it is a SIGNATURE and we
6262         # will stop and let the the standard tokenizer handle it.  In
6263         # particular, we stop if we see any nested parens, braces, or commas.
6264         my $saw_opening_paren = $input_line =~ /\G\s*\(/;
6265         if (
6266             $input_line =~ m/\G(\s*\([^\)\(\}\{\,]*\))?  # PROTO
6267             (\s*:)?                              # ATTRS leading ':'
6268             /gcx
6269             && ( $1 || $2 )
6270           )
6271         {
6272             $proto = $1;
6273             $attrs = $2;
6274
6275             # If we also found the sub name on this call then append PROTO.
6276             # This is not necessary but for compatability with previous
6277             # versions when the -csc flag is used:
6278             if ( $match && $proto ) {
6279                 $tok .= $proto;
6280             }
6281             $match ||= 1;
6282
6283             # Handle prototype on separate line from subname
6284             if ($subname_saved) {
6285                 $package = $package_saved;
6286                 $subname = $subname_saved;
6287                 $tok     = $last_nonblank_token;
6288             }
6289             $type = 'i';
6290         }
6291
6292         if ($match) {
6293
6294             # ATTRS: if there are attributes, back up and let the ':' be
6295             # found later by the scanner.
6296             my $pos = pos($input_line);
6297             if ($attrs) {
6298                 $pos -= length($attrs);
6299             }
6300
6301             my $next_nonblank_token = $tok;
6302
6303             # catch case of line with leading ATTR ':' after anonymous sub
6304             if ( $pos == $pos_beg && $tok eq ':' ) {
6305                 $type              = 'A';
6306                 $in_attribute_list = 1;
6307             }
6308
6309             # Otherwise, if we found a match we must convert back from
6310             # string position to the pre_token index for continued parsing.
6311             else {
6312
6313                 # I don't think an error flag can occur here ..but ?
6314                 my $error;
6315                 ( $i, $error ) = inverse_pretoken_map( $i, $pos, $rtoken_map,
6316                     $max_token_index );
6317                 if ($error) { warning("Possibly invalid sub\n") }
6318
6319                 # check for multiple definitions of a sub
6320                 ( $next_nonblank_token, my $i_next ) =
6321                   find_next_nonblank_token_on_this_line( $i, $rtokens,
6322                     $max_token_index );
6323             }
6324
6325             if ( $next_nonblank_token =~ /^(\s*|#)$/ )
6326             {    # skip blank or side comment
6327                 my ( $rpre_tokens, $rpre_types ) =
6328                   peek_ahead_for_n_nonblank_pre_tokens(1);
6329                 if ( defined($rpre_tokens) && @{$rpre_tokens} ) {
6330                     $next_nonblank_token = $rpre_tokens->[0];
6331                 }
6332                 else {
6333                     $next_nonblank_token = '}';
6334                 }
6335             }
6336             $package_saved = "";
6337             $subname_saved = "";
6338
6339             # See what's next...
6340             if ( $next_nonblank_token eq '{' ) {
6341                 if ($subname) {
6342
6343                     # Check for multiple definitions of a sub, but
6344                     # it is ok to have multiple sub BEGIN, etc,
6345                     # so we do not complain if name is all caps
6346                     if (   $saw_function_definition{$package}{$subname}
6347                         && $subname !~ /^[A-Z]+$/ )
6348                     {
6349                         my $lno = $saw_function_definition{$package}{$subname};
6350                         warning(
6351 "already saw definition of 'sub $subname' in package '$package' at line $lno\n"
6352                         );
6353                     }
6354                     $saw_function_definition{$package}{$subname} =
6355                       $tokenizer_self->{_last_line_number};
6356                 }
6357             }
6358             elsif ( $next_nonblank_token eq ';' ) {
6359             }
6360             elsif ( $next_nonblank_token eq '}' ) {
6361             }
6362
6363             # ATTRS - if an attribute list follows, remember the name
6364             # of the sub so the next opening brace can be labeled.
6365             # Setting 'statement_type' causes any ':'s to introduce
6366             # attributes.
6367             elsif ( $next_nonblank_token eq ':' ) {
6368                 $statement_type = $tok;
6369             }
6370
6371             # if we stopped before an open paren ...
6372             elsif ( $next_nonblank_token eq '(' ) {
6373
6374                 # If we DID NOT see this paren above then it must be on the
6375                 # next line so we will set a flag to come back here and see if
6376                 # it is a PROTOTYPE
6377
6378                 # Otherwise, we assume it is a SIGNATURE rather than a
6379                 # PROTOTYPE and let the normal tokenizer handle it as a list
6380                 if ( !$saw_opening_paren ) {
6381                     $id_scan_state = 'sub';     # we must come back to get proto
6382                     $package_saved = $package;
6383                     $subname_saved = $subname;
6384                 }
6385                 $statement_type = $tok;
6386             }
6387             elsif ($next_nonblank_token) {      # EOF technically ok
6388                 warning(
6389 "expecting ':' or ';' or '{' after definition or declaration of sub '$subname' but saw '$next_nonblank_token'\n"
6390                 );
6391             }
6392             check_prototype( $proto, $package, $subname );
6393         }
6394
6395         # no match but line not blank
6396         else {
6397         }
6398         return ( $i, $tok, $type, $id_scan_state );
6399     }
6400 }
6401
6402 #########i###############################################################
6403 # Tokenizer utility routines which may use CONSTANTS but no other GLOBALS
6404 #########################################################################
6405
6406 sub find_next_nonblank_token {
6407     my ( $i, $rtokens, $max_token_index ) = @_;
6408
6409     if ( $i >= $max_token_index ) {
6410         if ( !peeked_ahead() ) {
6411             peeked_ahead(1);
6412             $rtokens =
6413               peek_ahead_for_nonblank_token( $rtokens, $max_token_index );
6414         }
6415     }
6416     my $next_nonblank_token = $rtokens->[ ++$i ];
6417
6418     if ( $next_nonblank_token =~ /^\s*$/ ) {
6419         $next_nonblank_token = $rtokens->[ ++$i ];
6420     }
6421     return ( $next_nonblank_token, $i );
6422 }
6423
6424 sub numerator_expected {
6425
6426     # this is a filter for a possible numerator, in support of guessing
6427     # for the / pattern delimiter token.
6428     # returns -
6429     #   1 - yes
6430     #   0 - can't tell
6431     #  -1 - no
6432     # Note: I am using the convention that variables ending in
6433     # _expected have these 3 possible values.
6434     my ( $i, $rtokens, $max_token_index ) = @_;
6435     my $numerator_expected = 0;
6436
6437     my $next_token = $rtokens->[ $i + 1 ];
6438     if ( $next_token eq '=' ) { $i++; }    # handle /=
6439     my ( $next_nonblank_token, $i_next ) =
6440       find_next_nonblank_token( $i, $rtokens, $max_token_index );
6441
6442     if ( $next_nonblank_token =~ /(\(|\$|\w|\.|\@)/ ) {
6443         $numerator_expected = 1;
6444     }
6445     else {
6446
6447         if ( $next_nonblank_token =~ /^\s*$/ ) {
6448             $numerator_expected = 0;
6449         }
6450         else {
6451             $numerator_expected = -1;
6452         }
6453     }
6454     return $numerator_expected;
6455 }
6456
6457 sub pattern_expected {
6458
6459     # This is the start of a filter for a possible pattern.
6460     # It looks at the token after a possible pattern and tries to
6461     # determine if that token could end a pattern.
6462     # returns -
6463     #   1 - yes
6464     #   0 - can't tell
6465     #  -1 - no
6466     my ( $i, $rtokens, $max_token_index ) = @_;
6467     my $is_pattern = 0;
6468
6469     my $next_token = $rtokens->[ $i + 1 ];
6470     if ( $next_token =~ /^[msixpodualgc]/ ) { $i++; }   # skip possible modifier
6471     my ( $next_nonblank_token, $i_next ) =
6472       find_next_nonblank_token( $i, $rtokens, $max_token_index );
6473
6474     # list of tokens which may follow a pattern
6475     # (can probably be expanded)
6476     if ( $next_nonblank_token =~ /(\)|\}|\;|\&\&|\|\||and|or|while|if|unless)/ )
6477     {
6478         $is_pattern = 1;
6479     }
6480     else {
6481
6482         if ( $next_nonblank_token =~ /^\s*$/ ) {
6483             $is_pattern = 0;
6484         }
6485         else {
6486             $is_pattern = -1;
6487         }
6488     }
6489     return $is_pattern;
6490 }
6491
6492 sub find_next_nonblank_token_on_this_line {
6493     my ( $i, $rtokens, $max_token_index ) = @_;
6494     my $next_nonblank_token;
6495
6496     if ( $i < $max_token_index ) {
6497         $next_nonblank_token = $rtokens->[ ++$i ];
6498
6499         if ( $next_nonblank_token =~ /^\s*$/ ) {
6500
6501             if ( $i < $max_token_index ) {
6502                 $next_nonblank_token = $rtokens->[ ++$i ];
6503             }
6504         }
6505     }
6506     else {
6507         $next_nonblank_token = "";
6508     }
6509     return ( $next_nonblank_token, $i );
6510 }
6511
6512 sub find_angle_operator_termination {
6513
6514     # We are looking at a '<' and want to know if it is an angle operator.
6515     # We are to return:
6516     #   $i = pretoken index of ending '>' if found, current $i otherwise
6517     #   $type = 'Q' if found, '>' otherwise
6518     my ( $input_line, $i_beg, $rtoken_map, $expecting, $max_token_index ) = @_;
6519     my $i    = $i_beg;
6520     my $type = '<';
6521     pos($input_line) = 1 + $rtoken_map->[$i];
6522
6523     my $filter;
6524
6525     # we just have to find the next '>' if a term is expected
6526     if ( $expecting == TERM ) { $filter = '[\>]' }
6527
6528     # we have to guess if we don't know what is expected
6529     elsif ( $expecting == UNKNOWN ) { $filter = '[\>\;\=\#\|\<]' }
6530
6531     # shouldn't happen - we shouldn't be here if operator is expected
6532     else { warning("Program Bug in find_angle_operator_termination\n") }
6533
6534     # To illustrate what we might be looking at, in case we are
6535     # guessing, here are some examples of valid angle operators
6536     # (or file globs):
6537     #  <tmp_imp/*>
6538     #  <FH>
6539     #  <$fh>
6540     #  <*.c *.h>
6541     #  <_>
6542     #  <jskdfjskdfj* op/* jskdjfjkosvk*> ( glob.t)
6543     #  <${PREFIX}*img*.$IMAGE_TYPE>
6544     #  <img*.$IMAGE_TYPE>
6545     #  <Timg*.$IMAGE_TYPE>
6546     #  <$LATEX2HTMLVERSIONS${dd}html[1-9].[0-9].pl>
6547     #
6548     # Here are some examples of lines which do not have angle operators:
6549     #  return unless $self->[2]++ < $#{$self->[1]};
6550     #  < 2  || @$t >
6551     #
6552     # the following line from dlister.pl caused trouble:
6553     #  print'~'x79,"\n",$D<1024?"0.$D":$D>>10,"K, $C files\n\n\n";
6554     #
6555     # If the '<' starts an angle operator, it must end on this line and
6556     # it must not have certain characters like ';' and '=' in it.  I use
6557     # this to limit the testing.  This filter should be improved if
6558     # possible.
6559
6560     if ( $input_line =~ /($filter)/g ) {
6561
6562         if ( $1 eq '>' ) {
6563
6564             # We MAY have found an angle operator termination if we get
6565             # here, but we need to do more to be sure we haven't been
6566             # fooled.
6567             my $pos = pos($input_line);
6568
6569             my $pos_beg = $rtoken_map->[$i];
6570             my $str     = substr( $input_line, $pos_beg, ( $pos - $pos_beg ) );
6571
6572             # Reject if the closing '>' follows a '-' as in:
6573             # if ( VERSION < 5.009 && $op-> name eq 'assign' ) { }
6574             if ( $expecting eq UNKNOWN ) {
6575                 my $check = substr( $input_line, $pos - 2, 1 );
6576                 if ( $check eq '-' ) {
6577                     return ( $i, $type );
6578                 }
6579             }
6580
6581             ######################################debug#####
6582             #write_diagnostics( "ANGLE? :$str\n");
6583             #print "ANGLE: found $1 at pos=$pos str=$str check=$check\n";
6584             ######################################debug#####
6585             $type = 'Q';
6586             my $error;
6587             ( $i, $error ) =
6588               inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
6589
6590             # It may be possible that a quote ends midway in a pretoken.
6591             # If this happens, it may be necessary to split the pretoken.
6592             if ($error) {
6593                 warning(
6594                     "Possible tokinization error..please check this line\n");
6595                 report_possible_bug();
6596             }
6597
6598             # Now let's see where we stand....
6599             # OK if math op not possible
6600             if ( $expecting == TERM ) {
6601             }
6602
6603             # OK if there are no more than 2 pre-tokens inside
6604             # (not possible to write 2 token math between < and >)
6605             # This catches most common cases
6606             elsif ( $i <= $i_beg + 3 ) {
6607                 write_diagnostics("ANGLE(1 or 2 tokens): $str\n");
6608             }
6609
6610             # Not sure..
6611             else {
6612
6613                 # Let's try a Brace Test: any braces inside must balance
6614                 my $br = 0;
6615                 while ( $str =~ /\{/g ) { $br++ }
6616                 while ( $str =~ /\}/g ) { $br-- }
6617                 my $sb = 0;
6618                 while ( $str =~ /\[/g ) { $sb++ }
6619                 while ( $str =~ /\]/g ) { $sb-- }
6620                 my $pr = 0;
6621                 while ( $str =~ /\(/g ) { $pr++ }
6622                 while ( $str =~ /\)/g ) { $pr-- }
6623
6624                 # if braces do not balance - not angle operator
6625                 if ( $br || $sb || $pr ) {
6626                     $i    = $i_beg;
6627                     $type = '<';
6628                     write_diagnostics(
6629                         "NOT ANGLE (BRACE={$br ($pr [$sb ):$str\n");
6630                 }
6631
6632                 # we should keep doing more checks here...to be continued
6633                 # Tentatively accepting this as a valid angle operator.
6634                 # There are lots more things that can be checked.
6635                 else {
6636                     write_diagnostics(
6637                         "ANGLE-Guessing yes: $str expecting=$expecting\n");
6638                     write_logfile_entry("Guessing angle operator here: $str\n");
6639                 }
6640             }
6641         }
6642
6643         # didn't find ending >
6644         else {
6645             if ( $expecting == TERM ) {
6646                 warning("No ending > for angle operator\n");
6647             }
6648         }
6649     }
6650     return ( $i, $type );
6651 }
6652
6653 sub scan_number_do {
6654
6655     #  scan a number in any of the formats that Perl accepts
6656     #  Underbars (_) are allowed in decimal numbers.
6657     #  input parameters -
6658     #      $input_line  - the string to scan
6659     #      $i           - pre_token index to start scanning
6660     #    $rtoken_map    - reference to the pre_token map giving starting
6661     #                    character position in $input_line of token $i
6662     #  output parameters -
6663     #    $i            - last pre_token index of the number just scanned
6664     #    number        - the number (characters); or undef if not a number
6665
6666     my ( $input_line, $i, $rtoken_map, $input_type, $max_token_index ) = @_;
6667     my $pos_beg = $rtoken_map->[$i];
6668     my $pos;
6669     my $i_begin = $i;
6670     my $number  = undef;
6671     my $type    = $input_type;
6672
6673     my $first_char = substr( $input_line, $pos_beg, 1 );
6674
6675     # Look for bad starting characters; Shouldn't happen..
6676     if ( $first_char !~ /[\d\.\+\-Ee]/ ) {
6677         warning("Program bug - scan_number given character $first_char\n");
6678         report_definite_bug();
6679         return ( $i, $type, $number );
6680     }
6681
6682     # handle v-string without leading 'v' character ('Two Dot' rule)
6683     # (vstring.t)
6684     # TODO: v-strings may contain underscores
6685     pos($input_line) = $pos_beg;
6686     if ( $input_line =~ /\G((\d+)?\.\d+(\.\d+)+)/g ) {
6687         $pos = pos($input_line);
6688         my $numc = $pos - $pos_beg;
6689         $number = substr( $input_line, $pos_beg, $numc );
6690         $type   = 'v';
6691         report_v_string($number);
6692     }
6693
6694     # handle octal, hex, binary
6695     if ( !defined($number) ) {
6696         pos($input_line) = $pos_beg;
6697         if ( $input_line =~
6698             /\G[+-]?0(([xX][0-9a-fA-F_]+)|([0-7_]+)|([bB][01_]+))/g )
6699         {
6700             $pos = pos($input_line);
6701             my $numc = $pos - $pos_beg;
6702             $number = substr( $input_line, $pos_beg, $numc );
6703             $type   = 'n';
6704         }
6705     }
6706
6707     # handle decimal
6708     if ( !defined($number) ) {
6709         pos($input_line) = $pos_beg;
6710
6711         if ( $input_line =~ /\G([+-]?[\d_]*(\.[\d_]*)?([Ee][+-]?(\d+))?)/g ) {
6712             $pos = pos($input_line);
6713
6714             # watch out for things like 0..40 which would give 0. by this;
6715             if (   ( substr( $input_line, $pos - 1, 1 ) eq '.' )
6716                 && ( substr( $input_line, $pos, 1 ) eq '.' ) )
6717             {
6718                 $pos--;
6719             }
6720             my $numc = $pos - $pos_beg;
6721             $number = substr( $input_line, $pos_beg, $numc );
6722             $type   = 'n';
6723         }
6724     }
6725
6726     # filter out non-numbers like e + - . e2  .e3 +e6
6727     # the rule: at least one digit, and any 'e' must be preceded by a digit
6728     if (
6729         $number !~ /\d/    # no digits
6730         || (   $number =~ /^(.*)[eE]/
6731             && $1 !~ /\d/ )    # or no digits before the 'e'
6732       )
6733     {
6734         $number = undef;
6735         $type   = $input_type;
6736         return ( $i, $type, $number );
6737     }
6738
6739     # Found a number; now we must convert back from character position
6740     # to pre_token index. An error here implies user syntax error.
6741     # An example would be an invalid octal number like '009'.
6742     my $error;
6743     ( $i, $error ) =
6744       inverse_pretoken_map( $i, $pos, $rtoken_map, $max_token_index );
6745     if ($error) { warning("Possibly invalid number\n") }
6746
6747     return ( $i, $type, $number );
6748 }
6749
6750 sub inverse_pretoken_map {
6751
6752     # Starting with the current pre_token index $i, scan forward until
6753     # finding the index of the next pre_token whose position is $pos.
6754     my ( $i, $pos, $rtoken_map, $max_token_index ) = @_;
6755     my $error = 0;
6756
6757     while ( ++$i <= $max_token_index ) {
6758
6759         if ( $pos <= $rtoken_map->[$i] ) {
6760
6761             # Let the calling routine handle errors in which we do not
6762             # land on a pre-token boundary.  It can happen by running
6763             # perltidy on some non-perl scripts, for example.
6764             if ( $pos < $rtoken_map->[$i] ) { $error = 1 }
6765             $i--;
6766             last;
6767         }
6768     }
6769     return ( $i, $error );
6770 }
6771
6772 sub find_here_doc {
6773
6774     # find the target of a here document, if any
6775     # input parameters:
6776     #   $i - token index of the second < of <<
6777     #   ($i must be less than the last token index if this is called)
6778     # output parameters:
6779     #   $found_target = 0 didn't find target; =1 found target
6780     #   HERE_TARGET - the target string (may be empty string)
6781     #   $i - unchanged if not here doc,
6782     #    or index of the last token of the here target
6783     #   $saw_error - flag noting unbalanced quote on here target
6784     my ( $expecting, $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
6785     my $ibeg                 = $i;
6786     my $found_target         = 0;
6787     my $here_doc_target      = '';
6788     my $here_quote_character = '';
6789     my $saw_error            = 0;
6790     my ( $next_nonblank_token, $i_next_nonblank, $next_token );
6791     $next_token = $rtokens->[ $i + 1 ];
6792
6793     # perl allows a backslash before the target string (heredoc.t)
6794     my $backslash = 0;
6795     if ( $next_token eq '\\' ) {
6796         $backslash  = 1;
6797         $next_token = $rtokens->[ $i + 2 ];
6798     }
6799
6800     ( $next_nonblank_token, $i_next_nonblank ) =
6801       find_next_nonblank_token_on_this_line( $i, $rtokens, $max_token_index );
6802
6803     if ( $next_nonblank_token =~ /[\'\"\`]/ ) {
6804
6805         my $in_quote    = 1;
6806         my $quote_depth = 0;
6807         my $quote_pos   = 0;
6808         my $quoted_string;
6809
6810         (
6811             $i, $in_quote, $here_quote_character, $quote_pos, $quote_depth,
6812             $quoted_string
6813           )
6814           = follow_quoted_string( $i_next_nonblank, $in_quote, $rtokens,
6815             $here_quote_character, $quote_pos, $quote_depth, $max_token_index );
6816
6817         if ($in_quote) {    # didn't find end of quote, so no target found
6818             $i = $ibeg;
6819             if ( $expecting == TERM ) {
6820                 warning(
6821 "Did not find here-doc string terminator ($here_quote_character) before end of line \n"
6822                 );
6823                 $saw_error = 1;
6824             }
6825         }
6826         else {              # found ending quote
6827             $found_target = 1;
6828
6829             my $tokj;
6830             foreach my $j ( $i_next_nonblank + 1 .. $i - 1 ) {
6831                 $tokj = $rtokens->[$j];
6832
6833                 # we have to remove any backslash before the quote character
6834                 # so that the here-doc-target exactly matches this string
6835                 next
6836                   if ( $tokj eq "\\"
6837                     && $j < $i - 1
6838                     && $rtokens->[ $j + 1 ] eq $here_quote_character );
6839                 $here_doc_target .= $tokj;
6840             }
6841         }
6842     }
6843
6844     elsif ( ( $next_token =~ /^\s*$/ ) and ( $expecting == TERM ) ) {
6845         $found_target = 1;
6846         write_logfile_entry(
6847             "found blank here-target after <<; suggest using \"\"\n");
6848         $i = $ibeg;
6849     }
6850     elsif ( $next_token =~ /^\w/ ) {    # simple bareword or integer after <<
6851
6852         my $here_doc_expected;
6853         if ( $expecting == UNKNOWN ) {
6854             $here_doc_expected = guess_if_here_doc($next_token);
6855         }
6856         else {
6857             $here_doc_expected = 1;
6858         }
6859
6860         if ($here_doc_expected) {
6861             $found_target    = 1;
6862             $here_doc_target = $next_token;
6863             $i               = $ibeg + 1;
6864         }
6865
6866     }
6867     else {
6868
6869         if ( $expecting == TERM ) {
6870             $found_target = 1;
6871             write_logfile_entry("Note: bare here-doc operator <<\n");
6872         }
6873         else {
6874             $i = $ibeg;
6875         }
6876     }
6877
6878     # patch to neglect any prepended backslash
6879     if ( $found_target && $backslash ) { $i++ }
6880
6881     return ( $found_target, $here_doc_target, $here_quote_character, $i,
6882         $saw_error );
6883 }
6884
6885 sub do_quote {
6886
6887     # follow (or continue following) quoted string(s)
6888     # $in_quote return code:
6889     #   0 - ok, found end
6890     #   1 - still must find end of quote whose target is $quote_character
6891     #   2 - still looking for end of first of two quotes
6892     #
6893     # Returns updated strings:
6894     #  $quoted_string_1 = quoted string seen while in_quote=1
6895     #  $quoted_string_2 = quoted string seen while in_quote=2
6896     my (
6897         $i,               $in_quote,    $quote_character,
6898         $quote_pos,       $quote_depth, $quoted_string_1,
6899         $quoted_string_2, $rtokens,     $rtoken_map,
6900         $max_token_index
6901     ) = @_;
6902
6903     my $in_quote_starting = $in_quote;
6904
6905     my $quoted_string;
6906     if ( $in_quote == 2 ) {    # two quotes/quoted_string_1s to follow
6907         my $ibeg = $i;
6908         (
6909             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6910             $quoted_string
6911           )
6912           = follow_quoted_string( $i, $in_quote, $rtokens, $quote_character,
6913             $quote_pos, $quote_depth, $max_token_index );
6914         $quoted_string_2 .= $quoted_string;
6915         if ( $in_quote == 1 ) {
6916             if ( $quote_character =~ /[\{\[\<\(]/ ) { $i++; }
6917             $quote_character = '';
6918         }
6919         else {
6920             $quoted_string_2 .= "\n";
6921         }
6922     }
6923
6924     if ( $in_quote == 1 ) {    # one (more) quote to follow
6925         my $ibeg = $i;
6926         (
6927             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6928             $quoted_string
6929           )
6930           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
6931             $quote_pos, $quote_depth, $max_token_index );
6932         $quoted_string_1 .= $quoted_string;
6933         if ( $in_quote == 1 ) {
6934             $quoted_string_1 .= "\n";
6935         }
6936     }
6937     return ( $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
6938         $quoted_string_1, $quoted_string_2 );
6939 }
6940
6941 sub follow_quoted_string {
6942
6943     # scan for a specific token, skipping escaped characters
6944     # if the quote character is blank, use the first non-blank character
6945     # input parameters:
6946     #   $rtokens = reference to the array of tokens
6947     #   $i = the token index of the first character to search
6948     #   $in_quote = number of quoted strings being followed
6949     #   $beginning_tok = the starting quote character
6950     #   $quote_pos = index to check next for alphanumeric delimiter
6951     # output parameters:
6952     #   $i = the token index of the ending quote character
6953     #   $in_quote = decremented if found end, unchanged if not
6954     #   $beginning_tok = the starting quote character
6955     #   $quote_pos = index to check next for alphanumeric delimiter
6956     #   $quote_depth = nesting depth, since delimiters '{ ( [ <' can be nested.
6957     #   $quoted_string = the text of the quote (without quotation tokens)
6958     my ( $i_beg, $in_quote, $rtokens, $beginning_tok, $quote_pos, $quote_depth,
6959         $max_token_index )
6960       = @_;
6961     my ( $tok, $end_tok );
6962     my $i             = $i_beg - 1;
6963     my $quoted_string = "";
6964
6965     TOKENIZER_DEBUG_FLAG_QUOTE && do {
6966         print STDOUT
6967 "QUOTE entering with quote_pos = $quote_pos i=$i beginning_tok =$beginning_tok\n";
6968     };
6969
6970     # get the corresponding end token
6971     if ( $beginning_tok !~ /^\s*$/ ) {
6972         $end_tok = matching_end_token($beginning_tok);
6973     }
6974
6975     # a blank token means we must find and use the first non-blank one
6976     else {
6977         my $allow_quote_comments = ( $i < 0 ) ? 1 : 0; # i<0 means we saw a <cr>
6978
6979         while ( $i < $max_token_index ) {
6980             $tok = $rtokens->[ ++$i ];
6981
6982             if ( $tok !~ /^\s*$/ ) {
6983
6984                 if ( ( $tok eq '#' ) && ($allow_quote_comments) ) {
6985                     $i = $max_token_index;
6986                 }
6987                 else {
6988
6989                     if ( length($tok) > 1 ) {
6990                         if ( $quote_pos <= 0 ) { $quote_pos = 1 }
6991                         $beginning_tok = substr( $tok, $quote_pos - 1, 1 );
6992                     }
6993                     else {
6994                         $beginning_tok = $tok;
6995                         $quote_pos     = 0;
6996                     }
6997                     $end_tok     = matching_end_token($beginning_tok);
6998                     $quote_depth = 1;
6999                     last;
7000                 }
7001             }
7002             else {
7003                 $allow_quote_comments = 1;
7004             }
7005         }
7006     }
7007
7008     # There are two different loops which search for the ending quote
7009     # character.  In the rare case of an alphanumeric quote delimiter, we
7010     # have to look through alphanumeric tokens character-by-character, since
7011     # the pre-tokenization process combines multiple alphanumeric
7012     # characters, whereas for a non-alphanumeric delimiter, only tokens of
7013     # length 1 can match.
7014
7015     ###################################################################
7016     # Case 1 (rare): loop for case of alphanumeric quote delimiter..
7017     # "quote_pos" is the position the current word to begin searching
7018     ###################################################################
7019     if ( $beginning_tok =~ /\w/ ) {
7020
7021         # Note this because it is not recommended practice except
7022         # for obfuscated perl contests
7023         if ( $in_quote == 1 ) {
7024             write_logfile_entry(
7025                 "Note: alphanumeric quote delimiter ($beginning_tok) \n");
7026         }
7027
7028         while ( $i < $max_token_index ) {
7029
7030             if ( $quote_pos == 0 || ( $i < 0 ) ) {
7031                 $tok = $rtokens->[ ++$i ];
7032
7033                 if ( $tok eq '\\' ) {
7034
7035                     # retain backslash unless it hides the end token
7036                     $quoted_string .= $tok
7037                       unless $rtokens->[ $i + 1 ] eq $end_tok;
7038                     $quote_pos++;
7039                     last if ( $i >= $max_token_index );
7040                     $tok = $rtokens->[ ++$i ];
7041                 }
7042             }
7043             my $old_pos = $quote_pos;
7044
7045             unless ( defined($tok) && defined($end_tok) && defined($quote_pos) )
7046             {
7047
7048             }
7049             $quote_pos = 1 + index( $tok, $end_tok, $quote_pos );
7050
7051             if ( $quote_pos > 0 ) {
7052
7053                 $quoted_string .=
7054                   substr( $tok, $old_pos, $quote_pos - $old_pos - 1 );
7055
7056                 $quote_depth--;
7057
7058                 if ( $quote_depth == 0 ) {
7059                     $in_quote--;
7060                     last;
7061                 }
7062             }
7063             else {
7064                 $quoted_string .= substr( $tok, $old_pos );
7065             }
7066         }
7067     }
7068
7069     ########################################################################
7070     # Case 2 (normal): loop for case of a non-alphanumeric quote delimiter..
7071     ########################################################################
7072     else {
7073
7074         while ( $i < $max_token_index ) {
7075             $tok = $rtokens->[ ++$i ];
7076
7077             if ( $tok eq $end_tok ) {
7078                 $quote_depth--;
7079
7080                 if ( $quote_depth == 0 ) {
7081                     $in_quote--;
7082                     last;
7083                 }
7084             }
7085             elsif ( $tok eq $beginning_tok ) {
7086                 $quote_depth++;
7087             }
7088             elsif ( $tok eq '\\' ) {
7089
7090                 # retain backslash unless it hides the beginning or end token
7091                 $tok = $rtokens->[ ++$i ];
7092                 $quoted_string .= '\\'
7093                   unless ( $tok eq $end_tok || $tok eq $beginning_tok );
7094             }
7095             $quoted_string .= $tok;
7096         }
7097     }
7098     if ( $i > $max_token_index ) { $i = $max_token_index }
7099     return ( $i, $in_quote, $beginning_tok, $quote_pos, $quote_depth,
7100         $quoted_string );
7101 }
7102
7103 sub indicate_error {
7104     my ( $msg, $line_number, $input_line, $pos, $carrat ) = @_;
7105     interrupt_logfile();
7106     warning($msg);
7107     write_error_indicator_pair( $line_number, $input_line, $pos, $carrat );
7108     resume_logfile();
7109     return;
7110 }
7111
7112 sub write_error_indicator_pair {
7113     my ( $line_number, $input_line, $pos, $carrat ) = @_;
7114     my ( $offset, $numbered_line, $underline ) =
7115       make_numbered_line( $line_number, $input_line, $pos );
7116     $underline = write_on_underline( $underline, $pos - $offset, $carrat );
7117     warning( $numbered_line . "\n" );
7118     $underline =~ s/\s*$//;
7119     warning( $underline . "\n" );
7120     return;
7121 }
7122
7123 sub make_numbered_line {
7124
7125     #  Given an input line, its line number, and a character position of
7126     #  interest, create a string not longer than 80 characters of the form
7127     #     $lineno: sub_string
7128     #  such that the sub_string of $str contains the position of interest
7129     #
7130     #  Here is an example of what we want, in this case we add trailing
7131     #  '...' because the line is long.
7132     #
7133     # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
7134     #
7135     #  Here is another example, this time in which we used leading '...'
7136     #  because of excessive length:
7137     #
7138     # 2: ... er of the World Wide Web Consortium's
7139     #
7140     #  input parameters are:
7141     #   $lineno = line number
7142     #   $str = the text of the line
7143     #   $pos = position of interest (the error) : 0 = first character
7144     #
7145     #   We return :
7146     #     - $offset = an offset which corrects the position in case we only
7147     #       display part of a line, such that $pos-$offset is the effective
7148     #       position from the start of the displayed line.
7149     #     - $numbered_line = the numbered line as above,
7150     #     - $underline = a blank 'underline' which is all spaces with the same
7151     #       number of characters as the numbered line.
7152
7153     my ( $lineno, $str, $pos ) = @_;
7154     my $offset = ( $pos < 60 ) ? 0 : $pos - 40;
7155     my $excess = length($str) - $offset - 68;
7156     my $numc   = ( $excess > 0 ) ? 68 : undef;
7157
7158     if ( defined($numc) ) {
7159         if ( $offset == 0 ) {
7160             $str = substr( $str, $offset, $numc - 4 ) . " ...";
7161         }
7162         else {
7163             $str = "... " . substr( $str, $offset + 4, $numc - 4 ) . " ...";
7164         }
7165     }
7166     else {
7167
7168         if ( $offset == 0 ) {
7169         }
7170         else {
7171             $str = "... " . substr( $str, $offset + 4 );
7172         }
7173     }
7174
7175     my $numbered_line = sprintf( "%d: ", $lineno );
7176     $offset -= length($numbered_line);
7177     $numbered_line .= $str;
7178     my $underline = " " x length($numbered_line);
7179     return ( $offset, $numbered_line, $underline );
7180 }
7181
7182 sub write_on_underline {
7183
7184     # The "underline" is a string that shows where an error is; it starts
7185     # out as a string of blanks with the same length as the numbered line of
7186     # code above it, and we have to add marking to show where an error is.
7187     # In the example below, we want to write the string '--^' just below
7188     # the line of bad code:
7189     #
7190     # 2: (One of QAML 2.0's authors is a member of the World Wide Web Con ...
7191     #                 ---^
7192     # We are given the current underline string, plus a position and a
7193     # string to write on it.
7194     #
7195     # In the above example, there will be 2 calls to do this:
7196     # First call:  $pos=19, pos_chr=^
7197     # Second call: $pos=16, pos_chr=---
7198     #
7199     # This is a trivial thing to do with substr, but there is some
7200     # checking to do.
7201
7202     my ( $underline, $pos, $pos_chr ) = @_;
7203
7204     # check for error..shouldn't happen
7205     unless ( ( $pos >= 0 ) && ( $pos <= length($underline) ) ) {
7206         return $underline;
7207     }
7208     my $excess = length($pos_chr) + $pos - length($underline);
7209     if ( $excess > 0 ) {
7210         $pos_chr = substr( $pos_chr, 0, length($pos_chr) - $excess );
7211     }
7212     substr( $underline, $pos, length($pos_chr) ) = $pos_chr;
7213     return ($underline);
7214 }
7215
7216 sub pre_tokenize {
7217
7218     # Break a string, $str, into a sequence of preliminary tokens.  We
7219     # are interested in these types of tokens:
7220     #   words       (type='w'),            example: 'max_tokens_wanted'
7221     #   digits      (type = 'd'),          example: '0755'
7222     #   whitespace  (type = 'b'),          example: '   '
7223     #   any other single character (i.e. punct; type = the character itself).
7224     # We cannot do better than this yet because we might be in a quoted
7225     # string or pattern.  Caller sets $max_tokens_wanted to 0 to get all
7226     # tokens.
7227     my ( $str, $max_tokens_wanted ) = @_;
7228
7229     # we return references to these 3 arrays:
7230     my @tokens    = ();     # array of the tokens themselves
7231     my @token_map = (0);    # string position of start of each token
7232     my @type      = ();     # 'b'=whitespace, 'd'=digits, 'w'=alpha, or punct
7233
7234     do {
7235
7236         # whitespace
7237         if ( $str =~ /\G(\s+)/gc ) { push @type, 'b'; }
7238
7239         # numbers
7240         # note that this must come before words!
7241         elsif ( $str =~ /\G(\d+)/gc ) { push @type, 'd'; }
7242
7243         # words
7244         elsif ( $str =~ /\G(\w+)/gc ) { push @type, 'w'; }
7245
7246         # single-character punctuation
7247         elsif ( $str =~ /\G(\W)/gc ) { push @type, $1; }
7248
7249         # that's all..
7250         else {
7251             return ( \@tokens, \@token_map, \@type );
7252         }
7253
7254         push @tokens,    $1;
7255         push @token_map, pos($str);
7256
7257     } while ( --$max_tokens_wanted != 0 );
7258
7259     return ( \@tokens, \@token_map, \@type );
7260 }
7261
7262 sub show_tokens {
7263
7264     # this is an old debug routine
7265     # not called, but saved for reference
7266     my ( $rtokens, $rtoken_map ) = @_;
7267     my $num = scalar( @{$rtokens} );
7268
7269     foreach my $i ( 0 .. $num - 1 ) {
7270         my $len = length( $rtokens->[$i] );
7271         print STDOUT "$i:$len:$rtoken_map->[$i]:$rtokens->[$i]:\n";
7272     }
7273     return;
7274 }
7275
7276 {
7277     my %matching_end_token;
7278
7279     BEGIN {
7280         %matching_end_token = (
7281             '{' => '}',
7282             '(' => ')',
7283             '[' => ']',
7284             '<' => '>',
7285         );
7286     }
7287
7288     sub matching_end_token {
7289
7290         # return closing character for a pattern
7291         my $beginning_token = shift;
7292         if ( $matching_end_token{$beginning_token} ) {
7293             return $matching_end_token{$beginning_token};
7294         }
7295         return ($beginning_token);
7296     }
7297 }
7298
7299 sub dump_token_types {
7300     my ( $class, $fh ) = @_;
7301
7302     # This should be the latest list of token types in use
7303     # adding NEW_TOKENS: add a comment here
7304     print $fh <<'END_OF_LIST';
7305
7306 Here is a list of the token types currently used for lines of type 'CODE'.
7307 For the following tokens, the "type" of a token is just the token itself.
7308
7309 .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
7310 ( ) <= >= == =~ !~ != ++ -- /= x=
7311 ... **= <<= >>= &&= ||= //= <=>
7312 , + - / * | % ! x ~ = \ ? : . < > ^ &
7313
7314 The following additional token types are defined:
7315
7316  type    meaning
7317     b    blank (white space)
7318     {    indent: opening structural curly brace or square bracket or paren
7319          (code block, anonymous hash reference, or anonymous array reference)
7320     }    outdent: right structural curly brace or square bracket or paren
7321     [    left non-structural square bracket (enclosing an array index)
7322     ]    right non-structural square bracket
7323     (    left non-structural paren (all but a list right of an =)
7324     )    right non-structural paren
7325     L    left non-structural curly brace (enclosing a key)
7326     R    right non-structural curly brace
7327     ;    terminal semicolon
7328     f    indicates a semicolon in a "for" statement
7329     h    here_doc operator <<
7330     #    a comment
7331     Q    indicates a quote or pattern
7332     q    indicates a qw quote block
7333     k    a perl keyword
7334     C    user-defined constant or constant function (with void prototype = ())
7335     U    user-defined function taking parameters
7336     G    user-defined function taking block parameter (like grep/map/eval)
7337     M    (unused, but reserved for subroutine definition name)
7338     P    (unused, but -html uses it to label pod text)
7339     t    type indicater such as %,$,@,*,&,sub
7340     w    bare word (perhaps a subroutine call)
7341     i    identifier of some type (with leading %, $, @, *, &, sub, -> )
7342     n    a number
7343     v    a v-string
7344     F    a file test operator (like -e)
7345     Y    File handle
7346     Z    identifier in indirect object slot: may be file handle, object
7347     J    LABEL:  code block label
7348     j    LABEL after next, last, redo, goto
7349     p    unary +
7350     m    unary -
7351     pp   pre-increment operator ++
7352     mm   pre-decrement operator --
7353     A    : used as attribute separator
7354
7355     Here are the '_line_type' codes used internally:
7356     SYSTEM         - system-specific code before hash-bang line
7357     CODE           - line of perl code (including comments)
7358     POD_START      - line starting pod, such as '=head'
7359     POD            - pod documentation text
7360     POD_END        - last line of pod section, '=cut'
7361     HERE           - text of here-document
7362     HERE_END       - last line of here-doc (target word)
7363     FORMAT         - format section
7364     FORMAT_END     - last line of format section, '.'
7365     DATA_START     - __DATA__ line
7366     DATA           - unidentified text following __DATA__
7367     END_START      - __END__ line
7368     END            - unidentified text following __END__
7369     ERROR          - we are in big trouble, probably not a perl script
7370 END_OF_LIST
7371
7372     return;
7373 }
7374
7375 BEGIN {
7376
7377     # These names are used in error messages
7378     @opening_brace_names = qw# '{' '[' '(' '?' #;
7379     @closing_brace_names = qw# '}' ']' ')' ':' #;
7380
7381     my @q;
7382
7383     my @digraphs = qw(
7384       .. :: << >> ** && .. || // -> => += -= .= %= &= |= ^= *= <>
7385       <= >= == =~ !~ != ++ -- /= x= ~~ ~. |. &. ^.
7386     );
7387     @is_digraph{@digraphs} = (1) x scalar(@digraphs);
7388
7389     my @trigraphs = qw( ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~);
7390     @is_trigraph{@trigraphs} = (1) x scalar(@trigraphs);
7391
7392     my @tetragraphs = qw( <<>> );
7393     @is_tetragraph{@tetragraphs} = (1) x scalar(@tetragraphs);
7394
7395     # make a hash of all valid token types for self-checking the tokenizer
7396     # (adding NEW_TOKENS : select a new character and add to this list)
7397     my @valid_token_types = qw#
7398       A b C G L R f h Q k t w i q n p m F pp mm U j J Y Z v
7399       { } ( ) [ ] ; + - / * | % ! x ~ = \ ? : . < > ^ &
7400       #;
7401     push( @valid_token_types, @digraphs );
7402     push( @valid_token_types, @trigraphs );
7403     push( @valid_token_types, @tetragraphs );
7404     push( @valid_token_types, ( '#', ',', 'CORE::' ) );
7405     @is_valid_token_type{@valid_token_types} = (1) x scalar(@valid_token_types);
7406
7407     # a list of file test letters, as in -e (Table 3-4 of 'camel 3')
7408     my @file_test_operators =
7409       qw( A B C M O R S T W X b c d e f g k l o p r s t u w x z);
7410     @is_file_test_operator{@file_test_operators} =
7411       (1) x scalar(@file_test_operators);
7412
7413     # these functions have prototypes of the form (&), so when they are
7414     # followed by a block, that block MAY BE followed by an operator.
7415     # Smartmatch operator ~~ may be followed by anonymous hash or array ref
7416     @q = qw( do eval );
7417     @is_block_operator{@q} = (1) x scalar(@q);
7418
7419     # these functions allow an identifier in the indirect object slot
7420     @q = qw( print printf sort exec system say);
7421     @is_indirect_object_taker{@q} = (1) x scalar(@q);
7422
7423     # These tokens may precede a code block
7424     # patched for SWITCH/CASE/CATCH.  Actually these could be removed
7425     # now and we could let the extended-syntax coding handle them
7426     @q =
7427       qw( BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue if elsif else
7428       unless do while until eval for foreach map grep sort
7429       switch case given when catch try finally);
7430     @is_code_block_token{@q} = (1) x scalar(@q);
7431
7432     # I'll build the list of keywords incrementally
7433     my @Keywords = ();
7434
7435     # keywords and tokens after which a value or pattern is expected,
7436     # but not an operator.  In other words, these should consume terms
7437     # to their right, or at least they are not expected to be followed
7438     # immediately by operators.
7439     my @value_requestor = qw(
7440       AUTOLOAD
7441       BEGIN
7442       CHECK
7443       DESTROY
7444       END
7445       EQ
7446       GE
7447       GT
7448       INIT
7449       LE
7450       LT
7451       NE
7452       UNITCHECK
7453       abs
7454       accept
7455       alarm
7456       and
7457       atan2
7458       bind
7459       binmode
7460       bless
7461       break
7462       caller
7463       chdir
7464       chmod
7465       chomp
7466       chop
7467       chown
7468       chr
7469       chroot
7470       close
7471       closedir
7472       cmp
7473       connect
7474       continue
7475       cos
7476       crypt
7477       dbmclose
7478       dbmopen
7479       defined
7480       delete
7481       die
7482       dump
7483       each
7484       else
7485       elsif
7486       eof
7487       eq
7488       exec
7489       exists
7490       exit
7491       exp
7492       fcntl
7493       fileno
7494       flock
7495       for
7496       foreach
7497       formline
7498       ge
7499       getc
7500       getgrgid
7501       getgrnam
7502       gethostbyaddr
7503       gethostbyname
7504       getnetbyaddr
7505       getnetbyname
7506       getpeername
7507       getpgrp
7508       getpriority
7509       getprotobyname
7510       getprotobynumber
7511       getpwnam
7512       getpwuid
7513       getservbyname
7514       getservbyport
7515       getsockname
7516       getsockopt
7517       glob
7518       gmtime
7519       goto
7520       grep
7521       gt
7522       hex
7523       if
7524       index
7525       int
7526       ioctl
7527       join
7528       keys
7529       kill
7530       last
7531       lc
7532       lcfirst
7533       le
7534       length
7535       link
7536       listen
7537       local
7538       localtime
7539       lock
7540       log
7541       lstat
7542       lt
7543       map
7544       mkdir
7545       msgctl
7546       msgget
7547       msgrcv
7548       msgsnd
7549       my
7550       ne
7551       next
7552       no
7553       not
7554       oct
7555       open
7556       opendir
7557       or
7558       ord
7559       our
7560       pack
7561       pipe
7562       pop
7563       pos
7564       print
7565       printf
7566       prototype
7567       push
7568       quotemeta
7569       rand
7570       read
7571       readdir
7572       readlink
7573       readline
7574       readpipe
7575       recv
7576       redo
7577       ref
7578       rename
7579       require
7580       reset
7581       return
7582       reverse
7583       rewinddir
7584       rindex
7585       rmdir
7586       scalar
7587       seek
7588       seekdir
7589       select
7590       semctl
7591       semget
7592       semop
7593       send
7594       sethostent
7595       setnetent
7596       setpgrp
7597       setpriority
7598       setprotoent
7599       setservent
7600       setsockopt
7601       shift
7602       shmctl
7603       shmget
7604       shmread
7605       shmwrite
7606       shutdown
7607       sin
7608       sleep
7609       socket
7610       socketpair
7611       sort
7612       splice
7613       split
7614       sprintf
7615       sqrt
7616       srand
7617       stat
7618       study
7619       substr
7620       symlink
7621       syscall
7622       sysopen
7623       sysread
7624       sysseek
7625       system
7626       syswrite
7627       tell
7628       telldir
7629       tie
7630       tied
7631       truncate
7632       uc
7633       ucfirst
7634       umask
7635       undef
7636       unless
7637       unlink
7638       unpack
7639       unshift
7640       untie
7641       until
7642       use
7643       utime
7644       values
7645       vec
7646       waitpid
7647       warn
7648       while
7649       write
7650       xor
7651
7652       switch
7653       case
7654       given
7655       when
7656       err
7657       say
7658
7659       catch
7660     );
7661
7662     # patched above for SWITCH/CASE given/when err say
7663     # 'err' is a fairly safe addition.
7664     # TODO: 'default' still needed if appropriate
7665     # 'use feature' seen, but perltidy works ok without it.
7666     # Concerned that 'default' could break code.
7667     push( @Keywords, @value_requestor );
7668
7669     # These are treated the same but are not keywords:
7670     my @extra_vr = qw(
7671       constant
7672       vars
7673     );
7674     push( @value_requestor, @extra_vr );
7675
7676     @expecting_term_token{@value_requestor} = (1) x scalar(@value_requestor);
7677
7678     # this list contains keywords which do not look for arguments,
7679     # so that they might be followed by an operator, or at least
7680     # not a term.
7681     my @operator_requestor = qw(
7682       endgrent
7683       endhostent
7684       endnetent
7685       endprotoent
7686       endpwent
7687       endservent
7688       fork
7689       getgrent
7690       gethostent
7691       getlogin
7692       getnetent
7693       getppid
7694       getprotoent
7695       getpwent
7696       getservent
7697       setgrent
7698       setpwent
7699       time
7700       times
7701       wait
7702       wantarray
7703     );
7704
7705     push( @Keywords, @operator_requestor );
7706
7707     # These are treated the same but are not considered keywords:
7708     my @extra_or = qw(
7709       STDERR
7710       STDIN
7711       STDOUT
7712     );
7713
7714     push( @operator_requestor, @extra_or );
7715
7716     @expecting_operator_token{@operator_requestor} =
7717       (1) x scalar(@operator_requestor);
7718
7719     # these token TYPES expect trailing operator but not a term
7720     # note: ++ and -- are post-increment and decrement, 'C' = constant
7721     my @operator_requestor_types = qw( ++ -- C <> q );
7722     @expecting_operator_types{@operator_requestor_types} =
7723       (1) x scalar(@operator_requestor_types);
7724
7725     # these token TYPES consume values (terms)
7726     # note: pp and mm are pre-increment and decrement
7727     # f=semicolon in for,  F=file test operator
7728     my @value_requestor_type = qw#
7729       L { ( [ ~ !~ =~ ; . .. ... A : && ! || // = + - x
7730       **= += -= .= /= *= %= x= &= |= ^= <<= >>= &&= ||= //=
7731       <= >= == != => \ > < % * / ? & | ** <=> ~~ !~~
7732       f F pp mm Y p m U J G j >> << ^ t
7733       ~. ^. |. &. ^.= |.= &.=
7734       #;
7735     push( @value_requestor_type, ',' )
7736       ;    # (perl doesn't like a ',' in a qw block)
7737     @expecting_term_types{@value_requestor_type} =
7738       (1) x scalar(@value_requestor_type);
7739
7740     # Note: the following valid token types are not assigned here to
7741     # hashes requesting to be followed by values or terms, but are
7742     # instead currently hard-coded into sub operator_expected:
7743     # ) -> :: Q R Z ] b h i k n v w } #
7744
7745     # For simple syntax checking, it is nice to have a list of operators which
7746     # will really be unhappy if not followed by a term.  This includes most
7747     # of the above...
7748     %really_want_term = %expecting_term_types;
7749
7750     # with these exceptions...
7751     delete $really_want_term{'U'}; # user sub, depends on prototype
7752     delete $really_want_term{'F'}; # file test works on $_ if no following term
7753     delete $really_want_term{'Y'}; # indirect object, too risky to check syntax;
7754                                    # let perl do it
7755
7756     @q = qw(q qq qw qx qr s y tr m);
7757     @is_q_qq_qw_qx_qr_s_y_tr_m{@q} = (1) x scalar(@q);
7758
7759     # These keywords are handled specially in the tokenizer code:
7760     my @special_keywords = qw(
7761       do
7762       eval
7763       format
7764       m
7765       package
7766       q
7767       qq
7768       qr
7769       qw
7770       qx
7771       s
7772       sub
7773       tr
7774       y
7775     );
7776     push( @Keywords, @special_keywords );
7777
7778     # Keywords after which list formatting may be used
7779     # WARNING: do not include |map|grep|eval or perl may die on
7780     # syntax errors (map1.t).
7781     my @keyword_taking_list = qw(
7782       and
7783       chmod
7784       chomp
7785       chop
7786       chown
7787       dbmopen
7788       die
7789       elsif
7790       exec
7791       fcntl
7792       for
7793       foreach
7794       formline
7795       getsockopt
7796       if
7797       index
7798       ioctl
7799       join
7800       kill
7801       local
7802       msgctl
7803       msgrcv
7804       msgsnd
7805       my
7806       open
7807       or
7808       our
7809       pack
7810       print
7811       printf
7812       push
7813       read
7814       readpipe
7815       recv
7816       return
7817       reverse
7818       rindex
7819       seek
7820       select
7821       semctl
7822       semget
7823       send
7824       setpriority
7825       setsockopt
7826       shmctl
7827       shmget
7828       shmread
7829       shmwrite
7830       socket
7831       socketpair
7832       sort
7833       splice
7834       split
7835       sprintf
7836       substr
7837       syscall
7838       sysopen
7839       sysread
7840       sysseek
7841       system
7842       syswrite
7843       tie
7844       unless
7845       unlink
7846       unpack
7847       unshift
7848       until
7849       vec
7850       warn
7851       while
7852       given
7853       when
7854     );
7855     @is_keyword_taking_list{@keyword_taking_list} =
7856       (1) x scalar(@keyword_taking_list);
7857
7858     # perl functions which may be unary operators
7859     my @keyword_taking_optional_args = qw(
7860       chomp
7861       eof
7862       eval
7863       lc
7864       pop
7865       shift
7866       uc
7867       undef
7868     );
7869     @is_keyword_taking_optional_args{@keyword_taking_optional_args} =
7870       (1) x scalar(@keyword_taking_optional_args);
7871
7872     # These are not used in any way yet
7873     #    my @unused_keywords = qw(
7874     #     __FILE__
7875     #     __LINE__
7876     #     __PACKAGE__
7877     #     );
7878
7879     #  The list of keywords was originally extracted from function 'keyword' in
7880     #  perl file toke.c version 5.005.03, using this utility, plus a
7881     #  little editing: (file getkwd.pl):
7882     #  while (<>) { while (/\"(.*)\"/g) { print "$1\n"; } }
7883     #  Add 'get' prefix where necessary, then split into the above lists.
7884     #  This list should be updated as necessary.
7885     #  The list should not contain these special variables:
7886     #  ARGV DATA ENV SIG STDERR STDIN STDOUT
7887     #  __DATA__ __END__
7888
7889     @is_keyword{@Keywords} = (1) x scalar(@Keywords);
7890 }
7891 1;
7892