+ #----------------------------------------
+ # Starting final if-elsif- chain of tests
+ #----------------------------------------
+
+ # This is the return flag:
+ # true => this is the last token on the line
+ # false => keep tokenizing the line
+ my $is_last;
+
+ # The following blocks of code must update these vars:
+ # $type - the final token type, must always be set
+
+ # In addition, if additional pretokens are added:
+ # $tok - the final token
+ # $i - the index of the last pretoken
+
+ # They may also need to check and set various flags
+
+ # Quote a word followed by => operator
+ # unless the word __END__ or __DATA__ and the only word on
+ # the line.
+ if ( !$is_END_or_DATA
+ && $next_nonblank_token eq '='
+ && $rtokens->[ $i_next + 1 ] eq '>' )
+ {
+ do_QUOTED_BAREWORD();
+ }
+
+ # quote a bare word within braces..like xxx->{s}; note that we
+ # must be sure this is not a structural brace, to avoid
+ # mistaking {s} in the following for a quoted bare word:
+ # for(@[){s}bla}BLA}
+ # Also treat q in something like var{-q} as a bare word, not
+ # a quote operator
+ elsif (
+ $next_nonblank_token eq '}'
+ && (
+ $last_nonblank_type eq 'L'
+ || ( $last_nonblank_type eq 'm'
+ && $last_last_nonblank_type eq 'L' )
+ )
+ )
+ {
+ $type = 'w';
+ }
+
+ # Scan a bare word following a -> as an identifier; it could
+ # have a long package name. Fixes c037, c041.
+ elsif ( $last_nonblank_token eq '->' ) {
+ scan_bare_identifier();
+
+ # Patch for c043, part 4; use type 'w' after a '->'.
+ # This is just a safety check on sub scan_bare_identifier,
+ # which should get this case correct.
+ $type = 'w';
+ }
+
+ # handle operator x (now we know it isn't $x=)
+ elsif (
+ $expecting == OPERATOR
+ && substr( $tok, 0, 1 ) eq 'x'
+ && ( length($tok) == 1
+ || substr( $tok, 1, 1 ) =~ /^\d/ )
+ )
+ {
+ do_X_OPERATOR();
+ }
+ elsif ( $tok_kw eq 'CORE::' ) {
+ $type = $tok = $tok_kw;
+ $i += 2;
+ }
+ elsif ( ( $tok eq 'strict' )
+ and ( $last_nonblank_token eq 'use' ) )
+ {
+ $tokenizer_self->[_saw_use_strict_] = 1;
+ scan_bare_identifier();
+ }
+
+ elsif ( ( $tok eq 'warnings' )
+ and ( $last_nonblank_token eq 'use' ) )
+ {
+ $tokenizer_self->[_saw_perl_dash_w_] = 1;
+
+ # scan as identifier, so that we pick up something like:
+ # use warnings::register
+ scan_bare_identifier();
+ }
+
+ elsif (
+ $tok eq 'AutoLoader'
+ && $tokenizer_self->[_look_for_autoloader_]
+ && (
+ $last_nonblank_token eq 'use'
+
+ # these regexes are from AutoSplit.pm, which we want
+ # to mimic
+ || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
+ || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
+ )
+ )
+ {
+ write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
+ $tokenizer_self->[_saw_autoloader_] = 1;
+ $tokenizer_self->[_look_for_autoloader_] = 0;
+ scan_bare_identifier();
+ }
+
+ elsif (
+ $tok eq 'SelfLoader'
+ && $tokenizer_self->[_look_for_selfloader_]
+ && ( $last_nonblank_token eq 'use'
+ || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
+ || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
+ )
+ {
+ write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
+ $tokenizer_self->[_saw_selfloader_] = 1;
+ $tokenizer_self->[_look_for_selfloader_] = 0;
+ scan_bare_identifier();
+ }
+
+ elsif ( ( $tok eq 'constant' )
+ and ( $last_nonblank_token eq 'use' ) )
+ {
+ do_USE_CONSTANT();
+ }
+
+ # various quote operators
+ elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
+ do_QUOTE_OPERATOR();
+ }
+
+ # check for a statement label
+ elsif (
+ ( $next_nonblank_token eq ':' )
+ && ( $rtokens->[ $i_next + 1 ] ne ':' )
+ && ( $i_next <= $max_token_index ) # colon on same line
+
+ # like 'sub : lvalue' ?
+ ##&& !$sub_attribute_ok_here # like 'sub : lvalue' ?
+ && !sub_attribute_ok_here( $tok_kw, $next_nonblank_token, $i_next )
+ && label_ok()
+ )
+ {
+ if ( $tok !~ /[A-Z]/ ) {
+ push @{ $tokenizer_self->[_rlower_case_labels_at_] },
+ $input_line_number;
+ }
+ $type = 'J';
+ $tok .= ':';
+ $i = $i_next;
+ }
+
+ # 'sub' or alias
+ elsif ( $is_sub{$tok_kw} ) {
+ error_if_expecting_OPERATOR()
+ if ( $expecting == OPERATOR );
+ initialize_subname();
+ scan_id();
+ }
+
+ # 'package'
+ elsif ( $is_package{$tok_kw} ) {
+ error_if_expecting_OPERATOR()
+ if ( $expecting == OPERATOR );
+ scan_id();
+ }
+
+ # Fix for c035: split 'format' from 'is_format_END_DATA' to be
+ # more restrictive. Require a new statement to be ok here.
+ elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
+ $type = ';'; # make tokenizer look for TERM next
+ $tokenizer_self->[_in_format_] = 1;
+ $is_last = 1; ## is last token on this line
+ }
+
+ # Note on token types for format, __DATA__, __END__:
+ # It simplifies things to give these type ';', so that when we
+ # start rescanning we will be expecting a token of type TERM.
+ # We will switch to type 'k' before outputting the tokens.
+ elsif ( $is_END_DATA{$tok_kw} ) {
+ $type = ';'; # make tokenizer look for TERM next
+
+ # Remember that we are in one of these three sections
+ $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1;
+ $is_last = 1; ## is last token on this line
+ }
+
+ elsif ( $is_keyword{$tok_kw} ) {
+ do_KEYWORD();
+ }
+
+ # check for inline label following
+ # /^(redo|last|next|goto)$/
+ elsif (( $last_nonblank_type eq 'k' )
+ && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
+ {
+ $type = 'j';
+ }
+
+ # something else --
+ else {
+ do_UNKNOWN_BAREWORD($next_nonblank_token);
+ }
+
+ return $is_last;
+
+ } ## end sub do_BAREWORD
+
+ sub do_FOLLOW_QUOTE {
+
+ # Continue following a quote on a new line
+ $type = $quote_type;
+
+ unless ( @{$routput_token_list} ) { # initialize if continuation line
+ push( @{$routput_token_list}, $i );
+ $routput_token_type->[$i] = $type;
+
+ }
+
+ # Removed to fix b1280. This is not needed and was causing the
+ # starting type 'qw' to be lost, leading to mis-tokenization of
+ # a trailing block brace in a parenless for stmt 'for .. qw.. {'
+ ##$tok = $quote_character if ($quote_character);
+
+ # scan for the end of the quote or pattern
+ (
+ $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
+ $quoted_string_1, $quoted_string_2
+ )
+ = do_quote(
+ $i, $in_quote, $quote_character,
+ $quote_pos, $quote_depth, $quoted_string_1,
+ $quoted_string_2, $rtokens, $rtoken_map,
+ $max_token_index
+ );
+
+ # all done if we didn't find it
+ if ($in_quote) { return }
+
+ # save pattern and replacement text for rescanning
+ my $qs1 = $quoted_string_1;
+
+ # re-initialize for next search
+ $quote_character = EMPTY_STRING;
+ $quote_pos = 0;
+ $quote_type = 'Q';
+ $quoted_string_1 = EMPTY_STRING;
+ $quoted_string_2 = EMPTY_STRING;
+ if ( ++$i > $max_token_index ) { return }
+
+ # look for any modifiers
+ if ($allowed_quote_modifiers) {
+
+ # check for exact quote modifiers
+ if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
+ my $str = $rtokens->[$i];
+ my $saw_modifier_e;
+ while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
+ my $pos = pos($str);
+ my $char = substr( $str, $pos - 1, 1 );
+ $saw_modifier_e ||= ( $char eq 'e' );
+ }
+
+ # For an 'e' quote modifier we must scan the replacement
+ # text for here-doc targets...
+ # but if the modifier starts a new line we can skip
+ # this because either the here doc will be fully
+ # contained in the replacement text (so we can
+ # ignore it) or Perl will not find it.
+ # See test 'here2.in'.
+ if ( $saw_modifier_e && $i_tok >= 0 ) {
+
+ my $rht = scan_replacement_text($qs1);
+
+ # Change type from 'Q' to 'h' for quotes with
+ # here-doc targets so that the formatter (see sub
+ # process_line_of_CODE) will not make any line
+ # breaks after this point.
+ if ($rht) {
+ push @{$rhere_target_list}, @{$rht};
+ $type = 'h';
+ if ( $i_tok < 0 ) {
+ my $ilast = $routput_token_list->[-1];
+ $routput_token_type->[$ilast] = $type;
+ }
+ }
+ }
+
+ if ( defined( pos($str) ) ) {
+
+ # matched
+ if ( pos($str) == length($str) ) {
+ if ( ++$i > $max_token_index ) { return }
+ }
+
+ # Looks like a joined quote modifier
+ # and keyword, maybe something like
+ # s/xxx/yyy/gefor @k=...
+ # Example is "galgen.pl". Would have to split
+ # the word and insert a new token in the
+ # pre-token list. This is so rare that I haven't
+ # done it. Will just issue a warning citation.
+
+ # This error might also be triggered if my quote
+ # modifier characters are incomplete
+ else {
+ warning(<<EOM);
+
+Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
+Please put a space between quote modifiers and trailing keywords.
+EOM
+
+ # print "token $rtokens->[$i]\n";
+ # my $num = length($str) - pos($str);
+ # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
+ # print "continuing with new token $rtokens->[$i]\n";
+
+ # skipping past this token does least damage
+ if ( ++$i > $max_token_index ) { return }
+ }
+ }
+ else {
+
+ # example file: rokicki4.pl
+ # This error might also be triggered if my quote
+ # modifier characters are incomplete
+ write_logfile_entry(
+ "Note: found word $str at quote modifier location\n");
+ }
+ }
+
+ # re-initialize
+ $allowed_quote_modifiers = EMPTY_STRING;
+ }
+ return;
+ } ## end sub do_FOLLOW_QUOTE
+
+ # ------------------------------------------------------------
+ # begin hash of code for handling most token types
+ # ------------------------------------------------------------
+ my $tokenization_code = {
+
+ '>' => \&do_GREATER_THAN_SIGN,
+ '|' => \&do_VERTICAL_LINE,
+ '$' => \&do_DOLLAR_SIGN,
+ '(' => \&do_LEFT_PARENTHESIS,
+ ')' => \&do_RIGHT_PARENTHESIS,
+ ',' => \&do_COMMA,
+ ';' => \&do_SEMICOLON,
+ '"' => \&do_QUOTATION_MARK,
+ "'" => \&do_APOSTROPHE,
+ '`' => \&do_BACKTICK,
+ '/' => \&do_SLASH,
+ '{' => \&do_LEFT_CURLY_BRACKET,
+ '}' => \&do_RIGHT_CURLY_BRACKET,
+ '&' => \&do_AMPERSAND,
+ '<' => \&do_LESS_THAN_SIGN,
+ '?' => \&do_QUESTION_MARK,
+ '*' => \&do_STAR,
+ '.' => \&do_DOT,
+ ':' => \&do_COLON,
+ '+' => \&do_PLUS_SIGN,
+ '@' => \&do_AT_SIGN,
+ '%' => \&do_PERCENT_SIGN,
+ '[' => \&do_LEFT_SQUARE_BRACKET,
+ ']' => \&do_RIGHT_SQUARE_BRACKET,
+ '-' => \&do_MINUS_SIGN,
+ '^' => \&do_CARAT_SIGN,
+ '::' => \&do_DOUBLE_COLON,
+ '<<' => \&do_LEFT_SHIFT,
+ '<<~' => \&do_NEW_HERE_DOC,
+ '->' => \&do_POINTER,
+ '++' => \&do_PLUS_PLUS,
+ '=>' => \&do_FAT_COMMA,
+ '--' => \&do_MINUS_MINUS,
+ '&&' => \&do_LOGICAL_AND,
+ '||' => \&do_LOGICAL_OR,
+ '//' => \&do_SLASH_SLASH,
+
+ # No special code for these types yet, but syntax checks
+ # could be added.
+ ## '!' => undef,
+ ## '!=' => undef,
+ ## '!~' => undef,
+ ## '%=' => undef,
+ ## '&&=' => undef,
+ ## '&=' => undef,
+ ## '+=' => undef,
+ ## '-=' => undef,
+ ## '..' => undef,
+ ## '..' => undef,
+ ## '...' => undef,
+ ## '.=' => undef,
+ ## '<<=' => undef,
+ ## '<=' => undef,
+ ## '<=>' => undef,
+ ## '<>' => undef,
+ ## '=' => undef,
+ ## '==' => undef,
+ ## '=~' => undef,
+ ## '>=' => undef,
+ ## '>>' => undef,
+ ## '>>=' => undef,
+ ## '\\' => undef,
+ ## '^=' => undef,
+ ## '|=' => undef,
+ ## '||=' => undef,
+ ## '//=' => undef,
+ ## '~' => undef,
+ ## '~~' => undef,
+ ## '!~~' => undef,
+
+ };
+
+ # ------------------------------------------------------------
+ # end hash of code for handling individual token types
+ # ------------------------------------------------------------