From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Sun, 25 Oct 2020 15:12:13 +0000 (-0700)
Subject: avoid formatting files with more types of severe errors
X-Git-Tag: 20201001.03~41
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=2a86f51c5dd693a524d10aaf91c0f2f256c515b5;p=perltidy.git

avoid formatting files with more types of severe errors
---

diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm
index de5652b8..8077ee73 100644
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -164,6 +164,7 @@ BEGIN {
         _rlower_case_labels_at_              => $i++,
         _extended_syntax_                    => $i++,
         _maximum_level_                      => $i++,
+        _true_brace_error_count_             => $i++,
     };
 }
 
@@ -324,6 +325,7 @@ sub new {
     $self->[_rlower_case_labels_at_]              = undef;
     $self->[_extended_syntax_]                    = $args{extended_syntax};
     $self->[_maximum_level_]                      = 0;
+    $self->[_true_brace_error_count_]             = 0;
     bless $self, $class;
 
     $tokenizer_self = $self;
@@ -443,16 +445,38 @@ sub get_maximum_level {
 
 sub report_tokenization_errors {
 
-    my $self         = shift;
+    my ($self) = @_;
+
+    # Report any tokenization errors and return a flag '$severe_error'.
+    # Set $severe_error = 1 if the tokenizations errors are so severe that
+    # the formatter should not attempt to format the file. Instead, it will
+    # just output the file verbatim.
+
+    # set severe error flag if tokenizer has encountered file reading problems
+    # (i.e. unexpected binary characters)
     my $severe_error = $self->[_in_error_];
 
     my $level = get_indentation_level();
     if ( $level != $tokenizer_self->[_starting_level_] ) {
         warning("final indentation level: $level\n");
+        my $level_diff = $tokenizer_self->[_starting_level_] - $level;
+
+        # Set severe error flag if the level error is greater than 1.
+        # The formatter can function for any level error but it is probably
+        # best not to attempt formatting for a high level error.
+        $severe_error = 1 if ( $level_diff < -1 || $level_diff > 1 );
     }
 
     check_final_nesting_depths();
 
+    # Likewise, large numbers of brace errors usually indicate non-perl
+    # scirpts, so set the severe error flag at a low number.  This is similar
+    # to the level check, but different because braces may balance but be
+    # incorrectly interlaced.
+    if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
+        $severe_error = 1;
+    }
+
     if ( $tokenizer_self->[_look_for_hash_bang_]
         && !$tokenizer_self->[_saw_hash_bang_] )
     {
@@ -507,6 +531,7 @@ sub report_tokenization_errors {
         }
     }
 
+    # Something is seriously wrong if we ended inside a quote
     if ( $tokenizer_self->[_in_quote_] ) {
         $severe_error = 1;
         my $line_start_quote = $tokenizer_self->[_line_start_quote_];
@@ -524,25 +549,12 @@ sub report_tokenization_errors {
         $severe_error = 1;
     }
 
-    my $logger_object = $tokenizer_self->[_logger_object_];
-
-# TODO: eventually may want to activate this to cause file to be output verbatim
-    if (0) {
-
-        # Set the severe error for a fairly high warning count because
-        # some of the warnings do not harm formatting, such as duplicate
-        # sub names.
-        my $warning_count = $logger_object->get_warning_count();
-        if ( $warning_count > 50 ) {
-            $severe_error = 1;
-        }
-
-        # Brace errors are significant, so set the severe error flag at
-        # a low number.
-        my $saw_brace_error = get_saw_brace_error();
-        if ( $saw_brace_error > 2 ) {
-            $severe_error = 1;
-        }
+    # Multiple "unexpected" type tokenization errors usually indicate parsing
+    # non-perl scripts, or that something is seriously wrong, so we should
+    # avoid formatting them.  This can happen for example if we run perltidy on
+    # a shell script or an html file.
+    if ( $tokenizer_self->[_unexpected_error_count_] > 3 ) {
+        $severe_error = 1;
     }
 
     unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
@@ -4404,8 +4416,10 @@ BEGIN {
     @q = qw( w );
     @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
 
-    # Always expecting OPERATOR following these types:
-    # FIXME: see notes below for types n,v,q,i
+    # Always expecting OPERATOR ...
+    # 'n' and 'v' are currently excluded because they might be VERSION numbers
+    # 'i' is currently excluded because it might be a package
+    # 'q' is currently excluded because it might be a prototype
     @q = qw( -- C -> h R ++ ] Q <> );    ## n v q i );
     push @q, ')';
     @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
@@ -4512,6 +4526,16 @@ sub operator_expected {
     } ## end type 'k'
 
     # closing container token...
+
+    # Note that the actual token for type '}' may also be a ')'.
+
+    # Also note that $last_nonblank_token is not the token corresponding to 
+    # $last_nonblank_type when the type is a closing container.  In that
+    # case it is the token before the corresponding opening container token.
+    # So for example, for this snippet
+    #       $a = do { BLOCK } / 2;
+    # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
+
     elsif ( $last_nonblank_type eq '}' ) {
         $op_expected = UNKNOWN;
 
@@ -4573,9 +4597,10 @@ sub operator_expected {
     } ## end type '}'
 
     # number or v-string...
-    # FIXME: Numbers in 'use' statement should have a different type; not 'n'
-    # or 'v' suggest implementing new type 'V' for numbers in a use statement
-    # TODO: mark these numbers as type 'w'
+    # An exception is for VERSION numbers a 'use' statement which has the format
+    #     use Module VERSION LIST 
+    # We could avoid this exception by writing a special sub to parse 'use' statements
+    # and perhaps mark these numbers with a new type V (for VERSION) 
     elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
         $op_expected = OPERATOR;
         if ( $statement_type eq 'use' ) {
@@ -4589,8 +4614,7 @@ sub operator_expected {
     elsif ( $last_nonblank_type eq 'q' ) {
         $op_expected = OPERATOR;
         if ( $last_nonblank_token eq 'prototype' )
-
-          #|| $last_nonblank_token eq 'switch' )
+          ##|| $last_nonblank_token eq 'switch' )
         {
             $op_expected = TERM;
         }
@@ -5254,6 +5278,10 @@ EOM
             indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
         }
         increment_brace_error();
+
+        # keep track of errors in braces alone (ignoring ternary nesting errors)
+        $tokenizer_self->[_true_brace_error_count_]++
+          if ( $closing_brace_names[$aa] ne "':'" );
     }
     return ( $seqno, $outdent );
 }
diff --git a/local-docs/BugLog.pod b/local-docs/BugLog.pod
index 14289ebd..88039423 100644
--- a/local-docs/BugLog.pod
+++ b/local-docs/BugLog.pod
@@ -2,6 +2,34 @@
 
 =over 4
 
+=item b<more types of severe errors will prevent formatting>
+
+Files for which 'severe errors' are found have always been output verbatim
+rather than being formatted.  The definition of 'severe error' has been
+expanded to include a final indentation level error greater than 1, more than 2
+brace errors, and more than 3 "unexpected token type" parsing errors.  The goal
+is to avoid formatting a non-perl script or a perl script with severe errors.
+So for example the following snippet has a level error of 2
+
+{{{{
+}}
+
+was previously output with default parameters as
+
+{
+    {
+        {
+            {}
+        }
+
+
+along with an error message. But now it is just output verbatim as
+
+{{{{
+}}
+
+along with an error message.
+
 =item b<added 'state' as keyword>
 
 A statement such as the following was generating an error message at the colon: