avoid formatting files with more types of severe errors

author Steve Hancock <perltidy@users.sourceforge.net>

Sun, 25 Oct 2020 15:12:13 +0000 (08:12 -0700)

committer Steve Hancock <perltidy@users.sourceforge.net>

Sun, 25 Oct 2020 15:12:13 +0000 (08:12 -0700)
author Steve Hancock <perltidy@users.sourceforge.net>
Sun, 25 Oct 2020 15:12:13 +0000 (08:12 -0700)
committer Steve Hancock <perltidy@users.sourceforge.net>
Sun, 25 Oct 2020 15:12:13 +0000 (08:12 -0700)
diff --git a/lib/Perl/Tidy/Tokenizer.pm b/lib/Perl/Tidy/Tokenizer.pm

index de5652b829e85cb1c8288fbac38e32fb8ca6db2a..8077ee73a9512eea35e416d95a47a225db25eaa6 100644 (file)
--- a/lib/Perl/Tidy/Tokenizer.pm
+++ b/lib/Perl/Tidy/Tokenizer.pm
@@ -164,6 +164,7 @@ BEGIN {
          _rlower_case_labels_at_              => $i++,
          _extended_syntax_                    => $i++,
          _maximum_level_                      => $i++,
+        _true_brace_error_count_             => $i++,
      };
  }
  
@@ -324,6 +325,7 @@ sub new {
      $self->[_rlower_case_labels_at_]              = undef;
      $self->[_extended_syntax_]                    = $args{extended_syntax};
      $self->[_maximum_level_]                      = 0;
+    $self->[_true_brace_error_count_]             = 0;
      bless $self, $class;
  
      $tokenizer_self = $self;
@@ -443,16 +445,38 @@ sub get_maximum_level {
  
  sub report_tokenization_errors {
  
-    my $self         = shift;
+    my ($self) = @_;
+
+    # Report any tokenization errors and return a flag '$severe_error'.
+    # Set $severe_error = 1 if the tokenizations errors are so severe that
+    # the formatter should not attempt to format the file. Instead, it will
+    # just output the file verbatim.
+
+    # set severe error flag if tokenizer has encountered file reading problems
+    # (i.e. unexpected binary characters)
      my $severe_error = $self->[_in_error_];
  
      my $level = get_indentation_level();
      if ( $level != $tokenizer_self->[_starting_level_] ) {
          warning("final indentation level: $level\n");
+        my $level_diff = $tokenizer_self->[_starting_level_] - $level;
+
+        # Set severe error flag if the level error is greater than 1.
+        # The formatter can function for any level error but it is probably
+        # best not to attempt formatting for a high level error.
+        $severe_error = 1 if ( $level_diff < -1 || $level_diff > 1 );
      }
  
      check_final_nesting_depths();
  
+    # Likewise, large numbers of brace errors usually indicate non-perl
+    # scirpts, so set the severe error flag at a low number.  This is similar
+    # to the level check, but different because braces may balance but be
+    # incorrectly interlaced.
+    if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
+        $severe_error = 1;
+    }
+
      if ( $tokenizer_self->[_look_for_hash_bang_]
          && !$tokenizer_self->[_saw_hash_bang_] )
      {
@@ -507,6 +531,7 @@ sub report_tokenization_errors {
          }
      }
  
+    # Something is seriously wrong if we ended inside a quote
      if ( $tokenizer_self->[_in_quote_] ) {
          $severe_error = 1;
          my $line_start_quote = $tokenizer_self->[_line_start_quote_];
@@ -524,25 +549,12 @@ sub report_tokenization_errors {
          $severe_error = 1;
      }
  
-    my $logger_object = $tokenizer_self->[_logger_object_];
-
-# TODO: eventually may want to activate this to cause file to be output verbatim
-    if (0) {
-
-        # Set the severe error for a fairly high warning count because
-        # some of the warnings do not harm formatting, such as duplicate
-        # sub names.
-        my $warning_count = $logger_object->get_warning_count();
-        if ( $warning_count > 50 ) {
-            $severe_error = 1;
-        }
-
-        # Brace errors are significant, so set the severe error flag at
-        # a low number.
-        my $saw_brace_error = get_saw_brace_error();
-        if ( $saw_brace_error > 2 ) {
-            $severe_error = 1;
-        }
+    # Multiple "unexpected" type tokenization errors usually indicate parsing
+    # non-perl scripts, or that something is seriously wrong, so we should
+    # avoid formatting them.  This can happen for example if we run perltidy on
+    # a shell script or an html file.
+    if ( $tokenizer_self->[_unexpected_error_count_] > 3 ) {
+        $severe_error = 1;
      }
  
      unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
@@ -4404,8 +4416,10 @@ BEGIN {
      @q = qw( w );
      @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
  
-    # Always expecting OPERATOR following these types:
-    # FIXME: see notes below for types n,v,q,i
+    # Always expecting OPERATOR ...
+    # 'n' and 'v' are currently excluded because they might be VERSION numbers
+    # 'i' is currently excluded because it might be a package
+    # 'q' is currently excluded because it might be a prototype
      @q = qw( -- C -> h R ++ ] Q <> );    ## n v q i );
      push @q, ')';
      @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
@@ -4512,6 +4526,16 @@ sub operator_expected {
      } ## end type 'k'
  
      # closing container token...
+
+    # Note that the actual token for type '}' may also be a ')'.
+
+    # Also note that $last_nonblank_token is not the token corresponding to 
+    # $last_nonblank_type when the type is a closing container.  In that
+    # case it is the token before the corresponding opening container token.
+    # So for example, for this snippet
+    #       $a = do { BLOCK } / 2;
+    # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
+
      elsif ( $last_nonblank_type eq '}' ) {
          $op_expected = UNKNOWN;
  
@@ -4573,9 +4597,10 @@ sub operator_expected {
      } ## end type '}'
  
      # number or v-string...
-    # FIXME: Numbers in 'use' statement should have a different type; not 'n'
-    # or 'v' suggest implementing new type 'V' for numbers in a use statement
-    # TODO: mark these numbers as type 'w'
+    # An exception is for VERSION numbers a 'use' statement which has the format
+    #     use Module VERSION LIST 
+    # We could avoid this exception by writing a special sub to parse 'use' statements
+    # and perhaps mark these numbers with a new type V (for VERSION) 
      elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
          $op_expected = OPERATOR;
          if ( $statement_type eq 'use' ) {
@@ -4589,8 +4614,7 @@ sub operator_expected {
      elsif ( $last_nonblank_type eq 'q' ) {
          $op_expected = OPERATOR;
          if ( $last_nonblank_token eq 'prototype' )
-
-          #|| $last_nonblank_token eq 'switch' )
+          ##|| $last_nonblank_token eq 'switch' )
          {
              $op_expected = TERM;
          }
@@ -5254,6 +5278,10 @@ EOM
              indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
          }
          increment_brace_error();
+
+        # keep track of errors in braces alone (ignoring ternary nesting errors)
+        $tokenizer_self->[_true_brace_error_count_]++
+          if ( $closing_brace_names[$aa] ne "':'" );
      }
      return ( $seqno, $outdent );
  }
diff --git a/local-docs/BugLog.pod b/local-docs/BugLog.pod

index 14289ebdad1c6219512273b6c62bb9559de588c3..880394238ee84e57ad5bd5402a774acde249ca5f 100644 (file)
--- a/local-docs/BugLog.pod
+++ b/local-docs/BugLog.pod
@@ -2,6 +2,34 @@
  
  =over 4
  
+=item b<more types of severe errors will prevent formatting>
+
+Files for which 'severe errors' are found have always been output verbatim
+rather than being formatted.  The definition of 'severe error' has been
+expanded to include a final indentation level error greater than 1, more than 2
+brace errors, and more than 3 "unexpected token type" parsing errors.  The goal
+is to avoid formatting a non-perl script or a perl script with severe errors.
+So for example the following snippet has a level error of 2
+
+{{{{
+}}
+
+was previously output with default parameters as
+
+{
+    {
+        {
+            {}
+        }
+
+
+along with an error message. But now it is just output verbatim as
+
+{{{{
+}}
+
+along with an error message.
+
  =item b<added 'state' as keyword>
  
  A statement such as the following was generating an error message at the colon:
author	Steve Hancock <perltidy@users.sourceforge.net>
	Sun, 25 Oct 2020 15:12:13 +0000 (08:12 -0700)
committer	Steve Hancock <perltidy@users.sourceforge.net>
	Sun, 25 Oct 2020 15:12:13 +0000 (08:12 -0700)
lib/Perl/Tidy/Tokenizer.pm		patch \| blob \| history
local-docs/BugLog.pod		patch \| blob \| history