]> git.donarmstrong.com Git - perltidy.git/commitdiff
consolidate coding involving utf8::is_utf8
authorSteve Hancock <perltidy@users.sourceforge.net>
Sat, 21 Mar 2020 14:18:59 +0000 (07:18 -0700)
committerSteve Hancock <perltidy@users.sourceforge.net>
Sat, 21 Mar 2020 14:18:59 +0000 (07:18 -0700)
lib/Perl/Tidy.pm

index b4174c48a5897808df6a08d5beec5e402b019974..49583b2472e3b6c9da372762b3f0085bc3d5a32a 100644 (file)
@@ -909,83 +909,83 @@ EOM
             $buf .= $line;
         }
 
-        # Decode the input file as requested. There are 3 cases:
-        my $encoding_in = "";
+        # Decode the input stream if necessary requested
+        my $encoding_in              = "";
+        my $rOpts_character_encoding = $rOpts->{'character-encoding'};
 
-        # Case 1. No input stream encoding.  This is appropriate
+        # Case 1. See if we already have an encoded string. In that
+        # case, we have to ignore any encoding flag.
+        if ( utf8::is_utf8($buf) ) {
+            $encoding_in = "utf8";
+        }
+
+        # Case 2. No input stream encoding requested.  This is appropriate
         # for single-byte encodings like ascii, latin-1, etc
-        if (  !$rOpts->{'character-encoding'}
-            || $rOpts->{'character-encoding'} eq 'none' )
+        elsif ( !$rOpts_character_encoding
+            || $rOpts_character_encoding eq 'none' )
         {
 
             # nothing to do
-
         }
 
-        # Case 2. guess input stream encoding if requested
-        elsif ( $rOpts->{'character-encoding'} =~ /^guess$/i ) {
+        # Case 3. guess input stream encoding if requested
+        elsif ( $rOpts_character_encoding =~ /^guess$/i ) {
 
-            # First check if the module has been passed an encoded string
-            if ( utf8::is_utf8($buf) ) {
-                $encoding_in = "utf8";
-            }
+            # The guessing strategy is simple: use Encode::Guess to guess
+            # an encoding.  If and only if the guess is utf8, try decoding and
+            # use it if successful.  Otherwise, we proceed assuming the
+            # characters are encoded as single bytes (same as if 'none' had
+            # been specified as the encoding).
 
-            else {
+            # In testing I have found that including additional guess 'suspect'
+            # encodings sometimes works but can sometimes lead to disaster by
+            # using an incorrect decoding.  The user can always specify a
+            # specific input encoding.
+            my $buf_in = $buf;
 
-                # Use a very simple guessing strategy: if the guess is utf8,
-                # we test decoding with it and use it if successful.
-                # Otherwise, we proceed assuming the characters are encoded as
-                # single bytes.  I have found that anything more complicated
-                # may sometimes work but may also lead to the disaster of
-                # using an incorrect decoding.
-                my $buf_in = $buf;
-
-                my $decoder = guess_encoding( $buf_in, 'utf8' );
-                if ( ref($decoder) ) {
-                    $encoding_in = $decoder->name;
-                    if ( $encoding_in !~ /^(UTF-8|utf8)$/ ) {
-                        $encoding_in = "";
-                        $buf         = $buf_in;
-                    }
-                    else {
+            my $decoder = guess_encoding( $buf_in, 'utf8' );
+            if ( ref($decoder) ) {
+                $encoding_in = $decoder->name;
+                if ( $encoding_in !~ /^(UTF-8|utf8)$/ ) {
+                    $encoding_in = "";
+                    $buf         = $buf_in;
+                }
+                else {
 
-                        eval { $buf = $decoder->decode($buf_in); };
-                        if ($@) {
+                    eval { $buf = $decoder->decode($buf_in); };
+                    if ($@) {
 
-                            # Note that a guess failed, but keep going
-                            # This warning can eventually be removed
-                            Warn(
+                        # Note that a guess failed, but keep going
+                        # This warning can eventually be removed
+                        Warn(
 "file: $input_file: bad guess to decode source as $encoding_in\n"
-                            );
-                            $encoding_in = "";
-                            $buf         = $buf_in;
-                        }
+                        );
+                        $encoding_in = "";
+                        $buf         = $buf_in;
                     }
                 }
             }
         }
 
-        # Case 3. Decode with a specific encoding
+        # Case 4. Decode with a specific encoding
         else {
-            $encoding_in = $rOpts->{'character-encoding'};
+            $encoding_in = $rOpts_character_encoding;
 
             # a string or string ref passed to by a module call may or
             # may not have already been decoded, so we have to be careful
             # not to try to do it again.
-            if ( !utf8::is_utf8($buf) ) {
-                eval {
-                    $buf = Encode::decode( $encoding_in, $buf,
-                        Encode::FB_CROAK | Encode::LEAVE_SRC );
-                };
-                if ($@) {
-
-                    # Quit if we cannot decode by the requested encoding;
-                    # Something is not right.
-                    Warn(
+            eval {
+                $buf = Encode::decode( $encoding_in, $buf,
+                    Encode::FB_CROAK | Encode::LEAVE_SRC );
+            };
+            if ($@) {
+
+                # Quit if we cannot decode by the requested encoding;
+                # Something is not right.
+                Warn(
 "skipping file: $input_file: Unable to decode source as $encoding_in\n"
-                    );
-                    next;
-                }
+                );
+                next;
             }
         }