check utf8::is_utf8 in case guess is used

author Steve Hancock <perltidy@users.sourceforge.net>

Sat, 21 Mar 2020 03:08:08 +0000 (20:08 -0700)

committer Steve Hancock <perltidy@users.sourceforge.net>

Sat, 21 Mar 2020 03:08:08 +0000 (20:08 -0700)
author Steve Hancock <perltidy@users.sourceforge.net>
Sat, 21 Mar 2020 03:08:08 +0000 (20:08 -0700)
committer Steve Hancock <perltidy@users.sourceforge.net>
Sat, 21 Mar 2020 03:08:08 +0000 (20:08 -0700)
diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm

index bc2dc9d7e567a0bfafc82ecc4b9afcb5207bd95f..b4174c48a5897808df6a08d5beec5e402b019974 100644 (file)
--- a/lib/Perl/Tidy.pm
+++ b/lib/Perl/Tidy.pm
@@ -925,33 +925,42 @@ EOM
          # Case 2. guess input stream encoding if requested
          elsif ( $rOpts->{'character-encoding'} =~ /^guess$/i ) {
  
-            # Use a very simple guessing strategy: if the guess is utf8, we
-            # test decoding with it and use it if successful. Otherwise, we
-            # proceed assuming the characters are encoded as single bytes.  I
-            # have found that anything more complicated may sometimes work but
-            # may also lead to the disaster of using an incorrect decoding.
-            my $buf_in = $buf;
-
-            my $decoder = guess_encoding( $buf_in, 'utf8' );
-            if ( ref($decoder) ) {
-                $encoding_in = $decoder->name;
-                if ( $encoding_in !~ /^(UTF-8|utf8)$/ ) {
-                    $encoding_in = "";
-                    $buf         = $buf_in;
-                }
-                else {
+            # First check if the module has been passed an encoded string
+            if ( utf8::is_utf8($buf) ) {
+                $encoding_in = "utf8";
+            }
  
-                    eval { $buf = $decoder->decode($buf_in); };
-                    if ($@) {
+            else {
  
-                        # Note that a guess failed, but keep going
-                        # This warning can eventually be removed
-                        Warn(
-"file: $input_file: bad guess to decode source as $encoding_in\n"
-                        );
+                # Use a very simple guessing strategy: if the guess is utf8,
+                # we test decoding with it and use it if successful.
+                # Otherwise, we proceed assuming the characters are encoded as
+                # single bytes.  I have found that anything more complicated
+                # may sometimes work but may also lead to the disaster of
+                # using an incorrect decoding.
+                my $buf_in = $buf;
+
+                my $decoder = guess_encoding( $buf_in, 'utf8' );
+                if ( ref($decoder) ) {
+                    $encoding_in = $decoder->name;
+                    if ( $encoding_in !~ /^(UTF-8|utf8)$/ ) {
                          $encoding_in = "";
                          $buf         = $buf_in;
                      }
+                    else {
+
+                        eval { $buf = $decoder->decode($buf_in); };
+                        if ($@) {
+
+                            # Note that a guess failed, but keep going
+                            # This warning can eventually be removed
+                            Warn(
+"file: $input_file: bad guess to decode source as $encoding_in\n"
+                            );
+                            $encoding_in = "";
+                            $buf         = $buf_in;
+                        }
+                    }
                  }
              }
          }
@@ -959,6 +968,10 @@ EOM
          # Case 3. Decode with a specific encoding
          else {
              $encoding_in = $rOpts->{'character-encoding'};
+
+            # a string or string ref passed to by a module call may or
+            # may not have already been decoded, so we have to be careful
+            # not to try to do it again.
              if ( !utf8::is_utf8($buf) ) {
                  eval {
                      $buf = Encode::decode( $encoding_in, $buf,
author	Steve Hancock <perltidy@users.sourceforge.net>
	Sat, 21 Mar 2020 03:08:08 +0000 (20:08 -0700)
committer	Steve Hancock <perltidy@users.sourceforge.net>
	Sat, 21 Mar 2020 03:08:08 +0000 (20:08 -0700)