From 98e3859dabd9291544b65fb2f3e3842a465aa015 Mon Sep 17 00:00:00 2001 From: Steve Hancock Date: Fri, 20 Mar 2020 20:08:08 -0700 Subject: [PATCH] check utf8::is_utf8 in case guess is used --- lib/Perl/Tidy.pm | 57 +++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm index bc2dc9d7..b4174c48 100644 --- a/lib/Perl/Tidy.pm +++ b/lib/Perl/Tidy.pm @@ -925,33 +925,42 @@ EOM # Case 2. guess input stream encoding if requested elsif ( $rOpts->{'character-encoding'} =~ /^guess$/i ) { - # Use a very simple guessing strategy: if the guess is utf8, we - # test decoding with it and use it if successful. Otherwise, we - # proceed assuming the characters are encoded as single bytes. I - # have found that anything more complicated may sometimes work but - # may also lead to the disaster of using an incorrect decoding. - my $buf_in = $buf; - - my $decoder = guess_encoding( $buf_in, 'utf8' ); - if ( ref($decoder) ) { - $encoding_in = $decoder->name; - if ( $encoding_in !~ /^(UTF-8|utf8)$/ ) { - $encoding_in = ""; - $buf = $buf_in; - } - else { + # First check if the module has been passed an encoded string + if ( utf8::is_utf8($buf) ) { + $encoding_in = "utf8"; + } - eval { $buf = $decoder->decode($buf_in); }; - if ($@) { + else { - # Note that a guess failed, but keep going - # This warning can eventually be removed - Warn( -"file: $input_file: bad guess to decode source as $encoding_in\n" - ); + # Use a very simple guessing strategy: if the guess is utf8, + # we test decoding with it and use it if successful. + # Otherwise, we proceed assuming the characters are encoded as + # single bytes. I have found that anything more complicated + # may sometimes work but may also lead to the disaster of + # using an incorrect decoding. + my $buf_in = $buf; + + my $decoder = guess_encoding( $buf_in, 'utf8' ); + if ( ref($decoder) ) { + $encoding_in = $decoder->name; + if ( $encoding_in !~ /^(UTF-8|utf8)$/ ) { $encoding_in = ""; $buf = $buf_in; } + else { + + eval { $buf = $decoder->decode($buf_in); }; + if ($@) { + + # Note that a guess failed, but keep going + # This warning can eventually be removed + Warn( +"file: $input_file: bad guess to decode source as $encoding_in\n" + ); + $encoding_in = ""; + $buf = $buf_in; + } + } } } } @@ -959,6 +968,10 @@ EOM # Case 3. Decode with a specific encoding else { $encoding_in = $rOpts->{'character-encoding'}; + + # a string or string ref passed to by a module call may or + # may not have already been decoded, so we have to be careful + # not to try to do it again. if ( !utf8::is_utf8($buf) ) { eval { $buf = Encode::decode( $encoding_in, $buf, -- 2.39.5