From 48f8b1e57122fe154b4387d3753cf24393a9f93e Mon Sep 17 00:00:00 2001
From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Fri, 20 Mar 2020 19:28:54 -0700
Subject: [PATCH] added option --character-encoding=guess

---
 CHANGES.md                 |   6 ++
 bin/perltidy               |  47 ++++++--
 lib/Perl/Tidy.pm           | 212 +++++++++++++++++++++++--------------
 lib/Perl/Tidy/Formatter.pm |   7 --
 lib/Perl/Tidy/LineSink.pm  |  40 +++----
 lib/Perl/Tidy/Logger.pm    |  19 ++--
 6 files changed, 197 insertions(+), 134 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
index edab3077..f411d7cc 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,6 +2,12 @@
 
 ## 2020 01 10.01
 
+    - Added --character-encoding=guess or -guess to have perltidy guess
+      if a file is encoded as -utf8 or some older single-byte encoding. This
+      is useful when processing a mixture of file types, such as utf8 and 
+      latin-1.  Also, specific encodings of input files other than utf8 may
+      now be given, for example --character-encoding=euc-jp.
+
     - Fix for git#22, Preserve function signature on a single line. An
       unwanted line break was being introduced when a closing signature paren
       followed a closing do brace.
diff --git a/bin/perltidy b/bin/perltidy
index 3a0d843e..ab77be59 100755
--- a/bin/perltidy
+++ b/bin/perltidy
@@ -587,22 +587,47 @@ this flag is in effect.
 
 =item B<-enc=s>,  B<--character-encoding=s>
 
-where B<s>=B<none> or B<utf8>.  This flag tells perltidy the character encoding
-of both the input and output character streams.  The value B<utf8> causes the
-stream to be read and written as UTF-8.  The value B<none> causes the stream to
-be processed without special encoding assumptions.  At present there is no
-automatic detection of character encoding (even if there is a C<'use utf8'>
-statement in your code) so this flag must be set for streams encoded in UTF-8.
-Incorrectly setting this parameter can cause data corruption, so please
-carefully check the output.
+This flag indicates the character encoding, if any, of the input data stream.
+Perltidy does not look for the encoding directives in the soure stream, such
+as B<use utf8>, and instead relies on this flag to determine the encoding.
+(It has to be this way because perltidy is often working on snippets of code
+rather than a complete file).
 
-The default is B<none>.  
+The possible values for B<s> are (1) the name of an encoding recognized by the
+Encode.pm module, (2) B<none> if no encoding is used, or (3) <guess> if
+perltidy should guess.  
 
-The abbreviations B<-utf8> or B<-UTF8> are equivalent to B<-enc=utf8>.
-So to process a file named B<file.pl> which is encoded in UTF-8 you can use:
+For example, the value B<utf8> causes the stream to be read and written as
+UTF-8.  If the input stream cannot be decoded with a specified encoding then
+processing is not done.  
+
+The value B<none> causes the stream to be processed without special encoding
+assumptions.  This is appropriate for files which are written in single-byte
+character encodings such as latin-1.  
+
+The value B<guess> tells perltidy to guess between either utf8 encoding or no
+encoding (meaning one character per byte).  The guess uses the Encode::Guess
+module and this restricted range of guesses covers the most common cases.
+Testing showed that considering any greater number of encodings as guess
+suspects is too risky.
+
+The current default is B<none>.  
+
+The abbreviations B<-utf8> or B<-UTF8> are equivalent to B<-enc=utf8>, and the
+abbreviation B<-guess> is equivalent to <-enc=guess>.  So to process a file
+named B<file.pl> which is encoded in UTF-8 you can use:
 
    perltidy -utf8 file.pl
 
+or
+   perltidy -guess file.pl
+
+To process a file in B<euc-jp> you could use
+   
+   perltidy -enc=euc-jp file.pl
+
+A perltidy output file is unencoded if the input file is unencoded, and otherwise it is encoded as B<utf8>, even if the input encoding was not B<utf8>.
+
 =item B<-ole=s>,  B<--output-line-ending=s>
 
 where s=C<win>, C<dos>, C<unix>, or C<mac>.  This flag tells perltidy
diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm
index bdb9e341..12e4a2b9 100644
--- a/lib/Perl/Tidy.pm
+++ b/lib/Perl/Tidy.pm
@@ -85,7 +85,6 @@ use vars qw{
   @EXPORT
   $missing_file_spec
   $fh_stderr
-  $rOpts_character_encoding
   $Warn_count
 };
 
@@ -94,6 +93,7 @@ use vars qw{
 
 use Cwd;
 use Encode ();
+use Encode::Guess;
 use IO::File;
 use File::Basename;
 use File::Copy;
@@ -132,12 +132,15 @@ sub streamhandle {
     #                      (check for 'print' method for 'w' mode)
     #                      (check for 'getline' method for 'r' mode)
 
-    # An optional flag $character_encoding may be given.
-    # The current options are:
-    # 1. '' or 'none' -> the file will not use binary encoding
-    # 2. 'utf8' -> the file will be utf8 encoded
-    # 3. Any other string: use simple binmode
-    my ( $filename, $mode, $character_encoding ) = @_;
+    # An optional flag $is_encoded_data may be given, as follows:
+
+    # Case 1. Any non-empty string: encoded data is being transferred, set
+    # encoding to be utf8 for files and for stdin.
+
+    # Case 2. Not given, or an empty string: unencoded binary data is being
+    # transferred, set binary mode for files and for stdin.
+
+    my ( $filename, $mode, $is_encoded_data ) = @_;
 
     my $ref = ref($filename);
     my $New;
@@ -207,12 +210,16 @@ EOM
             $New = sub { IO::File->new( $filename, $mode ) };
         }
     }
-    $fh = $New->( $filename, $mode )
-      or Warn("Couldn't open file:$filename in mode:$mode : $!\n");
+    $fh = $New->( $filename, $mode );
+    if ( !$fh ) {
+
+        Warn("Couldn't open file:$filename in mode:$mode : $!\n");
 
-    if ( $fh && $character_encoding && $character_encoding ne 'none' ) {
+    }
+    else {
 
-        if ( $character_encoding eq 'utf8' ) {
+        # Case 1: handle encoded data
+        if ($is_encoded_data) {
             if ( ref($fh) eq 'IO::File' ) {
                 $fh->binmode(":raw:encoding(UTF-8)");
             }
@@ -221,10 +228,11 @@ EOM
             }
         }
 
-        # Patch for RT 122030
-        elsif ( ref($fh) eq 'IO::File' ) { $fh->binmode(); }
-
-        elsif ( $fh eq '-' ) { binmode STDOUT }
+        # Case 2: handle unencoded data
+        else {
+            if    ( ref($fh) eq 'IO::File' ) { $fh->binmode(); }
+            elsif ( $fh eq '-' )             { binmode STDOUT }
+        }
     }
 
     return $fh, ( $ref or $filename );
@@ -640,8 +648,6 @@ EOM
         user => '',
     );
 
-    $rOpts_character_encoding = $rOpts->{'character-encoding'};
-
     # be sure we have a valid output format
     unless ( exists $default_file_extension{ $rOpts->{'format'} } ) {
         my $formats = join ' ',
@@ -898,56 +904,103 @@ EOM
         my %saw_md5;
         my $digest_input = 0;
 
-        # Prefilters and postfilters: The prefilter is a code reference
-        # that will be applied to the source before tidying, and the
-        # postfilter is a code reference to the result before outputting.
-        if (
-            $prefilter
-            || (   $rOpts_character_encoding
-                && $rOpts_character_encoding eq 'utf8' )
-            || $rOpts->{'assert-tidy'}
-            || $rOpts->{'assert-untidy'}
-            || $do_convergence_test
-          )
+        my $buf = '';
+        while ( my $line = $source_object->get_line() ) {
+            $buf .= $line;
+        }
+
+        # Decode the input file as requested. There are 3 cases:
+        my $encoding_in = "";
+
+        # Case 1. No input stream encoding.  This is appropriate
+        # for single-byte encodings like ascii, latin-1, etc
+        if (  !$rOpts->{'character-encoding'}
+            || $rOpts->{'character-encoding'} eq 'none' )
         {
-            my $buf = '';
-            while ( my $line = $source_object->get_line() ) {
-                $buf .= $line;
-            }
 
-            if (   $rOpts_character_encoding
-                && $rOpts_character_encoding eq 'utf8'
-                && !utf8::is_utf8($buf) )
-            {
-                eval {
-                    $buf = Encode::decode( 'UTF-8', $buf,
-                        Encode::FB_CROAK | Encode::LEAVE_SRC );
-                };
-                if ($@) {
-                    Warn(
-"skipping file: $input_file: Unable to decode source as UTF-8\n"
-                    );
-                    next;
+            # nothing to do
+
+        }
+
+        # Case 2. guess input stream encoding if requested
+        elsif ( $rOpts->{'character-encoding'} =~ /^guess$/i ) {
+
+            # Use a very simple guessing strategy: if the guess is utf8, we
+            # test decoding with it and use it if successful. Otherwise, we
+            # proceed assuming the characters are encoded as single bytes.  I
+            # have found that anything more complicated may sometimes work but
+            # may also lead to the disaster of using an incorrect decoding.
+            my $buf_in = $buf;
+
+            my $decoder = guess_encoding( $buf_in, 'utf8' );
+            if ( ref($decoder) ) {
+                $encoding_in = $decoder->name;
+                if ( $encoding_in !~ /^(UTF-8|utf8)$/ ) {
+                    $encoding_in = "";
+                    $buf         = $buf_in;
+                }
+                else {
+
+                    eval { $buf = $decoder->decode($buf_in); };
+                    if ($@) {
+
+                        # Note that a guess failed, but keep going
+                        # This warning can eventually be removed
+                        Warn(
+"file: $input_file: bad guess to decode source as $encoding_in\n"
+                        );
+                        $encoding_in = "";
+                        $buf         = $buf_in;
+                    }
                 }
             }
+        }
 
-            # MD5 sum of input file is evaluated before any prefilter
-            if ( $rOpts->{'assert-tidy'} || $rOpts->{'assert-untidy'} ) {
-                $digest_input = $md5_hex->($buf);
+        # Case 3. Decode with a specific encoding
+        else {
+            $encoding_in = $rOpts->{'character-encoding'};
+            eval {
+                $buf = Encode::decode( $encoding_in, $buf,
+                    Encode::FB_CROAK | Encode::LEAVE_SRC );
+            };
+            if ($@) {
+
+                # Quit if we cannot decode by the requested encoding;
+                # Something is not right.
+                Warn(
+"skipping file: $input_file: Unable to decode source as $encoding_in\n"
+                );
+                next;
             }
+        }
 
-            $buf = $prefilter->($buf) if $prefilter;
+        # Set the encoding to be used for all further i/o: If we have
+        # decoded the data with any format, then we must continue to
+        # read and write it as encoded data, and we will normalize these
+        # operations with utf8.  If we have not decoded the data, then
+        # we must not treat it as encoded data.
+        my $is_encoded_data = $encoding_in ? 'utf8' : "";
 
-        # starting MD5 sum for convergence test is evaluated after any prefilter
-            if ($do_convergence_test) {
-                my $digest = $md5_hex->($buf);
-                $saw_md5{$digest} = 1;
-            }
+        # MD5 sum of input file is evaluated before any prefilter
+        if ( $rOpts->{'assert-tidy'} || $rOpts->{'assert-untidy'} ) {
+            $digest_input = $md5_hex->($buf);
+        }
 
-            $source_object = Perl::Tidy::LineSource->new( \$buf, $rOpts,
-                $rpending_logfile_message );
+        # Prefilters and postfilters: The prefilter is a code reference
+        # that will be applied to the source before tidying, and the
+        # postfilter is a code reference to the result before outputting.
+
+        $buf = $prefilter->($buf) if $prefilter;
+
+        # starting MD5 sum for convergence test is evaluated after any prefilter
+        if ($do_convergence_test) {
+            my $digest = $md5_hex->($buf);
+            $saw_md5{$digest} = 1;
         }
 
+        $source_object = Perl::Tidy::LineSource->new( \$buf, $rOpts,
+            $rpending_logfile_message );
+
         # register this file name with the Diagnostics package
         $diagnostics_object->set_input_file($input_file)
           if $diagnostics_object;
@@ -1037,12 +1090,6 @@ EOM
             $line_separator = find_input_line_ending($input_file);
         }
 
-        # Eventually all I/O may be done with binmode, but for now it is
-        # only done when a user requests a particular line separator
-        # through the -ple or -ole flags
-        my $binmode = defined($line_separator)
-          || defined($rOpts_character_encoding);
-        $binmode = 1;  ## TESTING
         $line_separator = "\n" unless defined($line_separator);
 
         my ( $sink_object, $postfilter_buffer );
@@ -1052,12 +1099,14 @@ EOM
         {
             $sink_object =
               Perl::Tidy::LineSink->new( \$postfilter_buffer, $tee_file,
-                $line_separator, $rOpts, $rpending_logfile_message, $binmode );
+                $line_separator, $rOpts, $rpending_logfile_message,
+                $is_encoded_data );
         }
         else {
             $sink_object =
               Perl::Tidy::LineSink->new( $output_file, $tee_file,
-                $line_separator, $rOpts, $rpending_logfile_message, $binmode );
+                $line_separator, $rOpts, $rpending_logfile_message,
+                $is_encoded_data );
         }
 
         #---------------------------------------------------------------
@@ -1070,7 +1119,7 @@ EOM
 
         my $logger_object =
           Perl::Tidy::Logger->new( $rOpts, $log_file, $warning_file,
-            $fh_stderr, $saw_extrude, $display_name );
+            $fh_stderr, $saw_extrude, $display_name, $is_encoded_data );
         write_logfile_header(
             $rOpts,        $logger_object, $config_file,
             $rraw_options, $Windows_type,  $readable_options,
@@ -1108,7 +1157,7 @@ EOM
                 $sink_object =
                   Perl::Tidy::LineSink->new( \$sink_buffer, $tee_file,
                     $line_separator, $rOpts, $rpending_logfile_message,
-                    $binmode );
+                    $is_encoded_data );
             }
             else {
                 $sink_object = $sink_object_final;
@@ -1274,7 +1323,8 @@ EOM
             $sink_object->close_output_file();
             $sink_object =
               Perl::Tidy::LineSink->new( $output_file, $tee_file,
-                $line_separator, $rOpts, $rpending_logfile_message, $binmode );
+                $line_separator, $rOpts, $rpending_logfile_message,
+                $is_encoded_data );
 
             my $buf =
                 $postfilter
@@ -1353,18 +1403,17 @@ EOM
             # everything if we closed it.
             seek( $output_file, 0, 0 )
               or Die("unable to rewind a temporary file for -b option: $!\n");
+
+            # TODO: maybe use streamhandle here
             my $fout = IO::File->new("> $input_file")
               or Die(
 "problem re-opening $input_file for write for -b option; check file and directory permissions: $!\n"
               );
-            if ($binmode) {
-                if (   $rOpts->{'character-encoding'}
-                    && $rOpts->{'character-encoding'} eq 'utf8' )
-                {
-                    binmode $fout, ":raw:encoding(UTF-8)";
-                }
-                else { binmode $fout }
+            if ($is_encoded_data) {
+                binmode $fout, ":raw:encoding(UTF-8)";
             }
+            else { binmode $fout }
+
             my $line;
             while ( $line = $output_file->getline() ) {
                 $fout->print($line);
@@ -1395,8 +1444,9 @@ EOM
                     #rt128477: avoid inconsistent owner/group and suid/sgid
                     if ( $uid_i != $uid_o || $gid_i != $gid_o ) {
 
-               # try to change owner and group to match input file if in -b mode
-               # note: chown returns number of files successfully changed
+                        # try to change owner and group to match input file if
+                        # in -b mode.  Note: chown returns number of files
+                        # successfully changed.
                         if ( $in_place_modify
                             && chown( $uid_i, $gid_i, $output_file ) )
                         {
@@ -2027,10 +2077,7 @@ sub generate_options {
     %option_range = (
         'format'             => [ 'tidy', 'html', 'user' ],
         'output-line-ending' => [ 'dos',  'win',  'mac', 'unix' ],
-        'character-encoding' => [ 'none', 'utf8' ],
-
-        'space-backslash-quote' => [ 0, 2 ],
-
+        'space-backslash-quote'    => [ 0, 2 ],
         'block-brace-tightness'    => [ 0, 2 ],
         'brace-tightness'          => [ 0, 2 ],
         'paren-tightness'          => [ 0, 2 ],
@@ -2207,8 +2254,9 @@ sub generate_options {
         'cb'             => [qw(cuddled-else)],
         'cuddled-blocks' => [qw(cuddled-else)],
 
-        'utf8' => [qw(character-encoding=utf8)],
-        'UTF8' => [qw(character-encoding=utf8)],
+        'utf8'  => [qw(character-encoding=utf8)],
+        'UTF8'  => [qw(character-encoding=utf8)],
+        'guess' => [qw(character-encoding=guess)],
 
         'swallow-optional-blank-lines'   => [qw(kbl=0)],
         'noswallow-optional-blank-lines' => [qw(kbl=1)],
diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm
index c5dadcef..1653b705 100644
--- a/lib/Perl/Tidy/Formatter.pm
+++ b/lib/Perl/Tidy/Formatter.pm
@@ -6007,13 +6007,6 @@ EOM
         $rOpts->{'long-block-line-count'} = 1000000;
     }
 
-    my $enc = $rOpts->{'character-encoding'};
-    if ( $enc && $enc !~ /^(none|utf8)$/i ) {
-        Die(<<EOM);
-Unrecognized character-encoding '$enc'; expecting one of: (none, utf8)
-EOM
-    }
-
     my $ole = $rOpts->{'output-line-ending'};
     if ($ole) {
         my %endings = (
diff --git a/lib/Perl/Tidy/LineSink.pm b/lib/Perl/Tidy/LineSink.pm
index 3854bc2a..511852fb 100644
--- a/lib/Perl/Tidy/LineSink.pm
+++ b/lib/Perl/Tidy/LineSink.pm
@@ -13,7 +13,7 @@ our $VERSION = '20200110.01';
 sub new {
 
     my ( $class, $output_file, $tee_file, $line_separator, $rOpts,
-        $rpending_logfile_message, $binmode )
+        $rpending_logfile_message, $is_encoded_data )
       = @_;
     my $fh     = undef;
     my $fh_tee = undef;
@@ -21,26 +21,10 @@ sub new {
     my $output_file_open = 0;
 
     if ( $rOpts->{'format'} eq 'tidy' ) {
-        ( $fh, $output_file ) = Perl::Tidy::streamhandle( $output_file, 'w' );
+        ( $fh, $output_file ) =
+          Perl::Tidy::streamhandle( $output_file, 'w', $is_encoded_data );
         unless ($fh) { Perl::Tidy::Die("Cannot write to output stream\n"); }
         $output_file_open = 1;
-        if ($binmode) {
-            if (   $rOpts->{'character-encoding'}
-                && $rOpts->{'character-encoding'} eq 'utf8' )
-            {
-                if ( ref($fh) eq 'IO::File' ) {
-                    $fh->binmode(":raw:encoding(UTF-8)");
-                }
-                elsif ( $output_file eq '-' ) {
-                    binmode STDOUT, ":raw:encoding(UTF-8)";
-                }
-            }
-
-            # Patch for RT 122030
-            elsif ( ref($fh) eq 'IO::File' ) { $fh->binmode(); }
-
-            elsif ( $output_file eq '-' ) { binmode STDOUT }
-        }
     }
 
     # in order to check output syntax when standard output is used,
@@ -68,7 +52,7 @@ EOM
         _tee_file         => $tee_file,
         _tee_file_opened  => 0,
         _line_separator   => $line_separator,
-        _binmode          => $binmode,
+        _is_encoded_data  => $is_encoded_data,
     }, $class;
 }
 
@@ -104,12 +88,16 @@ sub tee_off {
 }
 
 sub really_open_tee_file {
-    my $self     = shift;
-    my $tee_file = $self->{_tee_file};
-    my $fh_tee;
-    $fh_tee = IO::File->new(">$tee_file")
-      or Perl::Tidy::Die("couldn't open TEE file $tee_file: $!\n");
-    binmode $fh_tee if $self->{_binmode};
+    my $self            = shift;
+    my $tee_file        = $self->{_tee_file};
+    my $is_encoded_data = $self->{_is_encoded_data};
+
+    my ( $fh_tee, $filename ) =
+      Perl::Tidy::streamhandle( $tee_file, 'w', $is_encoded_data );
+    if ( !$fh_tee ) {
+        Perl::Tidy::Die("couldn't open TEE file $tee_file: $!\n");
+    }
+
     $self->{_tee_file_opened} = 1;
     $self->{_fh_tee}          = $fh_tee;
     return;
diff --git a/lib/Perl/Tidy/Logger.pm b/lib/Perl/Tidy/Logger.pm
index 4c872b99..73e24af7 100644
--- a/lib/Perl/Tidy/Logger.pm
+++ b/lib/Perl/Tidy/Logger.pm
@@ -12,7 +12,7 @@ our $VERSION = '20200110.01';
 sub new {
 
     my ( $class, $rOpts, $log_file, $warning_file, $fh_stderr, $saw_extrude,
-        $display_name )
+        $display_name, $is_encoded_data )
       = @_;
 
     my $fh_warnings = $rOpts->{'standard-error-output'} ? $fh_stderr : undef;
@@ -50,6 +50,7 @@ sub new {
         _warning_file                  => $warning_file,
         _warning_count                 => 0,
         _complaint_count               => 0,
+        _is_encoded_data               => $is_encoded_data,
         _saw_code_bug      => -1,                   # -1=no 0=maybe 1=for sure
         _saw_brace_error   => 0,
         _saw_extrude       => $saw_extrude,
@@ -311,13 +312,13 @@ sub warning {
     my $rOpts = $self->{_rOpts};
     unless ( $rOpts->{'quiet'} ) {
 
-        my $warning_count = $self->{_warning_count};
-        my $fh_warnings   = $self->{_fh_warnings};
+        my $warning_count   = $self->{_warning_count};
+        my $fh_warnings     = $self->{_fh_warnings};
+        my $is_encoded_data = $self->{_is_encoded_data};
         if ( !$fh_warnings ) {
             my $warning_file = $self->{_warning_file};
             ( $fh_warnings, my $filename ) =
-              Perl::Tidy::streamhandle( $warning_file, 'w',
-                $rOpts->{'character-encoding'} );
+              Perl::Tidy::streamhandle( $warning_file, 'w', $is_encoded_data );
             $fh_warnings or Perl::Tidy::Die("couldn't open $filename $!\n");
             Perl::Tidy::Warn("## Please see file $filename\n")
               unless ref($warning_file);
@@ -501,9 +502,10 @@ sub finish {
     $self->ask_user_for_bug_report( $infile_syntax_ok, $formatter );
 
     if ($save_logfile) {
-        my $log_file = $self->{_log_file};
-        my ( $fh, $filename ) = Perl::Tidy::streamhandle( $log_file, 'w',
-            $rOpts->{'character-encoding'} );
+        my $log_file        = $self->{_log_file};
+        my $is_encoded_data = $self->{_is_encoded_data};
+        my ( $fh, $filename ) =
+          Perl::Tidy::streamhandle( $log_file, 'w', $is_encoded_data );
         if ($fh) {
             my $routput_array = $self->{_output_array};
             foreach ( @{$routput_array} ) { $fh->print($_) }
@@ -515,3 +517,4 @@ sub finish {
     return;
 }
 1;
+
-- 
2.39.5