From 48f8b1e57122fe154b4387d3753cf24393a9f93e Mon Sep 17 00:00:00 2001 From: Steve Hancock Date: Fri, 20 Mar 2020 19:28:54 -0700 Subject: [PATCH] added option --character-encoding=guess --- CHANGES.md | 6 ++ bin/perltidy | 47 ++++++-- lib/Perl/Tidy.pm | 212 +++++++++++++++++++++++-------------- lib/Perl/Tidy/Formatter.pm | 7 -- lib/Perl/Tidy/LineSink.pm | 40 +++---- lib/Perl/Tidy/Logger.pm | 19 ++-- 6 files changed, 197 insertions(+), 134 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index edab3077..f411d7cc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,12 @@ ## 2020 01 10.01 + - Added --character-encoding=guess or -guess to have perltidy guess + if a file is encoded as -utf8 or some older single-byte encoding. This + is useful when processing a mixture of file types, such as utf8 and + latin-1. Also, specific encodings of input files other than utf8 may + now be given, for example --character-encoding=euc-jp. + - Fix for git#22, Preserve function signature on a single line. An unwanted line break was being introduced when a closing signature paren followed a closing do brace. diff --git a/bin/perltidy b/bin/perltidy index 3a0d843e..ab77be59 100755 --- a/bin/perltidy +++ b/bin/perltidy @@ -587,22 +587,47 @@ this flag is in effect. =item B<-enc=s>, B<--character-encoding=s> -where B=B or B. This flag tells perltidy the character encoding -of both the input and output character streams. The value B causes the -stream to be read and written as UTF-8. The value B causes the stream to -be processed without special encoding assumptions. At present there is no -automatic detection of character encoding (even if there is a C<'use utf8'> -statement in your code) so this flag must be set for streams encoded in UTF-8. -Incorrectly setting this parameter can cause data corruption, so please -carefully check the output. +This flag indicates the character encoding, if any, of the input data stream. +Perltidy does not look for the encoding directives in the soure stream, such +as B, and instead relies on this flag to determine the encoding. +(It has to be this way because perltidy is often working on snippets of code +rather than a complete file). -The default is B. +The possible values for B are (1) the name of an encoding recognized by the +Encode.pm module, (2) B if no encoding is used, or (3) if +perltidy should guess. -The abbreviations B<-utf8> or B<-UTF8> are equivalent to B<-enc=utf8>. -So to process a file named B which is encoded in UTF-8 you can use: +For example, the value B causes the stream to be read and written as +UTF-8. If the input stream cannot be decoded with a specified encoding then +processing is not done. + +The value B causes the stream to be processed without special encoding +assumptions. This is appropriate for files which are written in single-byte +character encodings such as latin-1. + +The value B tells perltidy to guess between either utf8 encoding or no +encoding (meaning one character per byte). The guess uses the Encode::Guess +module and this restricted range of guesses covers the most common cases. +Testing showed that considering any greater number of encodings as guess +suspects is too risky. + +The current default is B. + +The abbreviations B<-utf8> or B<-UTF8> are equivalent to B<-enc=utf8>, and the +abbreviation B<-guess> is equivalent to <-enc=guess>. So to process a file +named B which is encoded in UTF-8 you can use: perltidy -utf8 file.pl +or + perltidy -guess file.pl + +To process a file in B you could use + + perltidy -enc=euc-jp file.pl + +A perltidy output file is unencoded if the input file is unencoded, and otherwise it is encoded as B, even if the input encoding was not B. + =item B<-ole=s>, B<--output-line-ending=s> where s=C, C, C, or C. This flag tells perltidy diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm index bdb9e341..12e4a2b9 100644 --- a/lib/Perl/Tidy.pm +++ b/lib/Perl/Tidy.pm @@ -85,7 +85,6 @@ use vars qw{ @EXPORT $missing_file_spec $fh_stderr - $rOpts_character_encoding $Warn_count }; @@ -94,6 +93,7 @@ use vars qw{ use Cwd; use Encode (); +use Encode::Guess; use IO::File; use File::Basename; use File::Copy; @@ -132,12 +132,15 @@ sub streamhandle { # (check for 'print' method for 'w' mode) # (check for 'getline' method for 'r' mode) - # An optional flag $character_encoding may be given. - # The current options are: - # 1. '' or 'none' -> the file will not use binary encoding - # 2. 'utf8' -> the file will be utf8 encoded - # 3. Any other string: use simple binmode - my ( $filename, $mode, $character_encoding ) = @_; + # An optional flag $is_encoded_data may be given, as follows: + + # Case 1. Any non-empty string: encoded data is being transferred, set + # encoding to be utf8 for files and for stdin. + + # Case 2. Not given, or an empty string: unencoded binary data is being + # transferred, set binary mode for files and for stdin. + + my ( $filename, $mode, $is_encoded_data ) = @_; my $ref = ref($filename); my $New; @@ -207,12 +210,16 @@ EOM $New = sub { IO::File->new( $filename, $mode ) }; } } - $fh = $New->( $filename, $mode ) - or Warn("Couldn't open file:$filename in mode:$mode : $!\n"); + $fh = $New->( $filename, $mode ); + if ( !$fh ) { + + Warn("Couldn't open file:$filename in mode:$mode : $!\n"); - if ( $fh && $character_encoding && $character_encoding ne 'none' ) { + } + else { - if ( $character_encoding eq 'utf8' ) { + # Case 1: handle encoded data + if ($is_encoded_data) { if ( ref($fh) eq 'IO::File' ) { $fh->binmode(":raw:encoding(UTF-8)"); } @@ -221,10 +228,11 @@ EOM } } - # Patch for RT 122030 - elsif ( ref($fh) eq 'IO::File' ) { $fh->binmode(); } - - elsif ( $fh eq '-' ) { binmode STDOUT } + # Case 2: handle unencoded data + else { + if ( ref($fh) eq 'IO::File' ) { $fh->binmode(); } + elsif ( $fh eq '-' ) { binmode STDOUT } + } } return $fh, ( $ref or $filename ); @@ -640,8 +648,6 @@ EOM user => '', ); - $rOpts_character_encoding = $rOpts->{'character-encoding'}; - # be sure we have a valid output format unless ( exists $default_file_extension{ $rOpts->{'format'} } ) { my $formats = join ' ', @@ -898,56 +904,103 @@ EOM my %saw_md5; my $digest_input = 0; - # Prefilters and postfilters: The prefilter is a code reference - # that will be applied to the source before tidying, and the - # postfilter is a code reference to the result before outputting. - if ( - $prefilter - || ( $rOpts_character_encoding - && $rOpts_character_encoding eq 'utf8' ) - || $rOpts->{'assert-tidy'} - || $rOpts->{'assert-untidy'} - || $do_convergence_test - ) + my $buf = ''; + while ( my $line = $source_object->get_line() ) { + $buf .= $line; + } + + # Decode the input file as requested. There are 3 cases: + my $encoding_in = ""; + + # Case 1. No input stream encoding. This is appropriate + # for single-byte encodings like ascii, latin-1, etc + if ( !$rOpts->{'character-encoding'} + || $rOpts->{'character-encoding'} eq 'none' ) { - my $buf = ''; - while ( my $line = $source_object->get_line() ) { - $buf .= $line; - } - if ( $rOpts_character_encoding - && $rOpts_character_encoding eq 'utf8' - && !utf8::is_utf8($buf) ) - { - eval { - $buf = Encode::decode( 'UTF-8', $buf, - Encode::FB_CROAK | Encode::LEAVE_SRC ); - }; - if ($@) { - Warn( -"skipping file: $input_file: Unable to decode source as UTF-8\n" - ); - next; + # nothing to do + + } + + # Case 2. guess input stream encoding if requested + elsif ( $rOpts->{'character-encoding'} =~ /^guess$/i ) { + + # Use a very simple guessing strategy: if the guess is utf8, we + # test decoding with it and use it if successful. Otherwise, we + # proceed assuming the characters are encoded as single bytes. I + # have found that anything more complicated may sometimes work but + # may also lead to the disaster of using an incorrect decoding. + my $buf_in = $buf; + + my $decoder = guess_encoding( $buf_in, 'utf8' ); + if ( ref($decoder) ) { + $encoding_in = $decoder->name; + if ( $encoding_in !~ /^(UTF-8|utf8)$/ ) { + $encoding_in = ""; + $buf = $buf_in; + } + else { + + eval { $buf = $decoder->decode($buf_in); }; + if ($@) { + + # Note that a guess failed, but keep going + # This warning can eventually be removed + Warn( +"file: $input_file: bad guess to decode source as $encoding_in\n" + ); + $encoding_in = ""; + $buf = $buf_in; + } } } + } - # MD5 sum of input file is evaluated before any prefilter - if ( $rOpts->{'assert-tidy'} || $rOpts->{'assert-untidy'} ) { - $digest_input = $md5_hex->($buf); + # Case 3. Decode with a specific encoding + else { + $encoding_in = $rOpts->{'character-encoding'}; + eval { + $buf = Encode::decode( $encoding_in, $buf, + Encode::FB_CROAK | Encode::LEAVE_SRC ); + }; + if ($@) { + + # Quit if we cannot decode by the requested encoding; + # Something is not right. + Warn( +"skipping file: $input_file: Unable to decode source as $encoding_in\n" + ); + next; } + } - $buf = $prefilter->($buf) if $prefilter; + # Set the encoding to be used for all further i/o: If we have + # decoded the data with any format, then we must continue to + # read and write it as encoded data, and we will normalize these + # operations with utf8. If we have not decoded the data, then + # we must not treat it as encoded data. + my $is_encoded_data = $encoding_in ? 'utf8' : ""; - # starting MD5 sum for convergence test is evaluated after any prefilter - if ($do_convergence_test) { - my $digest = $md5_hex->($buf); - $saw_md5{$digest} = 1; - } + # MD5 sum of input file is evaluated before any prefilter + if ( $rOpts->{'assert-tidy'} || $rOpts->{'assert-untidy'} ) { + $digest_input = $md5_hex->($buf); + } - $source_object = Perl::Tidy::LineSource->new( \$buf, $rOpts, - $rpending_logfile_message ); + # Prefilters and postfilters: The prefilter is a code reference + # that will be applied to the source before tidying, and the + # postfilter is a code reference to the result before outputting. + + $buf = $prefilter->($buf) if $prefilter; + + # starting MD5 sum for convergence test is evaluated after any prefilter + if ($do_convergence_test) { + my $digest = $md5_hex->($buf); + $saw_md5{$digest} = 1; } + $source_object = Perl::Tidy::LineSource->new( \$buf, $rOpts, + $rpending_logfile_message ); + # register this file name with the Diagnostics package $diagnostics_object->set_input_file($input_file) if $diagnostics_object; @@ -1037,12 +1090,6 @@ EOM $line_separator = find_input_line_ending($input_file); } - # Eventually all I/O may be done with binmode, but for now it is - # only done when a user requests a particular line separator - # through the -ple or -ole flags - my $binmode = defined($line_separator) - || defined($rOpts_character_encoding); - $binmode = 1; ## TESTING $line_separator = "\n" unless defined($line_separator); my ( $sink_object, $postfilter_buffer ); @@ -1052,12 +1099,14 @@ EOM { $sink_object = Perl::Tidy::LineSink->new( \$postfilter_buffer, $tee_file, - $line_separator, $rOpts, $rpending_logfile_message, $binmode ); + $line_separator, $rOpts, $rpending_logfile_message, + $is_encoded_data ); } else { $sink_object = Perl::Tidy::LineSink->new( $output_file, $tee_file, - $line_separator, $rOpts, $rpending_logfile_message, $binmode ); + $line_separator, $rOpts, $rpending_logfile_message, + $is_encoded_data ); } #--------------------------------------------------------------- @@ -1070,7 +1119,7 @@ EOM my $logger_object = Perl::Tidy::Logger->new( $rOpts, $log_file, $warning_file, - $fh_stderr, $saw_extrude, $display_name ); + $fh_stderr, $saw_extrude, $display_name, $is_encoded_data ); write_logfile_header( $rOpts, $logger_object, $config_file, $rraw_options, $Windows_type, $readable_options, @@ -1108,7 +1157,7 @@ EOM $sink_object = Perl::Tidy::LineSink->new( \$sink_buffer, $tee_file, $line_separator, $rOpts, $rpending_logfile_message, - $binmode ); + $is_encoded_data ); } else { $sink_object = $sink_object_final; @@ -1274,7 +1323,8 @@ EOM $sink_object->close_output_file(); $sink_object = Perl::Tidy::LineSink->new( $output_file, $tee_file, - $line_separator, $rOpts, $rpending_logfile_message, $binmode ); + $line_separator, $rOpts, $rpending_logfile_message, + $is_encoded_data ); my $buf = $postfilter @@ -1353,18 +1403,17 @@ EOM # everything if we closed it. seek( $output_file, 0, 0 ) or Die("unable to rewind a temporary file for -b option: $!\n"); + + # TODO: maybe use streamhandle here my $fout = IO::File->new("> $input_file") or Die( "problem re-opening $input_file for write for -b option; check file and directory permissions: $!\n" ); - if ($binmode) { - if ( $rOpts->{'character-encoding'} - && $rOpts->{'character-encoding'} eq 'utf8' ) - { - binmode $fout, ":raw:encoding(UTF-8)"; - } - else { binmode $fout } + if ($is_encoded_data) { + binmode $fout, ":raw:encoding(UTF-8)"; } + else { binmode $fout } + my $line; while ( $line = $output_file->getline() ) { $fout->print($line); @@ -1395,8 +1444,9 @@ EOM #rt128477: avoid inconsistent owner/group and suid/sgid if ( $uid_i != $uid_o || $gid_i != $gid_o ) { - # try to change owner and group to match input file if in -b mode - # note: chown returns number of files successfully changed + # try to change owner and group to match input file if + # in -b mode. Note: chown returns number of files + # successfully changed. if ( $in_place_modify && chown( $uid_i, $gid_i, $output_file ) ) { @@ -2027,10 +2077,7 @@ sub generate_options { %option_range = ( 'format' => [ 'tidy', 'html', 'user' ], 'output-line-ending' => [ 'dos', 'win', 'mac', 'unix' ], - 'character-encoding' => [ 'none', 'utf8' ], - - 'space-backslash-quote' => [ 0, 2 ], - + 'space-backslash-quote' => [ 0, 2 ], 'block-brace-tightness' => [ 0, 2 ], 'brace-tightness' => [ 0, 2 ], 'paren-tightness' => [ 0, 2 ], @@ -2207,8 +2254,9 @@ sub generate_options { 'cb' => [qw(cuddled-else)], 'cuddled-blocks' => [qw(cuddled-else)], - 'utf8' => [qw(character-encoding=utf8)], - 'UTF8' => [qw(character-encoding=utf8)], + 'utf8' => [qw(character-encoding=utf8)], + 'UTF8' => [qw(character-encoding=utf8)], + 'guess' => [qw(character-encoding=guess)], 'swallow-optional-blank-lines' => [qw(kbl=0)], 'noswallow-optional-blank-lines' => [qw(kbl=1)], diff --git a/lib/Perl/Tidy/Formatter.pm b/lib/Perl/Tidy/Formatter.pm index c5dadcef..1653b705 100644 --- a/lib/Perl/Tidy/Formatter.pm +++ b/lib/Perl/Tidy/Formatter.pm @@ -6007,13 +6007,6 @@ EOM $rOpts->{'long-block-line-count'} = 1000000; } - my $enc = $rOpts->{'character-encoding'}; - if ( $enc && $enc !~ /^(none|utf8)$/i ) { - Die(<{'output-line-ending'}; if ($ole) { my %endings = ( diff --git a/lib/Perl/Tidy/LineSink.pm b/lib/Perl/Tidy/LineSink.pm index 3854bc2a..511852fb 100644 --- a/lib/Perl/Tidy/LineSink.pm +++ b/lib/Perl/Tidy/LineSink.pm @@ -13,7 +13,7 @@ our $VERSION = '20200110.01'; sub new { my ( $class, $output_file, $tee_file, $line_separator, $rOpts, - $rpending_logfile_message, $binmode ) + $rpending_logfile_message, $is_encoded_data ) = @_; my $fh = undef; my $fh_tee = undef; @@ -21,26 +21,10 @@ sub new { my $output_file_open = 0; if ( $rOpts->{'format'} eq 'tidy' ) { - ( $fh, $output_file ) = Perl::Tidy::streamhandle( $output_file, 'w' ); + ( $fh, $output_file ) = + Perl::Tidy::streamhandle( $output_file, 'w', $is_encoded_data ); unless ($fh) { Perl::Tidy::Die("Cannot write to output stream\n"); } $output_file_open = 1; - if ($binmode) { - if ( $rOpts->{'character-encoding'} - && $rOpts->{'character-encoding'} eq 'utf8' ) - { - if ( ref($fh) eq 'IO::File' ) { - $fh->binmode(":raw:encoding(UTF-8)"); - } - elsif ( $output_file eq '-' ) { - binmode STDOUT, ":raw:encoding(UTF-8)"; - } - } - - # Patch for RT 122030 - elsif ( ref($fh) eq 'IO::File' ) { $fh->binmode(); } - - elsif ( $output_file eq '-' ) { binmode STDOUT } - } } # in order to check output syntax when standard output is used, @@ -68,7 +52,7 @@ EOM _tee_file => $tee_file, _tee_file_opened => 0, _line_separator => $line_separator, - _binmode => $binmode, + _is_encoded_data => $is_encoded_data, }, $class; } @@ -104,12 +88,16 @@ sub tee_off { } sub really_open_tee_file { - my $self = shift; - my $tee_file = $self->{_tee_file}; - my $fh_tee; - $fh_tee = IO::File->new(">$tee_file") - or Perl::Tidy::Die("couldn't open TEE file $tee_file: $!\n"); - binmode $fh_tee if $self->{_binmode}; + my $self = shift; + my $tee_file = $self->{_tee_file}; + my $is_encoded_data = $self->{_is_encoded_data}; + + my ( $fh_tee, $filename ) = + Perl::Tidy::streamhandle( $tee_file, 'w', $is_encoded_data ); + if ( !$fh_tee ) { + Perl::Tidy::Die("couldn't open TEE file $tee_file: $!\n"); + } + $self->{_tee_file_opened} = 1; $self->{_fh_tee} = $fh_tee; return; diff --git a/lib/Perl/Tidy/Logger.pm b/lib/Perl/Tidy/Logger.pm index 4c872b99..73e24af7 100644 --- a/lib/Perl/Tidy/Logger.pm +++ b/lib/Perl/Tidy/Logger.pm @@ -12,7 +12,7 @@ our $VERSION = '20200110.01'; sub new { my ( $class, $rOpts, $log_file, $warning_file, $fh_stderr, $saw_extrude, - $display_name ) + $display_name, $is_encoded_data ) = @_; my $fh_warnings = $rOpts->{'standard-error-output'} ? $fh_stderr : undef; @@ -50,6 +50,7 @@ sub new { _warning_file => $warning_file, _warning_count => 0, _complaint_count => 0, + _is_encoded_data => $is_encoded_data, _saw_code_bug => -1, # -1=no 0=maybe 1=for sure _saw_brace_error => 0, _saw_extrude => $saw_extrude, @@ -311,13 +312,13 @@ sub warning { my $rOpts = $self->{_rOpts}; unless ( $rOpts->{'quiet'} ) { - my $warning_count = $self->{_warning_count}; - my $fh_warnings = $self->{_fh_warnings}; + my $warning_count = $self->{_warning_count}; + my $fh_warnings = $self->{_fh_warnings}; + my $is_encoded_data = $self->{_is_encoded_data}; if ( !$fh_warnings ) { my $warning_file = $self->{_warning_file}; ( $fh_warnings, my $filename ) = - Perl::Tidy::streamhandle( $warning_file, 'w', - $rOpts->{'character-encoding'} ); + Perl::Tidy::streamhandle( $warning_file, 'w', $is_encoded_data ); $fh_warnings or Perl::Tidy::Die("couldn't open $filename $!\n"); Perl::Tidy::Warn("## Please see file $filename\n") unless ref($warning_file); @@ -501,9 +502,10 @@ sub finish { $self->ask_user_for_bug_report( $infile_syntax_ok, $formatter ); if ($save_logfile) { - my $log_file = $self->{_log_file}; - my ( $fh, $filename ) = Perl::Tidy::streamhandle( $log_file, 'w', - $rOpts->{'character-encoding'} ); + my $log_file = $self->{_log_file}; + my $is_encoded_data = $self->{_is_encoded_data}; + my ( $fh, $filename ) = + Perl::Tidy::streamhandle( $log_file, 'w', $is_encoded_data ); if ($fh) { my $routput_array = $self->{_output_array}; foreach ( @{$routput_array} ) { $fh->print($_) } @@ -515,3 +517,4 @@ sub finish { return; } 1; + -- 2.39.5