limit --character-encoding=s to s='none', 'guess', or 'utf8'

author Steve Hancock <perltidy@users.sourceforge.net>

Mon, 31 Jan 2022 18:55:20 +0000 (10:55 -0800)

committer Steve Hancock <perltidy@users.sourceforge.net>

Mon, 31 Jan 2022 18:55:20 +0000 (10:55 -0800)
author Steve Hancock <perltidy@users.sourceforge.net>
Mon, 31 Jan 2022 18:55:20 +0000 (10:55 -0800)
committer Steve Hancock <perltidy@users.sourceforge.net>
Mon, 31 Jan 2022 18:55:20 +0000 (10:55 -0800)
diff --git a/CHANGES.md b/CHANGES.md

index 25c20a695ccedf26535414828859c6f53907fc36..d053725e482185e70af3682c45d7696c80b55862 100644 (file)
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,6 +2,7 @@
  
  ## 2021 10 29.06
  
+
      - A new flag --break-after-labels=i, or -bal=i, was added as requested
        in git #86.  This controls line breaks after labels, as follows:
  
@@ -25,6 +26,14 @@
      - Fix issue git #82, an error handling something like ${bareword} in a possible
        indirect object location.
  
+    - The possible values of the string 's' for the flag '--character-encoding=s'
+      have been limited to 'utf8' (or UTF-8), 'none', or 'guess'.  Previously an
+      arbitrary encoding could also be specified, but as a result of discussions
+      regarding git #83 it became clear that this was a bad idea and could lead
+      to problems since the output encoding was still restricted to UTF-8. Users
+      who need to work in other encodings can write a short program calling
+      Perl::Tidy with pre- and post-processing to handle encoding/decoding.
+
      - A new flag, --encode-output-strings, or -eos, has been added to resolve
        issue git #83.  This issue involves the interface between Perl::Tidy and
        calling programs, and tidyall in particular.  If you use tidyall and have
diff --git a/bin/perltidy b/bin/perltidy

index b9914c5480772931498700e2d0ba180f43afd380..9dae624951d4eb6e572757e319b84954fca42b36 100755 (executable)
--- a/bin/perltidy
+++ b/bin/perltidy
@@ -598,29 +598,29 @@ this flag is in effect.
  
  =item B<-enc=s>,  B<--character-encoding=s>
  
-This flag indicates the character encoding, if any, of the input data stream.
+This flag indicates if the input data stream use a character encoding.
  Perltidy does not look for the encoding directives in the soure stream, such
  as B<use utf8>, and instead relies on this flag to determine the encoding.
  (Note that perltidy often works on snippets of code rather than complete files
  so it cannot rely on B<use utf8> directives).
  
-The possible values for B<s> are (1) the name of an encoding recognized by the
-Encode.pm module, (2) B<none> if no encoding is used, or (3) <guess> if
-perltidy should guess.
-
-For example, the value B<utf8> causes the stream to be read and written as
-UTF-8.  If the input stream cannot be decoded with a specified encoding then
-processing is not done.
+The possible values for B<s> are :
+(1) B<none> if no encoding is used, or
+(2) B<utf8> or B<UTF8> or B<UTF-8>, or
+(3) <guess> if perltidy should guess between these two possibilities.
  
  The value B<none> causes the stream to be processed without special encoding
  assumptions.  This is appropriate for files which are written in single-byte
  character encodings such as latin-1.
  
+The value B<utf8> causes the stream to be read and written as
+UTF-8.  If the input stream cannot be decoded with this encoding then
+processing is not done.
+
  The value B<guess> tells perltidy to guess between either utf8 encoding or no
-encoding (meaning one character per byte).  The guess uses the Encode::Guess
-module and this restricted range of guesses covers the most common cases.
-Testing showed that considering any greater number of encodings as guess
-suspects is too risky.
+encoding (meaning one character per byte).  The 'guess' option uses the
+Encode::Guess module which has been found to be quite reliable at detecting
+if a file is encoded in utf8 or not.
  
  The current default is B<guess>.
  
@@ -633,13 +633,15 @@ named B<file.pl> which is encoded in UTF-8 you can use:
  or
     perltidy -guess file.pl
  
-To process a file in B<euc-jp> you could use
+or simply
+
+   perltidy file.pl
  
-   perltidy -enc=euc-jp file.pl
+since B<-guess> is the default.
  
-A perltidy output file is unencoded if the input file is unencoded, and
-otherwise it is encoded as B<utf8>, even if the input encoding was not
-B<utf8>.
+To process files with an encoding other than UTF-8, it would be necessary to
+write a short program which calls the Perl::Tidy module with some pre- and
+post-processing to handle decoding and encoding.
  
  =item B<-eos=s>,   B<--encode-output-strings=s>
  
@@ -896,6 +898,10 @@ containers (not just lists in parentheses).  The next section describes how to
  limit this style to, for example, just function calls.  The default indentation
  method will be applied elsewhere.
  
+(3) If you use B<-xlp> then long side comments can limit the indentation over
+multiple lines.  Consider adding the flag B<--ignore-side-comment-lengths> to
+prevent this, or minimizing the use of side comments.
+
  =item B<-lpxl=s>,  B<--line-up-parentheses-exclusion-list>
  
  The following discussion mentions B<-lp> but applies equally to B<-xlp>.
diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm

index 642cfdb5b1c471417f083d2b23a8802aee88019b..3693b91cef77371b291a671eb7fb0baed3f1e848 100644 (file)
--- a/lib/Perl/Tidy.pm
+++ b/lib/Perl/Tidy.pm
@@ -1087,6 +1087,9 @@ EOM
                      }
                  }
              }
+            $encoding_log_message .= <<EOM;
+Unable to guess a character encoding
+EOM
          }
  
          # Case 4. Decode with a specific encoding
@@ -3348,6 +3351,15 @@ sub check_options {
      # check and handle any interactions among the basic options..
      #---------------------------------------------------------------
  
+    # Since perltidy only encodes in utf8, problems can occur if we let it
+    # decode anything else.  See discussions for issue git #83.
+    my $encoding = $rOpts->{'character-encoding'};
+    if ( $encoding !~ /^\s*(guess|none|utf8|utf-8)\s*$/i ) {
+        Die(<<EOM);
+--character-encoding = '$encoding' is not allowed; the options are: 'none', 'guess', 'utf8'
+EOM
+    }
+
      # Since -vt, -vtc, and -cti are abbreviations, but under
      # msdos, an unquoted input parameter like vtc=1 will be
      # seen as 2 parameters, vtc and 1, so the abbreviations
author	Steve Hancock <perltidy@users.sourceforge.net>
	Mon, 31 Jan 2022 18:55:20 +0000 (10:55 -0800)
committer	Steve Hancock <perltidy@users.sourceforge.net>
	Mon, 31 Jan 2022 18:55:20 +0000 (10:55 -0800)
CHANGES.md		patch \| blob \| history
bin/perltidy		patch \| blob \| history
lib/Perl/Tidy.pm		patch \| blob \| history