From 194a6ee5e98583a18fc58b44c548161df9473f24 Mon Sep 17 00:00:00 2001
From: Steve Hancock <perltidy@users.sourceforge.net>
Date: Mon, 31 Jan 2022 10:55:20 -0800
Subject: [PATCH] limit --character-encoding=s to s='none', 'guess', or 'utf8'

---
 CHANGES.md       |  9 +++++++++
 bin/perltidy     | 40 +++++++++++++++++++++++-----------------
 lib/Perl/Tidy.pm | 12 ++++++++++++
 3 files changed, 44 insertions(+), 17 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
index 25c20a69..d053725e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,6 +2,7 @@
 
 ## 2021 10 29.06
 
+
     - A new flag --break-after-labels=i, or -bal=i, was added as requested
       in git #86.  This controls line breaks after labels, as follows:
 
@@ -25,6 +26,14 @@
     - Fix issue git #82, an error handling something like ${bareword} in a possible
       indirect object location.
 
+    - The possible values of the string 's' for the flag '--character-encoding=s'
+      have been limited to 'utf8' (or UTF-8), 'none', or 'guess'.  Previously an
+      arbitrary encoding could also be specified, but as a result of discussions
+      regarding git #83 it became clear that this was a bad idea and could lead
+      to problems since the output encoding was still restricted to UTF-8. Users
+      who need to work in other encodings can write a short program calling
+      Perl::Tidy with pre- and post-processing to handle encoding/decoding.
+
     - A new flag, --encode-output-strings, or -eos, has been added to resolve
       issue git #83.  This issue involves the interface between Perl::Tidy and
       calling programs, and tidyall in particular.  If you use tidyall and have
diff --git a/bin/perltidy b/bin/perltidy
index b9914c54..9dae6249 100755
--- a/bin/perltidy
+++ b/bin/perltidy
@@ -598,29 +598,29 @@ this flag is in effect.
 
 =item B<-enc=s>,  B<--character-encoding=s>
 
-This flag indicates the character encoding, if any, of the input data stream.
+This flag indicates if the input data stream use a character encoding.
 Perltidy does not look for the encoding directives in the soure stream, such
 as B<use utf8>, and instead relies on this flag to determine the encoding.
 (Note that perltidy often works on snippets of code rather than complete files
 so it cannot rely on B<use utf8> directives).
 
-The possible values for B<s> are (1) the name of an encoding recognized by the
-Encode.pm module, (2) B<none> if no encoding is used, or (3) <guess> if
-perltidy should guess.
-
-For example, the value B<utf8> causes the stream to be read and written as
-UTF-8.  If the input stream cannot be decoded with a specified encoding then
-processing is not done.
+The possible values for B<s> are :
+(1) B<none> if no encoding is used, or
+(2) B<utf8> or B<UTF8> or B<UTF-8>, or
+(3) <guess> if perltidy should guess between these two possibilities.
 
 The value B<none> causes the stream to be processed without special encoding
 assumptions.  This is appropriate for files which are written in single-byte
 character encodings such as latin-1.
 
+The value B<utf8> causes the stream to be read and written as
+UTF-8.  If the input stream cannot be decoded with this encoding then
+processing is not done.
+
 The value B<guess> tells perltidy to guess between either utf8 encoding or no
-encoding (meaning one character per byte).  The guess uses the Encode::Guess
-module and this restricted range of guesses covers the most common cases.
-Testing showed that considering any greater number of encodings as guess
-suspects is too risky.
+encoding (meaning one character per byte).  The 'guess' option uses the
+Encode::Guess module which has been found to be quite reliable at detecting
+if a file is encoded in utf8 or not.
 
 The current default is B<guess>.
 
@@ -633,13 +633,15 @@ named B<file.pl> which is encoded in UTF-8 you can use:
 or
    perltidy -guess file.pl
 
-To process a file in B<euc-jp> you could use
+or simply
+
+   perltidy file.pl
 
-   perltidy -enc=euc-jp file.pl
+since B<-guess> is the default.
 
-A perltidy output file is unencoded if the input file is unencoded, and
-otherwise it is encoded as B<utf8>, even if the input encoding was not
-B<utf8>.
+To process files with an encoding other than UTF-8, it would be necessary to
+write a short program which calls the Perl::Tidy module with some pre- and
+post-processing to handle decoding and encoding.
 
 =item B<-eos=s>,   B<--encode-output-strings=s>
 
@@ -896,6 +898,10 @@ containers (not just lists in parentheses).  The next section describes how to
 limit this style to, for example, just function calls.  The default indentation
 method will be applied elsewhere.
 
+(3) If you use B<-xlp> then long side comments can limit the indentation over
+multiple lines.  Consider adding the flag B<--ignore-side-comment-lengths> to
+prevent this, or minimizing the use of side comments.
+
 =item B<-lpxl=s>,  B<--line-up-parentheses-exclusion-list>
 
 The following discussion mentions B<-lp> but applies equally to B<-xlp>.
diff --git a/lib/Perl/Tidy.pm b/lib/Perl/Tidy.pm
index 642cfdb5..3693b91c 100644
--- a/lib/Perl/Tidy.pm
+++ b/lib/Perl/Tidy.pm
@@ -1087,6 +1087,9 @@ EOM
                     }
                 }
             }
+            $encoding_log_message .= <<EOM;
+Unable to guess a character encoding
+EOM
         }
 
         # Case 4. Decode with a specific encoding
@@ -3348,6 +3351,15 @@ sub check_options {
     # check and handle any interactions among the basic options..
     #---------------------------------------------------------------
 
+    # Since perltidy only encodes in utf8, problems can occur if we let it
+    # decode anything else.  See discussions for issue git #83.
+    my $encoding = $rOpts->{'character-encoding'};
+    if ( $encoding !~ /^\s*(guess|none|utf8|utf-8)\s*$/i ) {
+        Die(<<EOM);
+--character-encoding = '$encoding' is not allowed; the options are: 'none', 'guess', 'utf8'
+EOM
+    }
+
     # Since -vt, -vtc, and -cti are abbreviations, but under
     # msdos, an unquoted input parameter like vtc=1 will be
     # seen as 2 parameters, vtc and 1, so the abbreviations
-- 
2.39.5