decode_rfc1522 now returns utf8 octets

[debbugs.git] / Debbugs / Common.pm
diff --git a/Debbugs/Common.pm b/Debbugs/Common.pm

index 915fa859a7f8bd290f6037d3c8b17ab3fa757cf7..eb068edbf4bc589a586989fda55b0ba9678fbacb 100644 (file)
--- a/Debbugs/Common.pm
+++ b/Debbugs/Common.pm
@@ -50,12 +50,14 @@ BEGIN{
                                 qw(cleanup_eval_fail),
                                 qw(hash_slice),
                                ],
+                    utf8   => [qw(encode_utf8_structure encode_utf8_safely),
+                                qw(convert_to_utf8)],
                      date   => [qw(secs_to_english)],
                      quit   => [qw(quit)],
                      lock   => [qw(filelock unfilelock lockpid)],
                     );
       @EXPORT_OK = ();
-     Exporter::export_ok_tags(qw(lock quit date util misc));
+     Exporter::export_ok_tags(keys %EXPORT_TAGS);
       $EXPORT_TAGS{all} = [@EXPORT_OK];
  }
  
@@ -70,6 +72,9 @@ use IO::Scalar;
  use Debbugs::MIME qw(decode_rfc1522);
  use Mail::Address;
  use Cwd qw(cwd);
+use Encode qw(encode_utf8 is_utf8 decode);
+use Text::Iconv;
+use Storable qw(dclone);
  
  use Params::Validate qw(validate_with :types);
  
@@ -429,6 +434,7 @@ sub __add_to_hash {
      $type //= 'address';
      my $fh = IO::File->new($fn,'r') or
         die "Unable to open $fn for reading: $!";
+    binmode($fh,':encoding(UTF-8)');
      while (<$fh>) {
         chomp;
         next unless m/^(\S+)\s+(\S.*\S)\s*$/;
@@ -823,7 +829,8 @@ sub globify_scalar {
           if (defined ref($scalar)) {
                if (ref($scalar) eq 'SCALAR' and
                    not UNIVERSAL::isa($scalar,'GLOB')) {
-                   return IO::Scalar->new($scalar);
+                   open $handle, '>:scalar:utf8', $scalar;
+                   return $handle;
                }
                else {
                     return $scalar;
@@ -836,7 +843,7 @@ sub globify_scalar {
                carp "Given a non-scalar reference, non-glob to globify_scalar; returning /dev/null handle";
           }
       }
-     return IO::File->new('/dev/null','w');
+     return IO::File->new('/dev/null','>:utf8');
  }
  
  =head2 cleanup_eval_fail()
@@ -869,6 +876,8 @@ sub cleanup_eval_fail {
      }
      # ditch the "at foo/bar/baz.pm line 5"
      $error =~ s/\sat\s\S+\sline\s\d+//;
+    # ditch croak messages
+    $error =~ s/^\t+.+\n?//g;
      # ditch trailing multiple periods in case there was a cascade of
      # die messages.
      $error =~ s/\.+$/\./;
@@ -892,6 +901,148 @@ sub hash_slice(\%@) {
      return map {exists $hashref->{$_}?($_,$hashref->{$_}):()} @keys;
  }
  
+
+=head1 UTF-8
+
+These functions are exported with the :utf8 tag
+
+=head2 encode_utf8_structure
+
+     %newdata = encode_utf8_structure(%newdata);
+
+Takes a complex data structure and encodes any strings with is_utf8
+set into their constituent octets.
+
+=cut
+
+our $depth = 0;
+sub encode_utf8_structure {
+    ++$depth;
+    my @ret;
+    for my $_ (@_) {
+       if (ref($_) eq 'HASH') {
+           push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})};
+       }
+       elsif (ref($_) eq 'ARRAY') {
+           push @ret, [encode_utf8_structure(@{$depth == 1 ? dclone($_):$_})];
+       }
+       elsif (ref($_)) {
+           # we don't know how to handle non hash or non arrays
+           push @ret,$_;
+       }
+       else {
+           push @ret,encode_utf8_safely($_);
+       }
+    }
+    --$depth;
+    return @ret;
+}
+
+=head2 encode_utf8_safely
+
+     $octets = encode_utf8_safely($string);
+
+Given a $string, returns the octet equivalent of $string if $string is
+in perl's internal encoding; otherwise returns $string.
+
+Silently returns REFs without encoding them. [If you want to deeply
+encode REFs, see encode_utf8_structure.]
+
+=cut
+
+
+sub encode_utf8_safely{
+    my @ret;
+    for my $r (@_) {
+        if (not ref($r) and is_utf8($r)) {
+           $r = encode_utf8($r);
+       }
+       push @ret,$r;
+    }
+    return wantarray ? @ret : (length @_ > 1 ? @ret : $_[0]);
+}
+
+=head2 convert_to_utf8
+
+    $utf8 = convert_to_utf8("text","charset");
+
+=cut
+
+our %iconv_converters;
+
+sub convert_to_utf8 {
+    my ($data,$charset) = @_;
+    if (is_utf8($data)) {
+        return encode_utf8($data);
+    }
+    $charset = uc($charset);
+    if (not defined $iconv_converters{$charset}) {
+        eval {
+            $iconv_converters{$charset} = Text::Iconv->new($charset,"UTF-8") or
+                die "Unable to create converter for '$charset'";
+        };
+        if ($@) {
+            warn $@;
+            # We weren't able to create the converter, so use Encode
+            # instead
+            return __fallback_convert_to_utf8($data,$charset);
+        }
+        # It shouldn't be necessary when converting to UTF8, but lets
+        # allow for transliteration and silent discarding of broken
+        # sequences
+        eval {
+            $iconv_converters{$charset}->set_attr("transliterate");
+            $iconv_converters{$charset}->set_attr("discard_ilseq");
+        };
+        # This shouldn't fail on Debian systems; we're warning here
+        # just in case we've made a mistake above. This warning should
+        # probably be disabled on non-GNU libc systems.
+        warn $@ if $@;
+    }
+    if (not defined $iconv_converters{$charset}) {
+        warn "The converter for $charset wasn't created properly somehow!";
+        return __fallback_convert_to_utf8($data,$charset);
+    }
+    my $converted_data = $iconv_converters{$charset}->convert($data);
+    # if the conversion failed, retval will be undefined or perhaps
+    # -1.
+    if (not defined $iconv_converters{$charset}->retval() or
+        $iconv_converters{$charset}->retval() < 0
+       ) {
+        # Fallback to encode, which will probably also fail.
+        return __fallback_convert_to_utf8($data,$charset);
+    }
+    return $converted_data;
+}
+
+# Bug #61342 et al.
+# we're switching this to return UTF8 octets instead of perl's internal
+# encoding
+sub __Fallback_convert_to_utf8 {
+     my ($data, $charset) = @_;
+     # raw data just gets returned (that's the charset WordDecorder
+     # uses when it doesn't know what to do)
+     return $data if $charset eq 'raw';
+     if (not defined $charset and not is_utf8($data)) {
+         warn ("Undefined charset, and string '$data' is not in perl's internal encoding");
+         return $data;
+     }
+     # lets assume everything that doesn't have a charset is utf8
+     $charset //= 'utf8';
+     my $result;
+     eval {
+        $result = decode($charset,$data) unless is_utf8($data);
+         $result = encode_utf8($result);
+     };
+     if ($@) {
+         warn "Unable to decode charset; '$charset' and '$data': $@";
+         return $data;
+     }
+     return $result;
+}
+
+
+
  1;
  
  __END__