X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=Debbugs%2FUTF8.pm;h=01351f3668f04e2e33704eb672300006bf656579;hb=8e2468e4725d9784c5e99915cee75c3cc41903cf;hp=e96734035e61a1dff4085dea5fde21ffb0a21ce7;hpb=9dd18b2a38fae81a660504da29147937596ba206;p=debbugs.git diff --git a/Debbugs/UTF8.pm b/Debbugs/UTF8.pm index e967340..01351f3 100644 --- a/Debbugs/UTF8.pm +++ b/Debbugs/UTF8.pm @@ -28,14 +28,14 @@ charsets to UTF8. use warnings; use strict; use vars qw($VERSION $DEBUG %EXPORT_TAGS @EXPORT_OK @EXPORT); -use base qw(Exporter); +use Exporter qw(import); BEGIN{ $VERSION = 1.00; $DEBUG = 0 unless defined $DEBUG; %EXPORT_TAGS = (utf8 => [qw(encode_utf8_structure encode_utf8_safely), - qw(convert_to_utf8)], + qw(convert_to_utf8 decode_utf8_safely)], ); @EXPORT = (@{$EXPORT_TAGS{utf8}}); @EXPORT_OK = (); @@ -46,7 +46,7 @@ BEGIN{ use Carp; $Carp::Verbose = 1; -use Encode qw(encode_utf8 is_utf8 decode); +use Encode qw(encode_utf8 is_utf8 decode decode_utf8); use Text::Iconv; use Storable qw(dclone); @@ -68,7 +68,7 @@ our $depth = 0; sub encode_utf8_structure { ++$depth; my @ret; - for my $_ (@_) { + for $_ (@_) { if (ref($_) eq 'HASH') { push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})}; } @@ -108,59 +108,91 @@ sub encode_utf8_safely{ } push @ret,$r; } - return wantarray ? @ret : (length @_ > 1 ? @ret : $_[0]); + return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]); } +=head2 decode_utf8_safely + + $string = decode_utf8_safely($octets); + +Given $octets in UTF8, returns the perl-internal equivalent of $octets +if $octets does not have is_utf8 set; otherwise returns $octets. + +Silently returns REFs without encoding them. + +=cut + + +sub decode_utf8_safely{ + my @ret; + for my $r (@_) { + if (not ref($r) and not is_utf8($r)) { + $r = decode_utf8($r); + } + push @ret, $r; + } + return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]); +} + + + + =head2 convert_to_utf8 $utf8 = convert_to_utf8("text","charset"); =cut -our %iconv_converters; - sub convert_to_utf8 { - my ($data,$charset) = @_; + my ($data,$charset,$internal_call) = @_; + $internal_call //= 0; if (is_utf8($data)) { - return encode_utf8($data); + cluck("utf8 flag is set when calling convert_to_utf8"); + return $data; } - $charset = uc($charset); + $charset = uc($charset//'UTF-8'); if ($charset eq 'RAW') { - return $data; + croak("Charset must not be raw when calling convert_to_utf8"); } - if (not defined $iconv_converters{$charset}) { - eval { - $iconv_converters{$charset} = Text::Iconv->new($charset,"UTF-8") or - die "Unable to create converter for '$charset'"; - }; - if ($@) { - warn $@; - # We weren't able to create the converter, so use Encode - # instead - return __fallback_convert_to_utf8($data,$charset); - } + ## if the charset is unknown or unknown 8 bit, assume that it's UTF-8. + if ($charset =~ /unknown/i) { + $charset = 'UTF-8' } - if (not defined $iconv_converters{$charset}) { - warn "The converter for $charset wasn't created properly somehow!"; + my $iconv_converter; + eval { + $iconv_converter = Text::Iconv->new($charset,"UTF-8") or + die "Unable to create converter for '$charset'"; + }; + if ($@) { + return undef if $internal_call; + warn $@; + # We weren't able to create the converter, so use Encode + # instead return __fallback_convert_to_utf8($data,$charset); } - my $converted_data = $iconv_converters{$charset}->convert($data); - $converted_data = "$converted_data"; + my $converted_data = $iconv_converter->convert($data); # if the conversion failed, retval will be undefined or perhaps # -1. - my $retval = $iconv_converters{$charset}->retval(); + my $retval = $iconv_converter->retval(); if (not defined $retval or $retval < 0 ) { + # try iso8559-1 first + if (not $internal_call) { + my $call_back_data = convert_to_utf8($data,'ISO8859-1',1); + # if there's an à (0xC3), it's probably something + # horrible, and we shouldn't try to convert it. + if (defined $call_back_data and $call_back_data !~ /\x{C3}/) { + return $call_back_data; + } + } # Fallback to encode, which will probably also fail. return __fallback_convert_to_utf8($data,$charset); } - return $converted_data; + return decode("UTF-8",$converted_data); } -# Bug #61342 et al. -# we're switching this to return UTF8 octets instead of perl's internal -# encoding +# this returns data in perl's internal encoding sub __fallback_convert_to_utf8 { my ($data, $charset) = @_; # raw data just gets returned (that's the charset WordDecorder @@ -172,10 +204,13 @@ sub __fallback_convert_to_utf8 { } # lets assume everything that doesn't have a charset is utf8 $charset //= 'utf8'; + ## if the charset is unknown, assume it's UTF-8 + if ($charset =~ /unknown/i) { + $charset = 'utf8'; + } my $result; eval { - $result = decode($charset,$data) unless is_utf8($data); - $result = encode_utf8($result); + $result = decode($charset,$data,0); }; if ($@) { warn "Unable to decode charset; '$charset' and '$data': $@";