use warnings;
use strict;
use vars qw($VERSION $DEBUG %EXPORT_TAGS @EXPORT_OK @EXPORT);
-use base qw(Exporter);
+use Exporter qw(import);
BEGIN{
$VERSION = 1.00;
$DEBUG = 0 unless defined $DEBUG;
%EXPORT_TAGS = (utf8 => [qw(encode_utf8_structure encode_utf8_safely),
- qw(convert_to_utf8)],
+ qw(convert_to_utf8 decode_utf8_safely)],
);
@EXPORT = (@{$EXPORT_TAGS{utf8}});
@EXPORT_OK = ();
use Carp;
$Carp::Verbose = 1;
-use Encode qw(encode_utf8 is_utf8 decode);
+use Encode qw(encode_utf8 is_utf8 decode decode_utf8);
use Text::Iconv;
use Storable qw(dclone);
sub encode_utf8_structure {
++$depth;
my @ret;
- for my $_ (@_) {
+ for $_ (@_) {
if (ref($_) eq 'HASH') {
push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})};
}
}
push @ret,$r;
}
- return wantarray ? @ret : (length @_ > 1 ? @ret : $_[0]);
+ return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]);
}
+=head2 decode_utf8_safely
+
+ $string = decode_utf8_safely($octets);
+
+Given $octets in UTF8, returns the perl-internal equivalent of $octets
+if $octets does not have is_utf8 set; otherwise returns $octets.
+
+Silently returns REFs without encoding them.
+
+=cut
+
+
+sub decode_utf8_safely{
+ my @ret;
+ for my $r (@_) {
+ if (not ref($r) and not is_utf8($r)) {
+ $r = decode_utf8($r);
+ }
+ push @ret, $r;
+ }
+ return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]);
+}
+
+
+
+
=head2 convert_to_utf8
$utf8 = convert_to_utf8("text","charset");
=cut
-our %iconv_converters;
-
sub convert_to_utf8 {
- my ($data,$charset) = @_;
+ my ($data,$charset,$internal_call) = @_;
+ $internal_call //= 0;
if (is_utf8($data)) {
- return encode_utf8($data);
+ cluck("utf8 flag is set when calling convert_to_utf8");
+ return $data;
}
- $charset = uc($charset);
+ $charset = uc($charset//'UTF-8');
if ($charset eq 'RAW') {
- return $data;
+ croak("Charset must not be raw when calling convert_to_utf8");
}
- if (not defined $iconv_converters{$charset}) {
- eval {
- $iconv_converters{$charset} = Text::Iconv->new($charset,"UTF-8") or
- die "Unable to create converter for '$charset'";
- };
- if ($@) {
- warn $@;
- # We weren't able to create the converter, so use Encode
- # instead
- return __fallback_convert_to_utf8($data,$charset);
- }
+ ## if the charset is unknown or unknown 8 bit, assume that it's UTF-8.
+ if ($charset =~ /unknown/i) {
+ $charset = 'UTF-8'
}
- if (not defined $iconv_converters{$charset}) {
- warn "The converter for $charset wasn't created properly somehow!";
+ my $iconv_converter;
+ eval {
+ $iconv_converter = Text::Iconv->new($charset,"UTF-8") or
+ die "Unable to create converter for '$charset'";
+ };
+ if ($@) {
+ return undef if $internal_call;
+ warn $@;
+ # We weren't able to create the converter, so use Encode
+ # instead
return __fallback_convert_to_utf8($data,$charset);
}
- my $converted_data = $iconv_converters{$charset}->convert($data);
- $converted_data = "$converted_data";
+ my $converted_data = $iconv_converter->convert($data);
# if the conversion failed, retval will be undefined or perhaps
# -1.
- my $retval = $iconv_converters{$charset}->retval();
+ my $retval = $iconv_converter->retval();
if (not defined $retval or
$retval < 0
) {
+ # try iso8559-1 first
+ if (not $internal_call) {
+ my $call_back_data = convert_to_utf8($data,'ISO8859-1',1);
+ # if there's an à (0xC3), it's probably something
+ # horrible, and we shouldn't try to convert it.
+ if (defined $call_back_data and $call_back_data !~ /\x{C3}/) {
+ return $call_back_data;
+ }
+ }
# Fallback to encode, which will probably also fail.
return __fallback_convert_to_utf8($data,$charset);
}
- return $converted_data;
+ return decode("UTF-8",$converted_data);
}
-# Bug #61342 et al.
-# we're switching this to return UTF8 octets instead of perl's internal
-# encoding
+# this returns data in perl's internal encoding
sub __fallback_convert_to_utf8 {
my ($data, $charset) = @_;
# raw data just gets returned (that's the charset WordDecorder
}
# lets assume everything that doesn't have a charset is utf8
$charset //= 'utf8';
+ ## if the charset is unknown, assume it's UTF-8
+ if ($charset =~ /unknown/i) {
+ $charset = 'utf8';
+ }
my $result;
eval {
- $result = decode($charset,$data) unless is_utf8($data);
- $result = encode_utf8($result);
+ $result = decode($charset,$data,0);
};
if ($@) {
warn "Unable to decode charset; '$charset' and '$data': $@";