use warnings;
use strict;
use vars qw($VERSION $DEBUG %EXPORT_TAGS @EXPORT_OK @EXPORT);
-use base qw(Exporter);
+use Exporter qw(import);
BEGIN{
$VERSION = 1.00;
sub encode_utf8_structure {
++$depth;
my @ret;
- for my $_ (@_) {
+ for $_ (@_) {
if (ref($_) eq 'HASH') {
push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})};
}
=cut
-our %iconv_converters;
-
sub convert_to_utf8 {
- my ($data,$charset) = @_;
+ my ($data,$charset,$internal_call) = @_;
+ $internal_call //= 0;
if (is_utf8($data)) {
cluck("utf8 flag is set when calling convert_to_utf8");
return $data;
if ($charset eq 'RAW') {
croak("Charset must not be raw when calling convert_to_utf8");
}
- if (not defined $iconv_converters{$charset}) {
- eval {
- $iconv_converters{$charset} = Text::Iconv->new($charset,"UTF-8") or
- die "Unable to create converter for '$charset'";
- };
- if ($@) {
- warn $@;
- # We weren't able to create the converter, so use Encode
- # instead
- return __fallback_convert_to_utf8($data,$charset);
- }
+ ## if the charset is unknown or unknown 8 bit, assume that it's UTF-8.
+ if ($charset =~ /unknown/i) {
+ $charset = 'UTF-8'
}
- if (not defined $iconv_converters{$charset}) {
- warn "The converter for $charset wasn't created properly somehow!";
+ my $iconv_converter;
+ eval {
+ $iconv_converter = Text::Iconv->new($charset,"UTF-8") or
+ die "Unable to create converter for '$charset'";
+ };
+ if ($@) {
+ return undef if $internal_call;
+ warn $@;
+ # We weren't able to create the converter, so use Encode
+ # instead
return __fallback_convert_to_utf8($data,$charset);
}
- my $converted_data = $iconv_converters{$charset}->convert($data);
+ my $converted_data = $iconv_converter->convert($data);
# if the conversion failed, retval will be undefined or perhaps
# -1.
- my $retval = $iconv_converters{$charset}->retval();
+ my $retval = $iconv_converter->retval();
if (not defined $retval or
$retval < 0
) {
- warn "failed to convert to utf8 (charset: $charset, data: $data)";
+ # try iso8559-1 first
+ if (not $internal_call) {
+ my $call_back_data = convert_to_utf8($data,'ISO8859-1',1);
+ # if there's an à (0xC3), it's probably something
+ # horrible, and we shouldn't try to convert it.
+ if (defined $call_back_data and $call_back_data !~ /\x{C3}/) {
+ return $call_back_data;
+ }
+ }
# Fallback to encode, which will probably also fail.
return __fallback_convert_to_utf8($data,$charset);
}
}
# lets assume everything that doesn't have a charset is utf8
$charset //= 'utf8';
+ ## if the charset is unknown, assume it's UTF-8
+ if ($charset =~ /unknown/i) {
+ $charset = 'utf8';
+ }
my $result;
eval {
- $result = decode($charset,$data);
+ $result = decode($charset,$data,0);
};
if ($@) {
warn "Unable to decode charset; '$charset' and '$data': $@";