1 # This module is part of debbugs, and is released
2 # under the terms of the GPL version 2, or any later
3 # version at your option.
4 # See the file README and COPYING for more information.
6 # Copyright 2013 by Don Armstrong <don@donarmstrong.com>.
12 Debbugs::UTF8 -- Routines for handling conversion of charsets to UTF8
21 This module contains routines which convert from various different
30 use vars qw($VERSION $DEBUG %EXPORT_TAGS @EXPORT_OK @EXPORT);
31 use Exporter qw(import);
35 $DEBUG = 0 unless defined $DEBUG;
37 %EXPORT_TAGS = (utf8 => [qw(encode_utf8_structure encode_utf8_safely),
38 qw(convert_to_utf8 decode_utf8_safely)],
40 @EXPORT = (@{$EXPORT_TAGS{utf8}});
42 Exporter::export_ok_tags(keys %EXPORT_TAGS);
43 $EXPORT_TAGS{all} = [@EXPORT_OK];
49 use Encode qw(encode_utf8 is_utf8 decode decode_utf8);
51 use Storable qw(dclone);
56 These functions are exported with the :utf8 tag
58 =head2 encode_utf8_structure
60 %newdata = encode_utf8_structure(%newdata);
62 Takes a complex data structure and encodes any strings with is_utf8
63 set into their constituent octets.
68 sub encode_utf8_structure {
72 if (ref($_) eq 'HASH') {
73 push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})};
75 elsif (ref($_) eq 'ARRAY') {
76 push @ret, [encode_utf8_structure(@{$depth == 1 ? dclone($_):$_})];
79 # we don't know how to handle non hash or non arrays
83 push @ret,encode_utf8_safely($_);
90 =head2 encode_utf8_safely
92 $octets = encode_utf8_safely($string);
94 Given a $string, returns the octet equivalent of $string if $string is
95 in perl's internal encoding; otherwise returns $string.
97 Silently returns REFs without encoding them. [If you want to deeply
98 encode REFs, see encode_utf8_structure.]
103 sub encode_utf8_safely{
106 if (not ref($r) and is_utf8($r)) {
107 $r = encode_utf8($r);
111 return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]);
114 =head2 decode_utf8_safely
116 $string = decode_utf8_safely($octets);
118 Given $octets in UTF8, returns the perl-internal equivalent of $octets
119 if $octets does not have is_utf8 set; otherwise returns $octets.
121 Silently returns REFs without encoding them.
126 sub decode_utf8_safely{
129 if (not ref($r) and not is_utf8($r)) {
130 $r = decode_utf8($r);
134 return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]);
140 =head2 convert_to_utf8
142 $utf8 = convert_to_utf8("text","charset");
146 sub convert_to_utf8 {
147 my ($data,$charset,$internal_call) = @_;
148 $internal_call //= 0;
149 if (is_utf8($data)) {
150 cluck("utf8 flag is set when calling convert_to_utf8");
153 $charset = uc($charset//'UTF-8');
154 if ($charset eq 'RAW') {
155 croak("Charset must not be raw when calling convert_to_utf8");
159 $iconv_converter = Text::Iconv->new($charset,"UTF-8") or
160 die "Unable to create converter for '$charset'";
163 return undef if $internal_call;
165 # We weren't able to create the converter, so use Encode
167 return __fallback_convert_to_utf8($data,$charset);
169 my $converted_data = $iconv_converter->convert($data);
170 # if the conversion failed, retval will be undefined or perhaps
172 my $retval = $iconv_converter->retval();
173 if (not defined $retval or
176 # try iso8559-1 first
177 if (not $internal_call) {
178 my $call_back_data = convert_to_utf8($data,'ISO8859-1',1);
179 # if there's an à (0xC3), it's probably something
180 # horrible, and we shouldn't try to convert it.
181 if (defined $call_back_data and $call_back_data !~ /\x{C3}/) {
182 return $call_back_data;
185 # Fallback to encode, which will probably also fail.
186 return __fallback_convert_to_utf8($data,$charset);
188 return decode("UTF-8",$converted_data);
191 # this returns data in perl's internal encoding
192 sub __fallback_convert_to_utf8 {
193 my ($data, $charset) = @_;
194 # raw data just gets returned (that's the charset WordDecorder
195 # uses when it doesn't know what to do)
196 return $data if $charset eq 'raw';
197 if (not defined $charset and not is_utf8($data)) {
198 warn ("Undefined charset, and string '$data' is not in perl's internal encoding");
201 # lets assume everything that doesn't have a charset is utf8
205 $result = decode($charset,$data,0);
208 warn "Unable to decode charset; '$charset' and '$data': $@";