From: Don Armstrong Date: Fri, 1 Mar 2013 05:56:28 +0000 (-0800) Subject: move UTF8 routines out of Debbugs::Common and into Debbugs::UTF8 so Debbugs::Config... X-Git-Tag: release/2.6.0~313^2~11 X-Git-Url: https://git.donarmstrong.com/?p=debbugs.git;a=commitdiff_plain;h=9dd18b2a38fae81a660504da29147937596ba206 move UTF8 routines out of Debbugs::Common and into Debbugs::UTF8 so Debbugs::Config isn't loaded early --- diff --git a/Debbugs/Common.pm b/Debbugs/Common.pm index eb068ed..279893c 100644 --- a/Debbugs/Common.pm +++ b/Debbugs/Common.pm @@ -50,8 +50,6 @@ BEGIN{ qw(cleanup_eval_fail), qw(hash_slice), ], - utf8 => [qw(encode_utf8_structure encode_utf8_safely), - qw(convert_to_utf8)], date => [qw(secs_to_english)], quit => [qw(quit)], lock => [qw(filelock unfilelock lockpid)], @@ -72,8 +70,6 @@ use IO::Scalar; use Debbugs::MIME qw(decode_rfc1522); use Mail::Address; use Cwd qw(cwd); -use Encode qw(encode_utf8 is_utf8 decode); -use Text::Iconv; use Storable qw(dclone); use Params::Validate qw(validate_with :types); @@ -902,147 +898,6 @@ sub hash_slice(\%@) { } -=head1 UTF-8 - -These functions are exported with the :utf8 tag - -=head2 encode_utf8_structure - - %newdata = encode_utf8_structure(%newdata); - -Takes a complex data structure and encodes any strings with is_utf8 -set into their constituent octets. - -=cut - -our $depth = 0; -sub encode_utf8_structure { - ++$depth; - my @ret; - for my $_ (@_) { - if (ref($_) eq 'HASH') { - push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})}; - } - elsif (ref($_) eq 'ARRAY') { - push @ret, [encode_utf8_structure(@{$depth == 1 ? dclone($_):$_})]; - } - elsif (ref($_)) { - # we don't know how to handle non hash or non arrays - push @ret,$_; - } - else { - push @ret,encode_utf8_safely($_); - } - } - --$depth; - return @ret; -} - -=head2 encode_utf8_safely - - $octets = encode_utf8_safely($string); - -Given a $string, returns the octet equivalent of $string if $string is -in perl's internal encoding; otherwise returns $string. - -Silently returns REFs without encoding them. [If you want to deeply -encode REFs, see encode_utf8_structure.] - -=cut - - -sub encode_utf8_safely{ - my @ret; - for my $r (@_) { - if (not ref($r) and is_utf8($r)) { - $r = encode_utf8($r); - } - push @ret,$r; - } - return wantarray ? @ret : (length @_ > 1 ? @ret : $_[0]); -} - -=head2 convert_to_utf8 - - $utf8 = convert_to_utf8("text","charset"); - -=cut - -our %iconv_converters; - -sub convert_to_utf8 { - my ($data,$charset) = @_; - if (is_utf8($data)) { - return encode_utf8($data); - } - $charset = uc($charset); - if (not defined $iconv_converters{$charset}) { - eval { - $iconv_converters{$charset} = Text::Iconv->new($charset,"UTF-8") or - die "Unable to create converter for '$charset'"; - }; - if ($@) { - warn $@; - # We weren't able to create the converter, so use Encode - # instead - return __fallback_convert_to_utf8($data,$charset); - } - # It shouldn't be necessary when converting to UTF8, but lets - # allow for transliteration and silent discarding of broken - # sequences - eval { - $iconv_converters{$charset}->set_attr("transliterate"); - $iconv_converters{$charset}->set_attr("discard_ilseq"); - }; - # This shouldn't fail on Debian systems; we're warning here - # just in case we've made a mistake above. This warning should - # probably be disabled on non-GNU libc systems. - warn $@ if $@; - } - if (not defined $iconv_converters{$charset}) { - warn "The converter for $charset wasn't created properly somehow!"; - return __fallback_convert_to_utf8($data,$charset); - } - my $converted_data = $iconv_converters{$charset}->convert($data); - # if the conversion failed, retval will be undefined or perhaps - # -1. - if (not defined $iconv_converters{$charset}->retval() or - $iconv_converters{$charset}->retval() < 0 - ) { - # Fallback to encode, which will probably also fail. - return __fallback_convert_to_utf8($data,$charset); - } - return $converted_data; -} - -# Bug #61342 et al. -# we're switching this to return UTF8 octets instead of perl's internal -# encoding -sub __Fallback_convert_to_utf8 { - my ($data, $charset) = @_; - # raw data just gets returned (that's the charset WordDecorder - # uses when it doesn't know what to do) - return $data if $charset eq 'raw'; - if (not defined $charset and not is_utf8($data)) { - warn ("Undefined charset, and string '$data' is not in perl's internal encoding"); - return $data; - } - # lets assume everything that doesn't have a charset is utf8 - $charset //= 'utf8'; - my $result; - eval { - $result = decode($charset,$data) unless is_utf8($data); - $result = encode_utf8($result); - }; - if ($@) { - warn "Unable to decode charset; '$charset' and '$data': $@"; - return $data; - } - return $result; -} - - - 1; __END__ diff --git a/Debbugs/Control.pm b/Debbugs/Control.pm index 3119ef8..44463ba 100644 --- a/Debbugs/Control.pm +++ b/Debbugs/Control.pm @@ -110,7 +110,8 @@ BEGIN{ } use Debbugs::Config qw(:config); -use Debbugs::Common qw(:lock buglog :misc get_hashname sort_versions :utf8); +use Debbugs::Common qw(:lock buglog :misc get_hashname sort_versions); +use Debbugs::UTF8; use Debbugs::Status qw(bug_archiveable :read :hook writebug new_bug splitpackages split_status_fields get_bug_status); use Debbugs::CGI qw(html_escape); use Debbugs::Log qw(:misc :write); diff --git a/Debbugs/MIME.pm b/Debbugs/MIME.pm index 8c7b0cd..482a30b 100644 --- a/Debbugs/MIME.pm +++ b/Debbugs/MIME.pm @@ -55,7 +55,7 @@ use POSIX qw(strftime); use List::MoreUtils qw(apply); # for convert_to_utf8 -use Debbugs::Common qw(convert_to_utf8); +use Debbugs::UTF8 qw(convert_to_utf8); # for decode_rfc1522 use MIME::WordDecoder qw(); diff --git a/Debbugs/Mail.pm b/Debbugs/Mail.pm index d26c860..9fa282b 100644 --- a/Debbugs/Mail.pm +++ b/Debbugs/Mail.pm @@ -49,6 +49,7 @@ use Debbugs::MIME qw(encode_rfc1522); use Debbugs::Config qw(:config); use Params::Validate qw(:types validate_with); use Encode qw(encode is_utf8); +use Debbugs::UTF8 qw(encode_utf8_safely); use Debbugs::Packages; @@ -398,7 +399,7 @@ sub encode_headers{ my ($header,$body) = split /\n\n/, $message, 2; $header = encode_rfc1522($header); - return $header . qq(\n\n). $body; + return $header . qq(\n\n). encode_utf8_safely($body); } =head2 rfc822_date diff --git a/Debbugs/Status.pm b/Debbugs/Status.pm index a2aeabe..7d72c03 100644 --- a/Debbugs/Status.pm +++ b/Debbugs/Status.pm @@ -37,7 +37,8 @@ use vars qw($VERSION $DEBUG %EXPORT_TAGS @EXPORT_OK @EXPORT); use base qw(Exporter); use Params::Validate qw(validate_with :types); -use Debbugs::Common qw(:util :lock :quit :misc :utf8); +use Debbugs::Common qw(:util :lock :quit :misc); +use Debbugs::UTF8; use Debbugs::Config qw(:config); use Debbugs::MIME qw(decode_rfc1522 encode_rfc1522); use Debbugs::Packages qw(makesourceversions make_source_versions getversions get_versions binary_to_source); diff --git a/Debbugs/UTF8.pm b/Debbugs/UTF8.pm new file mode 100644 index 0000000..e967340 --- /dev/null +++ b/Debbugs/UTF8.pm @@ -0,0 +1,191 @@ +# This module is part of debbugs, and is released +# under the terms of the GPL version 2, or any later +# version at your option. +# See the file README and COPYING for more information. +# +# Copyright 2013 by Don Armstrong . + +package Debbugs::UTF8; + +=head1 NAME + +Debbugs::UTF8 -- Routines for handling conversion of charsets to UTF8 + +=head1 SYNOPSIS + +use Debbugs::UTF8; + + +=head1 DESCRIPTION + +This module contains routines which convert from various different +charsets to UTF8. + +=head1 FUNCTIONS + +=cut + +use warnings; +use strict; +use vars qw($VERSION $DEBUG %EXPORT_TAGS @EXPORT_OK @EXPORT); +use base qw(Exporter); + +BEGIN{ + $VERSION = 1.00; + $DEBUG = 0 unless defined $DEBUG; + + %EXPORT_TAGS = (utf8 => [qw(encode_utf8_structure encode_utf8_safely), + qw(convert_to_utf8)], + ); + @EXPORT = (@{$EXPORT_TAGS{utf8}}); + @EXPORT_OK = (); + Exporter::export_ok_tags(keys %EXPORT_TAGS); + $EXPORT_TAGS{all} = [@EXPORT_OK]; +} + +use Carp; +$Carp::Verbose = 1; + +use Encode qw(encode_utf8 is_utf8 decode); +use Text::Iconv; +use Storable qw(dclone); + + +=head1 UTF-8 + +These functions are exported with the :utf8 tag + +=head2 encode_utf8_structure + + %newdata = encode_utf8_structure(%newdata); + +Takes a complex data structure and encodes any strings with is_utf8 +set into their constituent octets. + +=cut + +our $depth = 0; +sub encode_utf8_structure { + ++$depth; + my @ret; + for my $_ (@_) { + if (ref($_) eq 'HASH') { + push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})}; + } + elsif (ref($_) eq 'ARRAY') { + push @ret, [encode_utf8_structure(@{$depth == 1 ? dclone($_):$_})]; + } + elsif (ref($_)) { + # we don't know how to handle non hash or non arrays + push @ret,$_; + } + else { + push @ret,encode_utf8_safely($_); + } + } + --$depth; + return @ret; +} + +=head2 encode_utf8_safely + + $octets = encode_utf8_safely($string); + +Given a $string, returns the octet equivalent of $string if $string is +in perl's internal encoding; otherwise returns $string. + +Silently returns REFs without encoding them. [If you want to deeply +encode REFs, see encode_utf8_structure.] + +=cut + + +sub encode_utf8_safely{ + my @ret; + for my $r (@_) { + if (not ref($r) and is_utf8($r)) { + $r = encode_utf8($r); + } + push @ret,$r; + } + return wantarray ? @ret : (length @_ > 1 ? @ret : $_[0]); +} + +=head2 convert_to_utf8 + + $utf8 = convert_to_utf8("text","charset"); + +=cut + +our %iconv_converters; + +sub convert_to_utf8 { + my ($data,$charset) = @_; + if (is_utf8($data)) { + return encode_utf8($data); + } + $charset = uc($charset); + if ($charset eq 'RAW') { + return $data; + } + if (not defined $iconv_converters{$charset}) { + eval { + $iconv_converters{$charset} = Text::Iconv->new($charset,"UTF-8") or + die "Unable to create converter for '$charset'"; + }; + if ($@) { + warn $@; + # We weren't able to create the converter, so use Encode + # instead + return __fallback_convert_to_utf8($data,$charset); + } + } + if (not defined $iconv_converters{$charset}) { + warn "The converter for $charset wasn't created properly somehow!"; + return __fallback_convert_to_utf8($data,$charset); + } + my $converted_data = $iconv_converters{$charset}->convert($data); + $converted_data = "$converted_data"; + # if the conversion failed, retval will be undefined or perhaps + # -1. + my $retval = $iconv_converters{$charset}->retval(); + if (not defined $retval or + $retval < 0 + ) { + # Fallback to encode, which will probably also fail. + return __fallback_convert_to_utf8($data,$charset); + } + return $converted_data; +} + +# Bug #61342 et al. +# we're switching this to return UTF8 octets instead of perl's internal +# encoding +sub __fallback_convert_to_utf8 { + my ($data, $charset) = @_; + # raw data just gets returned (that's the charset WordDecorder + # uses when it doesn't know what to do) + return $data if $charset eq 'raw'; + if (not defined $charset and not is_utf8($data)) { + warn ("Undefined charset, and string '$data' is not in perl's internal encoding"); + return $data; + } + # lets assume everything that doesn't have a charset is utf8 + $charset //= 'utf8'; + my $result; + eval { + $result = decode($charset,$data) unless is_utf8($data); + $result = encode_utf8($result); + }; + if ($@) { + warn "Unable to decode charset; '$charset' and '$data': $@"; + return $data; + } + return $result; +} + + + +1; + +__END__