From: Don Armstrong Date: Thu, 14 Mar 2013 19:59:41 +0000 (-0700) Subject: Merge branch 'don/fix_encoding' X-Git-Tag: release/2.6.0~313 X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=04265c83beff7e1ccb28a88ff7257fe36b2fc45d;hp=0b63438d6beca43a78a941605db8fd446a3af7c2;p=debbugs.git Merge branch 'don/fix_encoding' --- diff --git a/Debbugs/CGI/Bugreport.pm b/Debbugs/CGI/Bugreport.pm index 154ee04..707c3a0 100644 --- a/Debbugs/CGI/Bugreport.pm +++ b/Debbugs/CGI/Bugreport.pm @@ -32,9 +32,10 @@ use base qw(Exporter); use IO::Scalar; use Params::Validate qw(validate_with :types); -use Debbugs::MIME qw(convert_to_utf8 decode_rfc1522 create_mime_message); +use Debbugs::MIME qw(decode_rfc1522 create_mime_message); use Debbugs::CGI qw(:url :html :util); use Debbugs::Common qw(globify_scalar english_join); +use Debbugs::UTF8; use Debbugs::Config qw(:config); use POSIX qw(strftime); use Encode qw(decode_utf8); @@ -334,7 +335,7 @@ sub handle_record{ local $_ = $record->{type}; if (/html/) { # $record->{text} is not in perl's internal encoding; convert it - my $text = decode_utf8(decode_rfc1522($record->{text})); + my $text = decode_rfc1522(decode_utf8($record->{text})); my ($time) = $text =~ //; my $class = $text =~ /^(?:Acknowledgement|Reply|Information|Report|Notification)/m ? 'infmessage':'msgreceived'; $output .= $text; diff --git a/Debbugs/Common.pm b/Debbugs/Common.pm index f3c8218..1331ac0 100644 --- a/Debbugs/Common.pm +++ b/Debbugs/Common.pm @@ -50,7 +50,6 @@ BEGIN{ qw(cleanup_eval_fail), qw(hash_slice), ], - utf8 => [qw(encode_utf8_structure)], date => [qw(secs_to_english)], quit => [qw(quit)], lock => [qw(filelock unfilelock lockpid)], @@ -71,7 +70,6 @@ use IO::Scalar; use Debbugs::MIME qw(decode_rfc1522); use Mail::Address; use Cwd qw(cwd); -use Encode qw(encode_utf8 is_utf8); use Storable qw(dclone); use Params::Validate qw(validate_with :types); @@ -827,7 +825,7 @@ sub globify_scalar { if (defined ref($scalar)) { if (ref($scalar) eq 'SCALAR' and not UNIVERSAL::isa($scalar,'GLOB')) { - open $handle, '>:scalar:utf8', $scalar; + open $handle, '>:scalar:encoding(UTF-8)', $scalar; return $handle; } else { @@ -841,7 +839,7 @@ sub globify_scalar { carp "Given a non-scalar reference, non-glob to globify_scalar; returning /dev/null handle"; } } - return IO::File->new('/dev/null','>:utf8'); + return IO::File->new('/dev/null','>:encoding(UTF-8)'); } =head2 cleanup_eval_fail() @@ -900,55 +898,6 @@ sub hash_slice(\%@) { } -=head1 UTF-8 - -These functions are exported with the :utf8 tag - -=head2 encode_utf8_structure - - %newdata = encode_utf8_structure(%newdata); - -Takes a complex data structure and encodes any strings with is_utf8 -set into their constituent octets. - -=cut - -our $depth = 0; -sub encode_utf8_structure { - ++$depth; - my @ret; - for my $_ (@_) { - if (ref($_) eq 'HASH') { - push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})}; - } - elsif (ref($_) eq 'ARRAY') { - push @ret, [encode_utf8_structure(@{$depth == 1 ? dclone($_):$_})]; - } - elsif (ref($_)) { - # we don't know how to handle non hash or non arrays - push @ret,$_; - } - else { - push @ret,__encode_utf8($_); - } - } - --$depth; - return @ret; -} - -sub __encode_utf8 { - my @ret; - for my $r (@_) { - if (not ref($r) and is_utf8($r)) { - $r = encode_utf8($r); - } - push @ret,$r; - } - return @ret; -} - - - 1; __END__ diff --git a/Debbugs/Control.pm b/Debbugs/Control.pm index 5739734..9587624 100644 --- a/Debbugs/Control.pm +++ b/Debbugs/Control.pm @@ -110,7 +110,8 @@ BEGIN{ } use Debbugs::Config qw(:config); -use Debbugs::Common qw(:lock buglog :misc get_hashname sort_versions :utf8); +use Debbugs::Common qw(:lock buglog :misc get_hashname sort_versions); +use Debbugs::UTF8; use Debbugs::Status qw(bug_archiveable :read :hook writebug new_bug splitpackages split_status_fields get_bug_status); use Debbugs::CGI qw(html_escape); use Debbugs::Log qw(:misc :write); @@ -3437,25 +3438,25 @@ sub append_action_to_log{ } my $msg = join('', (exists $param{command} ? - "\n":"" + "\n":"" ), (length $param{requester} ? - "\n":"" + "\n":"" ), (length $param{request_addr} ? - "\n":"" + "\n":"" ), "\n", $data_diff, - "".html_escape(encode_utf8($param{action}))."\n"); + "".html_escape(encode_utf8_safely($param{action}))."\n"); if (length $param{requester}) { - $msg .= "Request was from ".html_escape(encode_utf8($param{requester}))."\n"; + $msg .= "Request was from ".html_escape(encode_utf8_safely($param{requester}))."\n"; } if (length $param{request_addr}) { - $msg .= "to ".html_escape(encode_utf8($param{request_addr})).""; + $msg .= "to ".html_escape(encode_utf8_safely($param{request_addr})).""; } if (length $param{desc}) { - $msg .= ":
\n".encode_utf8($param{desc})."\n"; + $msg .= ":
\n".encode_utf8_safely($param{desc})."\n"; } else { $msg .= ".\n"; diff --git a/Debbugs/Log.pm b/Debbugs/Log.pm index 2ae7af7..8b99b7d 100644 --- a/Debbugs/Log.pm +++ b/Debbugs/Log.pm @@ -39,7 +39,7 @@ use Carp; use Debbugs::Common qw(getbuglocation getbugcomponent make_list); use Params::Validate qw(:types validate_with); -use Encode qw(encode is_utf8); +use Encode qw(encode encode_utf8 is_utf8); =head1 NAME @@ -390,8 +390,12 @@ sub write_log_records croak "record type '$type' with no text field" unless defined $record->{text}; # I am not sure if we really want to croak here; but this is # almost certainly a bug if is_utf8 is on. - # croak "probably wrong encoding" if is_utf8($record->{text}); - my ($text) = escape_log($record->{text}); + my $text = $record->{text}; + if (is_utf8($text)) { + carp('Record text was in the wrong encoding (perl internal instead of utf8 octets)'); + $text = encode_utf8($text) + } + ($text) = escape_log($text); if ($type eq 'autocheck') { print {$logfh} "\01\n$text\03\n" or die "Unable to write to logfile: $!"; diff --git a/Debbugs/MIME.pm b/Debbugs/MIME.pm index 481be7b..afc0776 100644 --- a/Debbugs/MIME.pm +++ b/Debbugs/MIME.pm @@ -41,7 +41,6 @@ BEGIN { %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody)], rfc1522 => [qw(decode_rfc1522 encode_rfc1522)], - utf8 => [qw(convert_to_utf8)], ); @EXPORT_OK=(); Exporter::export_ok_tags(keys %EXPORT_TAGS); @@ -55,11 +54,11 @@ use MIME::Parser; use POSIX qw(strftime); use List::MoreUtils qw(apply); -# for decode_rfc1522 -use MIME::WordDecoder qw(); -use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8); +# for convert_to_utf8 +use Debbugs::UTF8 qw(convert_to_utf8); -# for encode_rfc1522 +# for decode_rfc1522 and encode_rfc1522 +use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8); use MIME::Words qw(); sub getmailbody @@ -69,7 +68,7 @@ sub getmailbody if ($type eq 'text/plain' or ($type =~ m#text/?# and $type ne 'text/html') or $type eq 'application/pgp') { - return $entity->bodyhandle; + return $entity; } elsif ($type eq 'multipart/alternative') { # RFC 2046 says we should use the last part we recognize. for my $part (reverse $entity->parts) { @@ -101,14 +100,24 @@ sub parse @headerlines = @{$entity->head->header}; chomp @headerlines; - my $entity_body = getmailbody($entity); - @bodylines = $entity_body ? $entity_body->as_lines() : (); + my $entity_body = getmailbody($entity); + my $entity_body_handle; + my $charset; + if (defined $entity_body) { + $entity_body_handle = $entity_body->bodyhandle(); + $charset = $entity_body->head()->mime_attr('content-type.charset'); + } + @bodylines = $entity_body_handle ? $entity_body_handle->as_lines() : (); + @bodylines = map {convert_to_utf8($_,$charset)} @bodylines; chomp @bodylines; } else { # Legacy pre-MIME code, kept around in case MIME::Parser fails. my @msg = split /\n/, $_[0]; my $i; + # assume us-ascii unless charset is set; probably bad, but we + # really shouldn't get to this point anyway + my $charset = 'us-ascii'; for ($i = 0; $i <= $#msg; ++$i) { $_ = $msg[$i]; last unless length; @@ -116,10 +125,12 @@ sub parse ++$i; $_ .= "\n" . $msg[$i]; } + if (/charset=\"([^\"]+)\"/) { + $charset = $1; + } push @headerlines, $_; } - - @bodylines = @msg[$i .. $#msg]; + @bodylines = map {convert_to_utf8($_,$charset)} @msg[$i .. $#msg]; } rmtree $tempdir, 0, 1; @@ -193,8 +204,8 @@ sub create_mime_message{ # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it. my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8', 'Encoding' => 'quoted-printable', - (map{encode_rfc1522($_)} @{$headers}), - Data => $body + (map{encode_rfc1522(encode_utf8($_))} @{$headers}), + Data => encode_utf8($body), ); # Attach the attachments @@ -229,23 +240,6 @@ sub create_mime_message{ } -# Bug #61342 et al. - -sub convert_to_utf8 { - my ($data, $charset) = @_; - # raw data just gets returned (that's the charset WordDecorder - # uses when it doesn't know what to do) - return $data if $charset eq 'raw' or is_utf8($data,1); - my $result; - eval { - $result = decode($charset,$data); - }; - if ($@) { - warn "Unable to decode charset; '$charset' and '$data': $@"; - return $data; - } - return $result; -} =head2 decode_rfc1522 @@ -256,24 +250,27 @@ Turn RFC-1522 names into the UTF-8 equivalent. =cut -BEGIN { - # Set up the default RFC1522 decoder, which turns all charsets that - # are supported into the appropriate UTF-8 charset. - MIME::WordDecoder->default(new MIME::WordDecoder( - ['*' => \&convert_to_utf8, - ])); -} - sub decode_rfc1522 { my ($string) = @_; # this is craptacular, but leading space is hacked off by unmime. # Save it. my $leading_space = ''; - $leading_space = $1 if $string =~ s/^(\s+)//; - # unmime calls the default MIME::WordDecoder handler set up at - # initialization time. - return $leading_space . MIME::WordDecoder::unmime($string); + $leading_space = $1 if $string =~ s/^(\ +)//; + # we must do this to switch off the utf8 flag before calling decode_mimewords + $string = encode_utf8($string); + my @mime_words = MIME::Words::decode_mimewords($string); + my $tmp = $leading_space . + join('', + (map { + if (@{$_} > 1) { + convert_to_utf8(${$_}[0],${$_}[1]); + } else { + decode_utf8(${$_}[0]); + } + } @mime_words) + ); + return $tmp; } =head2 encode_rfc1522 @@ -293,9 +290,10 @@ sub encode_rfc1522 { # handle being passed undef properly return undef if not defined $rawstr; - if (is_utf8($rawstr)) { - $rawstr= encode_utf8($rawstr); - } + + # convert to octets if we are given a string in perl's internal + # encoding + $rawstr= encode_utf8($rawstr) if is_utf8($rawstr); # We process words in reverse so we can preserve spacing between # encoded words. This regex splits on word|nonword boundaries and # nonword|nonword boundaries. We also consider parenthesis and " diff --git a/Debbugs/Mail.pm b/Debbugs/Mail.pm index d26c860..9fa282b 100644 --- a/Debbugs/Mail.pm +++ b/Debbugs/Mail.pm @@ -49,6 +49,7 @@ use Debbugs::MIME qw(encode_rfc1522); use Debbugs::Config qw(:config); use Params::Validate qw(:types validate_with); use Encode qw(encode is_utf8); +use Debbugs::UTF8 qw(encode_utf8_safely); use Debbugs::Packages; @@ -398,7 +399,7 @@ sub encode_headers{ my ($header,$body) = split /\n\n/, $message, 2; $header = encode_rfc1522($header); - return $header . qq(\n\n). $body; + return $header . qq(\n\n). encode_utf8_safely($body); } =head2 rfc822_date diff --git a/Debbugs/SOAP.pm b/Debbugs/SOAP.pm index 5cb08bb..c1fc85f 100644 --- a/Debbugs/SOAP.pm +++ b/Debbugs/SOAP.pm @@ -41,7 +41,8 @@ BEGIN{ use IO::File; use Debbugs::Status qw(get_bug_status); -use Debbugs::Common qw(make_list getbuglocation getbugcomponent :utf8); +use Debbugs::Common qw(make_list getbuglocation getbugcomponent); +use Debbugs::UTF8; use Debbugs::Packages; use Storable qw(nstore retrieve dclone); diff --git a/Debbugs/Status.pm b/Debbugs/Status.pm index a2aeabe..7d72c03 100644 --- a/Debbugs/Status.pm +++ b/Debbugs/Status.pm @@ -37,7 +37,8 @@ use vars qw($VERSION $DEBUG %EXPORT_TAGS @EXPORT_OK @EXPORT); use base qw(Exporter); use Params::Validate qw(validate_with :types); -use Debbugs::Common qw(:util :lock :quit :misc :utf8); +use Debbugs::Common qw(:util :lock :quit :misc); +use Debbugs::UTF8; use Debbugs::Config qw(:config); use Debbugs::MIME qw(decode_rfc1522 encode_rfc1522); use Debbugs::Packages qw(makesourceversions make_source_versions getversions get_versions binary_to_source); diff --git a/Debbugs/UTF8.pm b/Debbugs/UTF8.pm new file mode 100644 index 0000000..74a4042 --- /dev/null +++ b/Debbugs/UTF8.pm @@ -0,0 +1,215 @@ +# This module is part of debbugs, and is released +# under the terms of the GPL version 2, or any later +# version at your option. +# See the file README and COPYING for more information. +# +# Copyright 2013 by Don Armstrong . + +package Debbugs::UTF8; + +=head1 NAME + +Debbugs::UTF8 -- Routines for handling conversion of charsets to UTF8 + +=head1 SYNOPSIS + +use Debbugs::UTF8; + + +=head1 DESCRIPTION + +This module contains routines which convert from various different +charsets to UTF8. + +=head1 FUNCTIONS + +=cut + +use warnings; +use strict; +use vars qw($VERSION $DEBUG %EXPORT_TAGS @EXPORT_OK @EXPORT); +use base qw(Exporter); + +BEGIN{ + $VERSION = 1.00; + $DEBUG = 0 unless defined $DEBUG; + + %EXPORT_TAGS = (utf8 => [qw(encode_utf8_structure encode_utf8_safely), + qw(convert_to_utf8 decode_utf8_safely)], + ); + @EXPORT = (@{$EXPORT_TAGS{utf8}}); + @EXPORT_OK = (); + Exporter::export_ok_tags(keys %EXPORT_TAGS); + $EXPORT_TAGS{all} = [@EXPORT_OK]; +} + +use Carp; +$Carp::Verbose = 1; + +use Encode qw(encode_utf8 is_utf8 decode decode_utf8); +use Text::Iconv; +use Storable qw(dclone); + + +=head1 UTF-8 + +These functions are exported with the :utf8 tag + +=head2 encode_utf8_structure + + %newdata = encode_utf8_structure(%newdata); + +Takes a complex data structure and encodes any strings with is_utf8 +set into their constituent octets. + +=cut + +our $depth = 0; +sub encode_utf8_structure { + ++$depth; + my @ret; + for my $_ (@_) { + if (ref($_) eq 'HASH') { + push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})}; + } + elsif (ref($_) eq 'ARRAY') { + push @ret, [encode_utf8_structure(@{$depth == 1 ? dclone($_):$_})]; + } + elsif (ref($_)) { + # we don't know how to handle non hash or non arrays + push @ret,$_; + } + else { + push @ret,encode_utf8_safely($_); + } + } + --$depth; + return @ret; +} + +=head2 encode_utf8_safely + + $octets = encode_utf8_safely($string); + +Given a $string, returns the octet equivalent of $string if $string is +in perl's internal encoding; otherwise returns $string. + +Silently returns REFs without encoding them. [If you want to deeply +encode REFs, see encode_utf8_structure.] + +=cut + + +sub encode_utf8_safely{ + my @ret; + for my $r (@_) { + if (not ref($r) and is_utf8($r)) { + $r = encode_utf8($r); + } + push @ret,$r; + } + return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]); +} + +=head2 decode_utf8_safely + + $string = decode_utf8_safely($octets); + +Given $octets in UTF8, returns the perl-internal equivalent of $octets +if $octets does not have is_utf8 set; otherwise returns $octets. + +Silently returns REFs without encoding them. + +=cut + + +sub decode_utf8_safely{ + my @ret; + for my $r (@_) { + if (not ref($r) and not is_utf8($r)) { + $r = decode_utf8($r); + } + push @ret, $r; + } + return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]); +} + + + + +=head2 convert_to_utf8 + + $utf8 = convert_to_utf8("text","charset"); + +=cut + +our %iconv_converters; + +sub convert_to_utf8 { + my ($data,$charset) = @_; + if (is_utf8($data)) { + cluck("utf8 flag is set when calling convert_to_utf8"); + return $data; + } + $charset = uc($charset); + if ($charset eq 'RAW') { + croak("Charset must not be raw when calling convert_to_utf8"); + } + if (not defined $iconv_converters{$charset}) { + eval { + $iconv_converters{$charset} = Text::Iconv->new($charset,"UTF-8") or + die "Unable to create converter for '$charset'"; + }; + if ($@) { + warn $@; + # We weren't able to create the converter, so use Encode + # instead + return __fallback_convert_to_utf8($data,$charset); + } + } + if (not defined $iconv_converters{$charset}) { + warn "The converter for $charset wasn't created properly somehow!"; + return __fallback_convert_to_utf8($data,$charset); + } + my $converted_data = $iconv_converters{$charset}->convert($data); + # if the conversion failed, retval will be undefined or perhaps + # -1. + my $retval = $iconv_converters{$charset}->retval(); + if (not defined $retval or + $retval < 0 + ) { + warn "failed to convert to utf8"; + # Fallback to encode, which will probably also fail. + return __fallback_convert_to_utf8($data,$charset); + } + return decode("UTF-8",$converted_data); +} + +# this returns data in perl's internal encoding +sub __fallback_convert_to_utf8 { + my ($data, $charset) = @_; + # raw data just gets returned (that's the charset WordDecorder + # uses when it doesn't know what to do) + return $data if $charset eq 'raw'; + if (not defined $charset and not is_utf8($data)) { + warn ("Undefined charset, and string '$data' is not in perl's internal encoding"); + return $data; + } + # lets assume everything that doesn't have a charset is utf8 + $charset //= 'utf8'; + my $result; + eval { + $result = decode($charset,$data); + }; + if ($@) { + warn "Unable to decode charset; '$charset' and '$data': $@"; + return $data; + } + return $result; +} + + + +1; + +__END__ diff --git a/cgi/bugreport.cgi b/cgi/bugreport.cgi index 55617ae..70f4229 100755 --- a/cgi/bugreport.cgi +++ b/cgi/bugreport.cgi @@ -8,8 +8,6 @@ BEGIN{ delete @ENV{qw(IFS CDPATH ENV BASH_ENV)}; } -# STDOUT should be using the utf8 io layer -binmode(STDOUT,':utf8'); use POSIX qw(strftime); use MIME::Parser; @@ -38,6 +36,8 @@ use List::Util qw(max); use CGI::Simple; my $q = new CGI::Simple; +# STDOUT should be using the utf8 io layer +binmode(STDOUT,':raw:encoding(UTF-8)'); my %param = cgi_parameters(query => $q, single => [qw(bug msg att boring terse), diff --git a/cgi/pkgreport.cgi b/cgi/pkgreport.cgi index d29c3ab..eb7a61a 100755 --- a/cgi/pkgreport.cgi +++ b/cgi/pkgreport.cgi @@ -18,9 +18,7 @@ BEGIN{ delete @ENV{qw(IFS CDPATH ENV BASH_ENV)}; } -# STDOUT should be in utf8 mode -binmode(STDOUT,':utf8'); - +binmode(STDOUT,':encoding(UTF-8)'); use POSIX qw(strftime nice); use Debbugs::Config qw(:globals :text :config); diff --git a/t/01_mime.t b/t/01_mime.t index d103eb1..dcd3b76 100644 --- a/t/01_mime.t +++ b/t/01_mime.t @@ -30,12 +30,12 @@ ok(Debbugs::MIME::decode_rfc1522(q(=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= 2; +use Test::More tests => 3; use Encode qw(decode_utf8); use_ok('Debbugs::Common'); -is_deeply(Debbugs::Common::encode_utf8_structure( +use_ok('Debbugs::UTF8'); +is_deeply(Debbugs::UTF8::encode_utf8_structure( {a => decode_utf8('föö'), b => [map {decode_utf8($_)} qw(blëh bl♥h)], }), diff --git a/t/13_utf8_mail.t b/t/13_utf8_mail.t index 8ada76a..2491e14 100644 --- a/t/13_utf8_mail.t +++ b/t/13_utf8_mail.t @@ -6,6 +6,8 @@ use Test::More tests => 12; use warnings; use strict; +use utf8; + # Here, we're going to shoot messages through a set of things that can # happen. @@ -127,7 +129,7 @@ ok(system('sh','-c','find '.$sendmail_dir.q( -type f | xargs grep -q "Subject: P # This is an eval because $ENV{DEBBUGS_CONFIG_FILE} isn't set at BEGIN{} time eval "use Debbugs::Status qw(read_bug writebug);"; my $status = read_bug(bug=>1); -ok($status->{subject} eq decode_utf8('ütff8 title encoding test'),'bug 1 retitled'); +ok($status->{subject} eq 'ütff8 title encoding test','bug 1 retitled'); ok($status->{severity} eq 'wishlist','bug 1 wishlisted'); ok(system('sh','-c','[ $(egrep "retitle.*encoding test" '.$spool_dir.'/db-h/01/1.log|grep -v "=C3=BCt=EF=AC=808"|wc -l) -eq 0 ]') == 0, 'Control messages escaped properly');