Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =head1 NAME
  14
  15 Debbugs::MIME -- Mime handling routines for debbugs
  16
  17 =head1 SYNOPSIS
  18
  19  use Debbugs::MIME qw(parse decode_rfc1522);
  20
  21 =head1 DESCRIPTION
  22
  23
  24 =head1 BUGS
  25
  26 None known.
  27
  28 =cut
  29
  30 use warnings;
  31 use strict;
  32
  33 use base qw(Exporter);
  34 use vars qw($VERSION @EXPORT_OK);
  35
  36 BEGIN {
  37     $VERSION = 1.00;
  38
  39     @EXPORT_OK = qw(parse decode_rfc1522 encode_rfc1522 convert_to_utf8 create_mime_message getmailbody);
  40 }
  41
  42 use File::Path;
  43 use File::Temp qw();
  44 use MIME::Parser;
  45
  46 use POSIX qw(strftime);
  47 use List::MoreUtils qw(apply);
  48
  49 # for decode_rfc1522
  50 use MIME::WordDecoder qw();
  51 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  52
  53 # for encode_rfc1522
  54 use MIME::Words qw();
  55
  56 sub getmailbody
  57 {
  58     my $entity = shift;
  59     my $type = $entity->effective_type;
  60     if ($type eq 'text/plain' or
  61             ($type =~ m#text/?# and $type ne 'text/html') or
  62             $type eq 'application/pgp') {
  63         return $entity->bodyhandle;
  64     } elsif ($type eq 'multipart/alternative') {
  65         # RFC 2046 says we should use the last part we recognize.
  66         for my $part (reverse $entity->parts) {
  67             my $ret = getmailbody($part);
  68             return $ret if $ret;
  69         }
  70     } else {
  71         # For other multipart types, we just pretend they're
  72         # multipart/mixed and run through in order.
  73         for my $part ($entity->parts) {
  74             my $ret = getmailbody($part);
  75             return $ret if $ret;
  76         }
  77     }
  78     return undef;
  79 }
  80
  81 sub parse
  82 {
  83     # header and decoded body respectively
  84     my (@headerlines, @bodylines);
  85
  86     my $parser = MIME::Parser->new();
  87     my $tempdir = File::Temp::tempdir();
  88     $parser->output_under($tempdir);
  89     my $entity = eval { $parser->parse_data($_[0]) };
  90
  91     if ($entity and $entity->head->tags) {
  92         @headerlines = @{$entity->head->header};
  93         chomp @headerlines;
  94
  95         my $entity_body = getmailbody($entity);
  96         @bodylines = $entity_body ? $entity_body->as_lines() : ();
  97         chomp @bodylines;
  98     } else {
  99         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 100         my @msg = split /\n/, $_[0];
 101         my $i;
 102
 103         for ($i = 0; $i <= $#msg; ++$i) {
 104             $_ = $msg[$i];
 105             last unless length;
 106             while ($msg[$i + 1] =~ /^\s/) {
 107                 ++$i;
 108                 $_ .= "\n" . $msg[$i];
 109             }
 110             push @headerlines, $_;
 111         }
 112
 113         @bodylines = @msg[$i .. $#msg];
 114     }
 115
 116     rmtree $tempdir, 0, 1;
 117
 118     # Remove blank lines.
 119     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 120
 121     # Strip off RFC2440-style PGP clearsigning.
 122     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 123         shift @bodylines while @bodylines and
 124             length $bodylines[0] and
 125                 # we currently don't strip \r; handle this for the
 126                 # time being, though eventually it should be stripped
 127                 # too, I think. [See #565981]
 128                 $bodylines[0] ne "\r";
 129         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 130         for my $findsig (0 .. $#bodylines) {
 131             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 132                 $#bodylines = $findsig - 1;
 133                 last;
 134             }
 135         }
 136         map { s/^- // } @bodylines;
 137     }
 138
 139     return { header => [@headerlines], body => [@bodylines]};
 140 }
 141
 142 =head2 create_mime_message
 143
 144      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 145
 146 Creates a MIME encoded message with headers given by the first
 147 argument, and a message given by the second.
 148
 149 Optional attachments can be specified in the third arrayref argument.
 150
 151 Whether to include the date in the header is the final argument; it
 152 defaults to true, setting the Date header if one is not already
 153 present.
 154
 155 Headers are passed directly to MIME::Entity::build, the message is the
 156 first attachment.
 157
 158 Each of the elements of the attachment arrayref is attached as an
 159 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 160 hashref, the contents are passed as an argument to
 161 MIME::Entity::attach
 162
 163 =cut
 164
 165 sub create_mime_message{
 166      my ($headers,$body,$attachments,$include_date) = @_;
 167      $attachments = [] if not defined $attachments;
 168      $include_date = 1 if not defined $include_date;
 169
 170      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 171      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 172
 173      if ($include_date) {
 174          my %headers = apply {lc($_)} @{$headers};
 175          if (not exists $headers{date}) {
 176              push @{$headers},
 177                  ('Date',
 178                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 179                  );
 180          }
 181      }
 182
 183      # Build the message
 184      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 185      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 186                                    'Encoding'     => 'quoted-printable',
 187                                    (map{encode_rfc1522($_)} @{$headers}),
 188                                    Data    => $body
 189                                   );
 190
 191      # Attach the attachments
 192      for my $attachment (@{$attachments}) {
 193           if (ref($attachment) eq 'HASH') {
 194                $msg->attach(%{$attachment});
 195           }
 196           else {
 197                # This is *craptacular*, but because various MTAs
 198                # (sendmail and exim4, at least) appear to eat From
 199                # lines in message/rfc822 attachments, we need eat
 200                # the entire From line ourselves so the MTA doesn't
 201                # leave \n detrius around.
 202                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 203                     # make a copy so that we don't screw up anything
 204                     # that is expecting this arrayref to stay constant
 205                     $attachment = [@{$attachment}];
 206                     # remove the from line
 207                     splice @$attachment, 1, 1;
 208                }
 209                elsif (not ref($attachment)) {
 210                     # It's a scalar; remove the from line
 211                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 212                }
 213                $msg->attach(Type => 'message/rfc822',
 214                             Data => $attachment,
 215                             Encoding => '7bit',
 216                            );
 217           }
 218      }
 219      return $msg->as_string;
 220 }
 221
 222
 223 # Bug #61342 et al.
 224
 225 sub convert_to_utf8 {
 226      my ($data, $charset) = @_;
 227      # raw data just gets returned (that's the charset WordDecorder
 228      # uses when it doesn't know what to do)
 229      return $data if $charset eq 'raw' or is_utf8($data,1);
 230      my $result;
 231      eval {
 232          $result = decode($charset,$data);
 233      };
 234      if ($@) {
 235           warn "Unable to decode charset; '$charset' and '$data': $@";
 236           return $data;
 237      }
 238      return $result;
 239 }
 240
 241
 242 =head2 decode_rfc1522
 243
 244     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 245
 246 Turn RFC-1522 names into the UTF-8 equivalent.
 247
 248 =cut
 249
 250 BEGIN {
 251     # Set up the default RFC1522 decoder, which turns all charsets that
 252     # are supported into the appropriate UTF-8 charset.
 253     MIME::WordDecoder->default(new MIME::WordDecoder(
 254         ['*' => \&convert_to_utf8,
 255         ]));
 256 }
 257
 258 sub decode_rfc1522 {
 259     my ($string) = @_;
 260
 261     # this is craptacular, but leading space is hacked off by unmime.
 262     # Save it.
 263     my $leading_space = '';
 264     $leading_space = $1 if $string =~ s/^(\s+)//;
 265     # unmime calls the default MIME::WordDecoder handler set up at
 266     # initialization time.
 267     return $leading_space . MIME::WordDecoder::unmime($string);
 268 }
 269
 270 =head2 encode_rfc1522
 271
 272      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 273
 274 Encodes headers according to the RFC1522 standard by calling
 275 MIME::Words::encode_mimeword on distinct words as appropriate.
 276
 277 =cut
 278
 279 # We cannot use MIME::Words::encode_mimewords because that function
 280 # does not handle spaces properly at all.
 281
 282 sub encode_rfc1522 {
 283      my ($rawstr) = @_;
 284
 285      # handle being passed undef properly
 286      return undef if not defined $rawstr;
 287      if (is_utf8($rawstr)) {
 288          $rawstr= encode_utf8($rawstr);
 289      }
 290      # We process words in reverse so we can preserve spacing between
 291      # encoded words. This regex splits on word|nonword boundaries and
 292      # nonword|nonword boundaries. We also consider parenthesis and "
 293      # to be nonwords to avoid escaping them in comments in violation
 294      # of RFC1522
 295      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 296
 297      my $previous_word_encoded = 0;
 298      my $string = '';
 299      for my $word (@words) {
 300           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 301                $string = $word.$string;
 302                $previous_word_encoded=0;
 303           }
 304           elsif ($word =~ /^[\s\n]$/) {
 305                $string = $word.$string;
 306                $previous_word_encoded = 0 if $word eq "\n";
 307           }
 308           else {
 309                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 310                # RFC 1522 mandates that segments be at most 76 characters
 311                # long. If that's the case, we split the word up into 10
 312                # character pieces and encode it. We must use the Encode
 313                # magic here to avoid breaking on bit boundaries here.
 314                if (length $encoded > 75) {
 315                     # Turn utf8 into the internal perl representation
 316                     # so . is a character, not a byte.
 317                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 318                     my @encoded;
 319                     # Strip it into 10 character long segments, and encode
 320                     # the segments
 321                     # XXX It's possible that these segments are > 76 characters
 322                     while ($tempstr =~ s/(.{1,10})$//) {
 323                          # turn the character back into the utf8 representation.
 324                          my $tempword = encode_utf8($1);
 325                          # It may actually be better to eventually use
 326                          # the base64 encoding here, but I'm not sure
 327                          # if that's as widely supported as quoted
 328                          # printable.
 329                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 330                     }
 331                     $encoded = join(" ",@encoded);
 332                     # If the previous word was encoded, we must
 333                     # include a trailing _ that gets encoded as a
 334                     # space.
 335                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 336                     $string = $encoded.$string;
 337                }
 338                else {
 339                     # If the previous word was encoded, we must
 340                     # include a trailing _ that gets encoded as a
 341                     # space.
 342                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 343                     $string = $encoded.$string;
 344                }
 345                $previous_word_encoded = 1;
 346           }
 347      }
 348      return $string;
 349 }
 350
 351 1;