Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =encoding utf8
  14
  15 =head1 NAME
  16
  17 Debbugs::MIME -- Mime handling routines for debbugs
  18
  19 =head1 SYNOPSIS
  20
  21  use Debbugs::MIME qw(parse decode_rfc1522);
  22
  23 =head1 DESCRIPTION
  24
  25
  26 =head1 BUGS
  27
  28 None known.
  29
  30 =cut
  31
  32 use warnings;
  33 use strict;
  34
  35 use Exporter qw(import);
  36 use vars qw($DEBUG $VERSION @EXPORT_OK %EXPORT_TAGS @EXPORT);
  37
  38 BEGIN {
  39     $VERSION = 1.00;
  40     $DEBUG = 0 unless defined $DEBUG;
  41
  42     @EXPORT = ();
  43
  44     %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody)],
  45                     rfc1522 => [qw(decode_rfc1522 encode_rfc1522 handle_escaped_commas)],
  46                    );
  47     @EXPORT_OK=();
  48     Exporter::export_ok_tags(keys %EXPORT_TAGS);
  49     $EXPORT_TAGS{all} = [@EXPORT_OK];
  50 }
  51
  52 use File::Path qw(remove_tree);
  53 use File::Temp qw(tempdir);
  54 use MIME::Parser;
  55 use Mail::Message::Field;
  56
  57 use POSIX qw(strftime);
  58 use List::AllUtils qw(apply);
  59
  60 # for convert_to_utf8
  61 use Debbugs::UTF8 qw(convert_to_utf8);
  62
  63 # for decode_rfc1522 and encode_rfc1522
  64 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  65 use MIME::Words qw();
  66
  67 sub getmailbody
  68 {
  69     my $entity = shift;
  70     my $type = $entity->effective_type;
  71     if ($type eq 'text/plain' or
  72             ($type =~ m#text/?# and $type ne 'text/html') or
  73             $type eq 'application/pgp') {
  74         return $entity;
  75     } elsif ($type eq 'multipart/alternative') {
  76         # RFC 2046 says we should use the last part we recognize.
  77         for my $part (reverse $entity->parts) {
  78             my $ret = getmailbody($part);
  79             return $ret if $ret;
  80         }
  81     } else {
  82         # For other multipart types, we just pretend they're
  83         # multipart/mixed and run through in order.
  84         for my $part ($entity->parts) {
  85             my $ret = getmailbody($part);
  86             return $ret if $ret;
  87         }
  88     }
  89     return undef;
  90 }
  91
  92 sub parse
  93 {
  94     # header and decoded body respectively
  95     my (@headerlines, @bodylines);
  96
  97     my $parser = MIME::Parser->new();
  98     my $tempdir = tempdir(CLEANUP => 1);
  99     $parser->output_under($tempdir);
 100     my $entity = eval { $parser->parse_data($_[0]) };
 101
 102     if ($entity and $entity->head->tags) {
 103         @headerlines = @{$entity->head->header};
 104         chomp @headerlines;
 105
 106         my $entity_body = getmailbody($entity);
 107         my $entity_body_handle;
 108         my $charset;
 109         if (defined $entity_body) {
 110             $entity_body_handle = $entity_body->bodyhandle();
 111             $charset = $entity_body->head()->mime_attr('content-type.charset');
 112         }
 113         @bodylines = $entity_body_handle ? $entity_body_handle->as_lines() : ();
 114         @bodylines = map {convert_to_utf8($_,$charset)} @bodylines;
 115         chomp @bodylines;
 116     } else {
 117         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 118         my @msg = split /\n/, $_[0];
 119         my $i;
 120
 121         # assume us-ascii unless charset is set; probably bad, but we
 122         # really shouldn't get to this point anyway
 123         my $charset = 'us-ascii';
 124         for ($i = 0; $i <= $#msg; ++$i) {
 125             $_ = $msg[$i];
 126             last unless length;
 127             while ($msg[$i + 1] =~ /^\s/) {
 128                 ++$i;
 129                 $_ .= "\n" . $msg[$i];
 130             }
 131             if (/charset=\"([^\"]+)\"/) {
 132                 $charset = $1;
 133             }
 134             push @headerlines, $_;
 135         }
 136         @bodylines = map {convert_to_utf8($_,$charset)} @msg[$i .. $#msg];
 137     }
 138
 139     remove_tree($tempdir,{verbose => 0, safe => 1});
 140
 141     # Remove blank lines.
 142     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 143
 144     # Strip off RFC2440-style PGP clearsigning.
 145     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 146         shift @bodylines while @bodylines and
 147             length $bodylines[0] and
 148                 # we currently don't strip \r; handle this for the
 149                 # time being, though eventually it should be stripped
 150                 # too, I think. [See #565981]
 151                 $bodylines[0] ne "\r";
 152         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 153         for my $findsig (0 .. $#bodylines) {
 154             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 155                 $#bodylines = $findsig - 1;
 156                 last;
 157             }
 158         }
 159         map { s/^- // } @bodylines;
 160     }
 161
 162     return { header => [@headerlines], body => [@bodylines]};
 163 }
 164
 165 =head2 create_mime_message
 166
 167      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 168
 169 Creates a MIME encoded message with headers given by the first
 170 argument, and a message given by the second.
 171
 172 Optional attachments can be specified in the third arrayref argument.
 173
 174 Whether to include the date in the header is the final argument; it
 175 defaults to true, setting the Date header if one is not already
 176 present.
 177
 178 Headers are passed directly to MIME::Entity::build, the message is the
 179 first attachment.
 180
 181 Each of the elements of the attachment arrayref is attached as an
 182 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 183 hashref, the contents are passed as an argument to
 184 MIME::Entity::attach
 185
 186 =cut
 187
 188 sub create_mime_message{
 189      my ($headers,$body,$attachments,$include_date) = @_;
 190      $attachments = [] if not defined $attachments;
 191      $include_date = 1 if not defined $include_date;
 192
 193      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 194      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 195
 196      if ($include_date) {
 197          my %headers = apply {defined $_ ? lc($_) : ''} @{$headers};
 198          if (not exists $headers{date}) {
 199              push @{$headers},
 200                  ('Date',
 201                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 202                  );
 203          }
 204      }
 205
 206      # Build the message
 207      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 208      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 209                                    'Encoding'     => 'quoted-printable',
 210                                    (map{encode_rfc1522(encode_utf8(defined $_ ? $_:''))} @{$headers}),
 211                                    Data    => encode_utf8($body),
 212                                   );
 213
 214      # Attach the attachments
 215      for my $attachment (@{$attachments}) {
 216           if (ref($attachment) eq 'HASH') {
 217                $msg->attach(%{$attachment});
 218           }
 219           else {
 220                # This is *craptacular*, but because various MTAs
 221                # (sendmail and exim4, at least) appear to eat From
 222                # lines in message/rfc822 attachments, we need eat
 223                # the entire From line ourselves so the MTA doesn't
 224                # leave \n detrius around.
 225                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 226                     # make a copy so that we don't screw up anything
 227                     # that is expecting this arrayref to stay constant
 228                     $attachment = [@{$attachment}];
 229                     # remove the from line
 230                     splice @$attachment, 1, 1;
 231                }
 232                elsif (not ref($attachment)) {
 233                     # It's a scalar; remove the from line
 234                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 235                }
 236                $msg->attach(Type => 'message/rfc822',
 237                             Data => $attachment,
 238                             Encoding => '7bit',
 239                            );
 240           }
 241      }
 242      return $msg->as_string;
 243 }
 244
 245
 246
 247
 248 =head2 decode_rfc1522
 249
 250     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 251
 252 Turn RFC-1522 names into the UTF-8 equivalent.
 253
 254 =cut
 255
 256 sub decode_rfc1522 {
 257     my ($string) = @_;
 258
 259     # this is craptacular, but leading space is hacked off by unmime.
 260     # Save it.
 261     my $leading_space = '';
 262     $leading_space = $1 if $string =~ s/^(\ +)//;
 263     # we must do this to switch off the utf8 flag before calling decode_mimewords
 264     $string = encode_utf8($string);
 265     my @mime_words = MIME::Words::decode_mimewords($string);
 266     my $tmp = $leading_space .
 267         join('',
 268              (map {
 269                  if (@{$_} > 1) {
 270                      convert_to_utf8(${$_}[0],${$_}[1]);
 271                  } else {
 272                      decode_utf8(${$_}[0]);
 273                  }
 274              } @mime_words)
 275             );
 276     return $tmp;
 277 }
 278
 279 =head2 encode_rfc1522
 280
 281      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 282
 283 Encodes headers according to the RFC1522 standard by calling
 284 MIME::Words::encode_mimeword on distinct words as appropriate.
 285
 286 =cut
 287
 288 # We cannot use MIME::Words::encode_mimewords because that function
 289 # does not handle spaces properly at all.
 290
 291 sub encode_rfc1522 {
 292      my ($rawstr) = @_;
 293
 294      # handle being passed undef properly
 295      return undef if not defined $rawstr;
 296
 297      # convert to octets if we are given a string in perl's internal
 298      # encoding
 299      $rawstr= encode_utf8($rawstr) if is_utf8($rawstr);
 300      # We process words in reverse so we can preserve spacing between
 301      # encoded words. This regex splits on word|nonword boundaries and
 302      # nonword|nonword boundaries. We also consider parenthesis and "
 303      # to be nonwords to avoid escaping them in comments in violation
 304      # of RFC1522
 305      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 306
 307      my $previous_word_encoded = 0;
 308      my $string = '';
 309      for my $word (@words) {
 310           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 311                $string = $word.$string;
 312                $previous_word_encoded=0;
 313           }
 314           elsif ($word =~ /^[\s\n]$/) {
 315                $string = $word.$string;
 316                $previous_word_encoded = 0 if $word eq "\n";
 317           }
 318           else {
 319                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 320                # RFC 1522 mandates that segments be at most 76 characters
 321                # long. If that's the case, we split the word up into 10
 322                # character pieces and encode it. We must use the Encode
 323                # magic here to avoid breaking on bit boundaries here.
 324                if (length $encoded > 75) {
 325                     # Turn utf8 into the internal perl representation
 326                     # so . is a character, not a byte.
 327                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 328                     my @encoded;
 329                     # Strip it into 10 character long segments, and encode
 330                     # the segments
 331                     # XXX It's possible that these segments are > 76 characters
 332                     while ($tempstr =~ s/(.{1,10})$//) {
 333                          # turn the character back into the utf8 representation.
 334                          my $tempword = encode_utf8($1);
 335                          # It may actually be better to eventually use
 336                          # the base64 encoding here, but I'm not sure
 337                          # if that's as widely supported as quoted
 338                          # printable.
 339                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 340                     }
 341                     $encoded = join(" ",@encoded);
 342                     # If the previous word was encoded, we must
 343                     # include a trailing _ that gets encoded as a
 344                     # space.
 345                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 346                     $string = $encoded.$string;
 347                }
 348                else {
 349                     # If the previous word was encoded, we must
 350                     # include a trailing _ that gets encoded as a
 351                     # space.
 352                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 353                     $string = $encoded.$string;
 354                }
 355                $previous_word_encoded = 1;
 356           }
 357      }
 358      return $string;
 359 }
 360
 361 =head2
 362
 363     $header = handle_escaped_commas('','From: ')
 364
 365 Handle commas in addresses which have been RFC1522 escaped and now need to be
 366 quoted to avoid parsing as a record separator.
 367
 368 =cut
 369
 370 sub handle_escaped_commas {
 371     my ($modified_hdr, $orig_hdr) = @_;
 372
 373     my $field = Mail::Message::Field->new($orig_hdr);
 374     # if the header isn't structured, it can't contain an address
 375     if (not $field->isStructured()) {
 376         return $modified_hdr
 377     }
 378     if ($field->name() !~ m/^(?:to|from|reply-to)$/) {
 379         return $modified_hdr
 380     }
 381     my @addresses = $field->addresses();
 382     if (not @addresses) {
 383         return $modified_hdr
 384     }
 385     my @return_addresses;
 386     for my $address (@addresses) {
 387         $address->phrase(decode_rfc1522($address->phrase()));
 388         push @return_addresses, $address->format();
 389     }
 390     return join(', ',@return_addresses)
 391 }
 392
 393 1;