lib/Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =encoding utf8
  14
  15 =head1 NAME
  16
  17 Debbugs::MIME -- Mime handling routines for debbugs
  18
  19 =head1 SYNOPSIS
  20
  21  use Debbugs::MIME qw(parse decode_rfc1522);
  22
  23 =head1 DESCRIPTION
  24
  25
  26 =head1 BUGS
  27
  28 None known.
  29
  30 =cut
  31
  32 use warnings;
  33 use strict;
  34
  35 use Exporter qw(import);
  36 use vars qw($DEBUG $VERSION @EXPORT_OK %EXPORT_TAGS @EXPORT);
  37
  38 BEGIN {
  39     $VERSION = 1.00;
  40     $DEBUG = 0 unless defined $DEBUG;
  41
  42     @EXPORT = ();
  43
  44     %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody),
  45                              qw(parse_to_mime_entity),
  46                             ],
  47                     rfc1522 => [qw(decode_rfc1522 encode_rfc1522 handle_escaped_commas)],
  48                    );
  49     @EXPORT_OK=();
  50     Exporter::export_ok_tags(keys %EXPORT_TAGS);
  51     $EXPORT_TAGS{all} = [@EXPORT_OK];
  52 }
  53
  54 use File::Path qw(remove_tree);
  55 use File::Temp qw(tempdir);
  56 use MIME::Parser;
  57 use Mail::Message::Field;
  58
  59 use POSIX qw(strftime);
  60 use List::AllUtils qw(apply);
  61
  62 # for convert_to_utf8
  63 use Debbugs::UTF8 qw(convert_to_utf8);
  64
  65 # for decode_rfc1522 and encode_rfc1522
  66 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  67 use MIME::Words qw();
  68
  69 sub getmailbody
  70 {
  71     my $entity = shift;
  72     my $type = $entity->effective_type;
  73     if ($type eq 'text/plain' or
  74             ($type =~ m#text/?# and $type ne 'text/html') or
  75             $type eq 'application/pgp') {
  76         return $entity;
  77     } elsif ($type eq 'multipart/alternative') {
  78         # RFC 2046 says we should use the last part we recognize.
  79         for my $part (reverse $entity->parts) {
  80             my $ret = getmailbody($part);
  81             return $ret if $ret;
  82         }
  83     } else {
  84         # For other multipart types, we just pretend they're
  85         # multipart/mixed and run through in order.
  86         for my $part ($entity->parts) {
  87             my $ret = getmailbody($part);
  88             return $ret if $ret;
  89         }
  90     }
  91     return undef;
  92 }
  93
  94 =head2 parse_to_mime_entity
  95
  96      $entity = parse_to_mime_entity($record);
  97
  98 Returns a MIME::Entity from a record (from Debbugs::Log), a filehandle, or a
  99 scalar mail message. Will die upon failure.
 100
 101 Intermediate parsing results will be output under a temporary directory which
 102 should be cleaned up upon process exit.
 103
 104 =cut
 105
 106 sub parse_to_mime_entity {
 107     my ($record) = @_;
 108     my $parser = MIME::Parser->new();
 109     my $entity;
 110     # this will be cleaned up once we exit
 111     my $tempdir = File::Temp->newdir();
 112     $parser->output_dir($tempdir->dirname());
 113     if (ref($record) eq 'HASH') {
 114         if ($record->{inner_file}) {
 115             $entity = $parser->parse($record->{fh}) or
 116                 die "Unable to parse entity";
 117         } else {
 118             $entity = $parser->parse_data($record->{text}) or
 119                 die "Unable to parse entity";
 120         }
 121     } elsif (ref($record)) {
 122         $entity = $parser->parse($record) or
 123             die "Unable to parse entity";
 124     } else {
 125         $entity = $parser->parse_data($record) or
 126             die "Unable to parse entity";
 127     }
 128     return $entity;
 129 }
 130
 131 sub parse
 132 {
 133     # header and decoded body respectively
 134     my (@headerlines, @bodylines);
 135
 136     my $parser = MIME::Parser->new();
 137     my $tempdir = tempdir(CLEANUP => 1);
 138     $parser->output_under($tempdir);
 139     my $entity = eval { $parser->parse_data($_[0]) };
 140
 141     if ($entity and $entity->head->tags) {
 142         @headerlines = @{$entity->head->header};
 143         chomp @headerlines;
 144
 145         my $entity_body = getmailbody($entity);
 146         my $entity_body_handle;
 147         my $charset;
 148         if (defined $entity_body) {
 149             $entity_body_handle = $entity_body->bodyhandle();
 150             $charset = $entity_body->head()->mime_attr('content-type.charset');
 151         }
 152         @bodylines = $entity_body_handle ? $entity_body_handle->as_lines() : ();
 153         @bodylines = map {convert_to_utf8($_,$charset)} @bodylines;
 154         chomp @bodylines;
 155     } else {
 156         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 157         my @msg = split /\n/, $_[0];
 158         my $i;
 159
 160         # assume us-ascii unless charset is set; probably bad, but we
 161         # really shouldn't get to this point anyway
 162         my $charset = 'us-ascii';
 163         for ($i = 0; $i <= $#msg; ++$i) {
 164             $_ = $msg[$i];
 165             last unless length;
 166             while ($msg[$i + 1] =~ /^\s/) {
 167                 ++$i;
 168                 $_ .= "\n" . $msg[$i];
 169             }
 170             if (/charset=\"([^\"]+)\"/) {
 171                 $charset = $1;
 172             }
 173             push @headerlines, $_;
 174         }
 175         @bodylines = map {convert_to_utf8($_,$charset)} @msg[$i .. $#msg];
 176     }
 177
 178     remove_tree($tempdir,{verbose => 0, safe => 1});
 179
 180     # Remove blank lines.
 181     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 182
 183     # Strip off RFC2440-style PGP clearsigning.
 184     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 185         shift @bodylines while @bodylines and
 186             length $bodylines[0] and
 187                 # we currently don't strip \r; handle this for the
 188                 # time being, though eventually it should be stripped
 189                 # too, I think. [See #565981]
 190                 $bodylines[0] ne "\r";
 191         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 192         for my $findsig (0 .. $#bodylines) {
 193             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 194                 $#bodylines = $findsig - 1;
 195                 last;
 196             }
 197         }
 198         map { s/^- // } @bodylines;
 199     }
 200
 201     return { header => [@headerlines], body => [@bodylines]};
 202 }
 203
 204 =head2 create_mime_message
 205
 206      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 207
 208 Creates a MIME encoded message with headers given by the first
 209 argument, and a message given by the second.
 210
 211 Optional attachments can be specified in the third arrayref argument.
 212
 213 Whether to include the date in the header is the final argument; it
 214 defaults to true, setting the Date header if one is not already
 215 present.
 216
 217 Headers are passed directly to MIME::Entity::build, the message is the
 218 first attachment.
 219
 220 Each of the elements of the attachment arrayref is attached as an
 221 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 222 hashref, the contents are passed as an argument to
 223 MIME::Entity::attach
 224
 225 =cut
 226
 227 sub create_mime_message{
 228      my ($headers,$body,$attachments,$include_date) = @_;
 229      $attachments = [] if not defined $attachments;
 230      $include_date = 1 if not defined $include_date;
 231
 232      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 233      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 234
 235      if ($include_date) {
 236          my %headers = apply {defined $_ ? lc($_) : ''} @{$headers};
 237          if (not exists $headers{date}) {
 238              push @{$headers},
 239                  ('Date',
 240                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 241                  );
 242          }
 243      }
 244
 245      # Build the message
 246      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 247      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 248                                    'Encoding'     => 'quoted-printable',
 249                                    (map{encode_rfc1522(encode_utf8(defined $_ ? $_:''))} @{$headers}),
 250                                    Data    => encode_utf8($body),
 251                                   );
 252
 253      # Attach the attachments
 254      for my $attachment (@{$attachments}) {
 255           if (ref($attachment) eq 'HASH') {
 256                $msg->attach(%{$attachment});
 257           }
 258           else {
 259                # This is *craptacular*, but because various MTAs
 260                # (sendmail and exim4, at least) appear to eat From
 261                # lines in message/rfc822 attachments, we need eat
 262                # the entire From line ourselves so the MTA doesn't
 263                # leave \n detrius around.
 264                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 265                     # make a copy so that we don't screw up anything
 266                     # that is expecting this arrayref to stay constant
 267                     $attachment = [@{$attachment}];
 268                     # remove the from line
 269                     splice @$attachment, 1, 1;
 270                }
 271                elsif (not ref($attachment)) {
 272                     # It's a scalar; remove the from line
 273                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 274                }
 275                $msg->attach(Type => 'message/rfc822',
 276                             Data => $attachment,
 277                             Encoding => '7bit',
 278                            );
 279           }
 280      }
 281      return $msg->as_string;
 282 }
 283
 284
 285
 286
 287 =head2 decode_rfc1522
 288
 289     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 290
 291 Turn RFC-1522 names into the UTF-8 equivalent.
 292
 293 =cut
 294
 295 sub decode_rfc1522 {
 296     my ($string) = @_;
 297
 298     # this is craptacular, but leading space is hacked off by unmime.
 299     # Save it.
 300     my $leading_space = '';
 301     $leading_space = $1 if $string =~ s/^(\ +)//;
 302     # we must do this to switch off the utf8 flag before calling decode_mimewords
 303     $string = encode_utf8($string);
 304     my @mime_words = MIME::Words::decode_mimewords($string);
 305     my $tmp = $leading_space .
 306         join('',
 307              (map {
 308                  if (@{$_} > 1) {
 309                      convert_to_utf8(${$_}[0],${$_}[1]);
 310                  } else {
 311                      decode_utf8(${$_}[0]);
 312                  }
 313              } @mime_words)
 314             );
 315     return $tmp;
 316 }
 317
 318 =head2 encode_rfc1522
 319
 320      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 321
 322 Encodes headers according to the RFC1522 standard by calling
 323 MIME::Words::encode_mimeword on distinct words as appropriate.
 324
 325 =cut
 326
 327 # We cannot use MIME::Words::encode_mimewords because that function
 328 # does not handle spaces properly at all.
 329
 330 sub encode_rfc1522 {
 331      my ($rawstr) = @_;
 332
 333      # handle being passed undef properly
 334      return undef if not defined $rawstr;
 335
 336      # convert to octets if we are given a string in perl's internal
 337      # encoding
 338      $rawstr= encode_utf8($rawstr) if is_utf8($rawstr);
 339      # We process words in reverse so we can preserve spacing between
 340      # encoded words. This regex splits on word|nonword boundaries and
 341      # nonword|nonword boundaries. We also consider parenthesis and "
 342      # to be nonwords to avoid escaping them in comments in violation
 343      # of RFC1522
 344      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 345
 346      my $previous_word_encoded = 0;
 347      my $string = '';
 348      for my $word (@words) {
 349           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 350                $string = $word.$string;
 351                $previous_word_encoded=0;
 352           }
 353           elsif ($word =~ /^[\s\n]$/) {
 354                $string = $word.$string;
 355                $previous_word_encoded = 0 if $word eq "\n";
 356           }
 357           else {
 358                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 359                # RFC 1522 mandates that segments be at most 76 characters
 360                # long. If that's the case, we split the word up into 10
 361                # character pieces and encode it. We must use the Encode
 362                # magic here to avoid breaking on bit boundaries here.
 363                if (length $encoded > 75) {
 364                     # Turn utf8 into the internal perl representation
 365                     # so . is a character, not a byte.
 366                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 367                     my @encoded;
 368                     # Strip it into 10 character long segments, and encode
 369                     # the segments
 370                     # XXX It's possible that these segments are > 76 characters
 371                     while ($tempstr =~ s/(.{1,10})$//) {
 372                          # turn the character back into the utf8 representation.
 373                          my $tempword = encode_utf8($1);
 374                          # It may actually be better to eventually use
 375                          # the base64 encoding here, but I'm not sure
 376                          # if that's as widely supported as quoted
 377                          # printable.
 378                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 379                     }
 380                     $encoded = join(" ",@encoded);
 381                     # If the previous word was encoded, we must
 382                     # include a trailing _ that gets encoded as a
 383                     # space.
 384                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 385                     $string = $encoded.$string;
 386                }
 387                else {
 388                     # If the previous word was encoded, we must
 389                     # include a trailing _ that gets encoded as a
 390                     # space.
 391                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 392                     $string = $encoded.$string;
 393                }
 394                $previous_word_encoded = 1;
 395           }
 396      }
 397      return $string;
 398 }
 399
 400 =head2
 401
 402     $header = handle_escaped_commas('','From: ')
 403
 404 Handle commas in addresses which have been RFC1522 escaped and now need to be
 405 quoted to avoid parsing as a record separator.
 406
 407 =cut
 408
 409 sub handle_escaped_commas {
 410     my ($modified_hdr, $orig_hdr) = @_;
 411
 412     my $field = Mail::Message::Field->new($orig_hdr);
 413     # if the header isn't structured, it can't contain an address
 414     if (not $field->isStructured()) {
 415         return $modified_hdr
 416     }
 417     if ($field->name() !~ m/^(?:to|from|reply-to)$/) {
 418         return $modified_hdr
 419     }
 420     my @addresses = $field->addresses();
 421     if (not @addresses) {
 422         return $modified_hdr
 423     }
 424     my @return_addresses;
 425     for my $address (@addresses) {
 426         $address->phrase(decode_rfc1522($address->phrase()));
 427         push @return_addresses, $address->format();
 428     }
 429     return join(', ',@return_addresses)
 430 }
 431
 432 1;