Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =encoding utf8
  14
  15 =head1 NAME
  16
  17 Debbugs::MIME -- Mime handling routines for debbugs
  18
  19 =head1 SYNOPSIS
  20
  21  use Debbugs::MIME qw(parse decode_rfc1522);
  22
  23 =head1 DESCRIPTION
  24
  25
  26 =head1 BUGS
  27
  28 None known.
  29
  30 =cut
  31
  32 use warnings;
  33 use strict;
  34
  35 use Exporter qw(import);
  36 use vars qw($DEBUG $VERSION @EXPORT_OK %EXPORT_TAGS @EXPORT);
  37
  38 BEGIN {
  39     $VERSION = 1.00;
  40     $DEBUG = 0 unless defined $DEBUG;
  41
  42     @EXPORT = ();
  43
  44     %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody),
  45                              qw(parse_to_mime_entity),
  46                             ],
  47                     rfc1522 => [qw(decode_rfc1522 encode_rfc1522)],
  48                    );
  49     @EXPORT_OK=();
  50     Exporter::export_ok_tags(keys %EXPORT_TAGS);
  51     $EXPORT_TAGS{all} = [@EXPORT_OK];
  52 }
  53
  54 use File::Path qw(remove_tree);
  55 use File::Temp qw(tempdir);
  56 use MIME::Parser;
  57
  58 use POSIX qw(strftime);
  59 use List::AllUtils qw(apply);
  60
  61 # for convert_to_utf8
  62 use Debbugs::UTF8 qw(convert_to_utf8);
  63
  64 # for decode_rfc1522 and encode_rfc1522
  65 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  66 use MIME::Words qw();
  67
  68 sub getmailbody
  69 {
  70     my $entity = shift;
  71     my $type = $entity->effective_type;
  72     if ($type eq 'text/plain' or
  73             ($type =~ m#text/?# and $type ne 'text/html') or
  74             $type eq 'application/pgp') {
  75         return $entity;
  76     } elsif ($type eq 'multipart/alternative') {
  77         # RFC 2046 says we should use the last part we recognize.
  78         for my $part (reverse $entity->parts) {
  79             my $ret = getmailbody($part);
  80             return $ret if $ret;
  81         }
  82     } else {
  83         # For other multipart types, we just pretend they're
  84         # multipart/mixed and run through in order.
  85         for my $part ($entity->parts) {
  86             my $ret = getmailbody($part);
  87             return $ret if $ret;
  88         }
  89     }
  90     return undef;
  91 }
  92
  93 =head2 parse_to_mime_entity
  94
  95      $entity = parse_to_mime_entity($record);
  96
  97 Returns a MIME::Entity from a record (from Debbugs::Log), a filehandle, or a
  98 scalar mail message. Will die upon failure.
  99
 100 Intermediate parsing results will be output under a temporary directory which
 101 should be cleaned up upon process exit.
 102
 103 =cut
 104
 105 sub parse_to_mime_entity {
 106     my ($record) = @_;
 107     my $parser = MIME::Parser->new();
 108     my $entity;
 109     # this will be cleaned up once we exit
 110     my $tempdir = File::Temp->newdir();
 111     $parser->output_dir($tempdir->dirname());
 112     if (ref($record) eq 'HASH') {
 113         if ($record->{inner_file}) {
 114             $entity = $parser->parse($record->{fh}) or
 115                 die "Unable to parse entity";
 116         } else {
 117             $entity = $parser->parse_data($record->{text}) or
 118                 die "Unable to parse entity";
 119         }
 120     } elsif (ref($record)) {
 121         $entity = $parser->parse($record) or
 122             die "Unable to parse entity";
 123     } else {
 124         $entity = $parser->parse_data($record) or
 125             die "Unable to parse entity";
 126     }
 127     return $entity;
 128 }
 129
 130 sub parse
 131 {
 132     # header and decoded body respectively
 133     my (@headerlines, @bodylines);
 134
 135     my $parser = MIME::Parser->new();
 136     my $tempdir = tempdir(CLEANUP => 1);
 137     $parser->output_under($tempdir);
 138     my $entity = eval { $parser->parse_data($_[0]) };
 139
 140     if ($entity and $entity->head->tags) {
 141         @headerlines = @{$entity->head->header};
 142         chomp @headerlines;
 143
 144         my $entity_body = getmailbody($entity);
 145         my $entity_body_handle;
 146         my $charset;
 147         if (defined $entity_body) {
 148             $entity_body_handle = $entity_body->bodyhandle();
 149             $charset = $entity_body->head()->mime_attr('content-type.charset');
 150         }
 151         @bodylines = $entity_body_handle ? $entity_body_handle->as_lines() : ();
 152         @bodylines = map {convert_to_utf8($_,$charset)} @bodylines;
 153         chomp @bodylines;
 154     } else {
 155         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 156         my @msg = split /\n/, $_[0];
 157         my $i;
 158
 159         # assume us-ascii unless charset is set; probably bad, but we
 160         # really shouldn't get to this point anyway
 161         my $charset = 'us-ascii';
 162         for ($i = 0; $i <= $#msg; ++$i) {
 163             $_ = $msg[$i];
 164             last unless length;
 165             while ($msg[$i + 1] =~ /^\s/) {
 166                 ++$i;
 167                 $_ .= "\n" . $msg[$i];
 168             }
 169             if (/charset=\"([^\"]+)\"/) {
 170                 $charset = $1;
 171             }
 172             push @headerlines, $_;
 173         }
 174         @bodylines = map {convert_to_utf8($_,$charset)} @msg[$i .. $#msg];
 175     }
 176
 177     remove_tree($tempdir,{verbose => 0, safe => 1});
 178
 179     # Remove blank lines.
 180     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 181
 182     # Strip off RFC2440-style PGP clearsigning.
 183     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 184         shift @bodylines while @bodylines and
 185             length $bodylines[0] and
 186                 # we currently don't strip \r; handle this for the
 187                 # time being, though eventually it should be stripped
 188                 # too, I think. [See #565981]
 189                 $bodylines[0] ne "\r";
 190         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 191         for my $findsig (0 .. $#bodylines) {
 192             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 193                 $#bodylines = $findsig - 1;
 194                 last;
 195             }
 196         }
 197         map { s/^- // } @bodylines;
 198     }
 199
 200     return { header => [@headerlines], body => [@bodylines]};
 201 }
 202
 203 =head2 create_mime_message
 204
 205      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 206
 207 Creates a MIME encoded message with headers given by the first
 208 argument, and a message given by the second.
 209
 210 Optional attachments can be specified in the third arrayref argument.
 211
 212 Whether to include the date in the header is the final argument; it
 213 defaults to true, setting the Date header if one is not already
 214 present.
 215
 216 Headers are passed directly to MIME::Entity::build, the message is the
 217 first attachment.
 218
 219 Each of the elements of the attachment arrayref is attached as an
 220 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 221 hashref, the contents are passed as an argument to
 222 MIME::Entity::attach
 223
 224 =cut
 225
 226 sub create_mime_message{
 227      my ($headers,$body,$attachments,$include_date) = @_;
 228      $attachments = [] if not defined $attachments;
 229      $include_date = 1 if not defined $include_date;
 230
 231      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 232      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 233
 234      if ($include_date) {
 235          my %headers = apply {defined $_ ? lc($_) : ''} @{$headers};
 236          if (not exists $headers{date}) {
 237              push @{$headers},
 238                  ('Date',
 239                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 240                  );
 241          }
 242      }
 243
 244      # Build the message
 245      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 246      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 247                                    'Encoding'     => 'quoted-printable',
 248                                    (map{encode_rfc1522(encode_utf8(defined $_ ? $_:''))} @{$headers}),
 249                                    Data    => encode_utf8($body),
 250                                   );
 251
 252      # Attach the attachments
 253      for my $attachment (@{$attachments}) {
 254           if (ref($attachment) eq 'HASH') {
 255                $msg->attach(%{$attachment});
 256           }
 257           else {
 258                # This is *craptacular*, but because various MTAs
 259                # (sendmail and exim4, at least) appear to eat From
 260                # lines in message/rfc822 attachments, we need eat
 261                # the entire From line ourselves so the MTA doesn't
 262                # leave \n detrius around.
 263                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 264                     # make a copy so that we don't screw up anything
 265                     # that is expecting this arrayref to stay constant
 266                     $attachment = [@{$attachment}];
 267                     # remove the from line
 268                     splice @$attachment, 1, 1;
 269                }
 270                elsif (not ref($attachment)) {
 271                     # It's a scalar; remove the from line
 272                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 273                }
 274                $msg->attach(Type => 'message/rfc822',
 275                             Data => $attachment,
 276                             Encoding => '7bit',
 277                            );
 278           }
 279      }
 280      return $msg->as_string;
 281 }
 282
 283
 284
 285
 286 =head2 decode_rfc1522
 287
 288     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 289
 290 Turn RFC-1522 names into the UTF-8 equivalent.
 291
 292 =cut
 293
 294 sub decode_rfc1522 {
 295     my ($string) = @_;
 296
 297     # this is craptacular, but leading space is hacked off by unmime.
 298     # Save it.
 299     my $leading_space = '';
 300     $leading_space = $1 if $string =~ s/^(\ +)//;
 301     # we must do this to switch off the utf8 flag before calling decode_mimewords
 302     $string = encode_utf8($string);
 303     my @mime_words = MIME::Words::decode_mimewords($string);
 304     my $tmp = $leading_space .
 305         join('',
 306              (map {
 307                  if (@{$_} > 1) {
 308                      convert_to_utf8(${$_}[0],${$_}[1]);
 309                  } else {
 310                      decode_utf8(${$_}[0]);
 311                  }
 312              } @mime_words)
 313             );
 314     return $tmp;
 315 }
 316
 317 =head2 encode_rfc1522
 318
 319      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 320
 321 Encodes headers according to the RFC1522 standard by calling
 322 MIME::Words::encode_mimeword on distinct words as appropriate.
 323
 324 =cut
 325
 326 # We cannot use MIME::Words::encode_mimewords because that function
 327 # does not handle spaces properly at all.
 328
 329 sub encode_rfc1522 {
 330      my ($rawstr) = @_;
 331
 332      # handle being passed undef properly
 333      return undef if not defined $rawstr;
 334
 335      # convert to octets if we are given a string in perl's internal
 336      # encoding
 337      $rawstr= encode_utf8($rawstr) if is_utf8($rawstr);
 338      # We process words in reverse so we can preserve spacing between
 339      # encoded words. This regex splits on word|nonword boundaries and
 340      # nonword|nonword boundaries. We also consider parenthesis and "
 341      # to be nonwords to avoid escaping them in comments in violation
 342      # of RFC1522
 343      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 344
 345      my $previous_word_encoded = 0;
 346      my $string = '';
 347      for my $word (@words) {
 348           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 349                $string = $word.$string;
 350                $previous_word_encoded=0;
 351           }
 352           elsif ($word =~ /^[\s\n]$/) {
 353                $string = $word.$string;
 354                $previous_word_encoded = 0 if $word eq "\n";
 355           }
 356           else {
 357                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 358                # RFC 1522 mandates that segments be at most 76 characters
 359                # long. If that's the case, we split the word up into 10
 360                # character pieces and encode it. We must use the Encode
 361                # magic here to avoid breaking on bit boundaries here.
 362                if (length $encoded > 75) {
 363                     # Turn utf8 into the internal perl representation
 364                     # so . is a character, not a byte.
 365                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 366                     my @encoded;
 367                     # Strip it into 10 character long segments, and encode
 368                     # the segments
 369                     # XXX It's possible that these segments are > 76 characters
 370                     while ($tempstr =~ s/(.{1,10})$//) {
 371                          # turn the character back into the utf8 representation.
 372                          my $tempword = encode_utf8($1);
 373                          # It may actually be better to eventually use
 374                          # the base64 encoding here, but I'm not sure
 375                          # if that's as widely supported as quoted
 376                          # printable.
 377                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 378                     }
 379                     $encoded = join(" ",@encoded);
 380                     # If the previous word was encoded, we must
 381                     # include a trailing _ that gets encoded as a
 382                     # space.
 383                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 384                     $string = $encoded.$string;
 385                }
 386                else {
 387                     # If the previous word was encoded, we must
 388                     # include a trailing _ that gets encoded as a
 389                     # space.
 390                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 391                     $string = $encoded.$string;
 392                }
 393                $previous_word_encoded = 1;
 394           }
 395      }
 396      return $string;
 397 }
 398
 399 1;