Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =encoding utf8
  14
  15 =head1 NAME
  16
  17 Debbugs::MIME -- Mime handling routines for debbugs
  18
  19 =head1 SYNOPSIS
  20
  21  use Debbugs::MIME qw(parse decode_rfc1522);
  22
  23 =head1 DESCRIPTION
  24
  25
  26 =head1 BUGS
  27
  28 None known.
  29
  30 =cut
  31
  32 use warnings;
  33 use strict;
  34
  35 use Exporter qw(import);
  36 use vars qw($DEBUG $VERSION @EXPORT_OK %EXPORT_TAGS @EXPORT);
  37
  38 BEGIN {
  39     $VERSION = 1.00;
  40     $DEBUG = 0 unless defined $DEBUG;
  41
  42     @EXPORT = ();
  43
  44     %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody)],
  45                     rfc1522 => [qw(decode_rfc1522 encode_rfc1522)],
  46                    );
  47     @EXPORT_OK=();
  48     Exporter::export_ok_tags(keys %EXPORT_TAGS);
  49     $EXPORT_TAGS{all} = [@EXPORT_OK];
  50 }
  51
  52 use File::Path qw(remove_tree);
  53 use File::Temp qw(tempdir);
  54 use MIME::Parser;
  55
  56 use POSIX qw(strftime);
  57 use List::MoreUtils qw(apply);
  58
  59 # for convert_to_utf8
  60 use Debbugs::UTF8 qw(convert_to_utf8);
  61
  62 # for decode_rfc1522 and encode_rfc1522
  63 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  64 use MIME::Words qw();
  65
  66 sub getmailbody
  67 {
  68     my $entity = shift;
  69     my $type = $entity->effective_type;
  70     if ($type eq 'text/plain' or
  71             ($type =~ m#text/?# and $type ne 'text/html') or
  72             $type eq 'application/pgp') {
  73         return $entity;
  74     } elsif ($type eq 'multipart/alternative') {
  75         # RFC 2046 says we should use the last part we recognize.
  76         for my $part (reverse $entity->parts) {
  77             my $ret = getmailbody($part);
  78             return $ret if $ret;
  79         }
  80     } else {
  81         # For other multipart types, we just pretend they're
  82         # multipart/mixed and run through in order.
  83         for my $part ($entity->parts) {
  84             my $ret = getmailbody($part);
  85             return $ret if $ret;
  86         }
  87     }
  88     return undef;
  89 }
  90
  91 =item parse_to_mime_entity
  92
  93      $entity = parse_to_mime_entity($record);
  94
  95 Returns a MIME::Entity from a record (from Debbugs::Log), a filehandle, or a
  96 scalar mail message. Will die upon failure.
  97
  98 Intermediate parsing results will be output under a temporary directory which
  99 should be cleaned up upon process exit.
 100
 101 =cut
 102
 103 sub parse_to_mime_entity {
 104     my ($record) = @_;
 105     my $parser = MIME::Parser->new();
 106     my $entity;
 107     # this will be cleaned up once we exit
 108     $tempdir = File::Temp->newdir();
 109     $parser->output_dir($tempdir->dirname());
 110     if (ref($record) eq 'HASH') {
 111         if ($record->{inner_file}) {
 112             $entity = $parser->parse($record->{fh}) or
 113                 die "Unable to parse entity";
 114         } else {
 115             $entity = $parser->parse_data($record->{text}) or
 116                 die "Unable to parse entity";
 117         }
 118     } elsif (ref($record)) {
 119         $entity = $parser->parse($record) or
 120             die "Unable to parse entity";
 121     } else {
 122         $entity = $parser->parse_data($record) or
 123             die "Unable to parse entity";
 124     }
 125     return $entity;
 126 }
 127
 128 sub parse
 129 {
 130     # header and decoded body respectively
 131     my (@headerlines, @bodylines);
 132
 133     my $parser = MIME::Parser->new();
 134     my $tempdir = tempdir(CLEANUP => 1);
 135     $parser->output_under($tempdir);
 136     my $entity = eval { $parser->parse_data($_[0]) };
 137
 138     if ($entity and $entity->head->tags) {
 139         @headerlines = @{$entity->head->header};
 140         chomp @headerlines;
 141
 142         my $entity_body = getmailbody($entity);
 143         my $entity_body_handle;
 144         my $charset;
 145         if (defined $entity_body) {
 146             $entity_body_handle = $entity_body->bodyhandle();
 147             $charset = $entity_body->head()->mime_attr('content-type.charset');
 148         }
 149         @bodylines = $entity_body_handle ? $entity_body_handle->as_lines() : ();
 150         @bodylines = map {convert_to_utf8($_,$charset)} @bodylines;
 151         chomp @bodylines;
 152     } else {
 153         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 154         my @msg = split /\n/, $_[0];
 155         my $i;
 156
 157         # assume us-ascii unless charset is set; probably bad, but we
 158         # really shouldn't get to this point anyway
 159         my $charset = 'us-ascii';
 160         for ($i = 0; $i <= $#msg; ++$i) {
 161             $_ = $msg[$i];
 162             last unless length;
 163             while ($msg[$i + 1] =~ /^\s/) {
 164                 ++$i;
 165                 $_ .= "\n" . $msg[$i];
 166             }
 167             if (/charset=\"([^\"]+)\"/) {
 168                 $charset = $1;
 169             }
 170             push @headerlines, $_;
 171         }
 172         @bodylines = map {convert_to_utf8($_,$charset)} @msg[$i .. $#msg];
 173     }
 174
 175     remove_tree($tempdir,{verbose => 0, safe => 1});
 176
 177     # Remove blank lines.
 178     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 179
 180     # Strip off RFC2440-style PGP clearsigning.
 181     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 182         shift @bodylines while @bodylines and
 183             length $bodylines[0] and
 184                 # we currently don't strip \r; handle this for the
 185                 # time being, though eventually it should be stripped
 186                 # too, I think. [See #565981]
 187                 $bodylines[0] ne "\r";
 188         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 189         for my $findsig (0 .. $#bodylines) {
 190             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 191                 $#bodylines = $findsig - 1;
 192                 last;
 193             }
 194         }
 195         map { s/^- // } @bodylines;
 196     }
 197
 198     return { header => [@headerlines], body => [@bodylines]};
 199 }
 200
 201 =head2 create_mime_message
 202
 203      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 204
 205 Creates a MIME encoded message with headers given by the first
 206 argument, and a message given by the second.
 207
 208 Optional attachments can be specified in the third arrayref argument.
 209
 210 Whether to include the date in the header is the final argument; it
 211 defaults to true, setting the Date header if one is not already
 212 present.
 213
 214 Headers are passed directly to MIME::Entity::build, the message is the
 215 first attachment.
 216
 217 Each of the elements of the attachment arrayref is attached as an
 218 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 219 hashref, the contents are passed as an argument to
 220 MIME::Entity::attach
 221
 222 =cut
 223
 224 sub create_mime_message{
 225      my ($headers,$body,$attachments,$include_date) = @_;
 226      $attachments = [] if not defined $attachments;
 227      $include_date = 1 if not defined $include_date;
 228
 229      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 230      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 231
 232      if ($include_date) {
 233          my %headers = apply {defined $_ ? lc($_) : ''} @{$headers};
 234          if (not exists $headers{date}) {
 235              push @{$headers},
 236                  ('Date',
 237                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 238                  );
 239          }
 240      }
 241
 242      # Build the message
 243      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 244      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 245                                    'Encoding'     => 'quoted-printable',
 246                                    (map{encode_rfc1522(encode_utf8(defined $_ ? $_:''))} @{$headers}),
 247                                    Data    => encode_utf8($body),
 248                                   );
 249
 250      # Attach the attachments
 251      for my $attachment (@{$attachments}) {
 252           if (ref($attachment) eq 'HASH') {
 253                $msg->attach(%{$attachment});
 254           }
 255           else {
 256                # This is *craptacular*, but because various MTAs
 257                # (sendmail and exim4, at least) appear to eat From
 258                # lines in message/rfc822 attachments, we need eat
 259                # the entire From line ourselves so the MTA doesn't
 260                # leave \n detrius around.
 261                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 262                     # make a copy so that we don't screw up anything
 263                     # that is expecting this arrayref to stay constant
 264                     $attachment = [@{$attachment}];
 265                     # remove the from line
 266                     splice @$attachment, 1, 1;
 267                }
 268                elsif (not ref($attachment)) {
 269                     # It's a scalar; remove the from line
 270                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 271                }
 272                $msg->attach(Type => 'message/rfc822',
 273                             Data => $attachment,
 274                             Encoding => '7bit',
 275                            );
 276           }
 277      }
 278      return $msg->as_string;
 279 }
 280
 281
 282
 283
 284 =head2 decode_rfc1522
 285
 286     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 287
 288 Turn RFC-1522 names into the UTF-8 equivalent.
 289
 290 =cut
 291
 292 sub decode_rfc1522 {
 293     my ($string) = @_;
 294
 295     # this is craptacular, but leading space is hacked off by unmime.
 296     # Save it.
 297     my $leading_space = '';
 298     $leading_space = $1 if $string =~ s/^(\ +)//;
 299     # we must do this to switch off the utf8 flag before calling decode_mimewords
 300     $string = encode_utf8($string);
 301     my @mime_words = MIME::Words::decode_mimewords($string);
 302     my $tmp = $leading_space .
 303         join('',
 304              (map {
 305                  if (@{$_} > 1) {
 306                      convert_to_utf8(${$_}[0],${$_}[1]);
 307                  } else {
 308                      decode_utf8(${$_}[0]);
 309                  }
 310              } @mime_words)
 311             );
 312     return $tmp;
 313 }
 314
 315 =head2 encode_rfc1522
 316
 317      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 318
 319 Encodes headers according to the RFC1522 standard by calling
 320 MIME::Words::encode_mimeword on distinct words as appropriate.
 321
 322 =cut
 323
 324 # We cannot use MIME::Words::encode_mimewords because that function
 325 # does not handle spaces properly at all.
 326
 327 sub encode_rfc1522 {
 328      my ($rawstr) = @_;
 329
 330      # handle being passed undef properly
 331      return undef if not defined $rawstr;
 332
 333      # convert to octets if we are given a string in perl's internal
 334      # encoding
 335      $rawstr= encode_utf8($rawstr) if is_utf8($rawstr);
 336      # We process words in reverse so we can preserve spacing between
 337      # encoded words. This regex splits on word|nonword boundaries and
 338      # nonword|nonword boundaries. We also consider parenthesis and "
 339      # to be nonwords to avoid escaping them in comments in violation
 340      # of RFC1522
 341      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 342
 343      my $previous_word_encoded = 0;
 344      my $string = '';
 345      for my $word (@words) {
 346           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 347                $string = $word.$string;
 348                $previous_word_encoded=0;
 349           }
 350           elsif ($word =~ /^[\s\n]$/) {
 351                $string = $word.$string;
 352                $previous_word_encoded = 0 if $word eq "\n";
 353           }
 354           else {
 355                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 356                # RFC 1522 mandates that segments be at most 76 characters
 357                # long. If that's the case, we split the word up into 10
 358                # character pieces and encode it. We must use the Encode
 359                # magic here to avoid breaking on bit boundaries here.
 360                if (length $encoded > 75) {
 361                     # Turn utf8 into the internal perl representation
 362                     # so . is a character, not a byte.
 363                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 364                     my @encoded;
 365                     # Strip it into 10 character long segments, and encode
 366                     # the segments
 367                     # XXX It's possible that these segments are > 76 characters
 368                     while ($tempstr =~ s/(.{1,10})$//) {
 369                          # turn the character back into the utf8 representation.
 370                          my $tempword = encode_utf8($1);
 371                          # It may actually be better to eventually use
 372                          # the base64 encoding here, but I'm not sure
 373                          # if that's as widely supported as quoted
 374                          # printable.
 375                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 376                     }
 377                     $encoded = join(" ",@encoded);
 378                     # If the previous word was encoded, we must
 379                     # include a trailing _ that gets encoded as a
 380                     # space.
 381                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 382                     $string = $encoded.$string;
 383                }
 384                else {
 385                     # If the previous word was encoded, we must
 386                     # include a trailing _ that gets encoded as a
 387                     # space.
 388                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 389                     $string = $encoded.$string;
 390                }
 391                $previous_word_encoded = 1;
 392           }
 393      }
 394      return $string;
 395 }
 396
 397 1;