Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =head1 NAME
  14
  15 Debbugs::MIME -- Mime handling routines for debbugs
  16
  17 =head1 SYNOPSIS
  18
  19  use Debbugs::MIME qw(parse decode_rfc1522);
  20
  21 =head1 DESCRIPTION
  22
  23
  24 =head1 BUGS
  25
  26 None known.
  27
  28 =cut
  29
  30 use warnings;
  31 use strict;
  32
  33 use base qw(Exporter);
  34 use vars qw($DEBUG $VERSION @EXPORT_OK %EXPORT_TAGS @EXPORT);
  35
  36 BEGIN {
  37     $VERSION = 1.00;
  38     $DEBUG = 0 unless defined $DEBUG;
  39
  40     @EXPORT = ();
  41
  42     %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody)],
  43                     rfc1522 => [qw(decode_rfc1522 encode_rfc1522)],
  44                    );
  45     @EXPORT_OK=();
  46     Exporter::export_ok_tags(keys %EXPORT_TAGS);
  47     $EXPORT_TAGS{all} = [@EXPORT_OK];
  48 }
  49
  50 use File::Path;
  51 use File::Temp qw();
  52 use MIME::Parser;
  53
  54 use POSIX qw(strftime);
  55 use List::MoreUtils qw(apply);
  56
  57 # for convert_to_utf8
  58 use Debbugs::UTF8 qw(convert_to_utf8);
  59
  60 # for decode_rfc1522
  61 use MIME::WordDecoder qw();
  62 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  63
  64 # for encode_rfc1522
  65 use MIME::Words qw();
  66
  67 sub getmailbody
  68 {
  69     my $entity = shift;
  70     my $type = $entity->effective_type;
  71     if ($type eq 'text/plain' or
  72             ($type =~ m#text/?# and $type ne 'text/html') or
  73             $type eq 'application/pgp') {
  74         return $entity;
  75     } elsif ($type eq 'multipart/alternative') {
  76         # RFC 2046 says we should use the last part we recognize.
  77         for my $part (reverse $entity->parts) {
  78             my $ret = getmailbody($part);
  79             return $ret if $ret;
  80         }
  81     } else {
  82         # For other multipart types, we just pretend they're
  83         # multipart/mixed and run through in order.
  84         for my $part ($entity->parts) {
  85             my $ret = getmailbody($part);
  86             return $ret if $ret;
  87         }
  88     }
  89     return undef;
  90 }
  91
  92 sub parse
  93 {
  94     # header and decoded body respectively
  95     my (@headerlines, @bodylines);
  96
  97     my $parser = MIME::Parser->new();
  98     my $tempdir = File::Temp::tempdir();
  99     $parser->output_under($tempdir);
 100     my $entity = eval { $parser->parse_data($_[0]) };
 101
 102     if ($entity and $entity->head->tags) {
 103         @headerlines = @{$entity->head->header};
 104         chomp @headerlines;
 105
 106         my $entity_body = getmailbody($entity);
 107         my $entity_body_handle;
 108         my $charset;
 109         if (defined $entity_body) {
 110             $entity_body_handle = $entity_body->bodyhandle();
 111             $charset = $entity_body->head()->mime_attr('content-type.charset');
 112         }
 113         @bodylines = $entity_body_handle ? $entity_body_handle->as_lines() : ();
 114         @bodylines = map {convert_to_utf8($_,$charset)} @bodylines;
 115         chomp @bodylines;
 116     } else {
 117         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 118         my @msg = split /\n/, $_[0];
 119         my $i;
 120
 121         for ($i = 0; $i <= $#msg; ++$i) {
 122             $_ = $msg[$i];
 123             last unless length;
 124             while ($msg[$i + 1] =~ /^\s/) {
 125                 ++$i;
 126                 $_ .= "\n" . $msg[$i];
 127             }
 128             push @headerlines, $_;
 129         }
 130
 131         @bodylines = @msg[$i .. $#msg];
 132     }
 133
 134     rmtree $tempdir, 0, 1;
 135
 136     # Remove blank lines.
 137     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 138
 139     # Strip off RFC2440-style PGP clearsigning.
 140     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 141         shift @bodylines while @bodylines and
 142             length $bodylines[0] and
 143                 # we currently don't strip \r; handle this for the
 144                 # time being, though eventually it should be stripped
 145                 # too, I think. [See #565981]
 146                 $bodylines[0] ne "\r";
 147         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 148         for my $findsig (0 .. $#bodylines) {
 149             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 150                 $#bodylines = $findsig - 1;
 151                 last;
 152             }
 153         }
 154         map { s/^- // } @bodylines;
 155     }
 156
 157     return { header => [@headerlines], body => [@bodylines]};
 158 }
 159
 160 =head2 create_mime_message
 161
 162      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 163
 164 Creates a MIME encoded message with headers given by the first
 165 argument, and a message given by the second.
 166
 167 Optional attachments can be specified in the third arrayref argument.
 168
 169 Whether to include the date in the header is the final argument; it
 170 defaults to true, setting the Date header if one is not already
 171 present.
 172
 173 Headers are passed directly to MIME::Entity::build, the message is the
 174 first attachment.
 175
 176 Each of the elements of the attachment arrayref is attached as an
 177 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 178 hashref, the contents are passed as an argument to
 179 MIME::Entity::attach
 180
 181 =cut
 182
 183 sub create_mime_message{
 184      my ($headers,$body,$attachments,$include_date) = @_;
 185      $attachments = [] if not defined $attachments;
 186      $include_date = 1 if not defined $include_date;
 187
 188      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 189      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 190
 191      if ($include_date) {
 192          my %headers = apply {lc($_)} @{$headers};
 193          if (not exists $headers{date}) {
 194              push @{$headers},
 195                  ('Date',
 196                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 197                  );
 198          }
 199      }
 200
 201      # Build the message
 202      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 203      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 204                                    'Encoding'     => 'quoted-printable',
 205                                    (map{encode_rfc1522($_)} @{$headers}),
 206                                    Data    => is_utf8($body)?encode_utf8($body):$body,
 207                                   );
 208
 209      # Attach the attachments
 210      for my $attachment (@{$attachments}) {
 211           if (ref($attachment) eq 'HASH') {
 212                $msg->attach(%{$attachment});
 213           }
 214           else {
 215                # This is *craptacular*, but because various MTAs
 216                # (sendmail and exim4, at least) appear to eat From
 217                # lines in message/rfc822 attachments, we need eat
 218                # the entire From line ourselves so the MTA doesn't
 219                # leave \n detrius around.
 220                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 221                     # make a copy so that we don't screw up anything
 222                     # that is expecting this arrayref to stay constant
 223                     $attachment = [@{$attachment}];
 224                     # remove the from line
 225                     splice @$attachment, 1, 1;
 226                }
 227                elsif (not ref($attachment)) {
 228                     # It's a scalar; remove the from line
 229                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 230                }
 231                $msg->attach(Type => 'message/rfc822',
 232                             Data => $attachment,
 233                             Encoding => '7bit',
 234                            );
 235           }
 236      }
 237      return $msg->as_string;
 238 }
 239
 240
 241
 242
 243 =head2 decode_rfc1522
 244
 245     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 246
 247 Turn RFC-1522 names into the UTF-8 equivalent.
 248
 249 =cut
 250
 251 BEGIN {
 252     # Set up the default RFC1522 decoder, which turns all charsets that
 253     # are supported into the appropriate UTF-8 charset.
 254     MIME::WordDecoder->default(new MIME::WordDecoder(
 255         ['*' => \&convert_to_utf8,
 256         ]));
 257 }
 258
 259 sub decode_rfc1522 {
 260     my ($string) = @_;
 261
 262     # this is craptacular, but leading space is hacked off by unmime.
 263     # Save it.
 264     my $leading_space = '';
 265     $leading_space = $1 if $string =~ s/^(\s+)//;
 266     # unmime calls the default MIME::WordDecoder handler set up at
 267     # initialization time.
 268     return $leading_space . MIME::WordDecoder::unmime($string);
 269 }
 270
 271 =head2 encode_rfc1522
 272
 273      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 274
 275 Encodes headers according to the RFC1522 standard by calling
 276 MIME::Words::encode_mimeword on distinct words as appropriate.
 277
 278 =cut
 279
 280 # We cannot use MIME::Words::encode_mimewords because that function
 281 # does not handle spaces properly at all.
 282
 283 sub encode_rfc1522 {
 284      my ($rawstr) = @_;
 285
 286      # handle being passed undef properly
 287      return undef if not defined $rawstr;
 288      if (is_utf8($rawstr)) {
 289          $rawstr= encode_utf8($rawstr);
 290      }
 291      # We process words in reverse so we can preserve spacing between
 292      # encoded words. This regex splits on word|nonword boundaries and
 293      # nonword|nonword boundaries. We also consider parenthesis and "
 294      # to be nonwords to avoid escaping them in comments in violation
 295      # of RFC1522
 296      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 297
 298      my $previous_word_encoded = 0;
 299      my $string = '';
 300      for my $word (@words) {
 301           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 302                $string = $word.$string;
 303                $previous_word_encoded=0;
 304           }
 305           elsif ($word =~ /^[\s\n]$/) {
 306                $string = $word.$string;
 307                $previous_word_encoded = 0 if $word eq "\n";
 308           }
 309           else {
 310                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 311                # RFC 1522 mandates that segments be at most 76 characters
 312                # long. If that's the case, we split the word up into 10
 313                # character pieces and encode it. We must use the Encode
 314                # magic here to avoid breaking on bit boundaries here.
 315                if (length $encoded > 75) {
 316                     # Turn utf8 into the internal perl representation
 317                     # so . is a character, not a byte.
 318                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 319                     my @encoded;
 320                     # Strip it into 10 character long segments, and encode
 321                     # the segments
 322                     # XXX It's possible that these segments are > 76 characters
 323                     while ($tempstr =~ s/(.{1,10})$//) {
 324                          # turn the character back into the utf8 representation.
 325                          my $tempword = encode_utf8($1);
 326                          # It may actually be better to eventually use
 327                          # the base64 encoding here, but I'm not sure
 328                          # if that's as widely supported as quoted
 329                          # printable.
 330                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 331                     }
 332                     $encoded = join(" ",@encoded);
 333                     # If the previous word was encoded, we must
 334                     # include a trailing _ that gets encoded as a
 335                     # space.
 336                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 337                     $string = $encoded.$string;
 338                }
 339                else {
 340                     # If the previous word was encoded, we must
 341                     # include a trailing _ that gets encoded as a
 342                     # space.
 343                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 344                     $string = $encoded.$string;
 345                }
 346                $previous_word_encoded = 1;
 347           }
 348      }
 349      return $string;
 350 }
 351
 352 1;