Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 use strict;
  14
  15 use base qw(Exporter);
  16 use vars qw($VERSION @EXPORT_OK);
  17
  18 BEGIN {
  19     $VERSION = 1.00;
  20
  21     @EXPORT_OK = qw(parse decode_rfc1522 encode_rfc1522 convert_to_utf8 create_mime_message getmailbody);
  22 }
  23
  24 use File::Path;
  25 use MIME::Parser;
  26
  27 use POSIX qw(strftime);
  28 use List::MoreUtils qw(apply);
  29
  30 # for decode_rfc1522
  31 use MIME::WordDecoder qw();
  32 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  33
  34 # for encode_rfc1522
  35 use MIME::Words qw();
  36
  37 sub getmailbody
  38 {
  39     my $entity = shift;
  40     my $type = $entity->effective_type;
  41     if ($type eq 'text/plain' or
  42             ($type =~ m#text/# and $type ne 'text/html') or
  43             $type eq 'application/pgp') {
  44         return $entity->bodyhandle;
  45     } elsif ($type eq 'multipart/alternative') {
  46         # RFC 2046 says we should use the last part we recognize.
  47         for my $part (reverse $entity->parts) {
  48             my $ret = getmailbody($part);
  49             return $ret if $ret;
  50         }
  51     } else {
  52         # For other multipart types, we just pretend they're
  53         # multipart/mixed and run through in order.
  54         for my $part ($entity->parts) {
  55             my $ret = getmailbody($part);
  56             return $ret if $ret;
  57         }
  58     }
  59     return undef;
  60 }
  61
  62 sub parse
  63 {
  64     # header and decoded body respectively
  65     my (@headerlines, @bodylines);
  66
  67     my $parser = MIME::Parser->new();
  68     mkdir "mime.tmp.$$", 0777;
  69     $parser->output_under("mime.tmp.$$");
  70     my $entity = eval { $parser->parse_data($_[0]) };
  71
  72     if ($entity and $entity->head->tags) {
  73         @headerlines = @{$entity->head->header};
  74         chomp @headerlines;
  75
  76         my $entity_body = getmailbody($entity);
  77         @bodylines = $entity_body ? $entity_body->as_lines() : ();
  78         chomp @bodylines;
  79     } else {
  80         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
  81         my @msg = split /\n/, $_[0];
  82         my $i;
  83
  84         for ($i = 0; $i <= $#msg; ++$i) {
  85             $_ = $msg[$i];
  86             last unless length;
  87             while ($msg[$i + 1] =~ /^\s/) {
  88                 ++$i;
  89                 $_ .= "\n" . $msg[$i];
  90             }
  91             push @headerlines, $_;
  92         }
  93
  94         @bodylines = @msg[$i .. $#msg];
  95     }
  96
  97     rmtree "mime.tmp.$$", 0, 1;
  98
  99     # Remove blank lines.
 100     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 101
 102     # Strip off RFC2440-style PGP clearsigning.
 103     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 104         shift @bodylines while @bodylines and length $bodylines[0];
 105         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 106         for my $findsig (0 .. $#bodylines) {
 107             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 108                 $#bodylines = $findsig - 1;
 109                 last;
 110             }
 111         }
 112         map { s/^- // } @bodylines;
 113     }
 114
 115     return { header => [@headerlines], body => [@bodylines]};
 116 }
 117
 118 =head2 create_mime_message
 119
 120      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 121
 122 Creates a MIME encoded message with headers given by the first
 123 argument, and a message given by the second.
 124
 125 Optional attachments can be specified in the third arrayref argument.
 126
 127 Whether to include the date in the header is the final argument; it
 128 defaults to true, setting the Date header if one is not already
 129 present.
 130
 131 Headers are passed directly to MIME::Entity::build, the message is the
 132 first attachment.
 133
 134 Each of the elements of the attachment arrayref is attached as an
 135 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 136 hashref, the contents are passed as an argument to
 137 MIME::Entity::attach
 138
 139 =cut
 140
 141 sub create_mime_message{
 142      my ($headers,$body,$attachments,$include_date) = @_;
 143      $attachments = [] if not defined $attachments;
 144      $include_date = 1 if not defined $include_date;
 145
 146      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 147      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 148
 149      if ($include_date) {
 150          my %headers = apply {lc($_)} @{$headers};
 151          if (not exists $headers{date}) {
 152              push @{$headers},
 153                  ('Date',
 154                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 155                  );
 156          }
 157      }
 158
 159      # Build the message
 160      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 161      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 162                                    'Encoding'     => 'quoted-printable',
 163                                    (map{encode_rfc1522($_)} @{$headers}),
 164                                    Data    => $body
 165                                   );
 166
 167      # Attach the attachments
 168      for my $attachment (@{$attachments}) {
 169           if (ref($attachment) eq 'HASH') {
 170                $msg->attach(%{$attachment});
 171           }
 172           else {
 173                # This is *craptacular*, but because various MTAs
 174                # (sendmail and exim4, at least) appear to eat From
 175                # lines in message/rfc822 attachments, we need eat
 176                # the entire From line ourselves so the MTA doesn't
 177                # leave \n detrius around.
 178                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 179                     # make a copy so that we don't screw up anything
 180                     # that is expecting this arrayref to stay constant
 181                     $attachment = [@{$attachment}];
 182                     # remove the from line
 183                     splice @$attachment, 1, 1;
 184                }
 185                elsif (not ref($attachment)) {
 186                     # It's a scalar; remove the from line
 187                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 188                }
 189                $msg->attach(Type => 'message/rfc822',
 190                             Data => $attachment,
 191                             Encoding => '7bit',
 192                            );
 193           }
 194      }
 195      return $msg->as_string;
 196 }
 197
 198
 199 # Bug #61342 et al.
 200
 201 sub convert_to_utf8 {
 202      my ($data, $charset) = @_;
 203      # raw data just gets returned (that's the charset WordDecorder
 204      # uses when it doesn't know what to do)
 205      return $data if $charset eq 'raw' or is_utf8($data,1);
 206      my $result;
 207      eval {
 208           # this encode/decode madness is to make sure that the data
 209           # really is valid utf8 and that the is_utf8 flag is off.
 210           $result = encode("utf8",decode($charset,$data))
 211      };
 212      if ($@) {
 213           warn "Unable to decode charset; '$charset' and '$data': $@";
 214           return $data;
 215      }
 216      return $result;
 217 }
 218
 219
 220 =head2 decode_rfc1522
 221
 222     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 223
 224 Turn RFC-1522 names into the UTF-8 equivalent.
 225
 226 =cut
 227
 228 BEGIN {
 229     # Set up the default RFC1522 decoder, which turns all charsets that
 230     # are supported into the appropriate UTF-8 charset.
 231     MIME::WordDecoder->default(new MIME::WordDecoder(
 232         ['*' => \&convert_to_utf8,
 233         ]));
 234 }
 235
 236 sub decode_rfc1522 {
 237     my ($string) = @_;
 238
 239     # this is craptacular, but leading space is hacked off by unmime.
 240     # Save it.
 241     my $leading_space = '';
 242     $leading_space = $1 if $string =~ s/^(\s+)//;
 243     # unmime calls the default MIME::WordDecoder handler set up at
 244     # initialization time.
 245     return $leading_space . MIME::WordDecoder::unmime($string);
 246 }
 247
 248 =head2 encode_rfc1522
 249
 250      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 251
 252 Encodes headers according to the RFC1522 standard by calling
 253 MIME::Words::encode_mimeword on distinct words as appropriate.
 254
 255 =cut
 256
 257 # We cannot use MIME::Words::encode_mimewords because that function
 258 # does not handle spaces properly at all.
 259
 260 sub encode_rfc1522 {
 261      my ($rawstr) = @_;
 262
 263      # handle being passed undef properly
 264      return undef if not defined $rawstr;
 265      # We process words in reverse so we can preserve spacing between
 266      # encoded words. This regex splits on word|nonword boundaries and
 267      # nonword|nonword boundaries.
 268      my @words = reverse split /(?:(?<=[\s\n])|(?=[\s\n]))/m, $rawstr;
 269
 270      my $previous_word_encoded = 0;
 271      my $string = '';
 272      for my $word (@words) {
 273           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 274                $string = $word.$string;
 275                $previous_word_encoded=0;
 276           }
 277           elsif ($word =~ /^[\s\n]$/) {
 278                $string = $word.$string;
 279                $previous_word_encoded = 0 if $word eq "\n";
 280           }
 281           else {
 282                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 283                # RFC 1522 mandates that segments be at most 76 characters
 284                # long. If that's the case, we split the word up into 10
 285                # character pieces and encode it. We must use the Encode
 286                # magic here to avoid breaking on bit boundaries here.
 287                if (length $encoded > 75) {
 288                     # Turn utf8 into the internal perl representation
 289                     # so . is a character, not a byte.
 290                     my $tempstr = decode_utf8($word,Encode::FB_DEFAULT);
 291                     my @encoded;
 292                     # Strip it into 10 character long segments, and encode
 293                     # the segments
 294                     # XXX It's possible that these segments are > 76 characters
 295                     while ($tempstr =~ s/(.{1,10})$//) {
 296                          # turn the character back into the utf8 representation.
 297                          my $tempword = encode_utf8($1);
 298                          # It may actually be better to eventually use
 299                          # the base64 encoding here, but I'm not sure
 300                          # if that's as widely supported as quoted
 301                          # printable.
 302                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 303                     }
 304                     $encoded = join(" ",@encoded);
 305                     # If the previous word was encoded, we must
 306                     # include a trailing _ that gets encoded as a
 307                     # space.
 308                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 309                     $string = $encoded.$string;
 310                }
 311                else {
 312                     # If the previous word was encoded, we must
 313                     # include a trailing _ that gets encoded as a
 314                     # space.
 315                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 316                     $string = $encoded.$string;
 317                }
 318                $previous_word_encoded = 1;
 319           }
 320      }
 321      return $string;
 322 }
 323
 324 1;