Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =head1 NAME
  14
  15 Debbugs::MIME -- Mime handling routines for debbugs
  16
  17 =head1 SYNOPSIS
  18
  19  use Debbugs::MIME qw(parse decode_rfc1522);
  20
  21 =head1 DESCRIPTION
  22
  23
  24 =head1 BUGS
  25
  26 None known.
  27
  28 =cut
  29
  30 use warnings;
  31 use strict;
  32
  33 use base qw(Exporter);
  34 use vars qw($DEBUG $VERSION @EXPORT_OK %EXPORT_TAGS @EXPORT);
  35
  36 BEGIN {
  37     $VERSION = 1.00;
  38     $DEBUG = 0 unless defined $DEBUG;
  39
  40     @EXPORT = ();
  41
  42     %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody)],
  43                     rfc1522 => [qw(decode_rfc1522 encode_rfc1522)],
  44                     utf8 => [qw(convert_to_utf8)],
  45                    );
  46     @EXPORT_OK=();
  47     Exporter::export_ok_tags(keys %EXPORT_TAGS);
  48     $EXPORT_TAGS{all} = [@EXPORT_OK];
  49 }
  50
  51 use File::Path;
  52 use File::Temp qw();
  53 use MIME::Parser;
  54
  55 use POSIX qw(strftime);
  56 use List::MoreUtils qw(apply);
  57
  58 # for decode_rfc1522
  59 use MIME::WordDecoder qw();
  60 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  61
  62 # for encode_rfc1522
  63 use MIME::Words qw();
  64
  65 sub getmailbody
  66 {
  67     my $entity = shift;
  68     my $type = $entity->effective_type;
  69     if ($type eq 'text/plain' or
  70             ($type =~ m#text/?# and $type ne 'text/html') or
  71             $type eq 'application/pgp') {
  72         return $entity->bodyhandle;
  73     } elsif ($type eq 'multipart/alternative') {
  74         # RFC 2046 says we should use the last part we recognize.
  75         for my $part (reverse $entity->parts) {
  76             my $ret = getmailbody($part);
  77             return $ret if $ret;
  78         }
  79     } else {
  80         # For other multipart types, we just pretend they're
  81         # multipart/mixed and run through in order.
  82         for my $part ($entity->parts) {
  83             my $ret = getmailbody($part);
  84             return $ret if $ret;
  85         }
  86     }
  87     return undef;
  88 }
  89
  90 sub parse
  91 {
  92     # header and decoded body respectively
  93     my (@headerlines, @bodylines);
  94
  95     my $parser = MIME::Parser->new();
  96     my $tempdir = File::Temp::tempdir();
  97     $parser->output_under($tempdir);
  98     my $entity = eval { $parser->parse_data($_[0]) };
  99
 100     if ($entity and $entity->head->tags) {
 101         @headerlines = @{$entity->head->header};
 102         chomp @headerlines;
 103
 104         my $entity_body = getmailbody($entity);
 105         @bodylines = $entity_body ? $entity_body->as_lines() : ();
 106         chomp @bodylines;
 107     } else {
 108         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 109         my @msg = split /\n/, $_[0];
 110         my $i;
 111
 112         for ($i = 0; $i <= $#msg; ++$i) {
 113             $_ = $msg[$i];
 114             last unless length;
 115             while ($msg[$i + 1] =~ /^\s/) {
 116                 ++$i;
 117                 $_ .= "\n" . $msg[$i];
 118             }
 119             push @headerlines, $_;
 120         }
 121
 122         @bodylines = @msg[$i .. $#msg];
 123     }
 124
 125     rmtree $tempdir, 0, 1;
 126
 127     # Remove blank lines.
 128     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 129
 130     # Strip off RFC2440-style PGP clearsigning.
 131     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 132         shift @bodylines while @bodylines and
 133             length $bodylines[0] and
 134                 # we currently don't strip \r; handle this for the
 135                 # time being, though eventually it should be stripped
 136                 # too, I think. [See #565981]
 137                 $bodylines[0] ne "\r";
 138         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 139         for my $findsig (0 .. $#bodylines) {
 140             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 141                 $#bodylines = $findsig - 1;
 142                 last;
 143             }
 144         }
 145         map { s/^- // } @bodylines;
 146     }
 147
 148     return { header => [@headerlines], body => [@bodylines]};
 149 }
 150
 151 =head2 create_mime_message
 152
 153      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 154
 155 Creates a MIME encoded message with headers given by the first
 156 argument, and a message given by the second.
 157
 158 Optional attachments can be specified in the third arrayref argument.
 159
 160 Whether to include the date in the header is the final argument; it
 161 defaults to true, setting the Date header if one is not already
 162 present.
 163
 164 Headers are passed directly to MIME::Entity::build, the message is the
 165 first attachment.
 166
 167 Each of the elements of the attachment arrayref is attached as an
 168 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 169 hashref, the contents are passed as an argument to
 170 MIME::Entity::attach
 171
 172 =cut
 173
 174 sub create_mime_message{
 175      my ($headers,$body,$attachments,$include_date) = @_;
 176      $attachments = [] if not defined $attachments;
 177      $include_date = 1 if not defined $include_date;
 178
 179      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 180      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 181
 182      if ($include_date) {
 183          my %headers = apply {lc($_)} @{$headers};
 184          if (not exists $headers{date}) {
 185              push @{$headers},
 186                  ('Date',
 187                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 188                  );
 189          }
 190      }
 191
 192      # Build the message
 193      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 194      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 195                                    'Encoding'     => 'quoted-printable',
 196                                    (map{encode_rfc1522($_)} @{$headers}),
 197                                    Data    => $body
 198                                   );
 199
 200      # Attach the attachments
 201      for my $attachment (@{$attachments}) {
 202           if (ref($attachment) eq 'HASH') {
 203                $msg->attach(%{$attachment});
 204           }
 205           else {
 206                # This is *craptacular*, but because various MTAs
 207                # (sendmail and exim4, at least) appear to eat From
 208                # lines in message/rfc822 attachments, we need eat
 209                # the entire From line ourselves so the MTA doesn't
 210                # leave \n detrius around.
 211                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 212                     # make a copy so that we don't screw up anything
 213                     # that is expecting this arrayref to stay constant
 214                     $attachment = [@{$attachment}];
 215                     # remove the from line
 216                     splice @$attachment, 1, 1;
 217                }
 218                elsif (not ref($attachment)) {
 219                     # It's a scalar; remove the from line
 220                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 221                }
 222                $msg->attach(Type => 'message/rfc822',
 223                             Data => $attachment,
 224                             Encoding => '7bit',
 225                            );
 226           }
 227      }
 228      return $msg->as_string;
 229 }
 230
 231
 232 # Bug #61342 et al.
 233
 234 sub convert_to_utf8 {
 235      my ($data, $charset) = @_;
 236      # raw data just gets returned (that's the charset WordDecorder
 237      # uses when it doesn't know what to do)
 238      return $data if $charset eq 'raw' or is_utf8($data,1);
 239      my $result;
 240      eval {
 241          $result = decode($charset,$data);
 242      };
 243      if ($@) {
 244           warn "Unable to decode charset; '$charset' and '$data': $@";
 245           return $data;
 246      }
 247      return $result;
 248 }
 249
 250
 251 =head2 decode_rfc1522
 252
 253     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 254
 255 Turn RFC-1522 names into the UTF-8 equivalent.
 256
 257 =cut
 258
 259 BEGIN {
 260     # Set up the default RFC1522 decoder, which turns all charsets that
 261     # are supported into the appropriate UTF-8 charset.
 262     MIME::WordDecoder->default(new MIME::WordDecoder(
 263         ['*' => \&convert_to_utf8,
 264         ]));
 265 }
 266
 267 sub decode_rfc1522 {
 268     my ($string) = @_;
 269
 270     # this is craptacular, but leading space is hacked off by unmime.
 271     # Save it.
 272     my $leading_space = '';
 273     $leading_space = $1 if $string =~ s/^(\s+)//;
 274     # unmime calls the default MIME::WordDecoder handler set up at
 275     # initialization time.
 276     return $leading_space . MIME::WordDecoder::unmime($string);
 277 }
 278
 279 =head2 encode_rfc1522
 280
 281      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 282
 283 Encodes headers according to the RFC1522 standard by calling
 284 MIME::Words::encode_mimeword on distinct words as appropriate.
 285
 286 =cut
 287
 288 # We cannot use MIME::Words::encode_mimewords because that function
 289 # does not handle spaces properly at all.
 290
 291 sub encode_rfc1522 {
 292      my ($rawstr) = @_;
 293
 294      # handle being passed undef properly
 295      return undef if not defined $rawstr;
 296      if (is_utf8($rawstr)) {
 297          $rawstr= encode_utf8($rawstr);
 298      }
 299      # We process words in reverse so we can preserve spacing between
 300      # encoded words. This regex splits on word|nonword boundaries and
 301      # nonword|nonword boundaries. We also consider parenthesis and "
 302      # to be nonwords to avoid escaping them in comments in violation
 303      # of RFC1522
 304      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 305
 306      my $previous_word_encoded = 0;
 307      my $string = '';
 308      for my $word (@words) {
 309           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 310                $string = $word.$string;
 311                $previous_word_encoded=0;
 312           }
 313           elsif ($word =~ /^[\s\n]$/) {
 314                $string = $word.$string;
 315                $previous_word_encoded = 0 if $word eq "\n";
 316           }
 317           else {
 318                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 319                # RFC 1522 mandates that segments be at most 76 characters
 320                # long. If that's the case, we split the word up into 10
 321                # character pieces and encode it. We must use the Encode
 322                # magic here to avoid breaking on bit boundaries here.
 323                if (length $encoded > 75) {
 324                     # Turn utf8 into the internal perl representation
 325                     # so . is a character, not a byte.
 326                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 327                     my @encoded;
 328                     # Strip it into 10 character long segments, and encode
 329                     # the segments
 330                     # XXX It's possible that these segments are > 76 characters
 331                     while ($tempstr =~ s/(.{1,10})$//) {
 332                          # turn the character back into the utf8 representation.
 333                          my $tempword = encode_utf8($1);
 334                          # It may actually be better to eventually use
 335                          # the base64 encoding here, but I'm not sure
 336                          # if that's as widely supported as quoted
 337                          # printable.
 338                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 339                     }
 340                     $encoded = join(" ",@encoded);
 341                     # If the previous word was encoded, we must
 342                     # include a trailing _ that gets encoded as a
 343                     # space.
 344                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 345                     $string = $encoded.$string;
 346                }
 347                else {
 348                     # If the previous word was encoded, we must
 349                     # include a trailing _ that gets encoded as a
 350                     # space.
 351                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 352                     $string = $encoded.$string;
 353                }
 354                $previous_word_encoded = 1;
 355           }
 356      }
 357      return $string;
 358 }
 359
 360 1;