Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =head1 NAME
  14
  15 Debbugs::MIME -- Mime handling routines for debbugs
  16
  17 =head1 SYNOPSIS
  18
  19  use Debbugs::MIME qw(parse decode_rfc1522);
  20
  21 =head1 DESCRIPTION
  22
  23
  24 =head1 BUGS
  25
  26 None known.
  27
  28 =cut
  29
  30 use warnings;
  31 use strict;
  32
  33 use base qw(Exporter);
  34 use vars qw($DEBUG $VERSION @EXPORT_OK %EXPORT_TAGS @EXPORT);
  35
  36 BEGIN {
  37     $VERSION = 1.00;
  38     $DEBUG = 0 unless defined $DEBUG;
  39
  40     @EXPORT = ();
  41
  42     %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody)],
  43                     rfc1522 => [qw(decode_rfc1522 encode_rfc1522)],
  44                    );
  45     @EXPORT_OK=();
  46     Exporter::export_ok_tags(keys %EXPORT_TAGS);
  47     $EXPORT_TAGS{all} = [@EXPORT_OK];
  48 }
  49
  50 use File::Path;
  51 use File::Temp qw();
  52 use MIME::Parser;
  53
  54 use POSIX qw(strftime);
  55 use List::MoreUtils qw(apply);
  56
  57 # for convert_to_utf8
  58 use Debbugs::UTF8 qw(convert_to_utf8);
  59
  60 # for decode_rfc1522 and encode_rfc1522
  61 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  62 use MIME::Words qw();
  63
  64 sub getmailbody
  65 {
  66     my $entity = shift;
  67     my $type = $entity->effective_type;
  68     if ($type eq 'text/plain' or
  69             ($type =~ m#text/?# and $type ne 'text/html') or
  70             $type eq 'application/pgp') {
  71         return $entity;
  72     } elsif ($type eq 'multipart/alternative') {
  73         # RFC 2046 says we should use the last part we recognize.
  74         for my $part (reverse $entity->parts) {
  75             my $ret = getmailbody($part);
  76             return $ret if $ret;
  77         }
  78     } else {
  79         # For other multipart types, we just pretend they're
  80         # multipart/mixed and run through in order.
  81         for my $part ($entity->parts) {
  82             my $ret = getmailbody($part);
  83             return $ret if $ret;
  84         }
  85     }
  86     return undef;
  87 }
  88
  89 sub parse
  90 {
  91     # header and decoded body respectively
  92     my (@headerlines, @bodylines);
  93
  94     my $parser = MIME::Parser->new();
  95     my $tempdir = File::Temp::tempdir();
  96     $parser->output_under($tempdir);
  97     my $entity = eval { $parser->parse_data($_[0]) };
  98
  99     if ($entity and $entity->head->tags) {
 100         @headerlines = @{$entity->head->header};
 101         chomp @headerlines;
 102
 103         my $entity_body = getmailbody($entity);
 104         my $entity_body_handle;
 105         my $charset;
 106         if (defined $entity_body) {
 107             $entity_body_handle = $entity_body->bodyhandle();
 108             $charset = $entity_body->head()->mime_attr('content-type.charset');
 109         }
 110         @bodylines = $entity_body_handle ? $entity_body_handle->as_lines() : ();
 111         @bodylines = map {convert_to_utf8($_,$charset)} @bodylines;
 112         chomp @bodylines;
 113     } else {
 114         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 115         my @msg = split /\n/, $_[0];
 116         my $i;
 117
 118         for ($i = 0; $i <= $#msg; ++$i) {
 119             $_ = $msg[$i];
 120             last unless length;
 121             while ($msg[$i + 1] =~ /^\s/) {
 122                 ++$i;
 123                 $_ .= "\n" . $msg[$i];
 124             }
 125             push @headerlines, $_;
 126         }
 127
 128         @bodylines = @msg[$i .. $#msg];
 129     }
 130
 131     rmtree $tempdir, 0, 1;
 132
 133     # Remove blank lines.
 134     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 135
 136     # Strip off RFC2440-style PGP clearsigning.
 137     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 138         shift @bodylines while @bodylines and
 139             length $bodylines[0] and
 140                 # we currently don't strip \r; handle this for the
 141                 # time being, though eventually it should be stripped
 142                 # too, I think. [See #565981]
 143                 $bodylines[0] ne "\r";
 144         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 145         for my $findsig (0 .. $#bodylines) {
 146             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 147                 $#bodylines = $findsig - 1;
 148                 last;
 149             }
 150         }
 151         map { s/^- // } @bodylines;
 152     }
 153
 154     return { header => [@headerlines], body => [@bodylines]};
 155 }
 156
 157 =head2 create_mime_message
 158
 159      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 160
 161 Creates a MIME encoded message with headers given by the first
 162 argument, and a message given by the second.
 163
 164 Optional attachments can be specified in the third arrayref argument.
 165
 166 Whether to include the date in the header is the final argument; it
 167 defaults to true, setting the Date header if one is not already
 168 present.
 169
 170 Headers are passed directly to MIME::Entity::build, the message is the
 171 first attachment.
 172
 173 Each of the elements of the attachment arrayref is attached as an
 174 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 175 hashref, the contents are passed as an argument to
 176 MIME::Entity::attach
 177
 178 =cut
 179
 180 sub create_mime_message{
 181      my ($headers,$body,$attachments,$include_date) = @_;
 182      $attachments = [] if not defined $attachments;
 183      $include_date = 1 if not defined $include_date;
 184
 185      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 186      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 187
 188      if ($include_date) {
 189          my %headers = apply {lc($_)} @{$headers};
 190          if (not exists $headers{date}) {
 191              push @{$headers},
 192                  ('Date',
 193                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 194                  );
 195          }
 196      }
 197
 198      # Build the message
 199      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 200      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 201                                    'Encoding'     => 'quoted-printable',
 202                                    (map{encode_rfc1522(encode_utf8($_))} @{$headers}),
 203                                    Data    => encode_utf8($body),
 204                                   );
 205
 206      # Attach the attachments
 207      for my $attachment (@{$attachments}) {
 208           if (ref($attachment) eq 'HASH') {
 209                $msg->attach(%{$attachment});
 210           }
 211           else {
 212                # This is *craptacular*, but because various MTAs
 213                # (sendmail and exim4, at least) appear to eat From
 214                # lines in message/rfc822 attachments, we need eat
 215                # the entire From line ourselves so the MTA doesn't
 216                # leave \n detrius around.
 217                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 218                     # make a copy so that we don't screw up anything
 219                     # that is expecting this arrayref to stay constant
 220                     $attachment = [@{$attachment}];
 221                     # remove the from line
 222                     splice @$attachment, 1, 1;
 223                }
 224                elsif (not ref($attachment)) {
 225                     # It's a scalar; remove the from line
 226                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 227                }
 228                $msg->attach(Type => 'message/rfc822',
 229                             Data => $attachment,
 230                             Encoding => '7bit',
 231                            );
 232           }
 233      }
 234      return $msg->as_string;
 235 }
 236
 237
 238
 239
 240 =head2 decode_rfc1522
 241
 242     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 243
 244 Turn RFC-1522 names into the UTF-8 equivalent.
 245
 246 =cut
 247
 248 sub decode_rfc1522 {
 249     my ($string) = @_;
 250
 251     # this is craptacular, but leading space is hacked off by unmime.
 252     # Save it.
 253     my $leading_space = '';
 254     $leading_space = $1 if $string =~ s/^(\ +)//;
 255     # we must do this to switch off the utf8 flag before calling decode_mimewords
 256     $string = encode_utf8($string);
 257     my @mime_words = MIME::Words::decode_mimewords($string);
 258     my $tmp = $leading_space .
 259         join('',
 260              (map {
 261                  if (@{$_} > 1) {
 262                      convert_to_utf8(${$_}[0],${$_}[1]);
 263                  } else {
 264                      decode_utf8(${$_}[0]);
 265                  }
 266              } @mime_words)
 267             );
 268     return $tmp;
 269 }
 270
 271 =head2 encode_rfc1522
 272
 273      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 274
 275 Encodes headers according to the RFC1522 standard by calling
 276 MIME::Words::encode_mimeword on distinct words as appropriate.
 277
 278 =cut
 279
 280 # We cannot use MIME::Words::encode_mimewords because that function
 281 # does not handle spaces properly at all.
 282
 283 sub encode_rfc1522 {
 284      my ($rawstr) = @_;
 285
 286      # handle being passed undef properly
 287      return undef if not defined $rawstr;
 288
 289      # convert to octets if we are given a string in perl's internal
 290      # encoding
 291      $rawstr= encode_utf8($rawstr) if is_utf8($rawstr);
 292      # We process words in reverse so we can preserve spacing between
 293      # encoded words. This regex splits on word|nonword boundaries and
 294      # nonword|nonword boundaries. We also consider parenthesis and "
 295      # to be nonwords to avoid escaping them in comments in violation
 296      # of RFC1522
 297      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 298
 299      my $previous_word_encoded = 0;
 300      my $string = '';
 301      for my $word (@words) {
 302           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 303                $string = $word.$string;
 304                $previous_word_encoded=0;
 305           }
 306           elsif ($word =~ /^[\s\n]$/) {
 307                $string = $word.$string;
 308                $previous_word_encoded = 0 if $word eq "\n";
 309           }
 310           else {
 311                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 312                # RFC 1522 mandates that segments be at most 76 characters
 313                # long. If that's the case, we split the word up into 10
 314                # character pieces and encode it. We must use the Encode
 315                # magic here to avoid breaking on bit boundaries here.
 316                if (length $encoded > 75) {
 317                     # Turn utf8 into the internal perl representation
 318                     # so . is a character, not a byte.
 319                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 320                     my @encoded;
 321                     # Strip it into 10 character long segments, and encode
 322                     # the segments
 323                     # XXX It's possible that these segments are > 76 characters
 324                     while ($tempstr =~ s/(.{1,10})$//) {
 325                          # turn the character back into the utf8 representation.
 326                          my $tempword = encode_utf8($1);
 327                          # It may actually be better to eventually use
 328                          # the base64 encoding here, but I'm not sure
 329                          # if that's as widely supported as quoted
 330                          # printable.
 331                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 332                     }
 333                     $encoded = join(" ",@encoded);
 334                     # If the previous word was encoded, we must
 335                     # include a trailing _ that gets encoded as a
 336                     # space.
 337                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 338                     $string = $encoded.$string;
 339                }
 340                else {
 341                     # If the previous word was encoded, we must
 342                     # include a trailing _ that gets encoded as a
 343                     # space.
 344                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 345                     $string = $encoded.$string;
 346                }
 347                $previous_word_encoded = 1;
 348           }
 349      }
 350      return $string;
 351 }
 352
 353 1;