Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =head1 NAME
  14
  15 Debbugs::MIME -- Mime handling routines for debbugs
  16
  17 =head1 SYNOPSIS
  18
  19  use Debbugs::MIME qw(parse decode_rfc1522);
  20
  21 =head1 DESCRIPTION
  22
  23
  24 =head1 BUGS
  25
  26 None known.
  27
  28 =cut
  29
  30 use warnings;
  31 use strict;
  32
  33 use base qw(Exporter);
  34 use vars qw($DEBUG $VERSION @EXPORT_OK %EXPORT_TAGS @EXPORT);
  35
  36 BEGIN {
  37     $VERSION = 1.00;
  38     $DEBUG = 0 unless defined $DEBUG;
  39
  40     @EXPORT = ();
  41
  42     %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody)],
  43                     rfc1522 => [qw(decode_rfc1522 encode_rfc1522)],
  44                    );
  45     @EXPORT_OK=();
  46     Exporter::export_ok_tags(keys %EXPORT_TAGS);
  47     $EXPORT_TAGS{all} = [@EXPORT_OK];
  48 }
  49
  50 use File::Path;
  51 use File::Temp qw();
  52 use MIME::Parser;
  53
  54 use POSIX qw(strftime);
  55 use List::MoreUtils qw(apply);
  56
  57 # for convert_to_utf8
  58 use Debbugs::UTF8 qw(convert_to_utf8);
  59
  60 # for decode_rfc1522 and encode_rfc1522
  61 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  62 use MIME::Words qw();
  63
  64 sub getmailbody
  65 {
  66     my $entity = shift;
  67     my $type = $entity->effective_type;
  68     if ($type eq 'text/plain' or
  69             ($type =~ m#text/?# and $type ne 'text/html') or
  70             $type eq 'application/pgp') {
  71         return $entity;
  72     } elsif ($type eq 'multipart/alternative') {
  73         # RFC 2046 says we should use the last part we recognize.
  74         for my $part (reverse $entity->parts) {
  75             my $ret = getmailbody($part);
  76             return $ret if $ret;
  77         }
  78     } else {
  79         # For other multipart types, we just pretend they're
  80         # multipart/mixed and run through in order.
  81         for my $part ($entity->parts) {
  82             my $ret = getmailbody($part);
  83             return $ret if $ret;
  84         }
  85     }
  86     return undef;
  87 }
  88
  89 sub parse
  90 {
  91     # header and decoded body respectively
  92     my (@headerlines, @bodylines);
  93
  94     my $parser = MIME::Parser->new();
  95     my $tempdir = File::Temp::tempdir();
  96     $parser->output_under($tempdir);
  97     my $entity = eval { $parser->parse_data($_[0]) };
  98
  99     if ($entity and $entity->head->tags) {
 100         @headerlines = @{$entity->head->header};
 101         chomp @headerlines;
 102
 103         my $entity_body = getmailbody($entity);
 104         my $entity_body_handle;
 105         my $charset;
 106         if (defined $entity_body) {
 107             $entity_body_handle = $entity_body->bodyhandle();
 108             $charset = $entity_body->head()->mime_attr('content-type.charset');
 109         }
 110         @bodylines = $entity_body_handle ? $entity_body_handle->as_lines() : ();
 111         @bodylines = map {convert_to_utf8($_,$charset)} @bodylines;
 112         chomp @bodylines;
 113     } else {
 114         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 115         my @msg = split /\n/, $_[0];
 116         my $i;
 117
 118         # assume us-ascii unless charset is set; probably bad, but we
 119         # really shouldn't get to this point anyway
 120         my $charset = 'us-ascii';
 121         for ($i = 0; $i <= $#msg; ++$i) {
 122             $_ = $msg[$i];
 123             last unless length;
 124             while ($msg[$i + 1] =~ /^\s/) {
 125                 ++$i;
 126                 $_ .= "\n" . $msg[$i];
 127             }
 128             if (/charset=\"([^\"]+)\"/) {
 129                 $charset = $1;
 130             }
 131             push @headerlines, $_;
 132         }
 133         @bodylines = map {convert_to_utf8($_,$charset)} @msg[$i .. $#msg];
 134     }
 135
 136     rmtree $tempdir, 0, 1;
 137
 138     # Remove blank lines.
 139     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 140
 141     # Strip off RFC2440-style PGP clearsigning.
 142     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 143         shift @bodylines while @bodylines and
 144             length $bodylines[0] and
 145                 # we currently don't strip \r; handle this for the
 146                 # time being, though eventually it should be stripped
 147                 # too, I think. [See #565981]
 148                 $bodylines[0] ne "\r";
 149         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 150         for my $findsig (0 .. $#bodylines) {
 151             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 152                 $#bodylines = $findsig - 1;
 153                 last;
 154             }
 155         }
 156         map { s/^- // } @bodylines;
 157     }
 158
 159     return { header => [@headerlines], body => [@bodylines]};
 160 }
 161
 162 =head2 create_mime_message
 163
 164      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 165
 166 Creates a MIME encoded message with headers given by the first
 167 argument, and a message given by the second.
 168
 169 Optional attachments can be specified in the third arrayref argument.
 170
 171 Whether to include the date in the header is the final argument; it
 172 defaults to true, setting the Date header if one is not already
 173 present.
 174
 175 Headers are passed directly to MIME::Entity::build, the message is the
 176 first attachment.
 177
 178 Each of the elements of the attachment arrayref is attached as an
 179 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 180 hashref, the contents are passed as an argument to
 181 MIME::Entity::attach
 182
 183 =cut
 184
 185 sub create_mime_message{
 186      my ($headers,$body,$attachments,$include_date) = @_;
 187      $attachments = [] if not defined $attachments;
 188      $include_date = 1 if not defined $include_date;
 189
 190      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 191      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 192
 193      if ($include_date) {
 194          my %headers = apply {lc($_)} @{$headers};
 195          if (not exists $headers{date}) {
 196              push @{$headers},
 197                  ('Date',
 198                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 199                  );
 200          }
 201      }
 202
 203      # Build the message
 204      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 205      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 206                                    'Encoding'     => 'quoted-printable',
 207                                    (map{encode_rfc1522(encode_utf8($_))} @{$headers}),
 208                                    Data    => encode_utf8($body),
 209                                   );
 210
 211      # Attach the attachments
 212      for my $attachment (@{$attachments}) {
 213           if (ref($attachment) eq 'HASH') {
 214                $msg->attach(%{$attachment});
 215           }
 216           else {
 217                # This is *craptacular*, but because various MTAs
 218                # (sendmail and exim4, at least) appear to eat From
 219                # lines in message/rfc822 attachments, we need eat
 220                # the entire From line ourselves so the MTA doesn't
 221                # leave \n detrius around.
 222                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 223                     # make a copy so that we don't screw up anything
 224                     # that is expecting this arrayref to stay constant
 225                     $attachment = [@{$attachment}];
 226                     # remove the from line
 227                     splice @$attachment, 1, 1;
 228                }
 229                elsif (not ref($attachment)) {
 230                     # It's a scalar; remove the from line
 231                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 232                }
 233                $msg->attach(Type => 'message/rfc822',
 234                             Data => $attachment,
 235                             Encoding => '7bit',
 236                            );
 237           }
 238      }
 239      return $msg->as_string;
 240 }
 241
 242
 243
 244
 245 =head2 decode_rfc1522
 246
 247     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 248
 249 Turn RFC-1522 names into the UTF-8 equivalent.
 250
 251 =cut
 252
 253 sub decode_rfc1522 {
 254     my ($string) = @_;
 255
 256     # this is craptacular, but leading space is hacked off by unmime.
 257     # Save it.
 258     my $leading_space = '';
 259     $leading_space = $1 if $string =~ s/^(\ +)//;
 260     # we must do this to switch off the utf8 flag before calling decode_mimewords
 261     $string = encode_utf8($string);
 262     my @mime_words = MIME::Words::decode_mimewords($string);
 263     my $tmp = $leading_space .
 264         join('',
 265              (map {
 266                  if (@{$_} > 1) {
 267                      convert_to_utf8(${$_}[0],${$_}[1]);
 268                  } else {
 269                      decode_utf8(${$_}[0]);
 270                  }
 271              } @mime_words)
 272             );
 273     return $tmp;
 274 }
 275
 276 =head2 encode_rfc1522
 277
 278      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 279
 280 Encodes headers according to the RFC1522 standard by calling
 281 MIME::Words::encode_mimeword on distinct words as appropriate.
 282
 283 =cut
 284
 285 # We cannot use MIME::Words::encode_mimewords because that function
 286 # does not handle spaces properly at all.
 287
 288 sub encode_rfc1522 {
 289      my ($rawstr) = @_;
 290
 291      # handle being passed undef properly
 292      return undef if not defined $rawstr;
 293
 294      # convert to octets if we are given a string in perl's internal
 295      # encoding
 296      $rawstr= encode_utf8($rawstr) if is_utf8($rawstr);
 297      # We process words in reverse so we can preserve spacing between
 298      # encoded words. This regex splits on word|nonword boundaries and
 299      # nonword|nonword boundaries. We also consider parenthesis and "
 300      # to be nonwords to avoid escaping them in comments in violation
 301      # of RFC1522
 302      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 303
 304      my $previous_word_encoded = 0;
 305      my $string = '';
 306      for my $word (@words) {
 307           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 308                $string = $word.$string;
 309                $previous_word_encoded=0;
 310           }
 311           elsif ($word =~ /^[\s\n]$/) {
 312                $string = $word.$string;
 313                $previous_word_encoded = 0 if $word eq "\n";
 314           }
 315           else {
 316                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 317                # RFC 1522 mandates that segments be at most 76 characters
 318                # long. If that's the case, we split the word up into 10
 319                # character pieces and encode it. We must use the Encode
 320                # magic here to avoid breaking on bit boundaries here.
 321                if (length $encoded > 75) {
 322                     # Turn utf8 into the internal perl representation
 323                     # so . is a character, not a byte.
 324                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 325                     my @encoded;
 326                     # Strip it into 10 character long segments, and encode
 327                     # the segments
 328                     # XXX It's possible that these segments are > 76 characters
 329                     while ($tempstr =~ s/(.{1,10})$//) {
 330                          # turn the character back into the utf8 representation.
 331                          my $tempword = encode_utf8($1);
 332                          # It may actually be better to eventually use
 333                          # the base64 encoding here, but I'm not sure
 334                          # if that's as widely supported as quoted
 335                          # printable.
 336                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 337                     }
 338                     $encoded = join(" ",@encoded);
 339                     # If the previous word was encoded, we must
 340                     # include a trailing _ that gets encoded as a
 341                     # space.
 342                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 343                     $string = $encoded.$string;
 344                }
 345                else {
 346                     # If the previous word was encoded, we must
 347                     # include a trailing _ that gets encoded as a
 348                     # space.
 349                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 350                     $string = $encoded.$string;
 351                }
 352                $previous_word_encoded = 1;
 353           }
 354      }
 355      return $string;
 356 }
 357
 358 1;