Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =encoding utf8
  14
  15 =head1 NAME
  16
  17 Debbugs::MIME -- Mime handling routines for debbugs
  18
  19 =head1 SYNOPSIS
  20
  21  use Debbugs::MIME qw(parse decode_rfc1522);
  22
  23 =head1 DESCRIPTION
  24
  25
  26 =head1 BUGS
  27
  28 None known.
  29
  30 =cut
  31
  32 use warnings;
  33 use strict;
  34
  35 use Exporter qw(import);
  36 use vars qw($DEBUG $VERSION @EXPORT_OK %EXPORT_TAGS @EXPORT);
  37
  38 BEGIN {
  39     $VERSION = 1.00;
  40     $DEBUG = 0 unless defined $DEBUG;
  41
  42     @EXPORT = ();
  43
  44     %EXPORT_TAGS = (mime => [qw(parse create_mime_message getmailbody)],
  45                     rfc1522 => [qw(decode_rfc1522 encode_rfc1522)],
  46                    );
  47     @EXPORT_OK=();
  48     Exporter::export_ok_tags(keys %EXPORT_TAGS);
  49     $EXPORT_TAGS{all} = [@EXPORT_OK];
  50 }
  51
  52 use File::Path qw(remove_tree);
  53 use File::Temp qw(tempdir);
  54 use MIME::Parser;
  55
  56 use POSIX qw(strftime);
  57 use List::MoreUtils qw(apply);
  58
  59 # for convert_to_utf8
  60 use Debbugs::UTF8 qw(convert_to_utf8);
  61
  62 # for decode_rfc1522 and encode_rfc1522
  63 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  64 use MIME::Words qw();
  65
  66 sub getmailbody
  67 {
  68     my $entity = shift;
  69     my $type = $entity->effective_type;
  70     if ($type eq 'text/plain' or
  71             ($type =~ m#text/?# and $type ne 'text/html') or
  72             $type eq 'application/pgp') {
  73         return $entity;
  74     } elsif ($type eq 'multipart/alternative') {
  75         # RFC 2046 says we should use the last part we recognize.
  76         for my $part (reverse $entity->parts) {
  77             my $ret = getmailbody($part);
  78             return $ret if $ret;
  79         }
  80     } else {
  81         # For other multipart types, we just pretend they're
  82         # multipart/mixed and run through in order.
  83         for my $part ($entity->parts) {
  84             my $ret = getmailbody($part);
  85             return $ret if $ret;
  86         }
  87     }
  88     return undef;
  89 }
  90
  91 sub parse
  92 {
  93     # header and decoded body respectively
  94     my (@headerlines, @bodylines);
  95
  96     my $parser = MIME::Parser->new();
  97     my $tempdir = tempdir(CLEANUP => 1);
  98     $parser->output_under($tempdir);
  99     my $entity = eval { $parser->parse_data($_[0]) };
 100
 101     if ($entity and $entity->head->tags) {
 102         @headerlines = @{$entity->head->header};
 103         chomp @headerlines;
 104
 105         my $entity_body = getmailbody($entity);
 106         my $entity_body_handle;
 107         my $charset;
 108         if (defined $entity_body) {
 109             $entity_body_handle = $entity_body->bodyhandle();
 110             $charset = $entity_body->head()->mime_attr('content-type.charset');
 111         }
 112         @bodylines = $entity_body_handle ? $entity_body_handle->as_lines() : ();
 113         @bodylines = map {convert_to_utf8($_,$charset)} @bodylines;
 114         chomp @bodylines;
 115     } else {
 116         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 117         my @msg = split /\n/, $_[0];
 118         my $i;
 119
 120         # assume us-ascii unless charset is set; probably bad, but we
 121         # really shouldn't get to this point anyway
 122         my $charset = 'us-ascii';
 123         for ($i = 0; $i <= $#msg; ++$i) {
 124             $_ = $msg[$i];
 125             last unless length;
 126             while ($msg[$i + 1] =~ /^\s/) {
 127                 ++$i;
 128                 $_ .= "\n" . $msg[$i];
 129             }
 130             if (/charset=\"([^\"]+)\"/) {
 131                 $charset = $1;
 132             }
 133             push @headerlines, $_;
 134         }
 135         @bodylines = map {convert_to_utf8($_,$charset)} @msg[$i .. $#msg];
 136     }
 137
 138     remove_tree($tempdir,{verbose => 0, safe => 1});
 139
 140     # Remove blank lines.
 141     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 142
 143     # Strip off RFC2440-style PGP clearsigning.
 144     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 145         shift @bodylines while @bodylines and
 146             length $bodylines[0] and
 147                 # we currently don't strip \r; handle this for the
 148                 # time being, though eventually it should be stripped
 149                 # too, I think. [See #565981]
 150                 $bodylines[0] ne "\r";
 151         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 152         for my $findsig (0 .. $#bodylines) {
 153             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 154                 $#bodylines = $findsig - 1;
 155                 last;
 156             }
 157         }
 158         map { s/^- // } @bodylines;
 159     }
 160
 161     return { header => [@headerlines], body => [@bodylines]};
 162 }
 163
 164 =head2 create_mime_message
 165
 166      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 167
 168 Creates a MIME encoded message with headers given by the first
 169 argument, and a message given by the second.
 170
 171 Optional attachments can be specified in the third arrayref argument.
 172
 173 Whether to include the date in the header is the final argument; it
 174 defaults to true, setting the Date header if one is not already
 175 present.
 176
 177 Headers are passed directly to MIME::Entity::build, the message is the
 178 first attachment.
 179
 180 Each of the elements of the attachment arrayref is attached as an
 181 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 182 hashref, the contents are passed as an argument to
 183 MIME::Entity::attach
 184
 185 =cut
 186
 187 sub create_mime_message{
 188      my ($headers,$body,$attachments,$include_date) = @_;
 189      $attachments = [] if not defined $attachments;
 190      $include_date = 1 if not defined $include_date;
 191
 192      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 193      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 194
 195      if ($include_date) {
 196          my %headers = apply {defined $_ ? lc($_) : ''} @{$headers};
 197          if (not exists $headers{date}) {
 198              push @{$headers},
 199                  ('Date',
 200                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 201                  );
 202          }
 203      }
 204
 205      # Build the message
 206      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 207      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 208                                    'Encoding'     => 'quoted-printable',
 209                                    (map{encode_rfc1522(encode_utf8(defined $_ ? $_:''))} @{$headers}),
 210                                    Data    => encode_utf8($body),
 211                                   );
 212
 213      # Attach the attachments
 214      for my $attachment (@{$attachments}) {
 215           if (ref($attachment) eq 'HASH') {
 216                $msg->attach(%{$attachment});
 217           }
 218           else {
 219                # This is *craptacular*, but because various MTAs
 220                # (sendmail and exim4, at least) appear to eat From
 221                # lines in message/rfc822 attachments, we need eat
 222                # the entire From line ourselves so the MTA doesn't
 223                # leave \n detrius around.
 224                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 225                     # make a copy so that we don't screw up anything
 226                     # that is expecting this arrayref to stay constant
 227                     $attachment = [@{$attachment}];
 228                     # remove the from line
 229                     splice @$attachment, 1, 1;
 230                }
 231                elsif (not ref($attachment)) {
 232                     # It's a scalar; remove the from line
 233                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 234                }
 235                $msg->attach(Type => 'message/rfc822',
 236                             Data => $attachment,
 237                             Encoding => '7bit',
 238                            );
 239           }
 240      }
 241      return $msg->as_string;
 242 }
 243
 244
 245
 246
 247 =head2 decode_rfc1522
 248
 249     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 250
 251 Turn RFC-1522 names into the UTF-8 equivalent.
 252
 253 =cut
 254
 255 sub decode_rfc1522 {
 256     my ($string) = @_;
 257
 258     # this is craptacular, but leading space is hacked off by unmime.
 259     # Save it.
 260     my $leading_space = '';
 261     $leading_space = $1 if $string =~ s/^(\ +)//;
 262     # we must do this to switch off the utf8 flag before calling decode_mimewords
 263     $string = encode_utf8($string);
 264     my @mime_words = MIME::Words::decode_mimewords($string);
 265     my $tmp = $leading_space .
 266         join('',
 267              (map {
 268                  if (@{$_} > 1) {
 269                      convert_to_utf8(${$_}[0],${$_}[1]);
 270                  } else {
 271                      decode_utf8(${$_}[0]);
 272                  }
 273              } @mime_words)
 274             );
 275     return $tmp;
 276 }
 277
 278 =head2 encode_rfc1522
 279
 280      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 281
 282 Encodes headers according to the RFC1522 standard by calling
 283 MIME::Words::encode_mimeword on distinct words as appropriate.
 284
 285 =cut
 286
 287 # We cannot use MIME::Words::encode_mimewords because that function
 288 # does not handle spaces properly at all.
 289
 290 sub encode_rfc1522 {
 291      my ($rawstr) = @_;
 292
 293      # handle being passed undef properly
 294      return undef if not defined $rawstr;
 295
 296      # convert to octets if we are given a string in perl's internal
 297      # encoding
 298      $rawstr= encode_utf8($rawstr) if is_utf8($rawstr);
 299      # We process words in reverse so we can preserve spacing between
 300      # encoded words. This regex splits on word|nonword boundaries and
 301      # nonword|nonword boundaries. We also consider parenthesis and "
 302      # to be nonwords to avoid escaping them in comments in violation
 303      # of RFC1522
 304      my @words = reverse split /(?:(?<=[\s\n\)\(\"])|(?=[\s\n\)\(\"]))/m, $rawstr;
 305
 306      my $previous_word_encoded = 0;
 307      my $string = '';
 308      for my $word (@words) {
 309           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 310                $string = $word.$string;
 311                $previous_word_encoded=0;
 312           }
 313           elsif ($word =~ /^[\s\n]$/) {
 314                $string = $word.$string;
 315                $previous_word_encoded = 0 if $word eq "\n";
 316           }
 317           else {
 318                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 319                # RFC 1522 mandates that segments be at most 76 characters
 320                # long. If that's the case, we split the word up into 10
 321                # character pieces and encode it. We must use the Encode
 322                # magic here to avoid breaking on bit boundaries here.
 323                if (length $encoded > 75) {
 324                     # Turn utf8 into the internal perl representation
 325                     # so . is a character, not a byte.
 326                     my $tempstr = is_utf8($word)?$word:decode_utf8($word,Encode::FB_DEFAULT);
 327                     my @encoded;
 328                     # Strip it into 10 character long segments, and encode
 329                     # the segments
 330                     # XXX It's possible that these segments are > 76 characters
 331                     while ($tempstr =~ s/(.{1,10})$//) {
 332                          # turn the character back into the utf8 representation.
 333                          my $tempword = encode_utf8($1);
 334                          # It may actually be better to eventually use
 335                          # the base64 encoding here, but I'm not sure
 336                          # if that's as widely supported as quoted
 337                          # printable.
 338                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 339                     }
 340                     $encoded = join(" ",@encoded);
 341                     # If the previous word was encoded, we must
 342                     # include a trailing _ that gets encoded as a
 343                     # space.
 344                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 345                     $string = $encoded.$string;
 346                }
 347                else {
 348                     # If the previous word was encoded, we must
 349                     # include a trailing _ that gets encoded as a
 350                     # space.
 351                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 352                     $string = $encoded.$string;
 353                }
 354                $previous_word_encoded = 1;
 355           }
 356      }
 357      return $string;
 358 }
 359
 360 1;