Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =head1 NAME
  14
  15 Debbugs::MIME -- Mime handling routines for debbugs
  16
  17 =head1 SYNOPSIS
  18
  19  use Debbugs::MIME qw(parse decode_rfc1522);
  20
  21 =head1 DESCRIPTION
  22
  23
  24 =head1 BUGS
  25
  26 None known.
  27
  28 =cut
  29
  30 use warnings;
  31 use strict;
  32
  33 use base qw(Exporter);
  34 use vars qw($VERSION @EXPORT_OK);
  35
  36 BEGIN {
  37     $VERSION = 1.00;
  38
  39     @EXPORT_OK = qw(parse decode_rfc1522 encode_rfc1522 convert_to_utf8 create_mime_message getmailbody);
  40 }
  41
  42 use File::Path;
  43 use MIME::Parser;
  44
  45 use POSIX qw(strftime);
  46 use List::MoreUtils qw(apply);
  47
  48 # for decode_rfc1522
  49 use MIME::WordDecoder qw();
  50 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  51
  52 # for encode_rfc1522
  53 use MIME::Words qw();
  54
  55 sub getmailbody
  56 {
  57     my $entity = shift;
  58     my $type = $entity->effective_type;
  59     if ($type eq 'text/plain' or
  60             ($type =~ m#text/# and $type ne 'text/html') or
  61             $type eq 'application/pgp') {
  62         return $entity->bodyhandle;
  63     } elsif ($type eq 'multipart/alternative') {
  64         # RFC 2046 says we should use the last part we recognize.
  65         for my $part (reverse $entity->parts) {
  66             my $ret = getmailbody($part);
  67             return $ret if $ret;
  68         }
  69     } else {
  70         # For other multipart types, we just pretend they're
  71         # multipart/mixed and run through in order.
  72         for my $part ($entity->parts) {
  73             my $ret = getmailbody($part);
  74             return $ret if $ret;
  75         }
  76     }
  77     return undef;
  78 }
  79
  80 sub parse
  81 {
  82     # header and decoded body respectively
  83     my (@headerlines, @bodylines);
  84
  85     my $parser = MIME::Parser->new();
  86     mkdir "mime.tmp.$$", 0777;
  87     $parser->output_under("mime.tmp.$$");
  88     my $entity = eval { $parser->parse_data($_[0]) };
  89
  90     if ($entity and $entity->head->tags) {
  91         @headerlines = @{$entity->head->header};
  92         chomp @headerlines;
  93
  94         my $entity_body = getmailbody($entity);
  95         @bodylines = $entity_body ? $entity_body->as_lines() : ();
  96         chomp @bodylines;
  97     } else {
  98         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
  99         my @msg = split /\n/, $_[0];
 100         my $i;
 101
 102         for ($i = 0; $i <= $#msg; ++$i) {
 103             $_ = $msg[$i];
 104             last unless length;
 105             while ($msg[$i + 1] =~ /^\s/) {
 106                 ++$i;
 107                 $_ .= "\n" . $msg[$i];
 108             }
 109             push @headerlines, $_;
 110         }
 111
 112         @bodylines = @msg[$i .. $#msg];
 113     }
 114
 115     rmtree "mime.tmp.$$", 0, 1;
 116
 117     # Remove blank lines.
 118     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 119
 120     # Strip off RFC2440-style PGP clearsigning.
 121     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 122         shift @bodylines while @bodylines and length $bodylines[0];
 123         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 124         for my $findsig (0 .. $#bodylines) {
 125             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 126                 $#bodylines = $findsig - 1;
 127                 last;
 128             }
 129         }
 130         map { s/^- // } @bodylines;
 131     }
 132
 133     return { header => [@headerlines], body => [@bodylines]};
 134 }
 135
 136 =head2 create_mime_message
 137
 138      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 139
 140 Creates a MIME encoded message with headers given by the first
 141 argument, and a message given by the second.
 142
 143 Optional attachments can be specified in the third arrayref argument.
 144
 145 Whether to include the date in the header is the final argument; it
 146 defaults to true, setting the Date header if one is not already
 147 present.
 148
 149 Headers are passed directly to MIME::Entity::build, the message is the
 150 first attachment.
 151
 152 Each of the elements of the attachment arrayref is attached as an
 153 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 154 hashref, the contents are passed as an argument to
 155 MIME::Entity::attach
 156
 157 =cut
 158
 159 sub create_mime_message{
 160      my ($headers,$body,$attachments,$include_date) = @_;
 161      $attachments = [] if not defined $attachments;
 162      $include_date = 1 if not defined $include_date;
 163
 164      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 165      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 166
 167      if ($include_date) {
 168          my %headers = apply {lc($_)} @{$headers};
 169          if (not exists $headers{date}) {
 170              push @{$headers},
 171                  ('Date',
 172                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 173                  );
 174          }
 175      }
 176
 177      # Build the message
 178      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 179      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 180                                    'Encoding'     => 'quoted-printable',
 181                                    (map{encode_rfc1522($_)} @{$headers}),
 182                                    Data    => $body
 183                                   );
 184
 185      # Attach the attachments
 186      for my $attachment (@{$attachments}) {
 187           if (ref($attachment) eq 'HASH') {
 188                $msg->attach(%{$attachment});
 189           }
 190           else {
 191                # This is *craptacular*, but because various MTAs
 192                # (sendmail and exim4, at least) appear to eat From
 193                # lines in message/rfc822 attachments, we need eat
 194                # the entire From line ourselves so the MTA doesn't
 195                # leave \n detrius around.
 196                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 197                     # make a copy so that we don't screw up anything
 198                     # that is expecting this arrayref to stay constant
 199                     $attachment = [@{$attachment}];
 200                     # remove the from line
 201                     splice @$attachment, 1, 1;
 202                }
 203                elsif (not ref($attachment)) {
 204                     # It's a scalar; remove the from line
 205                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 206                }
 207                $msg->attach(Type => 'message/rfc822',
 208                             Data => $attachment,
 209                             Encoding => '7bit',
 210                            );
 211           }
 212      }
 213      return $msg->as_string;
 214 }
 215
 216
 217 # Bug #61342 et al.
 218
 219 sub convert_to_utf8 {
 220      my ($data, $charset) = @_;
 221      # raw data just gets returned (that's the charset WordDecorder
 222      # uses when it doesn't know what to do)
 223      return $data if $charset eq 'raw' or is_utf8($data,1);
 224      my $result;
 225      eval {
 226           # this encode/decode madness is to make sure that the data
 227           # really is valid utf8 and that the is_utf8 flag is off.
 228           $result = encode("utf8",decode($charset,$data))
 229      };
 230      if ($@) {
 231           warn "Unable to decode charset; '$charset' and '$data': $@";
 232           return $data;
 233      }
 234      return $result;
 235 }
 236
 237
 238 =head2 decode_rfc1522
 239
 240     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 241
 242 Turn RFC-1522 names into the UTF-8 equivalent.
 243
 244 =cut
 245
 246 BEGIN {
 247     # Set up the default RFC1522 decoder, which turns all charsets that
 248     # are supported into the appropriate UTF-8 charset.
 249     MIME::WordDecoder->default(new MIME::WordDecoder(
 250         ['*' => \&convert_to_utf8,
 251         ]));
 252 }
 253
 254 sub decode_rfc1522 {
 255     my ($string) = @_;
 256
 257     # this is craptacular, but leading space is hacked off by unmime.
 258     # Save it.
 259     my $leading_space = '';
 260     $leading_space = $1 if $string =~ s/^(\s+)//;
 261     # unmime calls the default MIME::WordDecoder handler set up at
 262     # initialization time.
 263     return $leading_space . MIME::WordDecoder::unmime($string);
 264 }
 265
 266 =head2 encode_rfc1522
 267
 268      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 269
 270 Encodes headers according to the RFC1522 standard by calling
 271 MIME::Words::encode_mimeword on distinct words as appropriate.
 272
 273 =cut
 274
 275 # We cannot use MIME::Words::encode_mimewords because that function
 276 # does not handle spaces properly at all.
 277
 278 sub encode_rfc1522 {
 279      my ($rawstr) = @_;
 280
 281      # handle being passed undef properly
 282      return undef if not defined $rawstr;
 283      # We process words in reverse so we can preserve spacing between
 284      # encoded words. This regex splits on word|nonword boundaries and
 285      # nonword|nonword boundaries.
 286      my @words = reverse split /(?:(?<=[\s\n])|(?=[\s\n]))/m, $rawstr;
 287
 288      my $previous_word_encoded = 0;
 289      my $string = '';
 290      for my $word (@words) {
 291           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 292                $string = $word.$string;
 293                $previous_word_encoded=0;
 294           }
 295           elsif ($word =~ /^[\s\n]$/) {
 296                $string = $word.$string;
 297                $previous_word_encoded = 0 if $word eq "\n";
 298           }
 299           else {
 300                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 301                # RFC 1522 mandates that segments be at most 76 characters
 302                # long. If that's the case, we split the word up into 10
 303                # character pieces and encode it. We must use the Encode
 304                # magic here to avoid breaking on bit boundaries here.
 305                if (length $encoded > 75) {
 306                     # Turn utf8 into the internal perl representation
 307                     # so . is a character, not a byte.
 308                     my $tempstr = decode_utf8($word,Encode::FB_DEFAULT);
 309                     my @encoded;
 310                     # Strip it into 10 character long segments, and encode
 311                     # the segments
 312                     # XXX It's possible that these segments are > 76 characters
 313                     while ($tempstr =~ s/(.{1,10})$//) {
 314                          # turn the character back into the utf8 representation.
 315                          my $tempword = encode_utf8($1);
 316                          # It may actually be better to eventually use
 317                          # the base64 encoding here, but I'm not sure
 318                          # if that's as widely supported as quoted
 319                          # printable.
 320                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 321                     }
 322                     $encoded = join(" ",@encoded);
 323                     # If the previous word was encoded, we must
 324                     # include a trailing _ that gets encoded as a
 325                     # space.
 326                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 327                     $string = $encoded.$string;
 328                }
 329                else {
 330                     # If the previous word was encoded, we must
 331                     # include a trailing _ that gets encoded as a
 332                     # space.
 333                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 334                     $string = $encoded.$string;
 335                }
 336                $previous_word_encoded = 1;
 337           }
 338      }
 339      return $string;
 340 }
 341
 342 1;