Debbugs/MIME.pm

   1 # This module is part of debbugs, and is released
   2 # under the terms of the GPL version 2, or any later
   3 # version at your option.
   4 # See the file README and COPYING for more information.
   5 #
   6 # [Other people have contributed to this file; their copyrights should
   7 # go here too.]
   8 # Copyright 2006 by Don Armstrong <don@donarmstrong.com>.
   9
  10
  11 package Debbugs::MIME;
  12
  13 =head1 NAME
  14
  15 Debbugs::MIME -- Mime handling routines for debbugs
  16
  17 =head1 SYNOPSIS
  18
  19  use Debbugs::MIME qw(parse decode_rfc1522);
  20
  21 =head1 DESCRIPTION
  22
  23
  24 =head1 BUGS
  25
  26 None known.
  27
  28 =cut
  29
  30 use warnings;
  31 use strict;
  32
  33 use base qw(Exporter);
  34 use vars qw($VERSION @EXPORT_OK);
  35
  36 BEGIN {
  37     $VERSION = 1.00;
  38
  39     @EXPORT_OK = qw(parse decode_rfc1522 encode_rfc1522 convert_to_utf8 create_mime_message getmailbody);
  40 }
  41
  42 use File::Path;
  43 use File::Temp qw();
  44 use MIME::Parser;
  45
  46 use POSIX qw(strftime);
  47 use List::MoreUtils qw(apply);
  48
  49 # for decode_rfc1522
  50 use MIME::WordDecoder qw();
  51 use Encode qw(decode encode encode_utf8 decode_utf8 is_utf8);
  52
  53 # for encode_rfc1522
  54 use MIME::Words qw();
  55
  56 sub getmailbody
  57 {
  58     my $entity = shift;
  59     my $type = $entity->effective_type;
  60     if ($type eq 'text/plain' or
  61             ($type =~ m#text/# and $type ne 'text/html') or
  62             $type eq 'application/pgp') {
  63         return $entity->bodyhandle;
  64     } elsif ($type eq 'multipart/alternative') {
  65         # RFC 2046 says we should use the last part we recognize.
  66         for my $part (reverse $entity->parts) {
  67             my $ret = getmailbody($part);
  68             return $ret if $ret;
  69         }
  70     } else {
  71         # For other multipart types, we just pretend they're
  72         # multipart/mixed and run through in order.
  73         for my $part ($entity->parts) {
  74             my $ret = getmailbody($part);
  75             return $ret if $ret;
  76         }
  77     }
  78     return undef;
  79 }
  80
  81 sub parse
  82 {
  83     # header and decoded body respectively
  84     my (@headerlines, @bodylines);
  85
  86     my $parser = MIME::Parser->new();
  87     my $tempdir = File::Temp::tempdir();
  88     $parser->output_under($tempdir);
  89     my $entity = eval { $parser->parse_data($_[0]) };
  90
  91     if ($entity and $entity->head->tags) {
  92         @headerlines = @{$entity->head->header};
  93         chomp @headerlines;
  94
  95         my $entity_body = getmailbody($entity);
  96         @bodylines = $entity_body ? $entity_body->as_lines() : ();
  97         chomp @bodylines;
  98     } else {
  99         # Legacy pre-MIME code, kept around in case MIME::Parser fails.
 100         my @msg = split /\n/, $_[0];
 101         my $i;
 102
 103         for ($i = 0; $i <= $#msg; ++$i) {
 104             $_ = $msg[$i];
 105             last unless length;
 106             while ($msg[$i + 1] =~ /^\s/) {
 107                 ++$i;
 108                 $_ .= "\n" . $msg[$i];
 109             }
 110             push @headerlines, $_;
 111         }
 112
 113         @bodylines = @msg[$i .. $#msg];
 114     }
 115
 116     rmtree $tempdir, 0, 1;
 117
 118     # Remove blank lines.
 119     shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 120
 121     # Strip off RFC2440-style PGP clearsigning.
 122     if (@bodylines and $bodylines[0] =~ /^-----BEGIN PGP SIGNED/) {
 123         shift @bodylines while @bodylines and length $bodylines[0];
 124         shift @bodylines while @bodylines and $bodylines[0] !~ /\S/;
 125         for my $findsig (0 .. $#bodylines) {
 126             if ($bodylines[$findsig] =~ /^-----BEGIN PGP SIGNATURE/) {
 127                 $#bodylines = $findsig - 1;
 128                 last;
 129             }
 130         }
 131         map { s/^- // } @bodylines;
 132     }
 133
 134     return { header => [@headerlines], body => [@bodylines]};
 135 }
 136
 137 =head2 create_mime_message
 138
 139      create_mime_message([To=>'don@debian.org'],$body,[$attach1, $attach2],$include_date);
 140
 141 Creates a MIME encoded message with headers given by the first
 142 argument, and a message given by the second.
 143
 144 Optional attachments can be specified in the third arrayref argument.
 145
 146 Whether to include the date in the header is the final argument; it
 147 defaults to true, setting the Date header if one is not already
 148 present.
 149
 150 Headers are passed directly to MIME::Entity::build, the message is the
 151 first attachment.
 152
 153 Each of the elements of the attachment arrayref is attached as an
 154 rfc822 message if it is a scalar or an arrayref; otherwise if it is a
 155 hashref, the contents are passed as an argument to
 156 MIME::Entity::attach
 157
 158 =cut
 159
 160 sub create_mime_message{
 161      my ($headers,$body,$attachments,$include_date) = @_;
 162      $attachments = [] if not defined $attachments;
 163      $include_date = 1 if not defined $include_date;
 164
 165      die "The first argument to create_mime_message must be an arrayref" unless ref($headers) eq 'ARRAY';
 166      die "The third argument to create_mime_message must be an arrayref" unless ref($attachments) eq 'ARRAY';
 167
 168      if ($include_date) {
 169          my %headers = apply {lc($_)} @{$headers};
 170          if (not exists $headers{date}) {
 171              push @{$headers},
 172                  ('Date',
 173                   strftime("%a, %d %b %Y %H:%M:%S +0000",gmtime)
 174                  );
 175          }
 176      }
 177
 178      # Build the message
 179      # MIME::Entity is stupid, and doesn't rfc1522 encode its headers, so we do it for it.
 180      my $msg = MIME::Entity->build('Content-Type' => 'text/plain; charset=utf-8',
 181                                    'Encoding'     => 'quoted-printable',
 182                                    (map{encode_rfc1522($_)} @{$headers}),
 183                                    Data    => $body
 184                                   );
 185
 186      # Attach the attachments
 187      for my $attachment (@{$attachments}) {
 188           if (ref($attachment) eq 'HASH') {
 189                $msg->attach(%{$attachment});
 190           }
 191           else {
 192                # This is *craptacular*, but because various MTAs
 193                # (sendmail and exim4, at least) appear to eat From
 194                # lines in message/rfc822 attachments, we need eat
 195                # the entire From line ourselves so the MTA doesn't
 196                # leave \n detrius around.
 197                if (ref($attachment) eq 'ARRAY' and $attachment->[1] =~ /^From /) {
 198                     # make a copy so that we don't screw up anything
 199                     # that is expecting this arrayref to stay constant
 200                     $attachment = [@{$attachment}];
 201                     # remove the from line
 202                     splice @$attachment, 1, 1;
 203                }
 204                elsif (not ref($attachment)) {
 205                     # It's a scalar; remove the from line
 206                     $attachment =~ s/^(Received:[^\n]+\n)(From [^\n]+\n)/$1/s;
 207                }
 208                $msg->attach(Type => 'message/rfc822',
 209                             Data => $attachment,
 210                             Encoding => '7bit',
 211                            );
 212           }
 213      }
 214      return $msg->as_string;
 215 }
 216
 217
 218 # Bug #61342 et al.
 219
 220 sub convert_to_utf8 {
 221      my ($data, $charset) = @_;
 222      # raw data just gets returned (that's the charset WordDecorder
 223      # uses when it doesn't know what to do)
 224      return $data if $charset eq 'raw' or is_utf8($data,1);
 225      my $result;
 226      eval {
 227           # this encode/decode madness is to make sure that the data
 228           # really is valid utf8 and that the is_utf8 flag is off.
 229           $result = encode("utf8",decode($charset,$data))
 230      };
 231      if ($@) {
 232           warn "Unable to decode charset; '$charset' and '$data': $@";
 233           return $data;
 234      }
 235      return $result;
 236 }
 237
 238
 239 =head2 decode_rfc1522
 240
 241     decode_rfc1522('=?iso-8859-1?Q?D=F6n_Armstr=F3ng?= <don@donarmstrong.com>')
 242
 243 Turn RFC-1522 names into the UTF-8 equivalent.
 244
 245 =cut
 246
 247 BEGIN {
 248     # Set up the default RFC1522 decoder, which turns all charsets that
 249     # are supported into the appropriate UTF-8 charset.
 250     MIME::WordDecoder->default(new MIME::WordDecoder(
 251         ['*' => \&convert_to_utf8,
 252         ]));
 253 }
 254
 255 sub decode_rfc1522 {
 256     my ($string) = @_;
 257
 258     # this is craptacular, but leading space is hacked off by unmime.
 259     # Save it.
 260     my $leading_space = '';
 261     $leading_space = $1 if $string =~ s/^(\s+)//;
 262     # unmime calls the default MIME::WordDecoder handler set up at
 263     # initialization time.
 264     return $leading_space . MIME::WordDecoder::unmime($string);
 265 }
 266
 267 =head2 encode_rfc1522
 268
 269      encode_rfc1522('Dön Armströng <don@donarmstrong.com>')
 270
 271 Encodes headers according to the RFC1522 standard by calling
 272 MIME::Words::encode_mimeword on distinct words as appropriate.
 273
 274 =cut
 275
 276 # We cannot use MIME::Words::encode_mimewords because that function
 277 # does not handle spaces properly at all.
 278
 279 sub encode_rfc1522 {
 280      my ($rawstr) = @_;
 281
 282      # handle being passed undef properly
 283      return undef if not defined $rawstr;
 284      # We process words in reverse so we can preserve spacing between
 285      # encoded words. This regex splits on word|nonword boundaries and
 286      # nonword|nonword boundaries.
 287      my @words = reverse split /(?:(?<=[\s\n])|(?=[\s\n]))/m, $rawstr;
 288
 289      my $previous_word_encoded = 0;
 290      my $string = '';
 291      for my $word (@words) {
 292           if ($word !~ m#[\x00-\x1F\x7F-\xFF]#o and $word ne ' ') {
 293                $string = $word.$string;
 294                $previous_word_encoded=0;
 295           }
 296           elsif ($word =~ /^[\s\n]$/) {
 297                $string = $word.$string;
 298                $previous_word_encoded = 0 if $word eq "\n";
 299           }
 300           else {
 301                my $encoded = MIME::Words::encode_mimeword($word, 'q', 'UTF-8');
 302                # RFC 1522 mandates that segments be at most 76 characters
 303                # long. If that's the case, we split the word up into 10
 304                # character pieces and encode it. We must use the Encode
 305                # magic here to avoid breaking on bit boundaries here.
 306                if (length $encoded > 75) {
 307                     # Turn utf8 into the internal perl representation
 308                     # so . is a character, not a byte.
 309                     my $tempstr = decode_utf8($word,Encode::FB_DEFAULT);
 310                     my @encoded;
 311                     # Strip it into 10 character long segments, and encode
 312                     # the segments
 313                     # XXX It's possible that these segments are > 76 characters
 314                     while ($tempstr =~ s/(.{1,10})$//) {
 315                          # turn the character back into the utf8 representation.
 316                          my $tempword = encode_utf8($1);
 317                          # It may actually be better to eventually use
 318                          # the base64 encoding here, but I'm not sure
 319                          # if that's as widely supported as quoted
 320                          # printable.
 321                          unshift @encoded, MIME::Words::encode_mimeword($tempword,'q','UTF-8');
 322                     }
 323                     $encoded = join(" ",@encoded);
 324                     # If the previous word was encoded, we must
 325                     # include a trailing _ that gets encoded as a
 326                     # space.
 327                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 328                     $string = $encoded.$string;
 329                }
 330                else {
 331                     # If the previous word was encoded, we must
 332                     # include a trailing _ that gets encoded as a
 333                     # space.
 334                     $encoded =~ s/\?\=$/_\?\=/ if $previous_word_encoded;
 335                     $string = $encoded.$string;
 336                }
 337                $previous_word_encoded = 1;
 338           }
 339      }
 340      return $string;
 341 }
 342
 343 1;