2 # convert_to_maildir converts mboxes to maildir, and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2011 by Don Armstrong <don@donarmstrong.com>.
6 # $Id: perl_script 1825 2011-01-02 01:53:43Z don $
17 convert_to_maildir mailbox maildir - convert a mailbox to a maildir
21 convert_to_maildir [options] mailbox maildir
22 convert_to_maildir -m maildir mailbox [mailbox2..]
26 --maildir,-m maildir destination
27 --debug, -d debugging level (Default 0)
28 --help, -h display this help
29 --man, -m display manual
37 Maildir destination; useful if converting multiple mailboxes
41 Debug verbosity. (Default 0)
45 Display brief usage information.
65 my %options = (debug => 0,
72 'debug|d+','help|h|?','man|m');
74 pod2usage() if $options{help};
75 pod2usage({verbose=>2}) if $options{man};
77 $DEBUG = $options{debug};
80 if (@ARGV != 2 and not defined $options{maildir}) {
81 push @USAGE_ERRORS,"You must either give one mailbox and one maildir, or use the -m option";
84 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
86 if (not defined $options{maildir}) {
87 $options{maildir} = pop @ARGV;
91 maildirmake($options{maildir});
92 for my $mailbox (@ARGV) {
93 convert($mailbox,$options{maildir});
96 # code below comes from mb2md
98 # mb2md-3.20.pl Converts Mbox mailboxes to Maildir format.
102 # currently maintained by:
103 # Juri Haberland <juri@koschikode.com>
104 # initially wrote by:
107 # This script's web abode is http://batleth.sapienti-sat.org/projects/mb2md/ .
108 # For a changelog see http://batleth.sapienti-sat.org/projects/mb2md/changelog.txt
110 # The maildirmake function
111 # ------------------------
113 # It does the same thing that the maildirmake binary that
114 # comes with courier-imap distribution
119 -d $_ or mkdir $_,0700 or die("Fatal: Directory $_ doesn't exist and can't be created.\n");
120 -d "$_/tmp" or mkdir("$_/tmp",0700) or die("Fatal: Unable to make $_/tmp/ subdirectory.\n");
121 -d "$_/new" or mkdir("$_/new",0700) or die("Fatal: Unable to make $_/new/ subdirectory.\n");
122 -d "$_/cur" or mkdir("$_/cur",0700) or die("Fatal: Unable to make $_/cur/ subdirectory.\n");
126 # The convert function
127 # ---------------------
129 # This function does the down and dirty work of
130 # actually converting the mbox to a maildir
134 # get the source and destination as arguments
135 my ($mbox, $maildir) = @_;
137 printf("Source Mbox is $mbox\n");
138 printf("Target Maildir is $maildir \n") ;
140 # create the directories for the new maildir
142 # if it is the root maildir (ie. converting the inbox)
143 # these already exist but thats not a big issue
145 &maildirmake($maildir);
147 # Change to the target mailbox directory.
151 # Converts a Mbox to multiple files
153 # This is adapted from mbox2maildir.
155 # Open the Mbox mailbox file.
158 if (sysopen(MBOX, "$mbox", O_RDONLY))
160 #printf("Converting Mbox $mbox . . . \n");
164 die("Fatal: unable to open input mailbox file: $mbox ! \n");
167 # This loop scans the input mailbox for
168 # a line starting with "From ". The
169 # "^" before it is pattern-matching
170 # lingo for it being at the start of a
173 # Each email in Mbox mailbox starts
174 # with such a line, which is why any
175 # such line in the body of the email
176 # has to have a ">" put in front of it.
178 # This is not required in a Maildir
179 # mailbox, and some majik below
180 # finds any such quoted "> From"s and
181 # gets rid of the "> " quote.
183 # Each email is put in a file
184 # in the cur/ subdirectory with a
187 # nnnnnnnnn.cccc.mbox:2,XXXX
190 # "nnnnnnnnn" is the Unix time since
191 # 1970 when this script started
192 # running, incremented by 1 for
193 # every email. This is to ensure
194 # unique names for each message
197 # ".cccc" is the message count of
198 # messages from this mbox.
200 # ".mbox" is just to indicate that
201 # this message was converted from
204 # ":2," is the start of potentially
205 # multiple IMAP flag characters
206 # "XXXX", but may be followed by
209 # This is sort-of compliant with
210 # the Maildir naming conventions
213 # http://www.qmail.org/man/man5/maildir.html
215 # This approach does not involve the
216 # process ID or the hostname, but it is
217 # probably good enough.
219 # When the IMAP server looks at this
220 # mailbox, it will move the files to
221 # the cur/ directory and change their
222 # names as it pleases. In the case
223 # of Courier IMAP, the names will
226 # 995096541.25351.mbox:2,S
228 # with 25351 being Courier IMAP's
229 # process ID. The :2, is the start
230 # of the flags, and the "S" means
231 # that this one has been seen by
232 # the user. (But is this the same
233 # meaning as the user actually
234 # having opened the message to see
235 # its contents, rather than just the
236 # IMAP server having been asked to
237 # list the message's Subject etc.
238 # so the client could list it in the
241 # This contrasts with a message
242 # created by Courier IMAP, say with
243 # a message copy, which is like:
245 # 995096541.25351.zair,S=14285:2,S
247 # where ",S=14285" is the size of the
250 # Courier Maildrop's names are similar
251 # but lack the ":2,XXXX" flags . . .
252 # except for my modified Maildrop
253 # which can deliver them with a
254 # ":2,T" - flagged for deletion.
256 # I have extended the logic of the
257 # per-message inner loop to stop
258 # saving a file for a message with:
260 # Subject: DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA
262 # This is the dummy message, always
263 # at the start of an Mbox format
264 # mailbox file - and is put there
265 # by UW IMAPD. Since quite a few
266 # people will use this for
267 # converting from a UW system,
268 # I figure it is worth it.
270 # I will not save any such message
271 # file for the dummy message.
276 # We want to read the entire Mbox file, whilst
277 # going through a loop for each message we find.
279 # We want to read all the headers of the message,
280 # starting with the "From " line. For that "From "
281 # line we want to get a date.
283 # For all other header lines, we want to store them
284 # in $headers whilst parsing them to find:
286 # 1 - Any flags in the "Status: " or "X-Status: " or
287 # "X-Mozilla-Status: " lines.
289 # 2 - A subject line indicating this is the dummy message
290 # at the start (typically, but not necessarily) of
293 # Once we reach the end of the headers, we will crunch any
294 # flags we found to create a file name. Then, unless this is
295 # the dummy message we create that file and write all the
298 # Then we continue reading the Mbox, converting ">From " to
299 # "From " and writing it to the file, until we reach one of:
301 # 1 - Another "From " line (indicating the start of another
306 # 2 - The end of the Mbox.
308 # In the former case, which we detect at the start of the loop
309 # we need to close the file and touch it to alter its date-time.
311 # In the later case, we also need to close the file and touch
312 # it to alter its date-time - but this is beyond the end of the
319 my $messagecount = 0;
321 # For generating unique filenames for
322 # each message. Initialise it here with
323 # numeric time in seconds since 1970.
326 # Name of message file to delete if we found that
327 # it was created by reading the Mbox dummy message.
329 my $deletedummy = '';
331 # To store the complete "From (address) (date-time)
332 # which delineates the start of each message
337 # Set to 1 when we are reading the header lines,
338 # including the "From " line.
340 # 0 means we are reading the message body and looking
341 # for another "From " line.
345 # Variable to hold all headers (apart from
346 # the first line "From ...." which is not
347 # part of the message itself.
350 # Variable to hold the accumulated characters
351 # we find in header lines of the type:
359 # To build the file name for the message in.
363 # The date string from the "From " line of each
364 # message will be written here - and used by
365 # touch to alter the date-time of each message
366 # file. Put non-date text here to make it
367 # spit the dummy if my code fails to find a
368 # date to write into this.
370 my $receivedate = 'Bogus';
372 # The subject of the message
375 my $previous_line_was_empty = 1;
377 # We record the message start line here, for error
381 # If defined, we use this as the number of bytes in the
382 # message body rather than looking for a /^From / line.
385 # A From lines can either occur as the first
386 # line of a file, or after an empty line.
387 # Most mail systems will quote all From lines
388 # appearing in the message, but some will only
389 # do it when necessary.
390 # Since we initialise the variable to true,
391 # we don't need to check for beginning of file.
395 # exchange possible Windows EOL (CRLF) with Unix EOL (LF)
399 && $previous_line_was_empty
400 && (!defined $contentlength)
403 # We are reading the "From " line which has an
404 # email address followed by a receive date.
405 # Turn on the $inheaders flag until we reach
406 # the end of the headers.
410 # record the message start line
414 # If this is not the first run through the loop
415 # then this means we have already been working
418 if ($messagecount > 0)
420 # If so, then close that message file and then
421 # use utime to change its date-time.
423 # Note this code should be duplicated to do
424 # the same thing at the end of the while loop
425 # since we must close and touch the final message
426 # file we were writing when we hit the end of the
430 if ($messagefn ne '') {
431 my $t = str2time($receivedate);
432 utime $t, $t, $messagefn if defined $t;
436 # Because we opened the Mbox file without any
437 # variable, I think this means that we have its
438 # current line in Perl's default variable "$_".
439 # So all sorts of pattern matching magic works
442 # We are currently reading the first line starting with
443 # "From " which contains the date we want.
445 # This will be of the form:
447 # From dduck@test.org Wed Nov 24 11:05:35 1999
449 # at least with UW-IMAP.
451 # However, I did find a nasty exception to this in my
452 # tests, of the form:
454 # "bounce-MusicNewsletter 5-rw=test.org"@announce2.mp3.com
456 # This makes it trickier to get rid of the email address,
457 # but I did find a way. I can't rule out that there would
458 # be some address like this with an "@" in the quoted
461 # Unfortunately, testing with an old Inbox Mbox file,
462 # I also found an instance where the email address
463 # had no @ sign at all. It was just an email
464 # account name, with no host.
466 # I could search for the day of the week. If I skipped
467 # at least one word of non-whitespace (1 or more contiguous
468 # non-whitespace characters) then searched for a day of
469 # the week, then I should be able to avoid almost
470 # every instance of a day of the week appearing in
473 # Do I need a failsafe arrangement to provide some
474 # other date to touch if I don't get what seems like
475 # a date in my resulting string? For now, no.
477 # I will take one approach if there is an @ in the
478 # "From " line and another (just skip the first word
479 # after "From ") if there is no @ in the line.
481 # If I knew more about Perl I would probably do it in
482 # a more elegant way.
484 # Copy the current line into $fromline.
488 # Now get rid of the "From ". " =~ s" means substitute.
489 # Find the word "From " at the start of the line and
490 # replace it with nothing. The nothing is what is
491 # between the second and third slash.
493 $fromline =~ s/^From // ;
496 # Likewise get rid of the email address.
497 # This first section is if we determine there is one
498 # (or more . . . ) "@" characters in the line, which
499 # would normally be the case.
501 if ($fromline =~ m/@/)
503 # The line has at least one "@" in it, so we assume
504 # this is in the middle of an email address.
506 # If the email address had no spaces, then we could
507 # get rid of the whole thing by searching for any number
508 # of non-whitespace characters (\S) contiguously, and
509 # then I think a space. Subsitute nothing for this.
511 # $fromline =~ s/(\S)+ // ;
513 # But we need something to match any number of non-@
514 # characters, then the "@" and then all the non-whitespace
515 # characters from there (which takes us to the end of
516 # "test.org") and then the space following that.
518 # A tutorial on regular expressions is:
520 # http://www.perldoc.com/perl5.6.1/pod/perlretut.html
522 # Get rid of all non-@ characters up to the first "@":
524 $fromline =~ s/[^@]+//;
527 # Get rid of the "@".
531 # If there was an "@" in the line, then we have now
532 # removed the first one (lets hope there aren't more!)
533 # and everything which preceded it.
535 # we now remove either something like
536 # '(foo bar)'. eg. '(no mail address)',
537 # or everything after the '@' up to the trailing
540 # FIXME: all those regexp should be combined to just one single one
542 $fromline =~ s/(\((\S*| )+\)|\S+) *//;
546 # Stash the date-time for later use. We will use it
547 # to touch the file after we have closed it.
549 $receivedate = $fromline;
553 # print "$receivedate is the receivedate of message $messagecount.\n";
554 # $receivedate = "Wed Nov 24 11:05:35 1999";
556 # To look at the exact date-time of files:
558 # ls -lFa --full-time
560 # End of handling the "From " line.
564 # Now process header lines which are not the "From " line.
566 if ( ($inheaders eq 1)
570 # Now we are reading the header lines after the "From " line.
571 # Keep looking for the blank line which indicates the end of the
575 # ".=" means append the current line to the $headers
578 # For some reason, I was getting two blank lines
579 # at the end of the headers, rather than one,
580 # so I decided not to read in the blank line
581 # which terminates the headers.
583 # Delete the "unless ($_ eq "\n")" to get rid
586 $headers .= $_ unless ($_ eq "\n");
588 # Now scan the line for various status flags
589 # and to fine the Subject line.
591 $flags .= $1 if /^Status: ([A-Z]+)/;
592 $flags .= $1 if /^X-Status: ([A-Z]+)/;
593 if (/^X-Mozilla-Status: ([0-9a-f]{4})/i)
595 $flags .= 'R' if (hex($1) & 0x0001);
596 $flags .= 'A' if (hex($1) & 0x0002);
597 $flags .= 'D' if (hex($1) & 0x0008);
599 if(/^X\-Evolution:\s+\w{8}\-(\w{4})/oi)
601 $b = pack("H4", $1); #pack it as 4 digit hex (0x0000)
602 $b = unpack("B32", $b); #unpack into bit string
604 # "usually" only the right most six bits are used
605 # however, I have come across a seventh bit in
606 # about 15 (out of 10,000) messages with this bit
608 # I have not found any documentation in the source.
609 # If you find out what it does, please let me know.
612 # Evolution 1.4 does mark forwarded messages.
613 # The sixth bit is to denote an attachment
615 $flags .= 'A' if($b =~ /[01]{15}1/); #replied
616 $flags .= 'D' if($b =~ /[01]{14}1[01]{1}/); #deleted
617 $flags .= 'T' if($b =~ /[01]{13}1[01]{2}/); #draft
618 $flags .= 'F' if($b =~ /[01]{12}1[01]{3}/); #flagged
619 $flags .= 'R' if($b =~ /[01]{11}1[01]{4}/); #seen/read
621 $subject = $1 if /^Subject: (.*)$/;
622 $contentlength = $1 if /^Content-Length: (\d+)$/;
624 # Now look out for the end of the headers - a blank
625 # line. When we find it, create the file name and
626 # analyse the Subject line.
630 # We are at the end of the headers. Set the
631 # $inheaders flag back to 0.
635 # Include the current newline in the content length
637 ++$contentlength if defined $contentlength;
639 # Create the file name for the current message.
641 # A simple version of this would be:
643 # $messagefn = "cur/$unique.$messagecount.mbox:2,";
645 # This would create names with $messagecount values of
646 # 1, 2, etc. But for neatness when looking at a
647 # directory of such messages, sorted by filename,
648 # I want to have leading zeroes on message count, so
649 # that they would be 000001 etc. This makes them
650 # appear in message order rather than 1 being after
651 # 19 etc. So this is good for up to 999,999 messages
652 # in a mailbox. It is a cosmetic matter for a person
653 # looking into the Maildir directory manually.
654 # To do this, use sprintf instead with "%06d" for
655 # 6 characters of zero-padding:
657 $messagefn = sprintf ("cur/%d.%06d.mbox:2,", $unique, $messagecount) ;
660 # Append flag characters to the end of the
661 # filename, according to flag characters
662 # collected from the message headers
664 $messagefn .= 'F' if $flags =~ /F/; # Flagged.
665 $messagefn .= 'R' if $flags =~ /A/; # Replied to.
666 $messagefn .= 'S' if $flags =~ /R/; # Seen or Read.
667 $messagefn .= 'T' if $flags =~ /D/; # Tagged for deletion.
670 # Opens filename $messagefn for output (>) with filehandle OUT.
672 open(OUT, ">$messagefn") or die("Fatal: unable to create new message $messagefn");
674 # Count the messages.
678 # Only for the first message,
679 # check to see if it is a dummy.
680 # Delete the message file we
681 # just created if it was for the
682 # dummy message at the start
685 # Add search terms as required.
686 # The last 2 lines are for rent.
688 # "m" means match the regular expression,
689 # but we can do without it.
691 # Do I need to escape the ' in "DON'T"?
692 # I didn't in the original version.
694 if ( (($messagecount == 1) && defined($subject))
695 && ($subject =~ m/^DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA/)
698 # Stash the file name of the dummy message so we
699 # can delete it later.
701 $deletedummy = "$messagefn";
704 # Print the collected headers to the message file.
706 print OUT "$headers";
709 # Clear $headers and $flags ready for the next message.
714 # End of processing the headers once we found the
715 # blank line which terminated them
718 # End of dealing with the headers.
722 if ( $inheaders eq 0)
725 # We are now processing the message body.
727 # Now we have passed the headers to the
728 # output file, we scan until the while
729 # loop finds another "From " line.
731 # Decrement our content length if we're
732 # using it to find the end of the message
735 if (defined $contentlength) {
737 # Decrement our $contentlength variable
739 $contentlength -= length($_);
741 # The proper end for a message with Content-Length
742 # specified is the $contentlength variable should
743 # be exactly -1 and we should be on a bare
744 # newline. Note that the bare newline is not
745 # printed to the end of the current message as
746 # it's actually a message separator in the mbox
747 # format rather than part of the message. The
748 # next line _should_ be a From_ line, but just in
749 # case the Content-Length header is incorrect
750 # (e.g. a corrupt mailbox), we just continue
751 # putting lines into the current message until we
752 # see the next From_ line.
754 if ($contentlength < 0) {
755 if ($contentlength == -1 && $_ eq "\n") {
756 $contentlength = undef;
759 $contentlength = undef;
764 # We want to copy every part of the message
765 # body to the output file, except for the
766 # quoted ">From " lines, which was the
767 # way the IMAP server encoded body lines
768 # starting with "From ".
770 # Pattern matching Perl majik to
771 # get rid of an Mbox quoted From.
773 # This works on the default variable "$_" which
774 # contains the text from the Mbox mailbox - I
775 # guess this is the case because of our
776 # (open(MBOX ....) line above, which did not
777 # assign this to anything else, so it would go
778 # to the default variable. This enables
779 # inscrutably terse Perlisms to follow.
781 # "s" means "Subsitute" and it looks for any
782 # occurrence of ">From" starting at the start
783 # of the line. When it finds this, it replaces
786 # So this finds all instances in the Mbox message
787 # where the original line started with the word
788 # "From" but was converted to ">From" in order to
789 # not be mistaken for the "From ..." line which
790 # is used to demark each message in the Mbox.
791 # This was was a destructive conversion because
792 # any message which originally had ">From" at the
793 # start of the line, before being put into the
794 # Mbox, will now have that line without the ">".
798 # Glorious tersness here. Thanks Simon for
801 # "print OUT" means print the default variable to
802 # the file of file handle OUT. This is where
803 # the bulk of the message text is written to
806 print OUT or die("Fatal: unable to write to new message to $messagefn");
809 # End of the if statement dealing with message body.
812 $previous_line_was_empty = ( $_ eq "\n" );
814 # End of while (MBOX) loop.
816 # Close the input file.
820 # Close the output file, and duplicate the code
821 # from the start of the while loop which touches
822 # the date-time of the most recent message file.
825 if ($messagefn ne '') {
826 my $t = str2time($receivedate);
827 utime $t, $t, $messagefn;
830 # After all the messages have been
831 # converted, check to see if the
832 # first one was a dummy.
833 # if so, delete it and make
834 # the message count one less.
836 if ($deletedummy ne "")
838 printf("Dummy mail system first message detected and not saved.\n");
845 printf("$messagecount messages.\n\n");