#! /usr/bin/perl # convert_to_maildir converts mboxes to maildir, and is released # under the terms of the GPL version 2, or any later version, at your # option. See the file README and COPYING for more information. # Copyright 2011 by Don Armstrong . # $Id: perl_script 1825 2011-01-02 01:53:43Z don $ use warnings; use strict; use Getopt::Long; use Pod::Usage; =head1 NAME convert_to_maildir mailbox maildir - convert a mailbox to a maildir =head1 SYNOPSIS convert_to_maildir [options] mailbox maildir convert_to_maildir -m maildir mailbox [mailbox2..] Options: --maildir,-m maildir destination --debug, -d debugging level (Default 0) --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--maildir,-m> Maildir destination; useful if converting multiple mailboxes =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief usage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES =cut use vars qw($DEBUG); use Date::Parse; use IO::Handle; use Fcntl; my %options = (debug => 0, help => 0, man => 0, ); GetOptions(\%options, 'maildir|m=s', 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; my @USAGE_ERRORS; if (@ARGV != 2 and not defined $options{maildir}) { push @USAGE_ERRORS,"You must either give one mailbox and one maildir, or use the -m option"; } pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; if (not defined $options{maildir}) { $options{maildir} = pop @ARGV; } maildirmake($options{maildir}); for my $mailbox (@ARGV) { convert($mailbox,$options{maildir}); } # code below comes from mb2md # # mb2md-3.20.pl Converts Mbox mailboxes to Maildir format. # # Public domain. # # currently maintained by: # Juri Haberland # initially wrote by: # Robin Whittle # # This script's web abode is http://batleth.sapienti-sat.org/projects/mb2md/ . # For a changelog see http://batleth.sapienti-sat.org/projects/mb2md/changelog.txt # The maildirmake function # ------------------------ # # It does the same thing that the maildirmake binary that # comes with courier-imap distribution # sub maildirmake { foreach(@_) { -d $_ or mkdir $_,0700 or die("Fatal: Directory $_ doesn't exist and can't be created.\n"); -d "$_/tmp" or mkdir("$_/tmp",0700) or die("Fatal: Unable to make $_/tmp/ subdirectory.\n"); -d "$_/new" or mkdir("$_/new",0700) or die("Fatal: Unable to make $_/new/ subdirectory.\n"); -d "$_/cur" or mkdir("$_/cur",0700) or die("Fatal: Unable to make $_/cur/ subdirectory.\n"); } } # The convert function # --------------------- # # This function does the down and dirty work of # actually converting the mbox to a maildir # sub convert { # get the source and destination as arguments my ($mbox, $maildir) = @_; printf("Source Mbox is $mbox\n"); printf("Target Maildir is $maildir \n") ; # create the directories for the new maildir # # if it is the root maildir (ie. converting the inbox) # these already exist but thats not a big issue &maildirmake($maildir); # Change to the target mailbox directory. chdir "$maildir" ; # Converts a Mbox to multiple files # in a Maildir. # This is adapted from mbox2maildir. # # Open the Mbox mailbox file. if (sysopen(MBOX, "$mbox", O_RDONLY)) { #printf("Converting Mbox $mbox . . . \n"); } else { die("Fatal: unable to open input mailbox file: $mbox ! \n"); } # This loop scans the input mailbox for # a line starting with "From ". The # "^" before it is pattern-matching # lingo for it being at the start of a # line. # # Each email in Mbox mailbox starts # with such a line, which is why any # such line in the body of the email # has to have a ">" put in front of it. # # This is not required in a Maildir # mailbox, and some majik below # finds any such quoted "> From"s and # gets rid of the "> " quote. # # Each email is put in a file # in the cur/ subdirectory with a # name of the form: # # nnnnnnnnn.cccc.mbox:2,XXXX # # where: # "nnnnnnnnn" is the Unix time since # 1970 when this script started # running, incremented by 1 for # every email. This is to ensure # unique names for each message # file. # # ".cccc" is the message count of # messages from this mbox. # # ".mbox" is just to indicate that # this message was converted from # an Mbox mailbox. # # ":2," is the start of potentially # multiple IMAP flag characters # "XXXX", but may be followed by # nothing. # # This is sort-of compliant with # the Maildir naming conventions # specified at: # # http://www.qmail.org/man/man5/maildir.html # # This approach does not involve the # process ID or the hostname, but it is # probably good enough. # # When the IMAP server looks at this # mailbox, it will move the files to # the cur/ directory and change their # names as it pleases. In the case # of Courier IMAP, the names will # become like: # # 995096541.25351.mbox:2,S # # with 25351 being Courier IMAP's # process ID. The :2, is the start # of the flags, and the "S" means # that this one has been seen by # the user. (But is this the same # meaning as the user actually # having opened the message to see # its contents, rather than just the # IMAP server having been asked to # list the message's Subject etc. # so the client could list it in the # visible Inbox?) # # This contrasts with a message # created by Courier IMAP, say with # a message copy, which is like: # # 995096541.25351.zair,S=14285:2,S # # where ",S=14285" is the size of the # message in bytes. # # Courier Maildrop's names are similar # but lack the ":2,XXXX" flags . . . # except for my modified Maildrop # which can deliver them with a # ":2,T" - flagged for deletion. # # I have extended the logic of the # per-message inner loop to stop # saving a file for a message with: # # Subject: DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA # # This is the dummy message, always # at the start of an Mbox format # mailbox file - and is put there # by UW IMAPD. Since quite a few # people will use this for # converting from a UW system, # I figure it is worth it. # # I will not save any such message # file for the dummy message. # # Plan # ---- # # We want to read the entire Mbox file, whilst # going through a loop for each message we find. # # We want to read all the headers of the message, # starting with the "From " line. For that "From " # line we want to get a date. # # For all other header lines, we want to store them # in $headers whilst parsing them to find: # # 1 - Any flags in the "Status: " or "X-Status: " or # "X-Mozilla-Status: " lines. # # 2 - A subject line indicating this is the dummy message # at the start (typically, but not necessarily) of # the Mbox. # # Once we reach the end of the headers, we will crunch any # flags we found to create a file name. Then, unless this is # the dummy message we create that file and write all the # headers to it. # # Then we continue reading the Mbox, converting ">From " to # "From " and writing it to the file, until we reach one of: # # 1 - Another "From " line (indicating the start of another # message). # # or # # 2 - The end of the Mbox. # # In the former case, which we detect at the start of the loop # we need to close the file and touch it to alter its date-time. # # In the later case, we also need to close the file and touch # it to alter its date-time - but this is beyond the end of the # loop. # Variables # --------- my $messagecount = 0; # For generating unique filenames for # each message. Initialise it here with # numeric time in seconds since 1970. my $unique = time; # Name of message file to delete if we found that # it was created by reading the Mbox dummy message. my $deletedummy = ''; # To store the complete "From (address) (date-time) # which delineates the start of each message # in the Mbox my $fromline = ''; # Set to 1 when we are reading the header lines, # including the "From " line. # # 0 means we are reading the message body and looking # for another "From " line. my $inheaders = 0; # Variable to hold all headers (apart from # the first line "From ...." which is not # part of the message itself. my $headers = ''; # Variable to hold the accumulated characters # we find in header lines of the type: # # Status: # X-Status: # X-Mozilla-Status: # X-Evolution: my $flags = ''; # To build the file name for the message in. my $messagefn = ''; # The date string from the "From " line of each # message will be written here - and used by # touch to alter the date-time of each message # file. Put non-date text here to make it # spit the dummy if my code fails to find a # date to write into this. my $receivedate = 'Bogus'; # The subject of the message my $subject = ''; my $previous_line_was_empty = 1; # We record the message start line here, for error # reporting. my $startline; # If defined, we use this as the number of bytes in the # message body rather than looking for a /^From / line. my $contentlength; # A From lines can either occur as the first # line of a file, or after an empty line. # Most mail systems will quote all From lines # appearing in the message, but some will only # do it when necessary. # Since we initialise the variable to true, # we don't need to check for beginning of file. while() { # exchange possible Windows EOL (CRLF) with Unix EOL (LF) $_ =~ s/\r\n$/\n/; if ( /^From / && $previous_line_was_empty && (!defined $contentlength) ) { # We are reading the "From " line which has an # email address followed by a receive date. # Turn on the $inheaders flag until we reach # the end of the headers. $inheaders = 1; # record the message start line $startline = $.; # If this is not the first run through the loop # then this means we have already been working # on a message. if ($messagecount > 0) { # If so, then close that message file and then # use utime to change its date-time. # # Note this code should be duplicated to do # the same thing at the end of the while loop # since we must close and touch the final message # file we were writing when we hit the end of the # Mbox file. close (OUT); if ($messagefn ne '') { my $t = str2time($receivedate); utime $t, $t, $messagefn if defined $t; } } # Because we opened the Mbox file without any # variable, I think this means that we have its # current line in Perl's default variable "$_". # So all sorts of pattern matching magic works # directly on it. # We are currently reading the first line starting with # "From " which contains the date we want. # # This will be of the form: # # From dduck@test.org Wed Nov 24 11:05:35 1999 # # at least with UW-IMAP. # # However, I did find a nasty exception to this in my # tests, of the form: # # "bounce-MusicNewsletter 5-rw=test.org"@announce2.mp3.com # # This makes it trickier to get rid of the email address, # but I did find a way. I can't rule out that there would # be some address like this with an "@" in the quoted # portion too. # # Unfortunately, testing with an old Inbox Mbox file, # I also found an instance where the email address # had no @ sign at all. It was just an email # account name, with no host. # # I could search for the day of the week. If I skipped # at least one word of non-whitespace (1 or more contiguous # non-whitespace characters) then searched for a day of # the week, then I should be able to avoid almost # every instance of a day of the week appearing in # the email address. # # Do I need a failsafe arrangement to provide some # other date to touch if I don't get what seems like # a date in my resulting string? For now, no. # # I will take one approach if there is an @ in the # "From " line and another (just skip the first word # after "From ") if there is no @ in the line. # # If I knew more about Perl I would probably do it in # a more elegant way. # Copy the current line into $fromline. $fromline = $_; # Now get rid of the "From ". " =~ s" means substitute. # Find the word "From " at the start of the line and # replace it with nothing. The nothing is what is # between the second and third slash. $fromline =~ s/^From // ; # Likewise get rid of the email address. # This first section is if we determine there is one # (or more . . . ) "@" characters in the line, which # would normally be the case. if ($fromline =~ m/@/) { # The line has at least one "@" in it, so we assume # this is in the middle of an email address. # # If the email address had no spaces, then we could # get rid of the whole thing by searching for any number # of non-whitespace characters (\S) contiguously, and # then I think a space. Subsitute nothing for this. # # $fromline =~ s/(\S)+ // ; # # But we need something to match any number of non-@ # characters, then the "@" and then all the non-whitespace # characters from there (which takes us to the end of # "test.org") and then the space following that. # # A tutorial on regular expressions is: # # http://www.perldoc.com/perl5.6.1/pod/perlretut.html # # Get rid of all non-@ characters up to the first "@": $fromline =~ s/[^@]+//; # Get rid of the "@". $fromline =~ s/@//; } # If there was an "@" in the line, then we have now # removed the first one (lets hope there aren't more!) # and everything which preceded it. # # we now remove either something like # '(foo bar)'. eg. '(no mail address)', # or everything after the '@' up to the trailing # timezone # # FIXME: all those regexp should be combined to just one single one $fromline =~ s/(\((\S*| )+\)|\S+) *//; chomp $fromline; # Stash the date-time for later use. We will use it # to touch the file after we have closed it. $receivedate = $fromline; # Debugging lines: # # print "$receivedate is the receivedate of message $messagecount.\n"; # $receivedate = "Wed Nov 24 11:05:35 1999"; # # To look at the exact date-time of files: # # ls -lFa --full-time # # End of handling the "From " line. } # Now process header lines which are not the "From " line. if ( ($inheaders eq 1) && (! /^From /) ) { # Now we are reading the header lines after the "From " line. # Keep looking for the blank line which indicates the end of the # headers. # ".=" means append the current line to the $headers # variable. # # For some reason, I was getting two blank lines # at the end of the headers, rather than one, # so I decided not to read in the blank line # which terminates the headers. # # Delete the "unless ($_ eq "\n")" to get rid # of this kludge. $headers .= $_ unless ($_ eq "\n"); # Now scan the line for various status flags # and to fine the Subject line. $flags .= $1 if /^Status: ([A-Z]+)/; $flags .= $1 if /^X-Status: ([A-Z]+)/; if (/^X-Mozilla-Status: ([0-9a-f]{4})/i) { $flags .= 'R' if (hex($1) & 0x0001); $flags .= 'A' if (hex($1) & 0x0002); $flags .= 'D' if (hex($1) & 0x0008); } if(/^X\-Evolution:\s+\w{8}\-(\w{4})/oi) { $b = pack("H4", $1); #pack it as 4 digit hex (0x0000) $b = unpack("B32", $b); #unpack into bit string # "usually" only the right most six bits are used # however, I have come across a seventh bit in # about 15 (out of 10,000) messages with this bit # activated. # I have not found any documentation in the source. # If you find out what it does, please let me know. # Notes: # Evolution 1.4 does mark forwarded messages. # The sixth bit is to denote an attachment $flags .= 'A' if($b =~ /[01]{15}1/); #replied $flags .= 'D' if($b =~ /[01]{14}1[01]{1}/); #deleted $flags .= 'T' if($b =~ /[01]{13}1[01]{2}/); #draft $flags .= 'F' if($b =~ /[01]{12}1[01]{3}/); #flagged $flags .= 'R' if($b =~ /[01]{11}1[01]{4}/); #seen/read } $subject = $1 if /^Subject: (.*)$/; $contentlength = $1 if /^Content-Length: (\d+)$/; # Now look out for the end of the headers - a blank # line. When we find it, create the file name and # analyse the Subject line. if ($_ eq "\n") { # We are at the end of the headers. Set the # $inheaders flag back to 0. $inheaders = 0; # Include the current newline in the content length ++$contentlength if defined $contentlength; # Create the file name for the current message. # # A simple version of this would be: # # $messagefn = "cur/$unique.$messagecount.mbox:2,"; # # This would create names with $messagecount values of # 1, 2, etc. But for neatness when looking at a # directory of such messages, sorted by filename, # I want to have leading zeroes on message count, so # that they would be 000001 etc. This makes them # appear in message order rather than 1 being after # 19 etc. So this is good for up to 999,999 messages # in a mailbox. It is a cosmetic matter for a person # looking into the Maildir directory manually. # To do this, use sprintf instead with "%06d" for # 6 characters of zero-padding: $messagefn = sprintf ("cur/%d.%06d.mbox:2,", $unique, $messagecount) ; # Append flag characters to the end of the # filename, according to flag characters # collected from the message headers $messagefn .= 'F' if $flags =~ /F/; # Flagged. $messagefn .= 'R' if $flags =~ /A/; # Replied to. $messagefn .= 'S' if $flags =~ /R/; # Seen or Read. $messagefn .= 'T' if $flags =~ /D/; # Tagged for deletion. # Opens filename $messagefn for output (>) with filehandle OUT. open(OUT, ">$messagefn") or die("Fatal: unable to create new message $messagefn"); # Count the messages. $messagecount++; # Only for the first message, # check to see if it is a dummy. # Delete the message file we # just created if it was for the # dummy message at the start # of the Mbox. # # Add search terms as required. # The last 2 lines are for rent. # # "m" means match the regular expression, # but we can do without it. # # Do I need to escape the ' in "DON'T"? # I didn't in the original version. if ( (($messagecount == 1) && defined($subject)) && ($subject =~ m/^DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA/) ) { # Stash the file name of the dummy message so we # can delete it later. $deletedummy = "$messagefn"; } # Print the collected headers to the message file. print OUT "$headers"; # Clear $headers and $flags ready for the next message. $headers = ''; $flags = ''; # End of processing the headers once we found the # blank line which terminated them } # End of dealing with the headers. } if ( $inheaders eq 0) { # We are now processing the message body. # # Now we have passed the headers to the # output file, we scan until the while # loop finds another "From " line. # Decrement our content length if we're # using it to find the end of the message # body if (defined $contentlength) { # Decrement our $contentlength variable $contentlength -= length($_); # The proper end for a message with Content-Length # specified is the $contentlength variable should # be exactly -1 and we should be on a bare # newline. Note that the bare newline is not # printed to the end of the current message as # it's actually a message separator in the mbox # format rather than part of the message. The # next line _should_ be a From_ line, but just in # case the Content-Length header is incorrect # (e.g. a corrupt mailbox), we just continue # putting lines into the current message until we # see the next From_ line. if ($contentlength < 0) { if ($contentlength == -1 && $_ eq "\n") { $contentlength = undef; next; } $contentlength = undef; } } # # We want to copy every part of the message # body to the output file, except for the # quoted ">From " lines, which was the # way the IMAP server encoded body lines # starting with "From ". # # Pattern matching Perl majik to # get rid of an Mbox quoted From. # # This works on the default variable "$_" which # contains the text from the Mbox mailbox - I # guess this is the case because of our # (open(MBOX ....) line above, which did not # assign this to anything else, so it would go # to the default variable. This enables # inscrutably terse Perlisms to follow. # # "s" means "Subsitute" and it looks for any # occurrence of ">From" starting at the start # of the line. When it finds this, it replaces # it with "From". # # So this finds all instances in the Mbox message # where the original line started with the word # "From" but was converted to ">From" in order to # not be mistaken for the "From ..." line which # is used to demark each message in the Mbox. # This was was a destructive conversion because # any message which originally had ">From" at the # start of the line, before being put into the # Mbox, will now have that line without the ">". s/^>From /From /; # Glorious tersness here. Thanks Simon for # explaining this. # # "print OUT" means print the default variable to # the file of file handle OUT. This is where # the bulk of the message text is written to # the output file. print OUT or die("Fatal: unable to write to new message to $messagefn"); # End of the if statement dealing with message body. } $previous_line_was_empty = ( $_ eq "\n" ); # End of while (MBOX) loop. } # Close the input file. close(MBOX); # Close the output file, and duplicate the code # from the start of the while loop which touches # the date-time of the most recent message file. close(OUT); if ($messagefn ne '') { my $t = str2time($receivedate); utime $t, $t, $messagefn; } # After all the messages have been # converted, check to see if the # first one was a dummy. # if so, delete it and make # the message count one less. if ($deletedummy ne "") { printf("Dummy mail system first message detected and not saved.\n"); unlink $deletedummy; $messagecount--; } printf("$messagecount messages.\n\n"); }