From: Don Armstrong Date: Sat, 22 Dec 2012 00:11:21 +0000 (-0800) Subject: add convert_to_maildir X-Git-Url: https://git.donarmstrong.com/?p=bin.git;a=commitdiff_plain;h=9b1126827881a7a752b58a7b497e8291565730be add convert_to_maildir --- diff --git a/convert_to_maildir b/convert_to_maildir new file mode 100755 index 0000000..62e57a8 --- /dev/null +++ b/convert_to_maildir @@ -0,0 +1,846 @@ +#! /usr/bin/perl +# convert_to_maildir converts mboxes to maildir, and is released +# under the terms of the GPL version 2, or any later version, at your +# option. See the file README and COPYING for more information. +# Copyright 2011 by Don Armstrong . +# $Id: perl_script 1825 2011-01-02 01:53:43Z don $ + + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +convert_to_maildir mailbox maildir - convert a mailbox to a maildir + +=head1 SYNOPSIS + +convert_to_maildir [options] mailbox maildir +convert_to_maildir -m maildir mailbox [mailbox2..] + + + Options: + --maildir,-m maildir destination + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--maildir,-m> + +Maildir destination; useful if converting multiple mailboxes + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + +=cut + + +use vars qw($DEBUG); +use Date::Parse; +use IO::Handle; +use Fcntl; + + +my %options = (debug => 0, + help => 0, + man => 0, + ); + +GetOptions(\%options, + 'maildir|m=s', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +my @USAGE_ERRORS; +if (@ARGV != 2 and not defined $options{maildir}) { + push @USAGE_ERRORS,"You must either give one mailbox and one maildir, or use the -m option"; +} + +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + +if (not defined $options{maildir}) { + $options{maildir} = pop @ARGV; +} + + +maildirmake($options{maildir}); +for my $mailbox (@ARGV) { + convert($mailbox,$options{maildir}); +} + +# code below comes from mb2md +# +# mb2md-3.20.pl Converts Mbox mailboxes to Maildir format. +# +# Public domain. +# +# currently maintained by: +# Juri Haberland +# initially wrote by: +# Robin Whittle +# +# This script's web abode is http://batleth.sapienti-sat.org/projects/mb2md/ . +# For a changelog see http://batleth.sapienti-sat.org/projects/mb2md/changelog.txt + +# The maildirmake function +# ------------------------ +# +# It does the same thing that the maildirmake binary that +# comes with courier-imap distribution +# +sub maildirmake +{ + foreach(@_) { + -d $_ or mkdir $_,0700 or die("Fatal: Directory $_ doesn't exist and can't be created.\n"); + -d "$_/tmp" or mkdir("$_/tmp",0700) or die("Fatal: Unable to make $_/tmp/ subdirectory.\n"); + -d "$_/new" or mkdir("$_/new",0700) or die("Fatal: Unable to make $_/new/ subdirectory.\n"); + -d "$_/cur" or mkdir("$_/cur",0700) or die("Fatal: Unable to make $_/cur/ subdirectory.\n"); + } +} + +# The convert function +# --------------------- +# +# This function does the down and dirty work of +# actually converting the mbox to a maildir +# +sub convert +{ + # get the source and destination as arguments + my ($mbox, $maildir) = @_; + + printf("Source Mbox is $mbox\n"); + printf("Target Maildir is $maildir \n") ; + + # create the directories for the new maildir + # + # if it is the root maildir (ie. converting the inbox) + # these already exist but thats not a big issue + + &maildirmake($maildir); + + # Change to the target mailbox directory. + + chdir "$maildir" ; + + # Converts a Mbox to multiple files + # in a Maildir. + # This is adapted from mbox2maildir. + # + # Open the Mbox mailbox file. + + + if (sysopen(MBOX, "$mbox", O_RDONLY)) + { + #printf("Converting Mbox $mbox . . . \n"); + } + else + { + die("Fatal: unable to open input mailbox file: $mbox ! \n"); + } + + # This loop scans the input mailbox for + # a line starting with "From ". The + # "^" before it is pattern-matching + # lingo for it being at the start of a + # line. + # + # Each email in Mbox mailbox starts + # with such a line, which is why any + # such line in the body of the email + # has to have a ">" put in front of it. + # + # This is not required in a Maildir + # mailbox, and some majik below + # finds any such quoted "> From"s and + # gets rid of the "> " quote. + # + # Each email is put in a file + # in the cur/ subdirectory with a + # name of the form: + # + # nnnnnnnnn.cccc.mbox:2,XXXX + # + # where: + # "nnnnnnnnn" is the Unix time since + # 1970 when this script started + # running, incremented by 1 for + # every email. This is to ensure + # unique names for each message + # file. + # + # ".cccc" is the message count of + # messages from this mbox. + # + # ".mbox" is just to indicate that + # this message was converted from + # an Mbox mailbox. + # + # ":2," is the start of potentially + # multiple IMAP flag characters + # "XXXX", but may be followed by + # nothing. + # + # This is sort-of compliant with + # the Maildir naming conventions + # specified at: + # + # http://www.qmail.org/man/man5/maildir.html + # + # This approach does not involve the + # process ID or the hostname, but it is + # probably good enough. + # + # When the IMAP server looks at this + # mailbox, it will move the files to + # the cur/ directory and change their + # names as it pleases. In the case + # of Courier IMAP, the names will + # become like: + # + # 995096541.25351.mbox:2,S + # + # with 25351 being Courier IMAP's + # process ID. The :2, is the start + # of the flags, and the "S" means + # that this one has been seen by + # the user. (But is this the same + # meaning as the user actually + # having opened the message to see + # its contents, rather than just the + # IMAP server having been asked to + # list the message's Subject etc. + # so the client could list it in the + # visible Inbox?) + # + # This contrasts with a message + # created by Courier IMAP, say with + # a message copy, which is like: + # + # 995096541.25351.zair,S=14285:2,S + # + # where ",S=14285" is the size of the + # message in bytes. + # + # Courier Maildrop's names are similar + # but lack the ":2,XXXX" flags . . . + # except for my modified Maildrop + # which can deliver them with a + # ":2,T" - flagged for deletion. + # + # I have extended the logic of the + # per-message inner loop to stop + # saving a file for a message with: + # + # Subject: DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA + # + # This is the dummy message, always + # at the start of an Mbox format + # mailbox file - and is put there + # by UW IMAPD. Since quite a few + # people will use this for + # converting from a UW system, + # I figure it is worth it. + # + # I will not save any such message + # file for the dummy message. + # + # Plan + # ---- + # + # We want to read the entire Mbox file, whilst + # going through a loop for each message we find. + # + # We want to read all the headers of the message, + # starting with the "From " line. For that "From " + # line we want to get a date. + # + # For all other header lines, we want to store them + # in $headers whilst parsing them to find: + # + # 1 - Any flags in the "Status: " or "X-Status: " or + # "X-Mozilla-Status: " lines. + # + # 2 - A subject line indicating this is the dummy message + # at the start (typically, but not necessarily) of + # the Mbox. + # + # Once we reach the end of the headers, we will crunch any + # flags we found to create a file name. Then, unless this is + # the dummy message we create that file and write all the + # headers to it. + # + # Then we continue reading the Mbox, converting ">From " to + # "From " and writing it to the file, until we reach one of: + # + # 1 - Another "From " line (indicating the start of another + # message). + # + # or + # + # 2 - The end of the Mbox. + # + # In the former case, which we detect at the start of the loop + # we need to close the file and touch it to alter its date-time. + # + # In the later case, we also need to close the file and touch + # it to alter its date-time - but this is beyond the end of the + # loop. + + + # Variables + # --------- + + my $messagecount = 0; + + # For generating unique filenames for + # each message. Initialise it here with + # numeric time in seconds since 1970. + my $unique = time; + + # Name of message file to delete if we found that + # it was created by reading the Mbox dummy message. + + my $deletedummy = ''; + + # To store the complete "From (address) (date-time) + # which delineates the start of each message + # in the Mbox + my $fromline = ''; + + + # Set to 1 when we are reading the header lines, + # including the "From " line. + # + # 0 means we are reading the message body and looking + # for another "From " line. + + my $inheaders = 0; + + # Variable to hold all headers (apart from + # the first line "From ...." which is not + # part of the message itself. + my $headers = ''; + + # Variable to hold the accumulated characters + # we find in header lines of the type: + # + # Status: + # X-Status: + # X-Mozilla-Status: + # X-Evolution: + my $flags = ''; + + # To build the file name for the message in. + my $messagefn = ''; + + + # The date string from the "From " line of each + # message will be written here - and used by + # touch to alter the date-time of each message + # file. Put non-date text here to make it + # spit the dummy if my code fails to find a + # date to write into this. + + my $receivedate = 'Bogus'; + + # The subject of the message + my $subject = ''; + + my $previous_line_was_empty = 1; + + # We record the message start line here, for error + # reporting. + my $startline; + + # If defined, we use this as the number of bytes in the + # message body rather than looking for a /^From / line. + my $contentlength; + + # A From lines can either occur as the first + # line of a file, or after an empty line. + # Most mail systems will quote all From lines + # appearing in the message, but some will only + # do it when necessary. + # Since we initialise the variable to true, + # we don't need to check for beginning of file. + + while() + { + # exchange possible Windows EOL (CRLF) with Unix EOL (LF) + $_ =~ s/\r\n$/\n/; + + if ( /^From / + && $previous_line_was_empty + && (!defined $contentlength) + ) + { + # We are reading the "From " line which has an + # email address followed by a receive date. + # Turn on the $inheaders flag until we reach + # the end of the headers. + + $inheaders = 1; + + # record the message start line + + $startline = $.; + + # If this is not the first run through the loop + # then this means we have already been working + # on a message. + + if ($messagecount > 0) + { + # If so, then close that message file and then + # use utime to change its date-time. + # + # Note this code should be duplicated to do + # the same thing at the end of the while loop + # since we must close and touch the final message + # file we were writing when we hit the end of the + # Mbox file. + + close (OUT); + if ($messagefn ne '') { + my $t = str2time($receivedate); + utime $t, $t, $messagefn if defined $t; + } + } + + # Because we opened the Mbox file without any + # variable, I think this means that we have its + # current line in Perl's default variable "$_". + # So all sorts of pattern matching magic works + # directly on it. + + # We are currently reading the first line starting with + # "From " which contains the date we want. + # + # This will be of the form: + # + # From dduck@test.org Wed Nov 24 11:05:35 1999 + # + # at least with UW-IMAP. + # + # However, I did find a nasty exception to this in my + # tests, of the form: + # + # "bounce-MusicNewsletter 5-rw=test.org"@announce2.mp3.com + # + # This makes it trickier to get rid of the email address, + # but I did find a way. I can't rule out that there would + # be some address like this with an "@" in the quoted + # portion too. + # + # Unfortunately, testing with an old Inbox Mbox file, + # I also found an instance where the email address + # had no @ sign at all. It was just an email + # account name, with no host. + # + # I could search for the day of the week. If I skipped + # at least one word of non-whitespace (1 or more contiguous + # non-whitespace characters) then searched for a day of + # the week, then I should be able to avoid almost + # every instance of a day of the week appearing in + # the email address. + # + # Do I need a failsafe arrangement to provide some + # other date to touch if I don't get what seems like + # a date in my resulting string? For now, no. + # + # I will take one approach if there is an @ in the + # "From " line and another (just skip the first word + # after "From ") if there is no @ in the line. + # + # If I knew more about Perl I would probably do it in + # a more elegant way. + + # Copy the current line into $fromline. + + $fromline = $_; + + # Now get rid of the "From ". " =~ s" means substitute. + # Find the word "From " at the start of the line and + # replace it with nothing. The nothing is what is + # between the second and third slash. + + $fromline =~ s/^From // ; + + + # Likewise get rid of the email address. + # This first section is if we determine there is one + # (or more . . . ) "@" characters in the line, which + # would normally be the case. + + if ($fromline =~ m/@/) + { + # The line has at least one "@" in it, so we assume + # this is in the middle of an email address. + # + # If the email address had no spaces, then we could + # get rid of the whole thing by searching for any number + # of non-whitespace characters (\S) contiguously, and + # then I think a space. Subsitute nothing for this. + # + # $fromline =~ s/(\S)+ // ; + # + # But we need something to match any number of non-@ + # characters, then the "@" and then all the non-whitespace + # characters from there (which takes us to the end of + # "test.org") and then the space following that. + # + # A tutorial on regular expressions is: + # + # http://www.perldoc.com/perl5.6.1/pod/perlretut.html + # + # Get rid of all non-@ characters up to the first "@": + + $fromline =~ s/[^@]+//; + + + # Get rid of the "@". + + $fromline =~ s/@//; + } + # If there was an "@" in the line, then we have now + # removed the first one (lets hope there aren't more!) + # and everything which preceded it. + # + # we now remove either something like + # '(foo bar)'. eg. '(no mail address)', + # or everything after the '@' up to the trailing + # timezone + # + # FIXME: all those regexp should be combined to just one single one + + $fromline =~ s/(\((\S*| )+\)|\S+) *//; + + chomp $fromline; + + # Stash the date-time for later use. We will use it + # to touch the file after we have closed it. + + $receivedate = $fromline; + + # Debugging lines: + # + # print "$receivedate is the receivedate of message $messagecount.\n"; + # $receivedate = "Wed Nov 24 11:05:35 1999"; + # + # To look at the exact date-time of files: + # + # ls -lFa --full-time + # + # End of handling the "From " line. + } + + + # Now process header lines which are not the "From " line. + + if ( ($inheaders eq 1) + && (! /^From /) + ) + { + # Now we are reading the header lines after the "From " line. + # Keep looking for the blank line which indicates the end of the + # headers. + + + # ".=" means append the current line to the $headers + # variable. + # + # For some reason, I was getting two blank lines + # at the end of the headers, rather than one, + # so I decided not to read in the blank line + # which terminates the headers. + # + # Delete the "unless ($_ eq "\n")" to get rid + # of this kludge. + + $headers .= $_ unless ($_ eq "\n"); + + # Now scan the line for various status flags + # and to fine the Subject line. + + $flags .= $1 if /^Status: ([A-Z]+)/; + $flags .= $1 if /^X-Status: ([A-Z]+)/; + if (/^X-Mozilla-Status: ([0-9a-f]{4})/i) + { + $flags .= 'R' if (hex($1) & 0x0001); + $flags .= 'A' if (hex($1) & 0x0002); + $flags .= 'D' if (hex($1) & 0x0008); + } + if(/^X\-Evolution:\s+\w{8}\-(\w{4})/oi) + { + $b = pack("H4", $1); #pack it as 4 digit hex (0x0000) + $b = unpack("B32", $b); #unpack into bit string + + # "usually" only the right most six bits are used + # however, I have come across a seventh bit in + # about 15 (out of 10,000) messages with this bit + # activated. + # I have not found any documentation in the source. + # If you find out what it does, please let me know. + + # Notes: + # Evolution 1.4 does mark forwarded messages. + # The sixth bit is to denote an attachment + + $flags .= 'A' if($b =~ /[01]{15}1/); #replied + $flags .= 'D' if($b =~ /[01]{14}1[01]{1}/); #deleted + $flags .= 'T' if($b =~ /[01]{13}1[01]{2}/); #draft + $flags .= 'F' if($b =~ /[01]{12}1[01]{3}/); #flagged + $flags .= 'R' if($b =~ /[01]{11}1[01]{4}/); #seen/read + } + $subject = $1 if /^Subject: (.*)$/; + $contentlength = $1 if /^Content-Length: (\d+)$/; + + # Now look out for the end of the headers - a blank + # line. When we find it, create the file name and + # analyse the Subject line. + + if ($_ eq "\n") + { + # We are at the end of the headers. Set the + # $inheaders flag back to 0. + + $inheaders = 0; + + # Include the current newline in the content length + + ++$contentlength if defined $contentlength; + + # Create the file name for the current message. + # + # A simple version of this would be: + # + # $messagefn = "cur/$unique.$messagecount.mbox:2,"; + # + # This would create names with $messagecount values of + # 1, 2, etc. But for neatness when looking at a + # directory of such messages, sorted by filename, + # I want to have leading zeroes on message count, so + # that they would be 000001 etc. This makes them + # appear in message order rather than 1 being after + # 19 etc. So this is good for up to 999,999 messages + # in a mailbox. It is a cosmetic matter for a person + # looking into the Maildir directory manually. + # To do this, use sprintf instead with "%06d" for + # 6 characters of zero-padding: + + $messagefn = sprintf ("cur/%d.%06d.mbox:2,", $unique, $messagecount) ; + + + # Append flag characters to the end of the + # filename, according to flag characters + # collected from the message headers + + $messagefn .= 'F' if $flags =~ /F/; # Flagged. + $messagefn .= 'R' if $flags =~ /A/; # Replied to. + $messagefn .= 'S' if $flags =~ /R/; # Seen or Read. + $messagefn .= 'T' if $flags =~ /D/; # Tagged for deletion. + + + # Opens filename $messagefn for output (>) with filehandle OUT. + + open(OUT, ">$messagefn") or die("Fatal: unable to create new message $messagefn"); + + # Count the messages. + + $messagecount++; + + # Only for the first message, + # check to see if it is a dummy. + # Delete the message file we + # just created if it was for the + # dummy message at the start + # of the Mbox. + # + # Add search terms as required. + # The last 2 lines are for rent. + # + # "m" means match the regular expression, + # but we can do without it. + # + # Do I need to escape the ' in "DON'T"? + # I didn't in the original version. + + if ( (($messagecount == 1) && defined($subject)) + && ($subject =~ m/^DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA/) + ) + { + # Stash the file name of the dummy message so we + # can delete it later. + + $deletedummy = "$messagefn"; + } + + # Print the collected headers to the message file. + + print OUT "$headers"; + + + # Clear $headers and $flags ready for the next message. + + $headers = ''; + $flags = ''; + + # End of processing the headers once we found the + # blank line which terminated them + } + + # End of dealing with the headers. + } + + + if ( $inheaders eq 0) + { + + # We are now processing the message body. + # + # Now we have passed the headers to the + # output file, we scan until the while + # loop finds another "From " line. + + # Decrement our content length if we're + # using it to find the end of the message + # body + + if (defined $contentlength) { + + # Decrement our $contentlength variable + + $contentlength -= length($_); + + # The proper end for a message with Content-Length + # specified is the $contentlength variable should + # be exactly -1 and we should be on a bare + # newline. Note that the bare newline is not + # printed to the end of the current message as + # it's actually a message separator in the mbox + # format rather than part of the message. The + # next line _should_ be a From_ line, but just in + # case the Content-Length header is incorrect + # (e.g. a corrupt mailbox), we just continue + # putting lines into the current message until we + # see the next From_ line. + + if ($contentlength < 0) { + if ($contentlength == -1 && $_ eq "\n") { + $contentlength = undef; + next; + } + $contentlength = undef; + } + } + + # + # We want to copy every part of the message + # body to the output file, except for the + # quoted ">From " lines, which was the + # way the IMAP server encoded body lines + # starting with "From ". + # + # Pattern matching Perl majik to + # get rid of an Mbox quoted From. + # + # This works on the default variable "$_" which + # contains the text from the Mbox mailbox - I + # guess this is the case because of our + # (open(MBOX ....) line above, which did not + # assign this to anything else, so it would go + # to the default variable. This enables + # inscrutably terse Perlisms to follow. + # + # "s" means "Subsitute" and it looks for any + # occurrence of ">From" starting at the start + # of the line. When it finds this, it replaces + # it with "From". + # + # So this finds all instances in the Mbox message + # where the original line started with the word + # "From" but was converted to ">From" in order to + # not be mistaken for the "From ..." line which + # is used to demark each message in the Mbox. + # This was was a destructive conversion because + # any message which originally had ">From" at the + # start of the line, before being put into the + # Mbox, will now have that line without the ">". + + s/^>From /From /; + + # Glorious tersness here. Thanks Simon for + # explaining this. + # + # "print OUT" means print the default variable to + # the file of file handle OUT. This is where + # the bulk of the message text is written to + # the output file. + + print OUT or die("Fatal: unable to write to new message to $messagefn"); + + + # End of the if statement dealing with message body. + } + + $previous_line_was_empty = ( $_ eq "\n" ); + + # End of while (MBOX) loop. + } + # Close the input file. + + close(MBOX); + + # Close the output file, and duplicate the code + # from the start of the while loop which touches + # the date-time of the most recent message file. + + close(OUT); + if ($messagefn ne '') { + my $t = str2time($receivedate); + utime $t, $t, $messagefn; + } + + # After all the messages have been + # converted, check to see if the + # first one was a dummy. + # if so, delete it and make + # the message count one less. + + if ($deletedummy ne "") + { + printf("Dummy mail system first message detected and not saved.\n"); + unlink $deletedummy; + + $messagecount--; + + } + + printf("$messagecount messages.\n\n"); +}