From 1640c547e5b4c3b24e5c658c82ef6937a20cf3a4 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Mon, 21 Jan 2013 11:51:46 -0800 Subject: [PATCH] use mutt to actually do the conversion for us --- convert_to_maildir | 761 +-------------------------------------------- 1 file changed, 9 insertions(+), 752 deletions(-) diff --git a/convert_to_maildir b/convert_to_maildir index 62e57a8..1841851 100755 --- a/convert_to_maildir +++ b/convert_to_maildir @@ -87,760 +87,17 @@ if (not defined $options{maildir}) { $options{maildir} = pop @ARGV; } - -maildirmake($options{maildir}); for my $mailbox (@ARGV) { convert($mailbox,$options{maildir}); } -# code below comes from mb2md -# -# mb2md-3.20.pl Converts Mbox mailboxes to Maildir format. -# -# Public domain. -# -# currently maintained by: -# Juri Haberland -# initially wrote by: -# Robin Whittle -# -# This script's web abode is http://batleth.sapienti-sat.org/projects/mb2md/ . -# For a changelog see http://batleth.sapienti-sat.org/projects/mb2md/changelog.txt - -# The maildirmake function -# ------------------------ -# -# It does the same thing that the maildirmake binary that -# comes with courier-imap distribution -# -sub maildirmake -{ - foreach(@_) { - -d $_ or mkdir $_,0700 or die("Fatal: Directory $_ doesn't exist and can't be created.\n"); - -d "$_/tmp" or mkdir("$_/tmp",0700) or die("Fatal: Unable to make $_/tmp/ subdirectory.\n"); - -d "$_/new" or mkdir("$_/new",0700) or die("Fatal: Unable to make $_/new/ subdirectory.\n"); - -d "$_/cur" or mkdir("$_/cur",0700) or die("Fatal: Unable to make $_/cur/ subdirectory.\n"); - } -} - -# The convert function -# --------------------- -# -# This function does the down and dirty work of -# actually converting the mbox to a maildir -# -sub convert -{ - # get the source and destination as arguments - my ($mbox, $maildir) = @_; - - printf("Source Mbox is $mbox\n"); - printf("Target Maildir is $maildir \n") ; - - # create the directories for the new maildir - # - # if it is the root maildir (ie. converting the inbox) - # these already exist but thats not a big issue - - &maildirmake($maildir); - - # Change to the target mailbox directory. - - chdir "$maildir" ; - - # Converts a Mbox to multiple files - # in a Maildir. - # This is adapted from mbox2maildir. - # - # Open the Mbox mailbox file. - - - if (sysopen(MBOX, "$mbox", O_RDONLY)) - { - #printf("Converting Mbox $mbox . . . \n"); - } - else - { - die("Fatal: unable to open input mailbox file: $mbox ! \n"); - } - - # This loop scans the input mailbox for - # a line starting with "From ". The - # "^" before it is pattern-matching - # lingo for it being at the start of a - # line. - # - # Each email in Mbox mailbox starts - # with such a line, which is why any - # such line in the body of the email - # has to have a ">" put in front of it. - # - # This is not required in a Maildir - # mailbox, and some majik below - # finds any such quoted "> From"s and - # gets rid of the "> " quote. - # - # Each email is put in a file - # in the cur/ subdirectory with a - # name of the form: - # - # nnnnnnnnn.cccc.mbox:2,XXXX - # - # where: - # "nnnnnnnnn" is the Unix time since - # 1970 when this script started - # running, incremented by 1 for - # every email. This is to ensure - # unique names for each message - # file. - # - # ".cccc" is the message count of - # messages from this mbox. - # - # ".mbox" is just to indicate that - # this message was converted from - # an Mbox mailbox. - # - # ":2," is the start of potentially - # multiple IMAP flag characters - # "XXXX", but may be followed by - # nothing. - # - # This is sort-of compliant with - # the Maildir naming conventions - # specified at: - # - # http://www.qmail.org/man/man5/maildir.html - # - # This approach does not involve the - # process ID or the hostname, but it is - # probably good enough. - # - # When the IMAP server looks at this - # mailbox, it will move the files to - # the cur/ directory and change their - # names as it pleases. In the case - # of Courier IMAP, the names will - # become like: - # - # 995096541.25351.mbox:2,S - # - # with 25351 being Courier IMAP's - # process ID. The :2, is the start - # of the flags, and the "S" means - # that this one has been seen by - # the user. (But is this the same - # meaning as the user actually - # having opened the message to see - # its contents, rather than just the - # IMAP server having been asked to - # list the message's Subject etc. - # so the client could list it in the - # visible Inbox?) - # - # This contrasts with a message - # created by Courier IMAP, say with - # a message copy, which is like: - # - # 995096541.25351.zair,S=14285:2,S - # - # where ",S=14285" is the size of the - # message in bytes. - # - # Courier Maildrop's names are similar - # but lack the ":2,XXXX" flags . . . - # except for my modified Maildrop - # which can deliver them with a - # ":2,T" - flagged for deletion. - # - # I have extended the logic of the - # per-message inner loop to stop - # saving a file for a message with: - # - # Subject: DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA - # - # This is the dummy message, always - # at the start of an Mbox format - # mailbox file - and is put there - # by UW IMAPD. Since quite a few - # people will use this for - # converting from a UW system, - # I figure it is worth it. - # - # I will not save any such message - # file for the dummy message. - # - # Plan - # ---- - # - # We want to read the entire Mbox file, whilst - # going through a loop for each message we find. - # - # We want to read all the headers of the message, - # starting with the "From " line. For that "From " - # line we want to get a date. - # - # For all other header lines, we want to store them - # in $headers whilst parsing them to find: - # - # 1 - Any flags in the "Status: " or "X-Status: " or - # "X-Mozilla-Status: " lines. - # - # 2 - A subject line indicating this is the dummy message - # at the start (typically, but not necessarily) of - # the Mbox. - # - # Once we reach the end of the headers, we will crunch any - # flags we found to create a file name. Then, unless this is - # the dummy message we create that file and write all the - # headers to it. - # - # Then we continue reading the Mbox, converting ">From " to - # "From " and writing it to the file, until we reach one of: - # - # 1 - Another "From " line (indicating the start of another - # message). - # - # or - # - # 2 - The end of the Mbox. - # - # In the former case, which we detect at the start of the loop - # we need to close the file and touch it to alter its date-time. - # - # In the later case, we also need to close the file and touch - # it to alter its date-time - but this is beyond the end of the - # loop. - - - # Variables - # --------- - - my $messagecount = 0; - - # For generating unique filenames for - # each message. Initialise it here with - # numeric time in seconds since 1970. - my $unique = time; - - # Name of message file to delete if we found that - # it was created by reading the Mbox dummy message. - - my $deletedummy = ''; - - # To store the complete "From (address) (date-time) - # which delineates the start of each message - # in the Mbox - my $fromline = ''; - - - # Set to 1 when we are reading the header lines, - # including the "From " line. - # - # 0 means we are reading the message body and looking - # for another "From " line. - - my $inheaders = 0; - - # Variable to hold all headers (apart from - # the first line "From ...." which is not - # part of the message itself. - my $headers = ''; - - # Variable to hold the accumulated characters - # we find in header lines of the type: - # - # Status: - # X-Status: - # X-Mozilla-Status: - # X-Evolution: - my $flags = ''; - - # To build the file name for the message in. - my $messagefn = ''; - - - # The date string from the "From " line of each - # message will be written here - and used by - # touch to alter the date-time of each message - # file. Put non-date text here to make it - # spit the dummy if my code fails to find a - # date to write into this. - - my $receivedate = 'Bogus'; - - # The subject of the message - my $subject = ''; - - my $previous_line_was_empty = 1; - - # We record the message start line here, for error - # reporting. - my $startline; - - # If defined, we use this as the number of bytes in the - # message body rather than looking for a /^From / line. - my $contentlength; - - # A From lines can either occur as the first - # line of a file, or after an empty line. - # Most mail systems will quote all From lines - # appearing in the message, but some will only - # do it when necessary. - # Since we initialise the variable to true, - # we don't need to check for beginning of file. - - while() - { - # exchange possible Windows EOL (CRLF) with Unix EOL (LF) - $_ =~ s/\r\n$/\n/; - - if ( /^From / - && $previous_line_was_empty - && (!defined $contentlength) - ) - { - # We are reading the "From " line which has an - # email address followed by a receive date. - # Turn on the $inheaders flag until we reach - # the end of the headers. - - $inheaders = 1; - - # record the message start line - - $startline = $.; - - # If this is not the first run through the loop - # then this means we have already been working - # on a message. - - if ($messagecount > 0) - { - # If so, then close that message file and then - # use utime to change its date-time. - # - # Note this code should be duplicated to do - # the same thing at the end of the while loop - # since we must close and touch the final message - # file we were writing when we hit the end of the - # Mbox file. - - close (OUT); - if ($messagefn ne '') { - my $t = str2time($receivedate); - utime $t, $t, $messagefn if defined $t; - } - } - - # Because we opened the Mbox file without any - # variable, I think this means that we have its - # current line in Perl's default variable "$_". - # So all sorts of pattern matching magic works - # directly on it. - - # We are currently reading the first line starting with - # "From " which contains the date we want. - # - # This will be of the form: - # - # From dduck@test.org Wed Nov 24 11:05:35 1999 - # - # at least with UW-IMAP. - # - # However, I did find a nasty exception to this in my - # tests, of the form: - # - # "bounce-MusicNewsletter 5-rw=test.org"@announce2.mp3.com - # - # This makes it trickier to get rid of the email address, - # but I did find a way. I can't rule out that there would - # be some address like this with an "@" in the quoted - # portion too. - # - # Unfortunately, testing with an old Inbox Mbox file, - # I also found an instance where the email address - # had no @ sign at all. It was just an email - # account name, with no host. - # - # I could search for the day of the week. If I skipped - # at least one word of non-whitespace (1 or more contiguous - # non-whitespace characters) then searched for a day of - # the week, then I should be able to avoid almost - # every instance of a day of the week appearing in - # the email address. - # - # Do I need a failsafe arrangement to provide some - # other date to touch if I don't get what seems like - # a date in my resulting string? For now, no. - # - # I will take one approach if there is an @ in the - # "From " line and another (just skip the first word - # after "From ") if there is no @ in the line. - # - # If I knew more about Perl I would probably do it in - # a more elegant way. - - # Copy the current line into $fromline. - - $fromline = $_; - - # Now get rid of the "From ". " =~ s" means substitute. - # Find the word "From " at the start of the line and - # replace it with nothing. The nothing is what is - # between the second and third slash. - - $fromline =~ s/^From // ; - - - # Likewise get rid of the email address. - # This first section is if we determine there is one - # (or more . . . ) "@" characters in the line, which - # would normally be the case. - - if ($fromline =~ m/@/) - { - # The line has at least one "@" in it, so we assume - # this is in the middle of an email address. - # - # If the email address had no spaces, then we could - # get rid of the whole thing by searching for any number - # of non-whitespace characters (\S) contiguously, and - # then I think a space. Subsitute nothing for this. - # - # $fromline =~ s/(\S)+ // ; - # - # But we need something to match any number of non-@ - # characters, then the "@" and then all the non-whitespace - # characters from there (which takes us to the end of - # "test.org") and then the space following that. - # - # A tutorial on regular expressions is: - # - # http://www.perldoc.com/perl5.6.1/pod/perlretut.html - # - # Get rid of all non-@ characters up to the first "@": - - $fromline =~ s/[^@]+//; - - - # Get rid of the "@". - - $fromline =~ s/@//; - } - # If there was an "@" in the line, then we have now - # removed the first one (lets hope there aren't more!) - # and everything which preceded it. - # - # we now remove either something like - # '(foo bar)'. eg. '(no mail address)', - # or everything after the '@' up to the trailing - # timezone - # - # FIXME: all those regexp should be combined to just one single one - - $fromline =~ s/(\((\S*| )+\)|\S+) *//; - - chomp $fromline; - - # Stash the date-time for later use. We will use it - # to touch the file after we have closed it. - - $receivedate = $fromline; - - # Debugging lines: - # - # print "$receivedate is the receivedate of message $messagecount.\n"; - # $receivedate = "Wed Nov 24 11:05:35 1999"; - # - # To look at the exact date-time of files: - # - # ls -lFa --full-time - # - # End of handling the "From " line. - } - - - # Now process header lines which are not the "From " line. - - if ( ($inheaders eq 1) - && (! /^From /) - ) - { - # Now we are reading the header lines after the "From " line. - # Keep looking for the blank line which indicates the end of the - # headers. - - - # ".=" means append the current line to the $headers - # variable. - # - # For some reason, I was getting two blank lines - # at the end of the headers, rather than one, - # so I decided not to read in the blank line - # which terminates the headers. - # - # Delete the "unless ($_ eq "\n")" to get rid - # of this kludge. - - $headers .= $_ unless ($_ eq "\n"); - - # Now scan the line for various status flags - # and to fine the Subject line. - - $flags .= $1 if /^Status: ([A-Z]+)/; - $flags .= $1 if /^X-Status: ([A-Z]+)/; - if (/^X-Mozilla-Status: ([0-9a-f]{4})/i) - { - $flags .= 'R' if (hex($1) & 0x0001); - $flags .= 'A' if (hex($1) & 0x0002); - $flags .= 'D' if (hex($1) & 0x0008); - } - if(/^X\-Evolution:\s+\w{8}\-(\w{4})/oi) - { - $b = pack("H4", $1); #pack it as 4 digit hex (0x0000) - $b = unpack("B32", $b); #unpack into bit string - - # "usually" only the right most six bits are used - # however, I have come across a seventh bit in - # about 15 (out of 10,000) messages with this bit - # activated. - # I have not found any documentation in the source. - # If you find out what it does, please let me know. - - # Notes: - # Evolution 1.4 does mark forwarded messages. - # The sixth bit is to denote an attachment - - $flags .= 'A' if($b =~ /[01]{15}1/); #replied - $flags .= 'D' if($b =~ /[01]{14}1[01]{1}/); #deleted - $flags .= 'T' if($b =~ /[01]{13}1[01]{2}/); #draft - $flags .= 'F' if($b =~ /[01]{12}1[01]{3}/); #flagged - $flags .= 'R' if($b =~ /[01]{11}1[01]{4}/); #seen/read - } - $subject = $1 if /^Subject: (.*)$/; - $contentlength = $1 if /^Content-Length: (\d+)$/; - - # Now look out for the end of the headers - a blank - # line. When we find it, create the file name and - # analyse the Subject line. - - if ($_ eq "\n") - { - # We are at the end of the headers. Set the - # $inheaders flag back to 0. - - $inheaders = 0; - - # Include the current newline in the content length - - ++$contentlength if defined $contentlength; - - # Create the file name for the current message. - # - # A simple version of this would be: - # - # $messagefn = "cur/$unique.$messagecount.mbox:2,"; - # - # This would create names with $messagecount values of - # 1, 2, etc. But for neatness when looking at a - # directory of such messages, sorted by filename, - # I want to have leading zeroes on message count, so - # that they would be 000001 etc. This makes them - # appear in message order rather than 1 being after - # 19 etc. So this is good for up to 999,999 messages - # in a mailbox. It is a cosmetic matter for a person - # looking into the Maildir directory manually. - # To do this, use sprintf instead with "%06d" for - # 6 characters of zero-padding: - - $messagefn = sprintf ("cur/%d.%06d.mbox:2,", $unique, $messagecount) ; - - - # Append flag characters to the end of the - # filename, according to flag characters - # collected from the message headers - - $messagefn .= 'F' if $flags =~ /F/; # Flagged. - $messagefn .= 'R' if $flags =~ /A/; # Replied to. - $messagefn .= 'S' if $flags =~ /R/; # Seen or Read. - $messagefn .= 'T' if $flags =~ /D/; # Tagged for deletion. - - - # Opens filename $messagefn for output (>) with filehandle OUT. - - open(OUT, ">$messagefn") or die("Fatal: unable to create new message $messagefn"); - - # Count the messages. - - $messagecount++; - - # Only for the first message, - # check to see if it is a dummy. - # Delete the message file we - # just created if it was for the - # dummy message at the start - # of the Mbox. - # - # Add search terms as required. - # The last 2 lines are for rent. - # - # "m" means match the regular expression, - # but we can do without it. - # - # Do I need to escape the ' in "DON'T"? - # I didn't in the original version. - - if ( (($messagecount == 1) && defined($subject)) - && ($subject =~ m/^DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA/) - ) - { - # Stash the file name of the dummy message so we - # can delete it later. - - $deletedummy = "$messagefn"; - } - - # Print the collected headers to the message file. - - print OUT "$headers"; - - - # Clear $headers and $flags ready for the next message. - - $headers = ''; - $flags = ''; - - # End of processing the headers once we found the - # blank line which terminated them - } - - # End of dealing with the headers. - } - - - if ( $inheaders eq 0) - { - - # We are now processing the message body. - # - # Now we have passed the headers to the - # output file, we scan until the while - # loop finds another "From " line. - - # Decrement our content length if we're - # using it to find the end of the message - # body - - if (defined $contentlength) { - - # Decrement our $contentlength variable - - $contentlength -= length($_); - - # The proper end for a message with Content-Length - # specified is the $contentlength variable should - # be exactly -1 and we should be on a bare - # newline. Note that the bare newline is not - # printed to the end of the current message as - # it's actually a message separator in the mbox - # format rather than part of the message. The - # next line _should_ be a From_ line, but just in - # case the Content-Length header is incorrect - # (e.g. a corrupt mailbox), we just continue - # putting lines into the current message until we - # see the next From_ line. - - if ($contentlength < 0) { - if ($contentlength == -1 && $_ eq "\n") { - $contentlength = undef; - next; - } - $contentlength = undef; - } - } - - # - # We want to copy every part of the message - # body to the output file, except for the - # quoted ">From " lines, which was the - # way the IMAP server encoded body lines - # starting with "From ". - # - # Pattern matching Perl majik to - # get rid of an Mbox quoted From. - # - # This works on the default variable "$_" which - # contains the text from the Mbox mailbox - I - # guess this is the case because of our - # (open(MBOX ....) line above, which did not - # assign this to anything else, so it would go - # to the default variable. This enables - # inscrutably terse Perlisms to follow. - # - # "s" means "Subsitute" and it looks for any - # occurrence of ">From" starting at the start - # of the line. When it finds this, it replaces - # it with "From". - # - # So this finds all instances in the Mbox message - # where the original line started with the word - # "From" but was converted to ">From" in order to - # not be mistaken for the "From ..." line which - # is used to demark each message in the Mbox. - # This was was a destructive conversion because - # any message which originally had ">From" at the - # start of the line, before being put into the - # Mbox, will now have that line without the ">". - - s/^>From /From /; - - # Glorious tersness here. Thanks Simon for - # explaining this. - # - # "print OUT" means print the default variable to - # the file of file handle OUT. This is where - # the bulk of the message text is written to - # the output file. - - print OUT or die("Fatal: unable to write to new message to $messagefn"); - - - # End of the if statement dealing with message body. - } - - $previous_line_was_empty = ( $_ eq "\n" ); - - # End of while (MBOX) loop. - } - # Close the input file. - - close(MBOX); - - # Close the output file, and duplicate the code - # from the start of the while loop which touches - # the date-time of the most recent message file. - - close(OUT); - if ($messagefn ne '') { - my $t = str2time($receivedate); - utime $t, $t, $messagefn; - } - - # After all the messages have been - # converted, check to see if the - # first one was a dummy. - # if so, delete it and make - # the message count one less. - - if ($deletedummy ne "") - { - printf("Dummy mail system first message detected and not saved.\n"); - unlink $deletedummy; - - $messagecount--; - - } - - printf("$messagecount messages.\n\n"); +# The idea to do this came from +# http://blog.sandipb.net/2009/01/07/one-liner-to-convert-maildir-to-mbox-using-mutt/ +# and talking with Mako. Lets just call mutt, it'll know what to do! +sub convert { + my ($mailbox,$maildir) = @_; + system('mutt','-m','maildir','-F/etc/Muttrc', + '-n','-R','-f', $mailbox,'-e', + 'set confirmcreate=no; set delete=no; set confirmappend=no; set quit=yes; push T.*\;C'. + $maildir.';'); } -- 2.39.2