#! /usr/bin/perl
# txt2xls turns text files into excel workbooks, and is released
# under the terms of the GPL version 2, or any later version, at your
# option. See the file README and COPYING for more information.
# Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
# $Id: perl_script 1153 2008-04-08 00:04:20Z don $


use warnings;
use strict;

use Getopt::Long;
use Pod::Usage;

=head1 NAME

txt2xls - Turns a (set of) text file(s) into an excel workbook

=head1 SYNOPSIS

 [options]

 Options:
  --tsv, -t tab separated value mode (Default)
  --ssv, -s space separated value mode
  --csv, -c comma separated value mode
  --r-mode, -r R mode (Default)
  --auto-format Auto format (Default)
  --sci-format Scientific format formatting string (Default 0.00E+0)
  --max-digits Maximum digits to use for auto-format (Default 4)
  --debug, -d debugging level (Default 0)
  --help, -h display this help
  --man, -m display manual

=head1 OPTIONS

=over

=item B<--auto-format>

Attempt to automatically format the excel file. Currently, this does
nothing for non-numeric entries. For numeric entries, if the number is
very large (> 9999), or less than 0.001, but not 0, the cell is put
into scientific format (B<--sci-format>, default C<0.00E+0>).
Otherwise, the cell is formatted to use at maximum B<--max-digits>
(default 4) digits.

To disable, use B<--no-auto-format>

=item B<--sci-format>

Excel format string to use for scientific format. See
L<http://office.microsoft.com/en-us/excel-help/number-format-codes-HP005198679.aspx>
for details. (Default C<'0.00E+0'>)

=item B<--max-digits>

Maximum number digits to display for non-scientific formats. (Default 4)

=item B<--debug, -d>

Debug verbosity. (Default 0)

=item B<--help, -h>

Display brief useage information.

=item B<--man, -m>

Display this manual.

=back

=head1 EXAMPLES


=cut


use vars qw($DEBUG);

use Text::CSV;
use Spreadsheet::WriteExcel;
use Scalar::Util qw(looks_like_number);
use POSIX qw(floor);

my %options = (debug           => 0,
	       help            => 0,
	       man             => 0,
               auto_format     => 1,
               sci_format      => '0.00E+0',
               max_digits      => 4,
	       remove_name     => [],
	       );

GetOptions(\%options,
	   'tsv|t',
	   'ssv|s',
	   'csv|c',
           'auto_format|auto-format!',
           'sci_format|sci-format=s',
           'max_digits|max-digits=i',
	   'rmode|r-mode|r!',
           'remove_name|remove-name=s@',
           'output|output_file|output-file=s',
	   'debug|d+','help|h|?','man|m');

pod2usage() if $options{help};
pod2usage({verbose=>2}) if $options{man};

$DEBUG = $options{debug};

my @USAGE_ERRORS;
if (0 == grep {exists $options{$_}} qw(tsv ssv csv)) {
    $options{tsv} = 1
}
if (1 < grep {exists $options{$_}} qw(tsv ssv csv)) {
     push @USAGE_ERRORS,"You can only pass one of --tsv, --ssv, or --csv";
}

pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;

if (not @ARGV) {
    # we'll use this as a special indicator to read stdin
    push @ARGV,undef;
}

my $sep_char = "\t";
if ($options{csv}) {
    $sep_char = ',';
}
elsif ($options{ssv}) {
    $sep_char = ' ';
}

if (not @{$options{remove_name}}) {
    $options{remove_name} = ['.+\/',
                            ];
}

my %wb_formats = ();
my $csv = Text::CSV->new({sep_char=>$sep_char});
my $wb;
if (defined $options{output}) {
    $wb =  Spreadsheet::WriteExcel->new($options{output});
} else {
    $wb = Spreadsheet::WriteExcel->new(\*STDOUT);
}

for my $file (@ARGV) {
    my $fh;
    if (not defined $file) {
	$fh = \*STDIN;
	$file = "STDIN";
    }
    else {
	open($fh,'<:encoding(utf8)',$file) or
	    die "Unable to open $file for reading: $!";
    }
    my $ws_name = $file;
    foreach my $remove (@{$options{remove_name}}) {
        $ws_name =~ s{$remove}{}g;
    }
    $ws_name =~ s{\.[^\.]+$}{}g;
    $ws_name =~ s/_+/ /g;
    $ws_name =~ s{[\]:*?\/\] ]+}{ }g;
    $ws_name =~ s{(?:^\s+|\s+$)}{}g;
    $ws_name =~ s{^(.{0,31}).*$}{$1};
    my $ws = $wb->add_worksheet($ws_name) or
	die "Unable to add worksheet to workbook";
    my $row = 1;
    my @header_row;
    my $overflow = 0;
    my $r_mode = $options{r_mode} // 0;
    # set to 1 if we have attempted to autodetect R mode
    my $r_mode_autodetected = 0;
    while (<$fh>) {
	chomp;
	# parse the line
	die "Unable to parse line $. of $file: ".$csv->error_diag() unless $csv->parse($_);
	my @row = $csv->fields();
	if ($row==1 and not $r_mode_autodetected) {
	    @header_row = @row;
	    $row++;
	    next;
	}
	if ($row==2 and not $r_mode_autodetected) {
	    $r_mode_autodetected = 1;
	    if (@row == (@header_row+1)) {
		$r_mode = 1 unless exists $options{r_mode} and defined $options{r_mode};
	    }
	    if ($r_mode) {
		# R doesn't output headers for rownames
		unshift @header_row,'';
	    }
	    output_row(\@header_row,1,$ws,$wb,\%wb_formats,\%options);
	}
	if ($row > 65536) { # ok, we're going to overflow here
	    my $t_ws_name = $ws_name;
	    my $maxlen = 31-length('.'.$overflow);
	    $t_ws_name =~ s{^(.{0,$maxlen}).*$}{$1};
	    $ws = $wb->add_worksheet($t_ws_name.'.'.$overflow);
            $overflow++;
	    $row=1;
	    output_row(\@header_row,$row,$ws,$wb,\%wb_formats,\%options);
	    $row++;
	}
	if ($row==1) {
	    @header_row = @row;
	}
	output_row(\@row,$row,$ws,$wb,\%wb_formats,\%options);
	$row++;
    }
}

sub output_row{
    my ($data,$row,$ws,$wb,$formats,$options) = @_;
    my @columns = ('A'..'Z','AA'..'ZZ');
    for my $i (0..$#{$data}) {
        my $format;
        if ($options->{auto_format}) {
            if (looks_like_number($data->[$i])) {
                my $format_string;
                # use scientific format?
                if ($data->[$i] != 0 && (abs($data->[$i]) > 9999 ||
                                         abs($data->[$i]) < 0.001)) {
                    $format_string = $options->{sci_format};
                } else {
                    $format_string = '0';
                    my $digits = length(floor(abs($data->[$i])));
                    if ($options->{max_digits} - $digits > 0 and
                        abs($data->[$i]) != abs(floor($data->[$i]))) {
                        # if there are digits left over, use them for
                        # decimal places, but don't require them
                        $format_string = $format_string.'.'.('#'x($options->{max_digits} - $digits))
                    }
                }
                if (not exists $formats->{$format_string}) {
                    $formats->{$format_string} =
                        $wb->add_format();
                    $formats->{$format_string}->
                        set_num_format($format_string);
                }
                $format = $formats->{$format_string};
            }
        }
        $ws->write($columns[$i].$row,$data->[$i],defined $format?$format:())
    }
}

__END__