#! /usr/bin/perl # txt2xls turns text files into excel workbooks, and is released # under the terms of the GPL version 2, or any later version, at your # option. See the file README and COPYING for more information. # Copyright 2008 by Don Armstrong . # $Id: perl_script 1153 2008-04-08 00:04:20Z don $ use warnings; use strict; use Getopt::Long; use Pod::Usage; =head1 NAME txt2xls - Turns a (set of) text file(s) into an excel workbook =head1 SYNOPSIS [options] Options: --tsv, -t tab separated value mode (Default) --ssv, -s space separated value mode --csv, -c comma separated value mode --r-mode, -r R mode (Default) --auto-format Auto format (Default) --sci-format Scientific format formatting string (Default 0.00E+0) --max-digits Maximum digits to use for auto-format (Default 4) --debug, -d debugging level (Default 0) --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--auto-format> Attempt to automatically format the excel file. Currently, this does nothing for non-numeric entries. For numeric entries, if the number is very large (> 9999), or less than 0.001, but not 0, the cell is put into scientific format (B<--sci-format>, default C<0.00E+0>). Otherwise, the cell is formatted to use at maximum B<--max-digits> (default 4) digits. To disable, use B<--no-auto-format> =item B<--sci-format> Excel format string to use for scientific format. See L for details. (Default C<'0.00E+0'>) =item B<--max-digits> Maximum number digits to display for non-scientific formats. (Default 4) =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief useage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES =cut use vars qw($DEBUG); use Text::CSV; use Spreadsheet::WriteExcel; use Scalar::Util qw(looks_like_number); use POSIX qw(floor); my %options = (debug => 0, help => 0, man => 0, auto_format => 1, sci_format => '0.00E+0', max_digits => 4, remove_name => [], ); GetOptions(\%options, 'tsv|t', 'ssv|s', 'csv|c', 'auto_format|auto-format!', 'sci_format|sci-format=s', 'max_digits|max-digits=i', 'rmode|r-mode|r!', 'remove_name|remove-name=s@', 'output|output_file|output-file=s', 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; my @USAGE_ERRORS; if (0 == grep {exists $options{$_}} qw(tsv ssv csv)) { $options{tsv} = 1 } if (1 < grep {exists $options{$_}} qw(tsv ssv csv)) { push @USAGE_ERRORS,"You can only pass one of --tsv, --ssv, or --csv"; } pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; if (not @ARGV) { # we'll use this as a special indicator to read stdin push @ARGV,undef; } my $sep_char = "\t"; if ($options{csv}) { $sep_char = ','; } elsif ($options{ssv}) { $sep_char = ' '; } if (not @{$options{remove_name}}) { $options{remove_name} = ['.+\/', ]; } my %wb_formats = (); my $csv = Text::CSV->new({sep_char=>$sep_char}); my $wb; if (defined $options{output}) { $wb = Spreadsheet::WriteExcel->new($options{output}); } else { $wb = Spreadsheet::WriteExcel->new(\*STDOUT); } for my $file (@ARGV) { my $fh; if (not defined $file) { $fh = \*STDIN; $file = "STDIN"; } else { open($fh,'<:encoding(utf8)',$file) or die "Unable to open $file for reading: $!"; } my $ws_name = $file; foreach my $remove (@{$options{remove_name}}) { $ws_name =~ s{$remove}{}g; } $ws_name =~ s{\.[^\.]+$}{}g; $ws_name =~ s/_+/ /g; $ws_name =~ s{[\]:*?\/\] ]+}{ }g; $ws_name =~ s{(?:^\s+|\s+$)}{}g; $ws_name =~ s{^(.{0,31}).*$}{$1}; my $ws = $wb->add_worksheet($ws_name) or die "Unable to add worksheet to workbook"; my $row = 1; my @header_row; my $overflow = 0; my $r_mode = $options{r_mode} // 0; # set to 1 if we have attempted to autodetect R mode my $r_mode_autodetected = 0; while (<$fh>) { chomp; # parse the line die "Unable to parse line $. of $file: ".$csv->error_diag() unless $csv->parse($_); my @row = $csv->fields(); if ($row==1 and not $r_mode_autodetected) { @header_row = @row; $row++; next; } if ($row==2 and not $r_mode_autodetected) { $r_mode_autodetected = 1; if (@row == (@header_row+1)) { $r_mode = 1 unless exists $options{r_mode} and defined $options{r_mode}; } if ($r_mode) { # R doesn't output headers for rownames unshift @header_row,''; } output_row(\@header_row,1,$ws,$wb,\%wb_formats,\%options); } if ($row > 65536) { # ok, we're going to overflow here my $t_ws_name = $ws_name; my $maxlen = 31-length('.'.$overflow); $t_ws_name =~ s{^(.{0,$maxlen}).*$}{$1}; $ws = $wb->add_worksheet($t_ws_name.'.'.$overflow); $overflow++; $row=1; output_row(\@header_row,$row,$ws,$wb,\%wb_formats,\%options); $row++; } if ($row==1) { @header_row = @row; } output_row(\@row,$row,$ws,$wb,\%wb_formats,\%options); $row++; } } sub output_row{ my ($data,$row,$ws,$wb,$formats,$options) = @_; my @columns = ('A'..'Z','AA'..'ZZ'); for my $i (0..$#{$data}) { my $format; if ($options->{auto_format}) { if (looks_like_number($data->[$i])) { my $format_string; # use scientific format? if ($data->[$i] != 0 && (abs($data->[$i]) > 9999 || abs($data->[$i]) < 0.001)) { $format_string = $options->{sci_format}; } else { $format_string = '0'; my $digits = length(floor(abs($data->[$i]))); if ($options->{max_digits} - $digits > 0 and abs($data->[$i]) != abs(floor($data->[$i]))) { # if there are digits left over, use them for # decimal places, but don't require them $format_string = $format_string.'.'.('#'x($options->{max_digits} - $digits)) } } if (not exists $formats->{$format_string}) { $formats->{$format_string} = $wb->add_format(); $formats->{$format_string}-> set_num_format($format_string); } $format = $formats->{$format_string}; } } $ws->write($columns[$i].$row,$data->[$i],defined $format?$format:()) } } __END__