supress outputing filenames

[bin.git] / txt2xls
diff --git a/txt2xls b/txt2xls

index 3ede2f4ea84846e3c1677d574833d0619ca66761..c79fb53b2d1c2ff63594602f14c280b34ac97cbc 100755 (executable)
--- a/txt2xls
+++ b/txt2xls
@@ -25,6 +25,9 @@ txt2xls - Turns a (set of) text file(s) into an excel workbook
    --ssv, -s space separated value mode
    --csv, -c comma separated value mode
    --r-mode, -r R mode (Default)
+  --auto-format Auto format (Default)
+  --sci-format Scientific format formatting string (Default 0.00E+0)
+  --max-digits Maximum digits to use for auto-format (Default 4)
    --debug, -d debugging level (Default 0)
    --help, -h display this help
    --man, -m display manual
@@ -33,6 +36,27 @@ txt2xls - Turns a (set of) text file(s) into an excel workbook
  
  =over
  
+=item B<--auto-format>
+
+Attempt to automatically format the excel file. Currently, this does
+nothing for non-numeric entries. For numeric entries, if the number is
+very large (> 9999), or less than 0.001, but not 0, the cell is put
+into scientific format (B<--sci-format>, default C<0.00E+0>).
+Otherwise, the cell is formatted to use at maximum B<--max-digits>
+(default 4) digits.
+
+To disable, use B<--no-auto-format>
+
+=item B<--sci-format>
+
+Excel format string to use for scientific format. See
+L<http://office.microsoft.com/en-us/excel-help/number-format-codes-HP005198679.aspx>
+for details. (Default C<'0.00E+0'>)
+
+=item B<--max-digits>
+
+Maximum number digits to display for non-scientific formats. (Default 4)
+
  =item B<--debug, -d>
  
  Debug verbosity. (Default 0)
@@ -57,11 +81,15 @@ use vars qw($DEBUG);
  
  use Text::CSV;
  use Spreadsheet::WriteExcel;
+use Scalar::Util qw(looks_like_number);
+use POSIX qw(floor);
  
  my %options = (debug           => 0,
                help            => 0,
                man             => 0,
-              rmode           => 1,
+               auto_format     => 1,
+               sci_format      => '0.00E+0',
+               max_digits      => 4,
                remove_name     => [],
                );
  
@@ -69,8 +97,12 @@ GetOptions(\%options,
            'tsv|t',
            'ssv|s',
            'csv|c',
+           'auto_format|auto-format!',
+           'sci_format|sci-format=s',
+           'max_digits|max-digits=i',
            'rmode|r-mode|r!',
-          'remove_name|remove-name=s@',
+           'remove_name|remove-name=s@',
+           'output|output_file|output-file=s',
            'debug|d+','help|h|?','man|m');
  
  pod2usage() if $options{help};
@@ -88,8 +120,6 @@ if (1 < grep {exists $options{$_}} qw(tsv ssv csv)) {
  
  pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
  
-my @columns = ('A'..'Z','AA'..'ZZ');
-
  if (not @ARGV) {
      # we'll use this as a special indicator to read stdin
      push @ARGV,undef;
@@ -103,8 +133,20 @@ elsif ($options{ssv}) {
      $sep_char = ' ';
  }
  
+if (not @{$options{remove_name}}) {
+    $options{remove_name} = ['.+\/',
+                            ];
+}
+
+my %wb_formats = ();
  my $csv = Text::CSV->new({sep_char=>$sep_char});
-my $wb = Spreadsheet::WriteExcel->new(\*STDOUT);
+my $wb;
+if (defined $options{output}) {
+    $wb =  Spreadsheet::WriteExcel->new($options{output});
+} else {
+    $wb = Spreadsheet::WriteExcel->new(\*STDOUT);
+}
+
  for my $file (@ARGV) {
      my $fh;
      if (not defined $file) {
@@ -112,12 +154,12 @@ for my $file (@ARGV) {
         $file = "STDIN";
      }
      else {
-       $fh = IO::File->new($file,'r') or
+       open($fh,'<:encoding(utf8)',$file) or
             die "Unable to open $file for reading: $!";
      }
      my $ws_name = $file;
      foreach my $remove (@{$options{remove_name}}) {
-       $ws_name =~ s{\Q$remove\E}{}g;
+        $ws_name =~ s{$remove}{}g;
      }
      $ws_name =~ s{\.[^\.]+$}{}g;
      $ws_name =~ s/_+/ /g;
@@ -129,35 +171,81 @@ for my $file (@ARGV) {
      my $row = 1;
      my @header_row;
      my $overflow = 0;
+    my $r_mode = $options{r_mode} // 0;
+    # set to 1 if we have attempted to autodetect R mode
+    my $r_mode_autodetected = 0;
      while (<$fh>) {
         chomp;
         # parse the line
-       my @row;
-       die "Unable to parse line $. of $file" unless $csv->parse($_);
-       if ($row==1 and $options{rmode}) {
-           # R doesn't output headers for rownames
-           push @row,'';
+       die "Unable to parse line $. of $file: ".$csv->error_diag() unless $csv->parse($_);
+       my @row = $csv->fields();
+       if ($row==1 and not $r_mode_autodetected) {
+           @header_row = @row;
+           $row++;
+           next;
+       }
+       if ($row==2 and not $r_mode_autodetected) {
+           $r_mode_autodetected = 1;
+           if (@row == (@header_row+1)) {
+               $r_mode = 1 unless exists $options{r_mode} and defined $options{r_mode};
+           }
+           if ($r_mode) {
+               # R doesn't output headers for rownames
+               unshift @header_row,'';
+           }
+           output_row(\@header_row,1,$ws,$wb,\%wb_formats,\%options);
         }
         if ($row > 65536) { # ok, we're going to overflow here
             my $t_ws_name = $ws_name;
             my $maxlen = 31-length('.'.$overflow);
             $t_ws_name =~ s{^(.{0,$maxlen}).*$}{$1};
-           $ws = $wb->add_worksheet($ws_name.'.'.$overflow);
+           $ws = $wb->add_worksheet($t_ws_name.'.'.$overflow);
+            $overflow++;
             $row=1;
-           for my $i (0..@header_row) {
-               $ws->write($columns[$i].$row,$header_row[$i]);
-           }
+           output_row(\@header_row,$row,$ws,$wb,\%wb_formats,\%options);
             $row++;
         }
-       push @row,$csv->fields();
         if ($row==1) {
             @header_row = @row;
         }
-       for my $i (0..$#row) {
-           $ws->write($columns[$i].$row,$row[$i]);
-       }
+       output_row(\@row,$row,$ws,$wb,\%wb_formats,\%options);
         $row++;
      }
  }
  
+sub output_row{
+    my ($data,$row,$ws,$wb,$formats,$options) = @_;
+    my @columns = ('A'..'Z','AA'..'ZZ');
+    for my $i (0..$#{$data}) {
+        my $format;
+        if ($options->{auto_format}) {
+            if (looks_like_number($data->[$i])) {
+                my $format_string;
+                # use scientific format?
+                if ($data->[$i] != 0 && (abs($data->[$i]) > 9999 ||
+                                         abs($data->[$i]) < 0.001)) {
+                    $format_string = $options->{sci_format};
+                } else {
+                    $format_string = '0';
+                    my $digits = length(floor(abs($data->[$i])));
+                    if ($options->{max_digits} - $digits > 0 and
+                        abs($data->[$i]) != abs(floor($data->[$i]))) {
+                        # if there are digits left over, use them for
+                        # decimal places, but don't require them
+                        $format_string = $format_string.'.'.('#'x($options->{max_digits} - $digits))
+                    }
+                }
+                if (not exists $formats->{$format_string}) {
+                    $formats->{$format_string} =
+                        $wb->add_format();
+                    $formats->{$format_string}->
+                        set_num_format($format_string);
+                }
+                $format = $formats->{$format_string};
+            }
+        }
+        $ws->write($columns[$i].$row,$data->[$i],defined $format?$format:())
+    }
+}
+
  __END__