From: Don Armstrong Date: Sat, 10 Mar 2012 00:13:21 +0000 (+0000) Subject: * use on-file transposing X-Git-Url: https://git.donarmstrong.com/?p=bin.git;a=commitdiff_plain;h=1aef3a4383e8483e3b3f09c46d70a19271bf8475 * use on-file transposing --- diff --git a/transpose_table b/transpose_table index 1232fe2..2cc7f3c 100755 --- a/transpose_table +++ b/transpose_table @@ -55,12 +55,13 @@ Display this manual. use vars qw($DEBUG); -use Text::CSV; +use Fcntl qw(:seek); use File::Temp qw(tempdir); use MLDBM qw(DB_File Storable); -use Fcntl; + +use Devel::Peek; use List::Util qw(max); @@ -108,52 +109,94 @@ elsif ($options{ssv}) { $sep_char = ' '; } -my $csv = Text::CSV->new({sep_char=>$sep_char}); - -my %rows; -my $n_rows = 0; - -if ($options{bigfile}) { - my $temp_dir = tempdir(CLEANUP=>1); - tie %rows,"MLDBM","$temp_dir/bigfile", O_CREAT|O_RDWR, 0600 or - die "Unable to tie '$temp_dir/bigfile': $!"; -} -my $cols = 0; -my $rows = 0; +my @row; for my $file (@ARGV) { - my $fh; + my $in; if (not defined $file) { - $fh = \*STDIN; + ...; # not supported yet; STDIN isn't seekable + $in = \*STDIN; $file = "STDIN"; } else { - $fh = IO::File->new($file,'r') or + $in = IO::File->new($file,'r') or die "Unable to open $file for reading: $!"; } - while (<$fh>) { - chomp; - # parse the line - die "Unable to parse line $. of $file" unless $csv->parse($_); - my @row = $csv->fields(); - $cols = max(scalar @row,$cols); - for my $i (0..$#row) { - $rows{$rows.'-'.$i} = $row[$i]; + my $out = \*STDOUT; + my $first_time = 1; + my $cur_row = 0; + my $end; + my $first_row_end; + do { + if (not defined $end) { + $in->seek(0,SEEK_END); + $end = $in->tell; + $in->seek(0,SEEK_SET); } - $rows++; - print STDERR "\rInput $rows rows";# if not ($rows-1) % 50; - } - print STDERR "\n"; - $fh = \*STDOUT; - for my $i (0..($cols-1)) { - my @row; - for my $j (0..($rows-1)) { - push @row,$rows{$j.'-'.$i}; + + # from the current position, advance to complete the next field + my $next_field = advance_to_field($in,$sep_char); + + # if we're at the end of the file, stop. + if (not $first_time and + $cur_row == 0) { + print {$out} "\n"; } - $csv->print($fh,\@row); - print "\n"; - print STDERR "\rOutput ".($i+1)."/$cols rows"; - } + elsif ($cur_row != 0) { + print {$out} $sep_char; + } + # write it to the output file + print {$out} $next_field; + if ($in->eof) { + # avoid writing out a newline if the file was totally empty to start with + print {$out} "\n" if not $first_time; + last; + } + + # if this is the first time through, store this position for + # this row, then find the end of the row [field with a \n as a + # terminator] and do the next loop; if we hit the end of the + # file, we are no longer the first time through. + $row[$cur_row] = $in->tell; + if ($first_time) { + print STDERR "\r$cur_row"; + $cur_row++; + advance_to_field($in,"\n"); + $first_row_end = $in->tell if not defined $first_row_end; + if ($in->eof()) { + $first_time = 0; + $in->seek($row[0],SEEK_SET); + $cur_row = 0; + } + } + else { + # otherwise, advance to the next row's position + $cur_row = ($cur_row + 1) % @row; + $in->seek($row[$cur_row],SEEK_SET); + } + if ($cur_row == 0) { + print STDERR "\r".$in->tell."/$first_row_end"; + } + } while (1); print STDERR "\n"; } +sub advance_to_field { + my ($fh,$sep) = @_; + + my $escaped = 0; + my $char; + my $return; + do { + $char = $fh->getc(); + if ($char eq '"') { + $escaped = $escaped ? 0 : 1; + } + if (not $escaped and ($char eq $sep or $char eq "\n")) { + return $return; + } + $return .= $char; + } while (1); +} + + __END__