use vars qw($DEBUG);
-use Text::CSV;
+use Fcntl qw(:seek);
use File::Temp qw(tempdir);
use MLDBM qw(DB_File Storable);
-use Fcntl;
+
+use Devel::Peek;
use List::Util qw(max);
$sep_char = ' ';
}
-my $csv = Text::CSV->new({sep_char=>$sep_char});
-
-my %rows;
-my $n_rows = 0;
-
-if ($options{bigfile}) {
- my $temp_dir = tempdir(CLEANUP=>1);
- tie %rows,"MLDBM","$temp_dir/bigfile", O_CREAT|O_RDWR, 0600 or
- die "Unable to tie '$temp_dir/bigfile': $!";
-}
-my $cols = 0;
-my $rows = 0;
+my @row;
for my $file (@ARGV) {
- my $fh;
+ my $in;
if (not defined $file) {
- $fh = \*STDIN;
+ ...; # not supported yet; STDIN isn't seekable
+ $in = \*STDIN;
$file = "STDIN";
}
else {
- $fh = IO::File->new($file,'r') or
+ $in = IO::File->new($file,'r') or
die "Unable to open $file for reading: $!";
}
- while (<$fh>) {
- chomp;
- # parse the line
- die "Unable to parse line $. of $file" unless $csv->parse($_);
- my @row = $csv->fields();
- $cols = max(scalar @row,$cols);
- for my $i (0..$#row) {
- $rows{$rows.'-'.$i} = $row[$i];
+ my $out = \*STDOUT;
+ my $first_time = 1;
+ my $cur_row = 0;
+ my $end;
+ my $first_row_end;
+ do {
+ if (not defined $end) {
+ $in->seek(0,SEEK_END);
+ $end = $in->tell;
+ $in->seek(0,SEEK_SET);
}
- $rows++;
- print STDERR "\rInput $rows rows";# if not ($rows-1) % 50;
- }
- print STDERR "\n";
- $fh = \*STDOUT;
- for my $i (0..($cols-1)) {
- my @row;
- for my $j (0..($rows-1)) {
- push @row,$rows{$j.'-'.$i};
+
+ # from the current position, advance to complete the next field
+ my $next_field = advance_to_field($in,$sep_char);
+
+ # if we're at the end of the file, stop.
+ if (not $first_time and
+ $cur_row == 0) {
+ print {$out} "\n";
}
- $csv->print($fh,\@row);
- print "\n";
- print STDERR "\rOutput ".($i+1)."/$cols rows";
- }
+ elsif ($cur_row != 0) {
+ print {$out} $sep_char;
+ }
+ # write it to the output file
+ print {$out} $next_field;
+ if ($in->eof) {
+ # avoid writing out a newline if the file was totally empty to start with
+ print {$out} "\n" if not $first_time;
+ last;
+ }
+
+ # if this is the first time through, store this position for
+ # this row, then find the end of the row [field with a \n as a
+ # terminator] and do the next loop; if we hit the end of the
+ # file, we are no longer the first time through.
+ $row[$cur_row] = $in->tell;
+ if ($first_time) {
+ print STDERR "\r$cur_row";
+ $cur_row++;
+ advance_to_field($in,"\n");
+ $first_row_end = $in->tell if not defined $first_row_end;
+ if ($in->eof()) {
+ $first_time = 0;
+ $in->seek($row[0],SEEK_SET);
+ $cur_row = 0;
+ }
+ }
+ else {
+ # otherwise, advance to the next row's position
+ $cur_row = ($cur_row + 1) % @row;
+ $in->seek($row[$cur_row],SEEK_SET);
+ }
+ if ($cur_row == 0) {
+ print STDERR "\r".$in->tell."/$first_row_end";
+ }
+ } while (1);
print STDERR "\n";
}
+sub advance_to_field {
+ my ($fh,$sep) = @_;
+
+ my $escaped = 0;
+ my $char;
+ my $return;
+ do {
+ $char = $fh->getc();
+ if ($char eq '"') {
+ $escaped = $escaped ? 0 : 1;
+ }
+ if (not $escaped and ($char eq $sep or $char eq "\n")) {
+ return $return;
+ }
+ $return .= $char;
+ } while (1);
+}
+
+
__END__