#! /usr/bin/perl
# col_grep greps a column matching a pattern, and is released
# under the terms of the GPL version 2, or any later version, at your
# option. See the file README and COPYING for more information.
# Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
# $Id: perl_script 1153 2008-04-08 00:04:20Z don $


use warnings;
use strict;

use Getopt::Long;
use Pod::Usage;

=head1 NAME

col_grep - grep for a column matching a pattern

=head1 SYNOPSIS

col_grep [options] PATTERN [FILE...]
col_grep [options] [-e PATTERN] [FILE...]

 Options:
  --delimiter, -d Delimiter to use (default tab)
  --regexp, -e pattern to match
  --fields, -f field number to match (default 1)
  --help, -h display this help
  --man, -m display manual

=head1 OPTIONS

=over

=item B<--delimiter,-d>

Delimiter to use, defaults to tab

=item B<--regexp, -e>

Pattern to match

=item B<--fields, -f>

Field number to match; starts at 1

=item B<--help, -h>

Display brief useage information.

=item B<--man, -m>

Display this manual.

=back

=head1 EXAMPLES


=cut

use Scalar::Util qw(looks_like_number);
use Text::CSV;
use vars qw($DEBUG);

my %options = (debug            => 0,
               help             => 0,
               man              => 0,
               delimiter        => "\t",
               field            => [],
               regexp           => [],
               has_header       => 0,
	      );

GetOptions(\%options,
           'delimiter=s',
           'field|f=s@',
           'regexp|e=s@',
           'has_header|has-header!',
           'debug|d+','help|h|?','man|m');

pod2usage() if $options{help};
pod2usage({verbose=>2}) if $options{man};

$DEBUG = $options{debug};

my @USAGE_ERRORS;
# if (1) {
#      push @USAGE_ERRORS,"You must pass something";
# }

if (not @{$options{field}}) {
    $options{field} = [1];
}

my @header;
my %header;
if ($options{has_header}) {
    # we allow for a header
}

my @field_indexes = map { looks_like_number($_) && $_ > 0 ? $_ - 1 : $_;}
    map {split /,/} @{$options{field}};
my %field_indexes;
@field_indexes{@field_indexes} = @field_indexes;
@field_indexes = values %field_indexes;

if (grep {not /-?\d+/} @field_indexes and not $options{has_header}) {
    push @USAGE_ERRORS,"Invalid field index(es)";
}

pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;

if (not @{$options{regexp}} and @ARGV) {
    push @{$options{regexp}}, shift @ARGV;
}

if (not @ARGV) {
    # we'll use this as a special indicator to read stdin
    push @ARGV,undef;
}
my %compiled_regexes;
my $csv = Text::CSV->new({sep_char=>$options{delimiter}});
FILE: for my $file (@ARGV) {
    my $fh;
    my $headers_updated = 0;
    if (not defined $file) {
        $fh = \*STDIN;
        $file = "STDIN";
    } else {
        $fh = IO::File->new($file,'r') or
            die "Unable to open $file for reading: $!";
    }
 LINE: while (<$fh>) {
        my $chomped = chomp;
        my $line = $_;
        die "Unable to parse line $. of $file: ".$csv->error_diag() unless $csv->parse($_);
        my @fields = $csv->fields();
        # skip lines which don't have enough fields
        if ($options{has_header} and not @header) {
            @header = @fields;
            @header{@header} = 0..$#fields;
            print $line;
            print $/ if $chomped;
            next LINE;
        }
        if ($options{has_header} and not $headers_updated) {
            $headers_updated = 0;
            if (@header < @fields) {
                @header{@header} = 1..@fields;
            }
            my @new_indexes;
            for my $index (@field_indexes) {
                push @new_indexes,$index and next if $index =~ /^-?\d+$/;
                if (not exists $header{$index}) {
                    use Data::Dumper;
                    print STDERR Dumper(\%header);
                    print STDERR "Invalid header $index\n";
                    exit 1;
                } else {
                    push @new_indexes,$header{$index};
                }
            }
            @field_indexes = @new_indexes;
        }
        next LINE if grep {not defined $_} @fields[@field_indexes];
    REGEX: for my $regex (@{$options{regexp}}) {
        FIELDS: for my $field (@fields[@field_indexes]) {
                if (length $regex) {
                    if (not exists $compiled_regexes{$regex}) {
                        $compiled_regexes{$regex} = qr/$regex/;
                    }
                    $field =~ $compiled_regexes{$regex} or next LINE;
                }
            }
        }
        print $line;
        print $/ if $chomped;
    }
}

__END__