col_grep

   1 #! /usr/bin/perl
   2 # col_grep greps a column matching a pattern, and is released
   3 # under the terms of the GPL version 2, or any later version, at your
   4 # option. See the file README and COPYING for more information.
   5 # Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
   6 # $Id: perl_script 1153 2008-04-08 00:04:20Z don $
   7
   8
   9 use warnings;
  10 use strict;
  11
  12 use Getopt::Long;
  13 use Pod::Usage;
  14
  15 =head1 NAME
  16
  17 col_grep - grep for a column matching a pattern
  18
  19 =head1 SYNOPSIS
  20
  21 col_grep [options] PATTERN [FILE...]
  22 col_grep [options] [-e PATTERN] [FILE...]
  23
  24  Options:
  25   --delimiter, -d Delimiter to use (default tab)
  26   --regexp, -e pattern to match
  27   --fields, -f field number to match (default 1)
  28   --help, -h display this help
  29   --man, -m display manual
  30
  31 =head1 OPTIONS
  32
  33 =over
  34
  35 =item B<--delimiter,-d>
  36
  37 Delimiter to use, defaults to tab
  38
  39 =item B<--regexp, -e>
  40
  41 Pattern to match
  42
  43 =item B<--fields, -f>
  44
  45 Field number to match; starts at 1
  46
  47 =item B<--help, -h>
  48
  49 Display brief useage information.
  50
  51 =item B<--man, -m>
  52
  53 Display this manual.
  54
  55 =back
  56
  57 =head1 EXAMPLES
  58
  59
  60 =cut
  61
  62 use Scalar::Util qw(looks_like_number);
  63 use Text::CSV;
  64 use vars qw($DEBUG);
  65
  66 my %options = (debug            => 0,
  67                help             => 0,
  68                man              => 0,
  69                delimiter        => "\t",
  70                field            => [],
  71                regexp           => [],
  72                has_header       => 0,
  73               );
  74
  75 GetOptions(\%options,
  76            'delimiter=s',
  77            'field|f=s@',
  78            'regexp|e=s@',
  79            'has_header|has-header!',
  80            'debug|d+','help|h|?','man|m');
  81
  82 pod2usage() if $options{help};
  83 pod2usage({verbose=>2}) if $options{man};
  84
  85 $DEBUG = $options{debug};
  86
  87 my @USAGE_ERRORS;
  88 # if (1) {
  89 #      push @USAGE_ERRORS,"You must pass something";
  90 # }
  91
  92 if (not @{$options{field}}) {
  93     $options{field} = [1];
  94 }
  95
  96 my @header;
  97 my %header;
  98 if ($options{has_header}) {
  99     # we allow for a header
 100 }
 101
 102 my @field_indexes = map { looks_like_number($_) && $_ > 0 ? $_ - 1 : $_;}
 103     map {split /,/} @{$options{field}};
 104 my %field_indexes;
 105 @field_indexes = map {++$field_indexes{$_} > 1 ? ():$_} @field_indexes;
 106
 107 if (grep {not /-?\d+/} @field_indexes and not $options{has_header}) {
 108     push @USAGE_ERRORS,"Invalid field index(es)";
 109 }
 110
 111 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
 112
 113 if (not @{$options{regexp}} and @ARGV) {
 114     push @{$options{regexp}}, shift @ARGV;
 115 }
 116
 117 if (not @ARGV) {
 118     # we'll use this as a special indicator to read stdin
 119     push @ARGV,undef;
 120 }
 121 my %compiled_regexes;
 122 my $csv;
 123 if (length($options{delimiter}) > 1) { # delimiter is a regex, then
 124     #csv will be undef
 125 } else {
 126     $csv = Text::CSV->new({sep_char=>$options{delimiter}}) or
 127         die "Unable to start Text::CSV";
 128 }
 129 FILE: for my $file (@ARGV) {
 130     my $fh;
 131     my $headers_updated = 0;
 132     if (not defined $file) {
 133         $fh = \*STDIN;
 134         $file = "STDIN";
 135     } else {
 136         $fh = IO::File->new($file,'r') or
 137             die "Unable to open $file for reading: $!";
 138     }
 139  LINE: while (<$fh>) {
 140         my $chomped = chomp;
 141         my $line = $_;
 142         my @fields;
 143         if (defined $csv) {
 144             die "Unable to parse line $. of $file: ".$csv->error_diag() unless $csv->parse($_);
 145             @fields = $csv->fields();
 146         } else {
 147             @fields = split /$options{delimiter}/o,$_;
 148         }
 149         # skip lines which don't have enough fields
 150         if ($options{has_header} and not @header) {
 151             @header = @fields;
 152             @header{@header} = 0..$#fields;
 153             print $line;
 154             print $/ if $chomped;
 155             next LINE;
 156         }
 157         if ($options{has_header} and not $headers_updated) {
 158             $headers_updated = 1;
 159             if (@header < @fields) {
 160                 @header{@header} = 1..@fields;
 161             }
 162             my @new_indexes;
 163             for my $index (@field_indexes) {
 164                 push @new_indexes,$index and next if $index =~ /^-?\d+$/;
 165                 if (not exists $header{$index}) {
 166                     use Data::Dumper;
 167                     print STDERR Data::Dumper->Dump([\%header],[qw(*header)]);
 168                     print STDERR "Invalid header $index\n";
 169                     exit 1;
 170                 } else {
 171                     push @new_indexes,$header{$index};
 172                 }
 173             }
 174             @field_indexes = @new_indexes;
 175             print STDERR Data::Dumper->Dump([\@field_indexes],[qw(*field_indexes)]) if $DEBUG;
 176         }
 177         next LINE if grep {not defined $_} @fields[@field_indexes];
 178         my $i = -1;
 179     REGEX: for my $regex (@{$options{regexp}}) {
 180             $i++;
 181             if (length $regex) {
 182                 my @fields_to_examine = map {$fields[$_]} @field_indexes;
 183                 if (@{$options{regexp}} > 1) {
 184                     @fields_to_examine = $fields_to_examine[$i];
 185                 }
 186             FIELDS: for my $field (@fields_to_examine) {
 187                     if (not exists $compiled_regexes{$regex}) {
 188                         $compiled_regexes{$regex} = qr/$regex/;
 189                     }
 190                     print STDERR "regex: $regex field: $field\n" if $DEBUG;
 191                     $field =~ $compiled_regexes{$regex} and next REGEX;
 192                 }
 193                 next LINE;
 194             }
 195         }
 196         print $line;
 197         print $/ if $chomped;
 198     }
 199 }
 200
 201 __END__