X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bin%2Fparse_genecard_results;fp=bin%2Fparse_genecard_results;h=4c00d9b47b85b44ef07adb66042f224cd6513e45;hb=d09b67e0af77d6f2818e41d6b4d648cff651c79d;hp=0000000000000000000000000000000000000000;hpb=5d4602c246e5d2c22435bda4c07116251f1fa546;p=function2gene.git diff --git a/bin/parse_genecard_results b/bin/parse_genecard_results new file mode 100755 index 0000000..4c00d9b --- /dev/null +++ b/bin/parse_genecard_results @@ -0,0 +1,180 @@ +#! /usr/bin/perl + +# parse_genecard_results retreives files of search results from ncbi, +# and is released under the terms of the GPL version 2, or any later +# version, at your option. See the file README and COPYING for more +# information. + +# Copyright 2004 by Don Armstrong . + +# $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $ + + +use warnings; +use strict; + + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + + parse_genecard_results [options] + +=head1 SYNOPSIS + + + Options: + --dir, -D directory to stick results into [default .] + --name, -n file naming scheme [default ${search}_results.$format] + --terms, -t file of search terms [default -] + --debug, -d debugging level [default 0] + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief useage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + parse_harvester_results -D ./harvester_results/ -n '${search}_name.html' < search_parameters + +Will pretty much do what you want + +=cut + + + +use vars qw($DEBUG $REVISION); + +BEGIN{ + ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/; + $DEBUG = 0 unless defined $DEBUG; +} + +use IO::File; +use IO::Dir; + +# XXX parse config file + +my %options = (debug => 0, + help => 0, + man => 0, + dir => '.', + keyword => undef, + ); + +GetOptions(\%options,'keyword|k=s','dir|D=s','debug|d+','help|h|?','man|m'); + + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +# CSV columns +use constant {NAME => 0, + REFSEQ => 1, + LOCATION => 2, + ALIAS => 3, + FUNCTION => 4, + DESCRIPTION => 5, + KEYWORD => 6, + DBNAME => 7, + FILENAME => 8, + }; + +if (not -d $options{dir}) { + die "$options{dir} does not exist or is not a directory"; +} + +my $dir = new IO::Dir $options{dir} or die "Unable to open dir $options{dir}: $!"; + +print join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n); + +while ($_ = $dir->read) { + my $file_name = $_; + next if $file_name =~ /^\./; + next unless -f "$options{dir}/$file_name" and -r "$options{dir}/$file_name"; + + my $file = new IO::File "$options{dir}/$file_name", 'r' or die "Unable to open file $file_name"; + + local $/; + my $result = <$file>; + + my @results; + + # Find gene name + ($results[NAME]) = $result =~ m&(?:Lean|Gene)Card\s+for\s+(?:(?:disorder\s+locus|uncategorized| + hugo\s*reserved\s*symbol|cluster| + potentially\s*expressed\s*sequence)|(?:predicted\s+|pseudo|rna\s+|)gene) + \s*(?:with\s*support\s*|)\s*\s*([^\s]+)\s*&xis; + + $results[NAME] ||= 'NO NAME'; + # Find REF SEQ number + ($results[REFSEQ]) = $result =~ m|http://www.ncbi.nlm.nih.gov/entrez/query.fcgi\? + cmd=Search\&db=nucleotide\&doptcmdl=GenBank\&term=([^\"]+)\"|xis; + + $results[REFSEQ] ||= 'NO REFSEQ'; + + # Find Gene Location + ($results[LOCATION]) = $result =~ m&LocusLink\s+cytogenetic\s+band:\s+ + \s*([^\<]+?)\s*&xis; + + $results[LOCATION] ||= 'NO LOCATION'; + + # Find gene aliases + my ($alias_table) = $result =~ m|Aliases and Descriptions(.+?)|is; + $alias_table ||=''; + + my @gene_aliases = $alias_table =~ m|
  • \s*([^\(]{0,20}?)\s*\(Function:\s+(.+?)(?:
  • )|(?:)&gis; + + # GO Functions + push @functions, (map {s#\s*\s*# #g; $_;} $result =~ m&(GO:\d+\s*.+?)(?:
    |

    )&gis); + $results[FUNCTION] = join('; ', map {(defined $_)?($_):()} @functions); + $results[FUNCTION] ||= 'NO FUNCTION'; + + # Figure out the keyword used + ($results[KEYWORD]) = $file_name =~ /search=([^&]+)/; + + $results[KEYWORD] ||= 'NO KEYWORD'; + + # Figure out what the description is + $results[DESCRIPTION] = ''; + + # Database searched + $results[DBNAME] = 'genecard'; + $results[FILENAME] = $file_name; + + print join(',',map {qq("$_")} @results),qq(\n); +} + + + + + + +__END__