X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bin%2Fparse_harvester_results;fp=bin%2Fparse_harvester_results;h=29d07194a8a4e7c830fe7af55bdaa4d9e873a15d;hb=d09b67e0af77d6f2818e41d6b4d648cff651c79d;hp=0000000000000000000000000000000000000000;hpb=5d4602c246e5d2c22435bda4c07116251f1fa546;p=function2gene.git diff --git a/bin/parse_harvester_results b/bin/parse_harvester_results new file mode 100755 index 0000000..29d0719 --- /dev/null +++ b/bin/parse_harvester_results @@ -0,0 +1,209 @@ +#! /usr/bin/perl + +# parse_harvester_results retreives files of search results from ncbi, +# and is released under the terms of the GPL version 2, or any later +# version, at your option. See the file README and COPYING for more +# information. + +# Copyright 2004 by Don Armstrong . + +# $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $ + + +use warnings; +use strict; + + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + + parse_harvester_results [options] + +=head1 SYNOPSIS + + + Options: + --dir, -D directory to stick results into [default .] + --name, -n file naming scheme [default ${search}_results.$format] + --terms, -t file of search terms [default -] + --debug, -d debugging level [default 0] + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief useage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + parse_harvester_results -D ./harvester_results/ -n '${search}_name.html' < search_parameters + +Will pretty much do what you want + +=cut + + + +use vars qw($DEBUG $REVISION); + +BEGIN{ + ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/; + $DEBUG = 0 unless defined $DEBUG; +} + +use IO::File; +use IO::Dir; + +# XXX parse config file + +my %options = (debug => 0, + help => 0, + man => 0, + dir => '.', + keyword => undef, + ); + +GetOptions(\%options,'keyword|k=s','dir|D=s','debug|d+','help|h|?','man|m'); + + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +# CSV columns +use constant {NAME => 0, + REFSEQ => 1, + LOCATION => 2, + ALIAS => 3, + FUNCTION => 4, + DESCRIPTION => 5, + KEYWORD => 6, + DBNAME => 7, + FILENAME => 8, + }; + +if (not -d $options{dir}) { + die "$options{dir} does not exist or is not a directory"; +} + +my $dir = new IO::Dir $options{dir} or die "Unable to open dir $options{dir}: $!"; + +print join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n); + +my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_harvester#; + +while ($_ = $dir->read) { + my $file_name = $_; + next if $file_name =~ /^\./; + next unless -f "$options{dir}/$file_name" and -r "$options{dir}/$file_name"; + + my $file = new IO::File "$options{dir}/$file_name", 'r' or die "Unable to open file $file_name"; + + local $/; + my $result = <$file>; + + my @results; + + # Find gene name + ($results[NAME]) = $result =~ m& +
([^<]+)
&xis; + + if (not defined $results[NAME]) { + ($results[NAME]) = $result =~ m&\s*Entry\s*name\s* + \s*\s*([^<]+?)\s*\s*&xis; + } + + $results[NAME] ||= 'NO NAME'; + + # Find REF SEQ number + ($results[REFSEQ]) = $result =~ m&&xis; + + $results[REFSEQ] ||= 'NO REFSEQ'; + + # Find Chromosomal Location + ($results[LOCATION]) = $result =~ m&Chromosomal\s+Location\s+
+
Chromosome/Cytoband
\s*([^\<]+?)\s*
&xis; + + $results[LOCATION] ||= 'NO LOCATION'; + # Find gene aliases + # SOURCE ALIASES + my ($alias_table) = $result =~ m|Aliases\s* + \s+
    (.+?)
|xis; + $alias_table ||=''; + + my @gene_aliases = $alias_table =~ m&
  • \s*([^\(\<]{0,30}?)\s*(?:\<|\()&gis; + + # UNIPROT ALIASES + push @gene_aliases, $result =~ m&\s*\s*Synonym\(s\)\s*\s* + \s*([^<]+?)\s*\s*&xis; + push @gene_aliases, $result =~ m&\s*Description\s*\s* + \s*([^<]+?)\s*\s*&xis; + + $results[ALIAS] = join('; ', @gene_aliases); + $results[ALIAS] ||= 'NO ALIASES'; + + # Find gene function(s) + + # Stanford GO functions + my ($gene_ontology) = $result =~ m&\s* + Ontology\s*(.+?)&xis; + + my @functions; + push @functions, map {s#\s*\"\>\s*# #g; $_;} $gene_ontology =~ m&[^\<]+)&gxis + if defined $gene_ontology; + + # UNIPROT GO Functions + push @functions, map {s#\s*\;?\s*# #g; $_;} m& + + (GO\:\d+\;\s+[^\<]+?)\s*&xgis; + + $results[FUNCTION] = join('; ', map {(defined $_)?($_):()} @functions); + $results[FUNCTION] ||= 'NO FUNCTION'; + + # Figure out the keyword used + $results[KEYWORD] = $keyword; + + $results[KEYWORD] ||= 'NO KEYWORD'; + + # Figure out what the description is + ($results[DESCRIPTION]) = map{s#\n# #g; $_;} $result =~ m&Locus\s+Link\s+Summary(.+?)\s*\s*&is; + if (not defined $results[DESCRIPTION]) { + ($results[DESCRIPTION]) = map{s#\n# #g; $_;} $result =~ m& + FUNCTION\s* + ([^\<]+)\s*&xis; + } + $results[DESCRIPTION] ||= ''; + + # Database searched + $results[DBNAME] = 'harvester'; + $results[FILENAME] = $file_name; + + print join(',',map {qq("$_")} @results),qq(\n); +} + + + + + + +__END__