X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bin%2Fparse_ncbi_results;fp=bin%2Fparse_ncbi_results;h=51d339db66e38bb02918c5d0c3ce94f760558b30;hb=d09b67e0af77d6f2818e41d6b4d648cff651c79d;hp=0000000000000000000000000000000000000000;hpb=5d4602c246e5d2c22435bda4c07116251f1fa546;p=function2gene.git diff --git a/bin/parse_ncbi_results b/bin/parse_ncbi_results new file mode 100755 index 0000000..51d339d --- /dev/null +++ b/bin/parse_ncbi_results @@ -0,0 +1,198 @@ +#! /usr/bin/perl + +# parse_ncbi_results retreives files of search results from ncbi, +# and is released under the terms of the GPL version 2, or any later +# version, at your option. See the file README and COPYING for more +# information. + +# Copyright 2004 by Don Armstrong . + +# $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $ + + +use warnings; +use strict; + + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + + parse_ncbi_results [options] + +=head1 SYNOPSIS + + + Options: + --dir, -D directory to stick results into [default .] + --name, -n file naming scheme [default ${search}_results.$format] + --terms, -t file of search terms [default -] + --debug, -d debugging level [default 0] + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief useage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + parse_ncbi_results -D ./ncbi_results/ -n '${search}_name.html' < search_parameters + +Will pretty much do what you want + +=cut + + + +use vars qw($DEBUG $REVISION); + +BEGIN{ + ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/; + $DEBUG = 0 unless defined $DEBUG; +} + +use XML::Parser::Expat; +use IO::File; + +# XXX parse config file + +my %options = (debug => 0, + help => 0, + man => 0, + dir => '.', + keyword => undef, + ); + +GetOptions(\%options,'keyword|k=s','debug|d+','help|h|?','man|m'); + + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +# CSV columns +use constant {NAME => 0, + REFSEQ => 1, + LOCATION => 2, + ALIAS => 3, + FUNCTION => 4, + DESCRIPTION => 5, + KEYWORD => 6, + DBNAME => 7, + FILENAME => 8, + }; + +my $current_gene = undef; +my $keyword = undef; +my $file_name = undef; +my ($within_GO,$mrna_ref_seq) = 0,0; + +sub tag_start{ + my ($expat, $element, %attr) = @_; + + local $_ = lc $element; + if ($_ eq 'entrezgene') { + $current_gene = []; + $$current_gene[KEYWORD] = $keyword; + $$current_gene[DBNAME] = 'ncbi'; + $$current_gene[FILENAME] = $file_name; + } +} + +sub tag_content { + my ($expat, $string) = @_; + + return unless defined $current_gene; + + local $_ = lc $expat->current_element; + + if ($_ eq 'gene-ref_locus') { + $$current_gene[NAME] = $string; + } + elsif ($_ eq 'gene-ref_maploc') { + $$current_gene[LOCATION] = $string; + } + elsif ($_ eq 'gene-ref_desc') { + push @{$$current_gene[ALIAS]}, $string; + } + elsif ($_ eq 'prot-ref_name_e' or $_ eq 'gene-ref_syn_e') { + push @{$$current_gene[ALIAS]}, $string; + } + elsif ($_ eq 'entrezgene_summary') { + $$current_gene[DESCRIPTION] = $string; + } + elsif ($_ eq 'gene-commentary_heading') { + $within_GO = 0; + $mrna_ref_seq = 0; + $within_GO = 1 if $string =~ /GeneOntology/; + $mrna_ref_seq = 1 if $string =~ /mRNA Sequence/i; + } + elsif ($_ eq 'other-source_anchor') { + return unless $within_GO; + push @{$$current_gene[FUNCTION]}, $string; + } + elsif ($_ eq 'gene-commentary_accession') { + return unless $expat->within_element('Gene-commentary_products'); + $$current_gene[REFSEQ] ||= $string; + } +} + +sub tag_stop { + my ($expat, $element) = @_; + + local $_ = lc $element; + if ($_ eq 'entrezgene') { + # If current_gene is defined, output the current gene information + if (defined $current_gene and @$current_gene) { + $$current_gene[NAME] ||= ${$$current_gene[ALIAS]}[1] if defined $$current_gene[ALIAS]; + for (qw(NAME REFSEQ LOCATION ALIAS + FUNCTION DESCRIPTION KEYWORD DBNAME + FILENAME)) { + $$current_gene[eval "$_"] ||= "NO $_"; + } + print STDOUT join(',', map {$_ = join('; ', @$_) if ref $_; qq("$_");} @$current_gene),qq(\n); + undef $current_gene; + } + } +} + +my $parser = new XML::Parser::Expat; +$parser->setHandlers('Start' => \&tag_start, + 'End' => \&tag_stop, + 'Char' => \&tag_content + ); + +for (@ARGV) { + $file_name = $_; + ($keyword) = $options{keyword} || $file_name =~ m#(?:^|/)([^\/]+?)[\s-]+AND[\s\-].+_results.xml$#; + print STDOUT join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n); + my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!"; + + $parser->parse($file); + + undef $file; +} + + + + + + +__END__