3 # parse_ncbi_results retreives files of search results from ncbi,
4 # and is released under the terms of the GPL version 2, or any later
5 # version, at your option. See the file README and COPYING for more
8 # Copyright 2004 by Don Armstrong <don@donarmstrong.com>.
10 # $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $
22 parse_ncbi_results [options]
28 --dir, -D directory to stick results into [default .]
29 --name, -n file naming scheme [default ${search}_results.$format]
30 --terms, -t file of search terms [default -]
31 --debug, -d debugging level [default 0]
32 --help, -h display this help
33 --man, -m display manual
41 Debug verbosity. (Default 0)
45 Display brief useage information.
55 parse_ncbi_results -D ./ncbi_results/ -n '${search}_name.html' < search_parameters
57 Will pretty much do what you want
63 use vars qw($DEBUG $REVISION);
66 ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/;
67 $DEBUG = 0 unless defined $DEBUG;
70 use XML::Parser::Expat;
73 # XXX parse config file
75 my %options = (debug => 0,
83 GetOptions(\%options,'keyword|k=s','debug|d+','help|h|?','man|m',
88 pod2usage() if $options{help};
89 pod2usage({verbose=>2}) if $options{man};
91 $DEBUG = $options{debug};
94 use constant {NAME => 0,
105 my $current_gene = undef;
107 my $file_name = undef;
108 my ($within_GO,$mrna_ref_seq) = 0,0;
111 my ($expat, $element, %attr) = @_;
113 local $_ = lc $element;
114 if ($_ eq 'entrezgene') {
116 $$current_gene[KEYWORD] = $keyword;
117 $$current_gene[DBNAME] = 'ncbi';
118 $$current_gene[FILENAME] = $file_name;
123 my ($expat, $string) = @_;
125 return unless defined $current_gene;
127 local $_ = lc $expat->current_element;
129 if ($_ eq 'gene-ref_locus') {
130 $$current_gene[NAME] = $string;
132 elsif ($_ eq 'gene-ref_maploc') {
133 $$current_gene[LOCATION] = $string;
135 elsif ($_ eq 'gene-ref_desc') {
136 push @{$$current_gene[ALIAS]}, $string;
138 elsif ($_ eq 'prot-ref_name_e' or $_ eq 'gene-ref_syn_e') {
139 push @{$$current_gene[ALIAS]}, $string;
141 elsif ($_ eq 'entrezgene_summary') {
142 $$current_gene[DESCRIPTION] = $string;
144 elsif ($_ eq 'gene-commentary_heading') {
147 $within_GO = 1 if $string =~ /GeneOntology/;
148 $mrna_ref_seq = 1 if $string =~ /mRNA Sequence/i;
150 elsif ($_ eq 'other-source_anchor') {
151 return unless $within_GO;
152 push @{$$current_gene[FUNCTION]}, $string;
154 elsif ($_ eq 'gene-commentary_accession') {
155 return unless $expat->within_element('Gene-commentary_products');
156 $$current_gene[REFSEQ] ||= $string;
161 my ($expat, $element) = @_;
163 local $_ = lc $element;
164 if ($_ eq 'entrezgene') {
165 # If current_gene is defined, output the current gene information
166 if (defined $current_gene and @$current_gene) {
167 $$current_gene[NAME] ||= ${$$current_gene[ALIAS]}[1] if defined $$current_gene[ALIAS];
168 for (qw(NAME REFSEQ LOCATION ALIAS
169 FUNCTION DESCRIPTION KEYWORD DBNAME
171 $$current_gene[eval "$_"] ||= "NO $_";
173 print STDOUT join(',', map {$_ = join('; ', @$_) if ref $_; qq("$_");} @$current_gene),qq(\n);
179 my $parser = new XML::Parser::Expat;
180 $parser->setHandlers('Start' => \&tag_start,
182 'Char' => \&tag_content
185 print STDOUT join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n);
188 if ($options{keywords}) {
190 $file_name = "ncbi_${keyword}_results.xml";
193 ($keyword) = $options{keyword} || $file_name =~ m#(?:^|/)([^\/]+?)[\s-]+AND[\s\-].+_results.xml$#;
195 my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
197 $parser->parse($file);