3 # parse_ncbi_results retreives files of search results from ncbi,
4 # and is released under the terms of the GPL version 2, or any later
5 # version, at your option. See the file README and COPYING for more
8 # Copyright 2004 by Don Armstrong <don@donarmstrong.com>.
10 # $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $
22 parse_ncbi_results [options]
28 --dir, -D directory to stick results into [default .]
29 --name, -n file naming scheme [default ${search}_results.$format]
30 --terms, -t file of search terms [default -]
31 --debug, -d debugging level [default 0]
32 --help, -h display this help
33 --man, -m display manual
41 Debug verbosity. (Default 0)
45 Display brief useage information.
55 parse_ncbi_results -D ./ncbi_results/ -n '${search}_name.html' < search_parameters
57 Will pretty much do what you want
63 use vars qw($DEBUG $REVISION);
66 ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/;
67 $DEBUG = 0 unless defined $DEBUG;
70 use XML::Parser::Expat;
73 # XXX parse config file
75 my %options = (debug => 0,
82 GetOptions(\%options,'keyword|k=s','debug|d+','help|h|?','man|m');
85 pod2usage() if $options{help};
86 pod2usage({verbose=>2}) if $options{man};
88 $DEBUG = $options{debug};
91 use constant {NAME => 0,
102 my $current_gene = undef;
104 my $file_name = undef;
105 my ($within_GO,$mrna_ref_seq) = 0,0;
108 my ($expat, $element, %attr) = @_;
110 local $_ = lc $element;
111 if ($_ eq 'entrezgene') {
113 $$current_gene[KEYWORD] = $keyword;
114 $$current_gene[DBNAME] = 'ncbi';
115 $$current_gene[FILENAME] = $file_name;
120 my ($expat, $string) = @_;
122 return unless defined $current_gene;
124 local $_ = lc $expat->current_element;
126 if ($_ eq 'gene-ref_locus') {
127 $$current_gene[NAME] = $string;
129 elsif ($_ eq 'gene-ref_maploc') {
130 $$current_gene[LOCATION] = $string;
132 elsif ($_ eq 'gene-ref_desc') {
133 push @{$$current_gene[ALIAS]}, $string;
135 elsif ($_ eq 'prot-ref_name_e' or $_ eq 'gene-ref_syn_e') {
136 push @{$$current_gene[ALIAS]}, $string;
138 elsif ($_ eq 'entrezgene_summary') {
139 $$current_gene[DESCRIPTION] = $string;
141 elsif ($_ eq 'gene-commentary_heading') {
144 $within_GO = 1 if $string =~ /GeneOntology/;
145 $mrna_ref_seq = 1 if $string =~ /mRNA Sequence/i;
147 elsif ($_ eq 'other-source_anchor') {
148 return unless $within_GO;
149 push @{$$current_gene[FUNCTION]}, $string;
151 elsif ($_ eq 'gene-commentary_accession') {
152 return unless $expat->within_element('Gene-commentary_products');
153 $$current_gene[REFSEQ] ||= $string;
158 my ($expat, $element) = @_;
160 local $_ = lc $element;
161 if ($_ eq 'entrezgene') {
162 # If current_gene is defined, output the current gene information
163 if (defined $current_gene and @$current_gene) {
164 $$current_gene[NAME] ||= ${$$current_gene[ALIAS]}[1] if defined $$current_gene[ALIAS];
165 for (qw(NAME REFSEQ LOCATION ALIAS
166 FUNCTION DESCRIPTION KEYWORD DBNAME
168 $$current_gene[eval "$_"] ||= "NO $_";
170 print STDOUT join(',', map {$_ = join('; ', @$_) if ref $_; qq("$_");} @$current_gene),qq(\n);
176 my $parser = new XML::Parser::Expat;
177 $parser->setHandlers('Start' => \&tag_start,
179 'Char' => \&tag_content
184 ($keyword) = $options{keyword} || $file_name =~ m#(?:^|/)([^\/]+?)[\s-]+AND[\s\-].+_results.xml$#;
185 print STDOUT join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n);
186 my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
188 $parser->parse($file);