3 # parse_ncbi_results parses search results retrieved from ncbi,
4 # and is released under the terms of the GPL version 2, or any later
5 # version, at your option. See the file README and COPYING for more
8 # Copyright 2005,7 by Don Armstrong <don@donarmstrong.com>.
21 parse_ncbi_results [options]
27 --dir, -D directory to stick results into [default .]
28 --name, -n file naming scheme [default ${search}_results.$format]
29 --terms, -t file of search terms [default -]
30 --debug, -d debugging level [default 0]
31 --help, -h display this help
32 --man, -m display manual
40 Debug verbosity. (Default 0)
44 Display brief useage information.
54 parse_ncbi_results -D ./ncbi_results/ -n '${search}_name.html' < search_parameters
56 Will pretty much do what you want
62 use vars qw($DEBUG $REVISION);
65 ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/;
66 $DEBUG = 0 unless defined $DEBUG;
69 use XML::Parser::Expat;
72 # XXX parse config file
74 my %options = (debug => 0,
82 GetOptions(\%options,'keyword|k=s','debug|d+','help|h|?','man|m',
87 pod2usage() if $options{help};
88 pod2usage({verbose=>2}) if $options{man};
90 $DEBUG = $options{debug};
93 use constant {NAME => 0,
104 my $current_gene = undef;
106 my $file_name = undef;
107 my ($within_GO,$mrna_ref_seq) = 0,0;
110 my ($expat, $element, %attr) = @_;
112 local $_ = lc $element;
113 if ($_ eq 'entrezgene') {
115 $$current_gene[KEYWORD] = $keyword;
116 $$current_gene[DBNAME] = 'ncbi';
117 $$current_gene[FILENAME] = $file_name;
122 my ($expat, $string) = @_;
124 return unless defined $current_gene;
126 local $_ = lc $expat->current_element;
128 if ($_ eq 'gene-ref_locus') {
129 $$current_gene[NAME] = $string;
131 elsif ($_ eq 'gene-ref_maploc') {
132 $$current_gene[LOCATION] = $string;
134 elsif ($_ eq 'gene-ref_desc') {
135 push @{$$current_gene[ALIAS]}, $string;
137 elsif ($_ eq 'prot-ref_name_e' or $_ eq 'gene-ref_syn_e') {
138 push @{$$current_gene[ALIAS]}, $string;
140 elsif ($_ eq 'entrezgene_summary') {
141 $$current_gene[DESCRIPTION] = $string;
143 elsif ($_ eq 'gene-commentary_heading') {
146 $within_GO = 1 if $string =~ /GeneOntology/;
147 $mrna_ref_seq = 1 if $string =~ /mRNA Sequence/i;
149 elsif ($_ eq 'other-source_anchor') {
150 return unless $within_GO;
151 push @{$$current_gene[FUNCTION]}, $string;
153 elsif ($_ eq 'gene-commentary_accession') {
154 return unless $expat->within_element('Gene-commentary_products');
155 $$current_gene[REFSEQ] ||= $string;
160 my ($expat, $element) = @_;
162 local $_ = lc $element;
163 if ($_ eq 'entrezgene') {
164 # If current_gene is defined, output the current gene information
165 if (defined $current_gene and @$current_gene) {
166 $$current_gene[NAME] ||= ${$$current_gene[ALIAS]}[1] if defined $$current_gene[ALIAS];
167 for (qw(NAME REFSEQ LOCATION ALIAS
168 FUNCTION DESCRIPTION KEYWORD DBNAME
170 $$current_gene[eval "$_"] ||= "NO $_";
172 print STDOUT join(',', map {$_ = join('; ', @$_) if ref $_; qq("$_");} @$current_gene),qq(\n);
178 my $parser = new XML::Parser::Expat;
179 $parser->setHandlers('Start' => \&tag_start,
181 'Char' => \&tag_content
184 print STDOUT join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n);
187 if ($options{keywords}) {
189 $file_name = "ncbi_${keyword}_results.xml";
192 ($keyword) = $options{keyword} || $file_name =~ m#(?:^|/)([^\/]+?)[\s-]+AND[\s\-].+_results.xml$#;
194 my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
196 $parser->parse($file);