#! /usr/bin/perl # parse_ncbi_results parses search results retrieved from ncbi, # and is released under the terms of the GPL version 2, or any later # version, at your option. See the file README and COPYING for more # information. # Copyright 2005,7 by Don Armstrong . use warnings; use strict; use Getopt::Long; use Pod::Usage; =head1 NAME parse_ncbi_results [options] =head1 SYNOPSIS Options: --dir, -D directory to stick results into [default .] --name, -n file naming scheme [default ${search}_results.$format] --terms, -t file of search terms [default -] --debug, -d debugging level [default 0] --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief useage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES parse_ncbi_results -D ./ncbi_results/ -n '${search}_name.html' < search_parameters Will pretty much do what you want =cut use vars qw($DEBUG $REVISION); BEGIN{ ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/; $DEBUG = 0 unless defined $DEBUG; } use XML::Parser::Expat; use IO::File; # XXX parse config file my %options = (debug => 0, help => 0, man => 0, dir => '.', keyword => undef, keywords => 0, ); GetOptions(\%options,'keyword|k=s','debug|d+','help|h|?','man|m', 'keywords' ); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; # CSV columns use constant {NAME => 0, REFSEQ => 1, LOCATION => 2, ALIAS => 3, FUNCTION => 4, DESCRIPTION => 5, KEYWORD => 6, DBNAME => 7, FILENAME => 8, }; my $current_gene = undef; my $keyword = undef; my $file_name = undef; my ($within_GO,$mrna_ref_seq) = 0,0; sub tag_start{ my ($expat, $element, %attr) = @_; local $_ = lc $element; if ($_ eq 'entrezgene') { $current_gene = []; $$current_gene[KEYWORD] = $keyword; $$current_gene[DBNAME] = 'ncbi'; $$current_gene[FILENAME] = $file_name; } } sub tag_content { my ($expat, $string) = @_; return unless defined $current_gene; local $_ = lc $expat->current_element; if ($_ eq 'gene-ref_locus') { $$current_gene[NAME] = $string; } elsif ($_ eq 'gene-ref_maploc') { $$current_gene[LOCATION] = $string; } elsif ($_ eq 'gene-ref_desc') { push @{$$current_gene[ALIAS]}, $string; } elsif ($_ eq 'prot-ref_name_e' or $_ eq 'gene-ref_syn_e') { push @{$$current_gene[ALIAS]}, $string; } elsif ($_ eq 'entrezgene_summary') { $$current_gene[DESCRIPTION] = $string; } elsif ($_ eq 'gene-commentary_heading') { $within_GO = 0; $mrna_ref_seq = 0; $within_GO = 1 if $string =~ /GeneOntology/; $mrna_ref_seq = 1 if $string =~ /mRNA Sequence/i; } elsif ($_ eq 'other-source_anchor') { return unless $within_GO; push @{$$current_gene[FUNCTION]}, $string; } elsif ($_ eq 'gene-commentary_accession') { return unless $expat->within_element('Gene-commentary_products'); $$current_gene[REFSEQ] ||= $string; } } sub tag_stop { my ($expat, $element) = @_; local $_ = lc $element; if ($_ eq 'entrezgene') { # If current_gene is defined, output the current gene information if (defined $current_gene and @$current_gene) { $$current_gene[NAME] ||= ${$$current_gene[ALIAS]}[1] if defined $$current_gene[ALIAS]; for (qw(NAME REFSEQ LOCATION ALIAS FUNCTION DESCRIPTION KEYWORD DBNAME FILENAME)) { $$current_gene[eval "$_"] ||= "NO $_"; } print STDOUT join(',', map {$_ = join('; ', @$_) if ref $_; qq("$_");} @$current_gene),qq(\n); undef $current_gene; } } } my $parser = new XML::Parser::Expat; $parser->setHandlers('Start' => \&tag_start, 'End' => \&tag_stop, 'Char' => \&tag_content ); print STDOUT join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n); for (@ARGV) { $file_name = $_; if ($options{keywords}) { $keyword = $_; $file_name = "ncbi_${keyword}_results.xml"; } else { ($keyword) = $options{keyword} || $file_name =~ m#(?:^|/)([^\/]+?)[\s-]+AND[\s\-].+_results.xml$#; } my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!"; $parser->parse($file); undef $file; } __END__