From 8e1fe04cb712a2231b9a7112b7a701a67f9ddd31 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Tue, 28 Aug 2007 01:00:57 +0000 Subject: [PATCH] * Update ncbi results to handle the new ncbi output format git-svn-id: file:///srv/svn/function2gene/trunk@7 a0738b58-4706-0410-8799-fb830574a030 --- bin/get_ncbi_results | 60 ++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/bin/get_ncbi_results b/bin/get_ncbi_results index c104356..7a86901 100755 --- a/bin/get_ncbi_results +++ b/bin/get_ncbi_results @@ -73,8 +73,8 @@ BEGIN{ } use IO::File; -use URI::ParamMunge; -use LWP::UserAgent; +use URI; +use WWW::Mechanize; # XXX parse config file @@ -86,9 +86,10 @@ my %options = (debug => 0, dir => '.', name => 'ncbi_${search}_results.$format', terms => '-', + orgn => 'homo', pubmed_site => 'http://www.ncbi.nlm.nih.gov', - pubmed_search_url => '/entrez/query.fcgi?db=gene&cmd=search&term=12q24*+AND+homo[Orgn]&doptcmdl=Brief&dispmax=1000', - pubmed_get_url => '/entrez/query.fcgi?db=gene&cmd=Text&dopt=XML', + pubmed_search_url => '/entrez/query.fcgi?cmd=search&doptcmdl=Brief&dispmax=1000', + pubmed_get_url => '/entrez/query.fcgi?cmd=Text', ); GetOptions(\%options,'format|f=s','database|b=s','name|n=s', @@ -120,30 +121,39 @@ while (<$terms>) { chomp; my $search = $_; my $format = $options{format}; - my $url = uri_param_munge($options{pubmed_site}.$options{pubmed_search_url}, - {term => $search, - db => $options{database}, - }, - ); - my $request = HTTP::Request->new('GET', $url); - my $response = $ua->request($request); - $response = $response->content; - my @gene_ids = $response =~ m/\[GeneID\:\s+(\d+)\]/g; - + my $uri = URI->new($options{pubmed_site}.$options{pubmed_search_url}); + $uri->query_form($uri->query_form(), + term => $search.' AND '.$options{orgn}.'[Orgn]', + db => $options{database}, + ); + my $url = $uri->as_string; + my $mech = WWW::Mechanize->new(agent => "DA_get_ncbi_results/$REVISION"); + $mech->get($url); + my @gene_ids; + { + do { + my $response = $mech->content(); + push @gene_ids , $response =~ m{\[GeneID\:\s+(\d+)\s*\]\s*}mg; + last unless $mech->find_link(text => 'Next'); + $mech->follow_link(text => 'Next'); + } while (1); + } my $file_name = eval qq("$options{name}") or die $@; my $xml_file = new IO::File "$options{dir}/$file_name", 'w' or die "Unable to open $options{dir}/$file_name: $!"; # Get XML file my @current_ids; - while (@current_ids = splice(@gene_ids,0,20)) { - $url = uri_param_munge($options{pubmed_site}.$options{pubmed_get_url}, - {dopt => uc($options{format}), - db => $options{database}, - }, - ) .'&' . join('&',map {qq(uid=$_)} @current_ids); - $request = HTTP::Request->new('GET', $url); - $response = $ua->request($request); - $response = $response->content; + while (@current_ids = splice(@gene_ids,0,5)) { + $uri = URI->new($options{pubmed_site}.$options{pubmed_get_url}); + $uri->query_form($uri->query_form(), + dopt => uc($options{format}), + db => $options{database}, + map {('uid',$_)} @current_ids, + ); + $url = $uri->as_string; + print STDERR "url: $url\n"; + $mech->get($url); + my $response = $mech->content; # For some dumb reason, they send us xml with html # entities. Ditch them. #$response = decode_entities($response); @@ -152,11 +162,11 @@ while (<$terms>) { $response =~ s/"/"/gso; # They also affix a
 and suffix a 
ditch them. + $response =~ s#<\?xml[^>]+>##gso; + $response =~ s#]+>##gso; $response =~ s/^\s*
//gso;
 	  $response =~ s#
\s*$##gso; - $response =~ s#<\?xml[^>]+>##gso; - $response =~ s#]+>##gso; print {$xml_file} $response; sleep 10; -- 2.39.2