X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fget_ncbi_results;h=e126d5f687fc984107aafcb1263986158ddd2086;hp=c1043562a3bb50db7244274673c465caba3e7eb6;hb=67d86ccaa9e63139266e501f33c69cd30803f6eb;hpb=2f7bfa126e44399e0d7c3f3b755afbe895ac6e3a diff --git a/bin/get_ncbi_results b/bin/get_ncbi_results index c104356..e126d5f 100755 --- a/bin/get_ncbi_results +++ b/bin/get_ncbi_results @@ -73,8 +73,8 @@ BEGIN{ } use IO::File; -use URI::ParamMunge; -use LWP::UserAgent; +use URI; +use WWW::Mechanize; # XXX parse config file @@ -86,9 +86,10 @@ my %options = (debug => 0, dir => '.', name => 'ncbi_${search}_results.$format', terms => '-', - pubmed_site => 'http://www.ncbi.nlm.nih.gov', - pubmed_search_url => '/entrez/query.fcgi?db=gene&cmd=search&term=12q24*+AND+homo[Orgn]&doptcmdl=Brief&dispmax=1000', - pubmed_get_url => '/entrez/query.fcgi?db=gene&cmd=Text&dopt=XML', + orgn => 'homo', + ncbi_site => 'http://www.ncbi.nlm.nih.gov', + ncbi_search_url => '/entrez/query.fcgi?cmd=search&doptcmdl=Brief&dispmax=1000', + ncbi_get_url => '/entrez/query.fcgi?cmd=Text', ); GetOptions(\%options,'format|f=s','database|b=s','name|n=s', @@ -120,30 +121,49 @@ while (<$terms>) { chomp; my $search = $_; my $format = $options{format}; - my $url = uri_param_munge($options{pubmed_site}.$options{pubmed_search_url}, - {term => $search, - db => $options{database}, - }, - ); - my $request = HTTP::Request->new('GET', $url); - my $response = $ua->request($request); - $response = $response->content; - my @gene_ids = $response =~ m/\[GeneID\:\s+(\d+)\]/g; - + my $uri = URI->new($options{ncbi_site}.$options{ncbi_search_url}); + $uri->query_form($uri->query_form(), + term => $search.' AND '.$options{orgn}.'[Orgn]', + db => $options{database}, + ); + my $url = $uri->as_string; + my $mech = WWW::Mechanize->new(agent => "DA_get_ncbi_results/$REVISION"); + $mech->get($url); + my @gene_ids; + { + do { + my $response = $mech->content(); + push @gene_ids , $response =~ m{\[GeneID\:\s+(\d+)\s*\]\s*}mg; + last unless $mech->find_link(text => 'Next'); + $mech->follow_link(text => 'Next'); + } while (1); + } my $file_name = eval qq("$options{name}") or die $@; my $xml_file = new IO::File "$options{dir}/$file_name", 'w' or die "Unable to open $options{dir}/$file_name: $!"; # Get XML file my @current_ids; - while (@current_ids = splice(@gene_ids,0,20)) { - $url = uri_param_munge($options{pubmed_site}.$options{pubmed_get_url}, - {dopt => uc($options{format}), - db => $options{database}, - }, - ) .'&' . join('&',map {qq(uid=$_)} @current_ids); - $request = HTTP::Request->new('GET', $url); - $response = $ua->request($request); - $response = $response->content; + print {$xml_file} "\n"; + while (@current_ids = splice(@gene_ids,0,5)) { + $uri = URI->new($options{ncbi_site}.$options{ncbi_get_url}); + $uri->query_form($uri->query_form(), + dopt => uc($options{format}), + db => $options{database}, + map {('uid',$_)} @current_ids, + ); + $url = $uri->as_string; + print STDERR "url: $url\n"; + $mech->get($url); + my $response = $mech->content; + my $retry_count=5; + while ($response =~ /Error reading from remote server/ and $retry_count > 0) { + $mech->get($url); + $response = $mech->content; + $retry_count--; + } + if ($retry_count <= 0) { + die 'Unable to retreive ids ['.join(',',@current_ids).'] because of a remote server error'; + } # For some dumb reason, they send us xml with html # entities. Ditch them. #$response = decode_entities($response); @@ -152,15 +172,15 @@ while (<$terms>) { $response =~ s/"/"/gso; # They also affix a
 and suffix a 
ditch them. - $response =~ s/^\s*
//gso;
-	  $response =~ s#
\s*$##gso; - $response =~ s#<\?xml[^>]+>##gso; $response =~ s#]+>##gso; + $response =~ s/^\s*
//gso;
+	  $response =~ s#
\s*$##gso; print {$xml_file} $response; sleep 10; } + print {$xml_file} "
\n"; undef $xml_file; }