* Update ncbi results to handle the new ncbi output format

[function2gene.git] / bin / get_ncbi_results
diff --git a/bin/get_ncbi_results b/bin/get_ncbi_results

index c1043562a3bb50db7244274673c465caba3e7eb6..7a869014bc15525ab80f36c0fc02ad50ac94f4eb 100755 (executable)
--- a/bin/get_ncbi_results
+++ b/bin/get_ncbi_results
@@ -73,8 +73,8 @@ BEGIN{
  }
  
  use IO::File;
-use URI::ParamMunge;
-use LWP::UserAgent;
+use URI;
+use WWW::Mechanize;
  
  # XXX parse config file
  
@@ -86,9 +86,10 @@ my %options = (debug    => 0,
                dir      => '.',
                name     => 'ncbi_${search}_results.$format',
                terms    => '-',
+              orgn     => 'homo',
                pubmed_site => 'http://www.ncbi.nlm.nih.gov',
-              pubmed_search_url  => '/entrez/query.fcgi?db=gene&cmd=search&term=12q24*+AND+homo[Orgn]&doptcmdl=Brief&dispmax=1000',
-              pubmed_get_url     => '/entrez/query.fcgi?db=gene&cmd=Text&dopt=XML',
+              pubmed_search_url  => '/entrez/query.fcgi?cmd=search&doptcmdl=Brief&dispmax=1000',
+              pubmed_get_url     => '/entrez/query.fcgi?cmd=Text',
               );
  
  GetOptions(\%options,'format|f=s','database|b=s','name|n=s',
@@ -120,30 +121,39 @@ while (<$terms>) {
       chomp;
       my $search = $_;
       my $format = $options{format};
-     my $url = uri_param_munge($options{pubmed_site}.$options{pubmed_search_url},
-                              {term => $search,
-                               db   => $options{database},
-                              },
-                             );
-     my $request = HTTP::Request->new('GET', $url);
-     my $response = $ua->request($request);
-     $response = $response->content;
-     my @gene_ids = $response =~ m/\[GeneID\:\s+(\d+)\]/g;
-
+     my $uri = URI->new($options{pubmed_site}.$options{pubmed_search_url});
+     $uri->query_form($uri->query_form(),
+                     term => $search.' AND '.$options{orgn}.'[Orgn]',
+                     db   => $options{database},
+                    );
+     my $url = $uri->as_string;
+     my $mech = WWW::Mechanize->new(agent => "DA_get_ncbi_results/$REVISION");
+     $mech->get($url);
+     my @gene_ids;
+     {
+         do {
+              my $response = $mech->content();
+              push @gene_ids , $response =~ m{\[GeneID\:\s+(\d+)\s*\]\s*</td>}mg;
+              last unless $mech->find_link(text => 'Next');
+              $mech->follow_link(text => 'Next');
+         } while (1);
+     }
       my $file_name = eval qq("$options{name}") or die $@;
       my $xml_file = new IO::File "$options{dir}/$file_name", 'w' or die "Unable to open $options{dir}/$file_name: $!";
  
       # Get XML file
       my @current_ids;
-     while (@current_ids = splice(@gene_ids,0,20)) {
-         $url = uri_param_munge($options{pubmed_site}.$options{pubmed_get_url},
-                                {dopt => uc($options{format}),
-                                 db   => $options{database},
-                                },
-                               ) .'&' . join('&',map {qq(uid=$_)} @current_ids);
-         $request = HTTP::Request->new('GET', $url);
-         $response = $ua->request($request);
-         $response = $response->content;
+     while (@current_ids = splice(@gene_ids,0,5)) {
+         $uri = URI->new($options{pubmed_site}.$options{pubmed_get_url});
+         $uri->query_form($uri->query_form(),
+                          dopt => uc($options{format}),
+                          db   => $options{database},
+                          map {('uid',$_)} @current_ids,
+                         );
+         $url = $uri->as_string;
+         print STDERR "url: $url\n";
+         $mech->get($url);
+         my $response = $mech->content;
           # For some dumb reason, they send us xml with html
           # entities. Ditch them.
           #$response = decode_entities($response);
@@ -152,11 +162,11 @@ while (<$terms>) {
           $response =~ s/&quot;/"/gso;
  
           # They also affix a <pre> and suffix a </pre> ditch them.
+         $response =~ s#<\?xml[^>]+>##gso;
+         $response =~ s#<!DOCTYPE[^>]+>##gso;
           $response =~ s/^\s*<pre>//gso;
           $response =~ s#</pre>\s*$##gso;
  
-         $response =~ s#<\?xml[^>]+>##gso;
-         $response =~ s#<!DOCTYPE[^>]+>##gso;
  
           print {$xml_file} $response;
           sleep 10;