]> git.donarmstrong.com Git - function2gene.git/blobdiff - bin/get_ncbi_results
* Fix "rety" typo
[function2gene.git] / bin / get_ncbi_results
index c1043562a3bb50db7244274673c465caba3e7eb6..e126d5f687fc984107aafcb1263986158ddd2086 100755 (executable)
@@ -73,8 +73,8 @@ BEGIN{
 }
 
 use IO::File;
 }
 
 use IO::File;
-use URI::ParamMunge;
-use LWP::UserAgent;
+use URI;
+use WWW::Mechanize;
 
 # XXX parse config file
 
 
 # XXX parse config file
 
@@ -86,9 +86,10 @@ my %options = (debug    => 0,
               dir      => '.',
               name     => 'ncbi_${search}_results.$format',
               terms    => '-',
               dir      => '.',
               name     => 'ncbi_${search}_results.$format',
               terms    => '-',
-              pubmed_site => 'http://www.ncbi.nlm.nih.gov',
-              pubmed_search_url  => '/entrez/query.fcgi?db=gene&cmd=search&term=12q24*+AND+homo[Orgn]&doptcmdl=Brief&dispmax=1000',
-              pubmed_get_url     => '/entrez/query.fcgi?db=gene&cmd=Text&dopt=XML',
+              orgn     => 'homo',
+              ncbi_site => 'http://www.ncbi.nlm.nih.gov',
+              ncbi_search_url  => '/entrez/query.fcgi?cmd=search&doptcmdl=Brief&dispmax=1000',
+              ncbi_get_url     => '/entrez/query.fcgi?cmd=Text',
              );
 
 GetOptions(\%options,'format|f=s','database|b=s','name|n=s',
              );
 
 GetOptions(\%options,'format|f=s','database|b=s','name|n=s',
@@ -120,30 +121,49 @@ while (<$terms>) {
      chomp;
      my $search = $_;
      my $format = $options{format};
      chomp;
      my $search = $_;
      my $format = $options{format};
-     my $url = uri_param_munge($options{pubmed_site}.$options{pubmed_search_url},
-                              {term => $search,
-                               db   => $options{database},
-                              },
-                             );
-     my $request = HTTP::Request->new('GET', $url);
-     my $response = $ua->request($request);
-     $response = $response->content;
-     my @gene_ids = $response =~ m/\[GeneID\:\s+(\d+)\]/g;
-
+     my $uri = URI->new($options{ncbi_site}.$options{ncbi_search_url});
+     $uri->query_form($uri->query_form(),
+                     term => $search.' AND '.$options{orgn}.'[Orgn]',
+                     db   => $options{database},
+                    );
+     my $url = $uri->as_string;
+     my $mech = WWW::Mechanize->new(agent => "DA_get_ncbi_results/$REVISION");
+     $mech->get($url);
+     my @gene_ids;
+     {
+         do {
+              my $response = $mech->content();
+              push @gene_ids , $response =~ m{\[GeneID\:\s+(\d+)\s*\]\s*</td>}mg;
+              last unless $mech->find_link(text => 'Next');
+              $mech->follow_link(text => 'Next');
+         } while (1);
+     }
      my $file_name = eval qq("$options{name}") or die $@;
      my $xml_file = new IO::File "$options{dir}/$file_name", 'w' or die "Unable to open $options{dir}/$file_name: $!";
 
      # Get XML file
      my @current_ids;
      my $file_name = eval qq("$options{name}") or die $@;
      my $xml_file = new IO::File "$options{dir}/$file_name", 'w' or die "Unable to open $options{dir}/$file_name: $!";
 
      # Get XML file
      my @current_ids;
-     while (@current_ids = splice(@gene_ids,0,20)) {
-         $url = uri_param_munge($options{pubmed_site}.$options{pubmed_get_url},
-                                {dopt => uc($options{format}),
-                                 db   => $options{database},
-                                },
-                               ) .'&' . join('&',map {qq(uid=$_)} @current_ids);
-         $request = HTTP::Request->new('GET', $url);
-         $response = $ua->request($request);
-         $response = $response->content;
+     print {$xml_file} "<opt>\n";
+     while (@current_ids = splice(@gene_ids,0,5)) {
+         $uri = URI->new($options{ncbi_site}.$options{ncbi_get_url});
+         $uri->query_form($uri->query_form(),
+                          dopt => uc($options{format}),
+                          db   => $options{database},
+                          map {('uid',$_)} @current_ids,
+                         );
+         $url = $uri->as_string;
+         print STDERR "url: $url\n";
+         $mech->get($url);
+         my $response = $mech->content;
+         my $retry_count=5;
+         while ($response =~ /Error reading from remote server/ and $retry_count > 0) {
+              $mech->get($url);
+              $response = $mech->content;
+              $retry_count--;
+         }
+         if ($retry_count <= 0) {
+              die 'Unable to retreive ids ['.join(',',@current_ids).'] because of a remote server error';
+         }
          # For some dumb reason, they send us xml with html
          # entities. Ditch them.
          #$response = decode_entities($response);
          # For some dumb reason, they send us xml with html
          # entities. Ditch them.
          #$response = decode_entities($response);
@@ -152,15 +172,15 @@ while (<$terms>) {
          $response =~ s/&quot;/"/gso;
 
          # They also affix a <pre> and suffix a </pre> ditch them.
          $response =~ s/&quot;/"/gso;
 
          # They also affix a <pre> and suffix a </pre> ditch them.
-         $response =~ s/^\s*<pre>//gso;
-         $response =~ s#</pre>\s*$##gso;
-
          $response =~ s#<\?xml[^>]+>##gso;
          $response =~ s#<!DOCTYPE[^>]+>##gso;
          $response =~ s#<\?xml[^>]+>##gso;
          $response =~ s#<!DOCTYPE[^>]+>##gso;
+         $response =~ s/^\s*<pre>//gso;
+         $response =~ s#</pre>\s*$##gso;
 
          print {$xml_file} $response;
          sleep 10;
      }
 
          print {$xml_file} $response;
          sleep 10;
      }
+     print {$xml_file} "</opt>\n";
      undef $xml_file;
 }
 
      undef $xml_file;
 }