* Update ncbi results to handle the new ncbi output format
authorDon Armstrong <don@donarmstrong.com>
Tue, 28 Aug 2007 01:00:57 +0000 (01:00 +0000)
committerDon Armstrong <don@donarmstrong.com>
Tue, 28 Aug 2007 01:00:57 +0000 (01:00 +0000)
git-svn-id: file:///srv/svn/function2gene/trunk@7 a0738b58-4706-0410-8799-fb830574a030

bin/get_ncbi_results

index c104356..7a86901 100755 (executable)
@@ -73,8 +73,8 @@ BEGIN{
 }
 
 use IO::File;
-use URI::ParamMunge;
-use LWP::UserAgent;
+use URI;
+use WWW::Mechanize;
 
 # XXX parse config file
 
@@ -86,9 +86,10 @@ my %options = (debug    => 0,
               dir      => '.',
               name     => 'ncbi_${search}_results.$format',
               terms    => '-',
+              orgn     => 'homo',
               pubmed_site => 'http://www.ncbi.nlm.nih.gov',
-              pubmed_search_url  => '/entrez/query.fcgi?db=gene&cmd=search&term=12q24*+AND+homo[Orgn]&doptcmdl=Brief&dispmax=1000',
-              pubmed_get_url     => '/entrez/query.fcgi?db=gene&cmd=Text&dopt=XML',
+              pubmed_search_url  => '/entrez/query.fcgi?cmd=search&doptcmdl=Brief&dispmax=1000',
+              pubmed_get_url     => '/entrez/query.fcgi?cmd=Text',
              );
 
 GetOptions(\%options,'format|f=s','database|b=s','name|n=s',
@@ -120,30 +121,39 @@ while (<$terms>) {
      chomp;
      my $search = $_;
      my $format = $options{format};
-     my $url = uri_param_munge($options{pubmed_site}.$options{pubmed_search_url},
-                              {term => $search,
-                               db   => $options{database},
-                              },
-                             );
-     my $request = HTTP::Request->new('GET', $url);
-     my $response = $ua->request($request);
-     $response = $response->content;
-     my @gene_ids = $response =~ m/\[GeneID\:\s+(\d+)\]/g;
-
+     my $uri = URI->new($options{pubmed_site}.$options{pubmed_search_url});
+     $uri->query_form($uri->query_form(),
+                     term => $search.' AND '.$options{orgn}.'[Orgn]',
+                     db   => $options{database},
+                    );
+     my $url = $uri->as_string;
+     my $mech = WWW::Mechanize->new(agent => "DA_get_ncbi_results/$REVISION");
+     $mech->get($url);
+     my @gene_ids;
+     {
+         do {
+              my $response = $mech->content();
+              push @gene_ids , $response =~ m{\[GeneID\:\s+(\d+)\s*\]\s*</td>}mg;
+              last unless $mech->find_link(text => 'Next');
+              $mech->follow_link(text => 'Next');
+         } while (1);
+     }
      my $file_name = eval qq("$options{name}") or die $@;
      my $xml_file = new IO::File "$options{dir}/$file_name", 'w' or die "Unable to open $options{dir}/$file_name: $!";
 
      # Get XML file
      my @current_ids;
-     while (@current_ids = splice(@gene_ids,0,20)) {
-         $url = uri_param_munge($options{pubmed_site}.$options{pubmed_get_url},
-                                {dopt => uc($options{format}),
-                                 db   => $options{database},
-                                },
-                               ) .'&' . join('&',map {qq(uid=$_)} @current_ids);
-         $request = HTTP::Request->new('GET', $url);
-         $response = $ua->request($request);
-         $response = $response->content;
+     while (@current_ids = splice(@gene_ids,0,5)) {
+         $uri = URI->new($options{pubmed_site}.$options{pubmed_get_url});
+         $uri->query_form($uri->query_form(),
+                          dopt => uc($options{format}),
+                          db   => $options{database},
+                          map {('uid',$_)} @current_ids,
+                         );
+         $url = $uri->as_string;
+         print STDERR "url: $url\n";
+         $mech->get($url);
+         my $response = $mech->content;
          # For some dumb reason, they send us xml with html
          # entities. Ditch them.
          #$response = decode_entities($response);
@@ -152,11 +162,11 @@ while (<$terms>) {
          $response =~ s/&quot;/"/gso;
 
          # They also affix a <pre> and suffix a </pre> ditch them.
+         $response =~ s#<\?xml[^>]+>##gso;
+         $response =~ s#<!DOCTYPE[^>]+>##gso;
          $response =~ s/^\s*<pre>//gso;
          $response =~ s#</pre>\s*$##gso;
 
-         $response =~ s#<\?xml[^>]+>##gso;
-         $response =~ s#<!DOCTYPE[^>]+>##gso;
 
          print {$xml_file} $response;
          sleep 10;