}
use IO::File;
-use URI::ParamMunge;
-use LWP::UserAgent;
+use URI;
+use WWW::Mechanize;
# XXX parse config file
dir => '.',
name => 'ncbi_${search}_results.$format',
terms => '-',
- pubmed_site => 'http://www.ncbi.nlm.nih.gov',
- pubmed_search_url => '/entrez/query.fcgi?db=gene&cmd=search&term=12q24*+AND+homo[Orgn]&doptcmdl=Brief&dispmax=1000',
- pubmed_get_url => '/entrez/query.fcgi?db=gene&cmd=Text&dopt=XML',
+ orgn => 'homo',
+ ncbi_site => 'http://www.ncbi.nlm.nih.gov',
+ ncbi_search_url => '/entrez/query.fcgi?cmd=search&doptcmdl=Brief&dispmax=1000',
+ ncbi_get_url => '/entrez/query.fcgi?cmd=Text',
);
GetOptions(\%options,'format|f=s','database|b=s','name|n=s',
chomp;
my $search = $_;
my $format = $options{format};
- my $url = uri_param_munge($options{pubmed_site}.$options{pubmed_search_url},
- {term => $search,
- db => $options{database},
- },
- );
- my $request = HTTP::Request->new('GET', $url);
- my $response = $ua->request($request);
- $response = $response->content;
- my @gene_ids = $response =~ m/\[GeneID\:\s+(\d+)\]/g;
-
+ my $uri = URI->new($options{ncbi_site}.$options{ncbi_search_url});
+ $uri->query_form($uri->query_form(),
+ term => $search.' AND '.$options{orgn}.'[Orgn]',
+ db => $options{database},
+ );
+ my $url = $uri->as_string;
+ my $mech = WWW::Mechanize->new(agent => "DA_get_ncbi_results/$REVISION");
+ $mech->get($url);
+ my @gene_ids;
+ {
+ do {
+ my $response = $mech->content();
+ push @gene_ids , $response =~ m{\[GeneID\:\s+(\d+)\s*\]\s*</td>}mg;
+ last unless $mech->find_link(text => 'Next');
+ $mech->follow_link(text => 'Next');
+ } while (1);
+ }
my $file_name = eval qq("$options{name}") or die $@;
my $xml_file = new IO::File "$options{dir}/$file_name", 'w' or die "Unable to open $options{dir}/$file_name: $!";
# Get XML file
my @current_ids;
- while (@current_ids = splice(@gene_ids,0,20)) {
- $url = uri_param_munge($options{pubmed_site}.$options{pubmed_get_url},
- {dopt => uc($options{format}),
- db => $options{database},
- },
- ) .'&' . join('&',map {qq(uid=$_)} @current_ids);
- $request = HTTP::Request->new('GET', $url);
- $response = $ua->request($request);
- $response = $response->content;
+ print {$xml_file} "<opt>\n";
+ while (@current_ids = splice(@gene_ids,0,5)) {
+ $uri = URI->new($options{ncbi_site}.$options{ncbi_get_url});
+ $uri->query_form($uri->query_form(),
+ dopt => uc($options{format}),
+ db => $options{database},
+ map {('uid',$_)} @current_ids,
+ );
+ $url = $uri->as_string;
+ print STDERR "url: $url\n";
+ $mech->get($url);
+ my $response = $mech->content;
+ my $retry_count=5;
+ while ($response =~ /Error reading from remote server/ and $retry_count > 0) {
+ $mech->get($url);
+ $response = $mech->content;
+ $retry_count--;
+ }
+ if ($retry_count <= 0) {
+ die 'Unable to retreive ids ['.join(',',@current_ids).'] because of a remote server error';
+ }
# For some dumb reason, they send us xml with html
# entities. Ditch them.
#$response = decode_entities($response);
$response =~ s/"/"/gso;
# They also affix a <pre> and suffix a </pre> ditch them.
- $response =~ s/^\s*<pre>//gso;
- $response =~ s#</pre>\s*$##gso;
-
$response =~ s#<\?xml[^>]+>##gso;
$response =~ s#<!DOCTYPE[^>]+>##gso;
+ $response =~ s/^\s*<pre>//gso;
+ $response =~ s#</pre>\s*$##gso;
print {$xml_file} $response;
sleep 10;
}
+ print {$xml_file} "</opt>\n";
undef $xml_file;
}