# $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $
+use threads;
use warnings;
use strict;
}
use IO::File;
-use URI::ParamMunge;
-use LWP::UserAgent;
+use URI;
+use WWW::Mechanize;
+use Thread::Queue;
# XXX parse config file
dir => '.',
name => '${search}_results_harvester',
terms => '-',
- harvester_site => 'http://www-db.embl.de',
- harvester_search_url => '/jss/servlet/de.embl.bk.htmlfind.HarvesterPageSearchOutput?search=GOLGI&chipsetID=-1&maxHits=10000&submit=search',
- );
+ orgn => 'human',
+ harvester_site => 'http://harvester.fzk.de',
+ );
GetOptions(\%options,'format|f=s','database|b=s','name|n=s',
'terms|t=s','dir|D=s','debug|d+','help|h|?','man|m');
die "$options{dir} does not exist or is not a directory";
}
+
+$options{harvester_search_url} = '/cgi-bin/'.$options{orgn}.'/search.cgi?zoom_query=golgi&zoom_per_page=100&zoom_and=1&zoom_sort=0';
+
#open search terms file
my $terms;
if ($options{terms} eq '-') {
$terms = new IO::File $options{terms}, 'r' or die "Unable to open file $options{terms}: $!";
}
-my $ua = new LWP::UserAgent(agent=>"DA_get_harvester_results/$REVISION");
-
#For every term
+my @threads;
while (<$terms>) {
# Get uids to retrieve
chomp;
my $search = $_;
- my $url = uri_param_munge($options{harvester_site}.$options{harvester_search_url},
- {search => $search,
- },
- );
- my $request = HTTP::Request->new('GET', $url);
- my $response = $ua->request($request);
- $response = $response->content;
- my @result_urls = $response =~ m#<a\s+href=(http://harvester.embl.de/harvester/[^\/]+/[^.]+.htm)\s*>#g;
-
+ my $uri = URI->new($options{harvester_site}.$options{harvester_search_url});
+ $uri->query_form(zoom_query =>[],
+ );
+ $uri->query_form(zoom_query => $search,
+ );
+ my $url = $uri->as_string;
+ my $queue = Thread::Queue->new();
my $dir_name = eval qq("$options{name}") or die $@;
- mkdir("$options{dir}/$dir_name") or die "Unable to make directory $options{dir}/$dir_name $!";
- # Get XML file
- my @current_urls;
- while (@current_urls = splice(@result_urls,0,30)) {
- system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!";
+ if (not -d "$options{dir}/$dir_name") {
+ mkdir("$options{dir}/$dir_name") or die "Unable to make directory $options{dir}/$dir_name $!";
}
+ my $wget_thread = threads->new(\&get_url,"$options{dir}/$dir_name",$queue);
+ push @threads,$wget_thread;
+
+ my $mech = WWW::Mechanize->new(agent => "DA_get_harvester_results/$REVISION");
+
+ #HTTP::Request->new('GET', $url);
+ $mech->get($url);
+ my $next_link;
+ do {
+ my @links = $mech->links;
+ $next_link = undef;
+ for my $link (@links) {
+ if ($link->text() =~ /Next /) {
+ $next_link = $link;
+ }
+ elsif ($link->url =~ m#http://harvester.fzk.de/harvester/human/[^\/]+/[^.]+.htm#) {
+ $queue->enqueue($link->url());
+ }
+ }
+ $mech->follow_link(url=>$next_link->url) if defined $next_link;
+ } while ($next_link);
+ $queue->enqueue(undef);
+}
+for my $thread (@threads) {
+ $thread->join;
}
+sub get_url{
+ my ($dir,$queue) = @_;
-
-
+ my @current_urls;
+ while (my $url = $queue->dequeue) {
+ push @current_urls,$url;
+ if (@current_urls >= 30) {
+ wget_urls($dir,@current_urls);
+ @current_urls = ();
+ }
+ }
+ wget_urls($dir,@current_urls) if @current_urls;
+}
+sub wget_urls{
+ my ($dir,@urls) = @_;
+ return unless @urls;
+ system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!";
+}
__END__