From: Don Armstrong Date: Tue, 28 Aug 2007 01:01:57 +0000 (+0000) Subject: update get harvester results for new site X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=commitdiff_plain;h=1c08756f612648e7c2823b0a8e9acfdbb833470e update get harvester results for new site git-svn-id: file:///srv/svn/function2gene/trunk@9 a0738b58-4706-0410-8799-fb830574a030 --- diff --git a/bin/get_harvester_results b/bin/get_harvester_results index c6b4f0b..97734e9 100755 --- a/bin/get_harvester_results +++ b/bin/get_harvester_results @@ -10,6 +10,7 @@ # $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $ +use threads; use warnings; use strict; @@ -68,8 +69,9 @@ BEGIN{ } use IO::File; -use URI::ParamMunge; -use LWP::UserAgent; +use URI; +use WWW::Mechanize; +use Thread::Queue; # XXX parse config file @@ -81,10 +83,10 @@ my %options = (debug => 0, dir => '.', name => '${search}_results_harvester', terms => '-', - harvester_site => 'http://www-db.embl.de', - harvester_search_url => '/jss/servlet/de.embl.bk.htmlfind.HarvesterPageSearchOutput?search=GOLGI&chipsetID=-1&maxHits=10000&submit=search', - ); + orgn => 'human', + harvester_site => 'http://harvester.fzk.de', + ); GetOptions(\%options,'format|f=s','database|b=s','name|n=s', 'terms|t=s','dir|D=s','debug|d+','help|h|?','man|m'); @@ -97,6 +99,9 @@ if (not -d $options{dir}) { die "$options{dir} does not exist or is not a directory"; } + +$options{harvester_search_url} = '/cgi-bin/'.$options{orgn}.'/search.cgi?zoom_query=golgi&zoom_per_page=100&zoom_and=1&zoom_sort=0'; + #open search terms file my $terms; if ($options{terms} eq '-') { @@ -106,34 +111,66 @@ else { $terms = new IO::File $options{terms}, 'r' or die "Unable to open file $options{terms}: $!"; } -my $ua = new LWP::UserAgent(agent=>"DA_get_harvester_results/$REVISION"); - #For every term +my @threads; while (<$terms>) { # Get uids to retrieve chomp; my $search = $_; - my $url = uri_param_munge($options{harvester_site}.$options{harvester_search_url}, - {search => $search, - }, - ); - my $request = HTTP::Request->new('GET', $url); - my $response = $ua->request($request); - $response = $response->content; - my @result_urls = $response =~ m##g; - + my $uri = URI->new($options{harvester_site}.$options{harvester_search_url}); + $uri->query_form(zoom_query =>[], + ); + $uri->query_form(zoom_query => $search, + ); + my $url = $uri->as_string; + my $queue = Thread::Queue->new(); my $dir_name = eval qq("$options{name}") or die $@; mkdir("$options{dir}/$dir_name") or die "Unable to make directory $options{dir}/$dir_name $!"; - # Get XML file - my @current_urls; - while (@current_urls = splice(@result_urls,0,30)) { - system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!"; - } + my $wget_thread = threads->new(\&get_url,"$options{dir}/$dir_name",$queue); + push @threads,$wget_thread; + + my $mech = WWW::Mechanize->new(agent => "DA_get_harvester_results/$REVISION"); + + #HTTP::Request->new('GET', $url); + $mech->get($url); + my $next_link; + do { + my @links = $mech->links; + $next_link = undef; + for my $link (@links) { + if ($link->text() =~ /Next /) { + $next_link = $link; + } + elsif ($link->url =~ m#http://harvester.fzk.de/harvester/human/[^\/]+/[^.]+.htm#) { + $queue->enqueue($link->url()); + } + } + $mech->follow_link(url=>$next_link->url) if defined $next_link; + } while ($next_link); + $queue->enqueue(undef); +} +for my $thread (@threads) { + $thread->join; } +sub get_url{ + my ($dir,$queue) = @_; - - + my @current_urls; + while (my $url = $queue->dequeue) { + push @current_urls,$url; + if (@current_urls >= 30) { + wget_urls($dir,@current_urls); + @current_urls = (); + } + } + wget_urls($dir,@current_urls) if @current_urls; +} +sub wget_urls{ + my ($dir,@urls) = @_; + return unless @urls; + system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!"; +} __END__