X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fget_harvester_results;fp=bin%2Fget_harvester_results;h=4d3f5932481349f61bd213508271ac2bf00728ce;hp=7446e088c245d9d58adcafef59dd04f769edb6a8;hb=af4fd770f221db1cec02393df378e079c0b9a8fc;hpb=6d24067f20698257dc1103d5c21e8a7f6a32b97b diff --git a/bin/get_harvester_results b/bin/get_harvester_results index 7446e08..4d3f593 100755 --- a/bin/get_harvester_results +++ b/bin/get_harvester_results @@ -72,6 +72,7 @@ use IO::File; use URI; use WWW::Mechanize; use Thread::Queue; +use Time::HiRes qw(usleep); # XXX parse config file @@ -172,7 +173,24 @@ sub get_url{ sub wget_urls{ my ($dir,@urls) = @_; return unless @urls; - system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!"; + # replacing wget with WWW::Mechanize + my $mech = WWW::Mechanize->new(agent => "DA_get_harvester_results/$REVISION"); + for my $url (@urls) { + # sleep for around 2 seconds + usleep((0.5+rand)*2*1000000); + $mech->get($url); + my $cleaned_url = $url; + $cleaned_url =~ s{http://}{}g; + $cleaned_url =~ s/[^\w]//g; + eval { + $mech->save_content($dir.'/'.$cleaned_url); + print "retreived $url\n"; + }; + if ($@) { + warn $@; + } + } + #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!"; } __END__