X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fget_harvester_results;h=4d3f5932481349f61bd213508271ac2bf00728ce;hp=97734e9348963e9ede2a8e45a00a91d74b2a3d7b;hb=3f759318c3a4c283f19113827dbaf206cb0f0252;hpb=1c08756f612648e7c2823b0a8e9acfdbb833470e diff --git a/bin/get_harvester_results b/bin/get_harvester_results index 97734e9..4d3f593 100755 --- a/bin/get_harvester_results +++ b/bin/get_harvester_results @@ -72,6 +72,7 @@ use IO::File; use URI; use WWW::Mechanize; use Thread::Queue; +use Time::HiRes qw(usleep); # XXX parse config file @@ -125,7 +126,9 @@ while (<$terms>) { my $url = $uri->as_string; my $queue = Thread::Queue->new(); my $dir_name = eval qq("$options{name}") or die $@; - mkdir("$options{dir}/$dir_name") or die "Unable to make directory $options{dir}/$dir_name $!"; + if (not -d "$options{dir}/$dir_name") { + mkdir("$options{dir}/$dir_name") or die "Unable to make directory $options{dir}/$dir_name $!"; + } my $wget_thread = threads->new(\&get_url,"$options{dir}/$dir_name",$queue); push @threads,$wget_thread; @@ -170,7 +173,24 @@ sub get_url{ sub wget_urls{ my ($dir,@urls) = @_; return unless @urls; - system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!"; + # replacing wget with WWW::Mechanize + my $mech = WWW::Mechanize->new(agent => "DA_get_harvester_results/$REVISION"); + for my $url (@urls) { + # sleep for around 2 seconds + usleep((0.5+rand)*2*1000000); + $mech->get($url); + my $cleaned_url = $url; + $cleaned_url =~ s{http://}{}g; + $cleaned_url =~ s/[^\w]//g; + eval { + $mech->save_content($dir.'/'.$cleaned_url); + print "retreived $url\n"; + }; + if ($@) { + warn $@; + } + } + #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!"; } __END__