X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fget_genecard_results;h=ac38e29d6767e8cb13040eb2b591258f07edf5b2;hp=017f1e1b2b76957fbb7c67e90fd7f0d075811ede;hb=af4fd770f221db1cec02393df378e079c0b9a8fc;hpb=6d24067f20698257dc1103d5c21e8a7f6a32b97b diff --git a/bin/get_genecard_results b/bin/get_genecard_results index 017f1e1..ac38e29 100755 --- a/bin/get_genecard_results +++ b/bin/get_genecard_results @@ -70,6 +70,7 @@ BEGIN{ use IO::File; use URI; use WWW::Mechanize; +use Time::HiRes qw(usleep); # XXX parse config file @@ -127,7 +128,22 @@ while (<$terms>) { # Get XML file my @current_urls; while (@current_urls = map{$options{genecard_site}.$_} splice(@result_urls,0,30)) { - system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!"; + for my $url (@current_urls) { + # sleep for around 2 seconds + usleep((0.5+rand)*2*1000000); + $mech->get($url); + my $cleaned_url = $url; + $cleaned_url =~ s{http://}{}g; + $cleaned_url =~ s/[^\w]//g; + eval { + $mech->save_content($options{dir}.'/'.$dir_name.'/'.$cleaned_url); + print "retreived $url\n"; + }; + if ($@) { + warn $@; + } + } + #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!"; } }