use IO::File;
use URI;
use WWW::Mechanize;
+use Time::HiRes qw(usleep);
# XXX parse config file
dir => '.',
name => '${search}_results_genecard',
terms => '-',
- genecard_site => 'http://bioinfo.weizmann.ac.il/cards-bin/',
+ genecard_site => 'http://www.genecards.org/cgi-bin/',
genecard_search_url => 'cardsearch.pl?search_type=kwd&mini=no&speed=fast&matches=999999',
);
# Get XML file
my @current_urls;
while (@current_urls = map{$options{genecard_site}.$_} splice(@result_urls,0,30)) {
- system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!";
+ for my $url (@current_urls) {
+ # sleep for around 2 seconds
+ usleep((0.5+rand)*2*1000000);
+ $mech->get($url);
+ my $cleaned_url = $url;
+ $cleaned_url =~ s{http://}{}g;
+ $cleaned_url =~ s/[^\w]//g;
+ eval {
+ $mech->save_content($options{dir}.'/'.$dir_name.'/'.$cleaned_url);
+ print "retreived $url\n";
+ };
+ if ($@) {
+ warn $@;
+ }
+ }
+ #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!";
}
}