use URI;
use WWW::Mechanize;
use Thread::Queue;
+use Time::HiRes qw(usleep);
# XXX parse config file
my $url = $uri->as_string;
my $queue = Thread::Queue->new();
my $dir_name = eval qq("$options{name}") or die $@;
- mkdir("$options{dir}/$dir_name") or die "Unable to make directory $options{dir}/$dir_name $!";
+ if (not -d "$options{dir}/$dir_name") {
+ mkdir("$options{dir}/$dir_name") or die "Unable to make directory $options{dir}/$dir_name $!";
+ }
my $wget_thread = threads->new(\&get_url,"$options{dir}/$dir_name",$queue);
push @threads,$wget_thread;
sub wget_urls{
my ($dir,@urls) = @_;
return unless @urls;
- system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!";
+ # replacing wget with WWW::Mechanize
+ my $mech = WWW::Mechanize->new(agent => "DA_get_harvester_results/$REVISION");
+ for my $url (@urls) {
+ # sleep for around 2 seconds
+ usleep((0.5+rand)*2*1000000);
+ $mech->get($url);
+ my $cleaned_url = $url;
+ $cleaned_url =~ s{http://}{}g;
+ $cleaned_url =~ s/[^\w]//g;
+ eval {
+ $mech->save_content($dir.'/'.$cleaned_url);
+ print "retreived $url\n";
+ };
+ if ($@) {
+ warn $@;
+ }
+ }
+ #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!";
}
__END__