X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fget_genecard_results;h=1b94f626c1f1bab04f51ffd9dfd68754aa375fd7;hp=051e39da7e7d54dfcdda7cac17abcb8df8a3b1ba;hb=766ad5225a78970eef161db44b8ea86f3e37bbc9;hpb=dd8ec1d4cecf282940831171ef0f796570d781fd diff --git a/bin/get_genecard_results b/bin/get_genecard_results index 051e39d..1b94f62 100755 --- a/bin/get_genecard_results +++ b/bin/get_genecard_results @@ -70,6 +70,7 @@ BEGIN{ use IO::File; use URI; use WWW::Mechanize; +use Time::HiRes qw(usleep); # XXX parse config file @@ -81,7 +82,7 @@ my %options = (debug => 0, dir => '.', name => '${search}_results_genecard', terms => '-', - genecard_site => 'http://bioinfo.weizmann.ac.il/cards-bin/', + genecard_site => 'http://www.genecards.org/cgi-bin/', genecard_search_url => 'cardsearch.pl?search_type=kwd&mini=no&speed=fast&matches=999999', ); @@ -120,6 +121,7 @@ while (<$terms>) { $mech->get($url); my $response = $mech->content(); my @result_urls = $response =~ m##sg; + @result_urls = grep {$_ !~ /gene=ENSG\d+\&/i} @result_urls; my $dir_name = eval qq("$options{name}") or die $@; if (not -d "$options{dir}/$dir_name") { mkdir("$options{dir}/$dir_name") or die "Unable to make directory $options{dir}/$dir_name $!"; @@ -127,7 +129,22 @@ while (<$terms>) { # Get XML file my @current_urls; while (@current_urls = map{$options{genecard_site}.$_} splice(@result_urls,0,30)) { - system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!"; + for my $url (@current_urls) { + # sleep for around 2 seconds + usleep((0.5+rand)*2*1000000); + $mech->get($url); + my $cleaned_url = $url; + $cleaned_url =~ s{http://}{}g; + $cleaned_url =~ s/[^\w]//g; + eval { + $mech->save_content($options{dir}.'/'.$dir_name.'/'.$cleaned_url); + print "retreived $url\n"; + }; + if ($@) { + warn $@; + } + } + #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!"; } }