]> git.donarmstrong.com Git - function2gene.git/blobdiff - bin/get_genecard_results
skip ENSG results, and use average weight
[function2gene.git] / bin / get_genecard_results
index 051e39da7e7d54dfcdda7cac17abcb8df8a3b1ba..1b94f626c1f1bab04f51ffd9dfd68754aa375fd7 100755 (executable)
@@ -70,6 +70,7 @@ BEGIN{
 use IO::File;
 use URI;
 use WWW::Mechanize;
+use Time::HiRes qw(usleep);
 
 # XXX parse config file
 
@@ -81,7 +82,7 @@ my %options = (debug    => 0,
               dir      => '.',
               name     => '${search}_results_genecard',
               terms    => '-',
-              genecard_site => 'http://bioinfo.weizmann.ac.il/cards-bin/',
+              genecard_site => 'http://www.genecards.org/cgi-bin/',
               genecard_search_url  => 'cardsearch.pl?search_type=kwd&mini=no&speed=fast&matches=999999',
              );
 
@@ -120,6 +121,7 @@ while (<$terms>) {
      $mech->get($url);
      my $response = $mech->content();
      my @result_urls = $response =~ m#<a\s+target\=\'card\'\s+href=\"(carddisp\.pl\?[^\"]+)\"\s*>#sg;
+     @result_urls = grep {$_ !~ /gene=ENSG\d+\&/i} @result_urls;
      my $dir_name = eval qq("$options{name}") or die $@;
      if (not -d "$options{dir}/$dir_name") {
          mkdir("$options{dir}/$dir_name") or die "Unable to make directory $options{dir}/$dir_name $!";
@@ -127,7 +129,22 @@ while (<$terms>) {
      # Get XML file
      my @current_urls;
      while (@current_urls = map{$options{genecard_site}.$_} splice(@result_urls,0,30)) {
-         system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!";
+         for my $url (@current_urls) {
+              # sleep for around 2 seconds
+              usleep((0.5+rand)*2*1000000);
+              $mech->get($url);
+              my $cleaned_url = $url;
+              $cleaned_url =~ s{http://}{}g;
+              $cleaned_url =~ s/[^\w]//g;
+              eval {
+                   $mech->save_content($options{dir}.'/'.$dir_name.'/'.$cleaned_url);
+                   print "retreived $url\n";
+              };
+              if ($@) {
+                   warn $@;
+              }
+         }
+         #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!";
      }
 }