* Stop requiring wget

author Don Armstrong <don@donarmstrong.com>

Tue, 22 Jan 2008 23:04:20 +0000 (23:04 +0000)

committer Don Armstrong <don@donarmstrong.com>

Tue, 22 Jan 2008 23:04:20 +0000 (23:04 +0000)
author Don Armstrong <don@donarmstrong.com>
Tue, 22 Jan 2008 23:04:20 +0000 (23:04 +0000)
committer Don Armstrong <don@donarmstrong.com>
Tue, 22 Jan 2008 23:04:20 +0000 (23:04 +0000)
diff --git a/bin/combine_results b/bin/combine_results

index e49de8f9bb20c1b730bf091dee5ac7c9d338a00c..666fe7c81a70808c6bf01b69ed4952f0d6a5a156 100755 (executable)
--- a/bin/combine_results
+++ b/bin/combine_results
@@ -23,6 +23,9 @@ use Pod::Usage;
   combine_results parsed_results_1.txt [parsedresultfiles ...]
  
   Options:
+  --keywords newline delineated list of keywords to search for
+  --results file to store results in
+  --results-table optional file to write result summary table to
    --debug, -d debugging level [default 0]
    --help, -h display this help
    --man, -m display manual
@@ -31,6 +34,23 @@ use Pod::Usage;
  
  =over
  
+=item B<--keywords>
+
+A file which contains a newline delinated list of keywords to search
+for. Can be specified multiple times. Lines starting with # or ; are
+ignored. An optional weight can be specified after the keyword, which
+is separated from the keyword by a tab. (If not specified, 1 is
+assumed.)
+
+=item B<--results>
+
+A file in which to store the combined results; defaults to STDOUT
+
+=item B<--results-table>
+
+A file in which to store a summary table of results. If not provided,
+no summary table is written.
+
  =item B<--debug, -d>
  
  Debug verbosity. (Default 0)
@@ -63,6 +83,7 @@ BEGIN{
  
  use XML::Parser::Expat;
  use IO::File;
+use List::Util qw(sum max);
  
  # XXX parse config file
  
@@ -70,9 +91,13 @@ my %options = (debug    => 0,
                help     => 0,
                man      => 0,
                keywords => [],
+              results  => undef,
+              results_table => undef,
               );
  
-GetOptions(\%options,'keywords|k=s@','debug|d+','help|h|?','man|m');
+GetOptions(\%options,'keywords|k=s@',
+          'results=s','results_table|results-table=s',
+          'debug|d+','help|h|?','man|m');
  
  
  pod2usage() if $options{help};
@@ -80,6 +105,18 @@ pod2usage({verbose=>2}) if $options{man};
  
  $DEBUG = $options{debug};
  
+my $results_fh = \*STDOUT;
+my $results_table_fh = undef;
+
+if ($options{results}) {
+     $results_fh = IO::File->new($options{results},'w') or
+        die "Unable to open results file $options{results} for writing";
+}
+if ($options{results_table}) {
+     $results_table_fh = IO::File->new($options{results_table},'w') or
+        die "Unable to open results table file $options{results_table} for writing";
+}
+
  # CSV columns
  use constant {NAME        => 0,
               REFSEQ      => 1,
@@ -92,10 +129,29 @@ use constant {NAME        => 0,
               FILENAME    => 8,
              };
  
-my @csv_fields = qw(name hits rzscore refseq location alias database terms description function);
+my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function);
  
  my %genes;
  
+my %keyword_weight;
+
+if (@{$options{keywords}}) {
+     for my $keyword_file (@{$options{keywords}}) {
+         my $keyword_fh = IO::File->new($keyword_file,'r') or
+              die "Unable to open $keyword_file for reading: $!";
+         while (<$keyword_fh>) {
+              next if /^\s*[#;]/;
+              next unless /\w+/;
+              chomp;
+              my ($keyword,$weight) = split /\t/, $_;
+              $weight = 1 if not defined $weight;
+              $keyword_weight{$keyword} = $weight;
+         }
+     }
+}
+
+
+
  for my $file_name (@ARGV) {
       my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
       while (<$file>) {
@@ -114,10 +170,93 @@ for my $file_name (@ARGV) {
       }
  }
  
-print join(',',map {qq("$_")} @csv_fields),qq(\n);
+my %databases;
+my %terms;
+my %auto_weight;
+my %keyword_keyword;
+for my $gene (keys %genes) {
+     my %term_temp;
+     my %db_temp;
+     my %gene_temp;
+     my %gene_temp2;
+     for my $term (keys %{$genes{$gene}{terms}}) {
+         if ($term =~ /\[/) {
+              my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/;
+              my $hits = $genes{$gene}{terms}{$term};
+              $keyword =~ s/[-+_]/ /g;
+              $keyword =~ s/\s*$//;
+              $keyword =~ s/[*]//;
+              $gene_temp{$keyword}{$database} = 1;
+              $gene_temp2{$database}{$keyword} = 1;
+              $databases{$database}{$keyword}{count}++;
+              $db_temp{$database}++;
+              $terms{$keyword}{$database}{count}++;
+         }
+         else {
+              my $keyword = $term;
+              my $hits = $genes{$gene}{terms}{$term};
+              $keyword =~ s/[-+_]/ /g;
+              $keyword =~ s/\s*$//;
+              $keyword =~ s/[*]//;
+              $terms{$keyword}{total}{count}++;
+         }
+     }
+     if (keys %gene_temp == 1) {
+         $terms{[keys %gene_temp]->[0]}{total}{unique}++;
+         if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) {
+              $databases{total}{total}{unique}++
+         }
+     }
+     if (keys %gene_temp2 == 1) {
+         $databases{[keys %gene_temp2]->[0]}{total}{unique}++;
+     }
+     for my $keyword (keys %gene_temp) {
+         if (keys %{$gene_temp{$keyword}} == 1) {
+              $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++;
+         }
+         for my $keyword2 (keys %gene_temp) {
+              $keyword_keyword{$keyword}{$keyword2}++
+         }
+     }
+     for my $database (keys %db_temp) {
+         $databases{$database}{total}{count}++;
+     }
+     $databases{total}{total}{count}++;
+
+}
+
+for my $keyword (keys %keyword_keyword) {
+     # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table
+     # we use max here to avoid 0/0 problems.
+     my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword});
+     my $results_combined = max(1,grep {defined $_}
+                               sum(map {$keyword_keyword{$keyword}{$_}}
+                                   grep {$_ ne $keyword}
+                                   keys %{$keyword_keyword{$keyword}}
+                                  )
+                              );
+     $auto_weight{$keyword} = $results_by_this_keyword/$results_combined;
+}
+
+print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n);
  for my $gene (keys %genes) {
       $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}};
-     print STDOUT join (',',
+     $genes{$gene}{weightedscore}= sum(0,
+                                      map {defined $keyword_weight{$_}?$keyword_weight{$_}:1}
+                                      grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
+                                     );
+     $genes{$gene}{autoscore}= sum(0,
+                                  map {defined $auto_weight{$_}?$auto_weight{$_}:1}
+                                  grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
+                                 );
+}
+
+my $sort = 'autoscore';
+if (scalar grep {$_ != 1 } values %keyword_weight) {
+     $sort='weightedscore';
+}
+for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) {
+     print {$results_fh} join (',',
                         map {s/"//g; qq("$_")}
                         map {
                              my $value = $_;
@@ -159,6 +298,47 @@ sub add_if_better{
  }
  
  
+if (defined $results_table_fh) {
+     our ($keyword,$weight,$autoweight,$gct,$hvt,$nct,$t) = ('Keyword','Weight','Autoweight','GeneCards','Harvester','NCBI','Total');
+     format RESULTS_TABLE =
+@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\
+$keyword,                 $weight,      $autoweight,  $gct,         $hvt,         $nct,         $t
+.
+     $results_table_fh->format_name('RESULTS_TABLE');
+     write $results_table_fh;
+
+     for $keyword (sort keys %terms) {
+         ($gct,$hvt,$nct,$t) =
+              map {
+                   if (not defined $_) {
+                        '$-$';
+                   }
+                   else {
+                        $_->{unique} ||= 0;
+                        "$_->{count} ($_->{unique})";
+                   }
+              } @{$terms{$keyword}}{qw(genecard harvester ncbi total)};
+         $weight = $keyword_weight{$keyword} || 1;
+         $autoweight = $auto_weight{$keyword};
+         write $results_table_fh;
+
+     }
  
+     $keyword = 'Total';
+     ($gct,$hvt,$nct,$t) =
+         map {
+              if (not defined $_) {
+                   '$-$';
+              }
+              else {
+                   $_->{unique} ||= 0;
+                   "$_->{count} ($_->{unique})";
+              }
+         } map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
+     #($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
+     $weight = '';
+     $autoweight = '';
+     write $results_table_fh;
+}
  
  __END__
diff --git a/bin/function2gene b/bin/function2gene

index f3bd3ddfd7364a89afac2c4d391e43b3787e8b8d..de4915bc4f95ecbdd9c564ffb7de88938658d4c0 100755 (executable)
--- a/bin/function2gene
+++ b/bin/function2gene
@@ -339,13 +339,18 @@ if ($actions{combine}) {
                            } keys %{$state{done_keywords}{parse}};
  
       # create temporary file to store keyword weights
-
-     write_command_to_file('combined_results.txt',
-                          "$base_dir/combine_results",
-                          '--keywords',
-                          
-                          @parsed_results,
-                         );
+     my $file = IO::File->new('combined_keywords.txt','w') or
+         die "Unable to open combined_keywords.txt for writing: $!";
+     for my $keyword (keys %{$state{keyword_weight}}) {
+         print {$file} "$keyword\t$state{keyword_weight}{$keyword}\n";
+     }
+     system("$base_dir/combine_results",
+           '--keywords','combined_keywords.txt',
+           '--results','combined_results.txt',
+           '--results-table','combined_results_table.txt',
+           @parsed_results,
+          ) == 0
+               or die "combine_results failed with ".($?>>8);
       for my $result (@parsed_results) {
           $result =~ s/^parsed_results_//;
           $result =~ s/\.txt$//;
@@ -353,10 +358,6 @@ if ($actions{combine}) {
           $state{done_keywords}{combined}{$db}{$keyword} = 1;
       }
       save_state(\%state);
-     write_command_to_file('combined_results_table.txt',
-                          "$base_dir/results_to_table",
-                          'combined_results.txt',
-                         );
       ADVISE("Finished; results in $options{results}/combined_results.txt");
  }
  else {
diff --git a/bin/get_genecard_results b/bin/get_genecard_results

index 017f1e1b2b76957fbb7c67e90fd7f0d075811ede..ac38e29d6767e8cb13040eb2b591258f07edf5b2 100755 (executable)
--- a/bin/get_genecard_results
+++ b/bin/get_genecard_results
@@ -70,6 +70,7 @@ BEGIN{
  use IO::File;
  use URI;
  use WWW::Mechanize;
+use Time::HiRes qw(usleep);
  
  # XXX parse config file
  
@@ -127,7 +128,22 @@ while (<$terms>) {
       # Get XML file
       my @current_urls;
       while (@current_urls = map{$options{genecard_site}.$_} splice(@result_urls,0,30)) {
-         system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!";
+         for my $url (@current_urls) {
+              # sleep for around 2 seconds
+              usleep((0.5+rand)*2*1000000);
+              $mech->get($url);
+              my $cleaned_url = $url;
+              $cleaned_url =~ s{http://}{}g;
+              $cleaned_url =~ s/[^\w]//g;
+              eval {
+                   $mech->save_content($options{dir}.'/'.$dir_name.'/'.$cleaned_url);
+                   print "retreived $url\n";
+              };
+              if ($@) {
+                   warn $@;
+              }
+         }
+         #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!";
       }
  }
  
diff --git a/bin/get_harvester_results b/bin/get_harvester_results

index 7446e088c245d9d58adcafef59dd04f769edb6a8..4d3f5932481349f61bd213508271ac2bf00728ce 100755 (executable)
--- a/bin/get_harvester_results
+++ b/bin/get_harvester_results
@@ -72,6 +72,7 @@ use IO::File;
  use URI;
  use WWW::Mechanize;
  use Thread::Queue;
+use Time::HiRes qw(usleep);
  
  # XXX parse config file
  
@@ -172,7 +173,24 @@ sub get_url{
  sub wget_urls{
       my ($dir,@urls) = @_;
       return unless @urls;
-     system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!";
+     # replacing wget with WWW::Mechanize
+     my $mech = WWW::Mechanize->new(agent => "DA_get_harvester_results/$REVISION");
+     for my $url (@urls) {
+         # sleep for around 2 seconds
+         usleep((0.5+rand)*2*1000000);
+         $mech->get($url);
+         my $cleaned_url = $url;
+         $cleaned_url =~ s{http://}{}g;
+         $cleaned_url =~ s/[^\w]//g;
+         eval {
+              $mech->save_content($dir.'/'.$cleaned_url);
+              print "retreived $url\n";
+         };
+         if ($@) {
+              warn $@;
+         }
+     }
+     #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!";
  }
  
  __END__
diff --git a/bin/parse_genecard_results b/bin/parse_genecard_results

index b42371957a229b4423cd8b89ac152eb2094f7134..fbd98851a4a8d27f88cb7e9eee567b4ebda35643 100755 (executable)
--- a/bin/parse_genecard_results
+++ b/bin/parse_genecard_results
@@ -118,6 +118,8 @@ my $dir = new IO::Dir $options{dir} or die "Unable to open dir $options{dir}: $!
  
  print join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n);
  
+my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_genecard#;
+
  while ($_ = $dir->read) {
       my $file_name = $_;
       next if $file_name =~ /^\./;
@@ -167,9 +169,9 @@ while ($_ = $dir->read) {
       $results[FUNCTION] ||= 'NO FUNCTION';
  
       # Figure out the keyword used
-     ($results[KEYWORD]) = $file_name =~ /search=([^&]+)/;
+     ($results[KEYWORD]) = $file_name =~ /search=?([^&]+)$/;
  
-     $results[KEYWORD] ||= 'NO KEYWORD';
+     $results[KEYWORD] ||= $keyword || 'NO KEYWORD';
  
       # Swiss prot functions
       my @description = (map {s/<[^>]+>/ /g;
author	Don Armstrong <don@donarmstrong.com>
	Tue, 22 Jan 2008 23:04:20 +0000 (23:04 +0000)
committer	Don Armstrong <don@donarmstrong.com>
	Tue, 22 Jan 2008 23:04:20 +0000 (23:04 +0000)
bin/combine_results		patch \| blob \| history
bin/function2gene		patch \| blob \| history
bin/get_genecard_results		patch \| blob \| history
bin/get_harvester_results		patch \| blob \| history
bin/parse_genecard_results		patch \| blob \| history