X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fcombine_results;fp=bin%2Fcombine_results;h=666fe7c81a70808c6bf01b69ed4952f0d6a5a156;hp=e49de8f9bb20c1b730bf091dee5ac7c9d338a00c;hb=af4fd770f221db1cec02393df378e079c0b9a8fc;hpb=6d24067f20698257dc1103d5c21e8a7f6a32b97b

diff --git a/bin/combine_results b/bin/combine_results
index e49de8f..666fe7c 100755
--- a/bin/combine_results
+++ b/bin/combine_results
@@ -23,6 +23,9 @@ use Pod::Usage;
  combine_results parsed_results_1.txt [parsedresultfiles ...]
 
  Options:
+  --keywords newline delineated list of keywords to search for
+  --results file to store results in
+  --results-table optional file to write result summary table to
   --debug, -d debugging level [default 0]
   --help, -h display this help
   --man, -m display manual
@@ -31,6 +34,23 @@ use Pod::Usage;
 
 =over
 
+=item B<--keywords>
+
+A file which contains a newline delinated list of keywords to search
+for. Can be specified multiple times. Lines starting with # or ; are
+ignored. An optional weight can be specified after the keyword, which
+is separated from the keyword by a tab. (If not specified, 1 is
+assumed.)
+
+=item B<--results>
+
+A file in which to store the combined results; defaults to STDOUT
+
+=item B<--results-table>
+
+A file in which to store a summary table of results. If not provided,
+no summary table is written.
+
 =item B<--debug, -d>
 
 Debug verbosity. (Default 0)
@@ -63,6 +83,7 @@ BEGIN{
 
 use XML::Parser::Expat;
 use IO::File;
+use List::Util qw(sum max);
 
 # XXX parse config file
 
@@ -70,9 +91,13 @@ my %options = (debug    => 0,
 	       help     => 0,
 	       man      => 0,
 	       keywords => [],
+	       results  => undef,
+	       results_table => undef,
 	      );
 
-GetOptions(\%options,'keywords|k=s@','debug|d+','help|h|?','man|m');
+GetOptions(\%options,'keywords|k=s@',
+	   'results=s','results_table|results-table=s',
+	   'debug|d+','help|h|?','man|m');
 
 
 pod2usage() if $options{help};
@@ -80,6 +105,18 @@ pod2usage({verbose=>2}) if $options{man};
 
 $DEBUG = $options{debug};
 
+my $results_fh = \*STDOUT;
+my $results_table_fh = undef;
+
+if ($options{results}) {
+     $results_fh = IO::File->new($options{results},'w') or
+	 die "Unable to open results file $options{results} for writing";
+}
+if ($options{results_table}) {
+     $results_table_fh = IO::File->new($options{results_table},'w') or
+	 die "Unable to open results table file $options{results_table} for writing";
+}
+
 # CSV columns
 use constant {NAME        => 0,
 	      REFSEQ      => 1,
@@ -92,10 +129,29 @@ use constant {NAME        => 0,
 	      FILENAME    => 8,
 	     };
 
-my @csv_fields = qw(name hits rzscore refseq location alias database terms description function);
+my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function);
 
 my %genes;
 
+my %keyword_weight;
+
+if (@{$options{keywords}}) {
+     for my $keyword_file (@{$options{keywords}}) {
+	  my $keyword_fh = IO::File->new($keyword_file,'r') or
+	       die "Unable to open $keyword_file for reading: $!";
+	  while (<$keyword_fh>) {
+	       next if /^\s*[#;]/;
+	       next unless /\w+/;
+	       chomp;
+	       my ($keyword,$weight) = split /\t/, $_;
+	       $weight = 1 if not defined $weight;
+	       $keyword_weight{$keyword} = $weight;
+	  }
+     }
+}
+
+
+
 for my $file_name (@ARGV) {
      my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
      while (<$file>) {
@@ -114,10 +170,93 @@ for my $file_name (@ARGV) {
      }
 }
 
-print join(',',map {qq("$_")} @csv_fields),qq(\n);
+my %databases;
+my %terms;
+my %auto_weight;
+my %keyword_keyword;
+for my $gene (keys %genes) {
+     my %term_temp;
+     my %db_temp;
+     my %gene_temp;
+     my %gene_temp2;
+     for my $term (keys %{$genes{$gene}{terms}}) {
+	  if ($term =~ /\[/) {
+	       my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/;
+	       my $hits = $genes{$gene}{terms}{$term};
+	       $keyword =~ s/[-+_]/ /g;
+	       $keyword =~ s/\s*$//;
+	       $keyword =~ s/[*]//;
+	       $gene_temp{$keyword}{$database} = 1;
+	       $gene_temp2{$database}{$keyword} = 1;
+	       $databases{$database}{$keyword}{count}++;
+	       $db_temp{$database}++;
+	       $terms{$keyword}{$database}{count}++;
+	  }
+	  else {
+	       my $keyword = $term;
+	       my $hits = $genes{$gene}{terms}{$term};
+	       $keyword =~ s/[-+_]/ /g;
+	       $keyword =~ s/\s*$//;
+	       $keyword =~ s/[*]//;
+	       $terms{$keyword}{total}{count}++;
+	  }
+     }
+     if (keys %gene_temp == 1) {
+	  $terms{[keys %gene_temp]->[0]}{total}{unique}++;
+	  if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) {
+	       $databases{total}{total}{unique}++
+	  }
+     }
+     if (keys %gene_temp2 == 1) {
+	  $databases{[keys %gene_temp2]->[0]}{total}{unique}++;
+     }
+     for my $keyword (keys %gene_temp) {
+	  if (keys %{$gene_temp{$keyword}} == 1) {
+	       $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++;
+	  }
+	  for my $keyword2 (keys %gene_temp) {
+	       $keyword_keyword{$keyword}{$keyword2}++
+	  }
+     }
+     for my $database (keys %db_temp) {
+	  $databases{$database}{total}{count}++;
+     }
+     $databases{total}{total}{count}++;
+
+}
+
+for my $keyword (keys %keyword_keyword) {
+     # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table
+     # we use max here to avoid 0/0 problems.
+     my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword});
+     my $results_combined = max(1,grep {defined $_}
+				sum(map {$keyword_keyword{$keyword}{$_}}
+				    grep {$_ ne $keyword}
+				    keys %{$keyword_keyword{$keyword}}
+				   )
+			       );
+     $auto_weight{$keyword} = $results_by_this_keyword/$results_combined;
+}
+
+print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n);
 for my $gene (keys %genes) {
      $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}};
-     print STDOUT join (',',
+     $genes{$gene}{weightedscore}= sum(0,
+				       map {defined $keyword_weight{$_}?$keyword_weight{$_}:1}
+				       grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
+				      );
+     $genes{$gene}{autoscore}= sum(0,
+				   map {defined $auto_weight{$_}?$auto_weight{$_}:1}
+				   grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
+				  );
+}
+
+my $sort = 'autoscore';
+if (scalar grep {$_ != 1 } values %keyword_weight) {
+     $sort='weightedscore';
+}
+for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) {
+     print {$results_fh} join (',',
 			map {s/"//g; qq("$_")}
 			map {
 			     my $value = $_;
@@ -159,6 +298,47 @@ sub add_if_better{
 }
 
 
+if (defined $results_table_fh) {
+     our ($keyword,$weight,$autoweight,$gct,$hvt,$nct,$t) = ('Keyword','Weight','Autoweight','GeneCards','Harvester','NCBI','Total');
+     format RESULTS_TABLE =
+@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\
+$keyword,                 $weight,      $autoweight,  $gct,         $hvt,         $nct,         $t
+.
+     $results_table_fh->format_name('RESULTS_TABLE');
+     write $results_table_fh;
+
+     for $keyword (sort keys %terms) {
+	  ($gct,$hvt,$nct,$t) =
+	       map {
+		    if (not defined $_) {
+			 '$-$';
+		    }
+		    else {
+			 $_->{unique} ||= 0;
+			 "$_->{count} ($_->{unique})";
+		    }
+	       } @{$terms{$keyword}}{qw(genecard harvester ncbi total)};
+	  $weight = $keyword_weight{$keyword} || 1;
+	  $autoweight = $auto_weight{$keyword};
+	  write $results_table_fh;
+
+     }
 
+     $keyword = 'Total';
+     ($gct,$hvt,$nct,$t) =
+	  map {
+	       if (not defined $_) {
+		    '$-$';
+	       }
+	       else {
+		    $_->{unique} ||= 0;
+		    "$_->{count} ($_->{unique})";
+	       }
+	  } map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
+     #($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
+     $weight = '';
+     $autoweight = '';
+     write $results_table_fh;
+}
 
 __END__