X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fcombine_results;h=49c1ca6da1b32aea859b0daa132d1648888c0ed5;hp=666fe7c81a70808c6bf01b69ed4952f0d6a5a156;hb=HEAD;hpb=af4fd770f221db1cec02393df378e079c0b9a8fc

diff --git a/bin/combine_results b/bin/combine_results
index 666fe7c..49c1ca6 100755
--- a/bin/combine_results
+++ b/bin/combine_results
@@ -152,11 +152,37 @@ if (@{$options{keywords}}) {
 
 
 
+my %alias_reverse;
+
 for my $file_name (@ARGV) {
      my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
      while (<$file>) {
 	  next if /^"Name"/;
 	  my @gene = map {s/^\"//; s/\"$//; $_;} split /(?<=\")\,(?=\")/, $_;
+	  # check to see if there's a different name we should be using
+	  if (not exists $genes{$gene[NAME]}) {
+	       # if the gene has a valid name, we do at least one test.
+	       my $num_tested = $gene[NAME] ne 'NO NAME' ? 1 : 0;
+	       my %candidates;
+	       if ($gene[NAME] ne 'NO NAME' and exists $alias_reverse{$gene[NAME]} and $alias_reverse{$gene[NAME]} ne '') {
+		    $candidates{$alias_reverse{$gene[NAME]}}++;
+	       }
+	       else {
+		    for my $alias (grep {$_ !~ /^NO (ALIASES|NAME)$/} split(/; /, $gene[ALIAS])) {
+			 if (exists $alias_reverse{$alias} and $alias_reverse{$alias} ne '') {
+			      $candidates{$alias_reverse{$alias}}++;
+			 }
+			 $num_tested++;
+		    }
+	       }
+	       #print STDERR "Choosing $alias_reverse{$gene[NAME]} for $gene[NAME]\n";
+	       for my $candidate (keys %candidates) {
+		    if ($candidates{$candidate} > ($num_tested/2)) {
+			 print STDERR "Choosing $candidate for '$gene[NAME]', as it matched $candidates{$candidate} of $num_tested tests\n";
+			 $gene[NAME] = $candidate;
+		    }
+	       }
+	  }
 	  $genes{$gene[NAME]}{name} = $gene[NAME];
 	  $genes{$gene[NAME]}{database}{$gene[DBNAME]}++;
 	  $genes{$gene[NAME]}{hits}++;
@@ -166,7 +192,19 @@ for my $file_name (@ARGV) {
 	  add_if_better($genes{$gene[NAME]},'description',$gene[DESCRIPTION]);
 	  add_if_better($genes{$gene[NAME]},'location',$gene[LOCATION]);
 	  add_unique_parts($genes{$gene[NAME]},'function',split(/; /, $gene[FUNCTION]));
-	  add_unique_parts($genes{$gene[NAME]},'alias', split(/; /, $gene[ALIAS]));
+	  my @aliases = grep {$_ ne 'NO ALIASES'} split(/; /, $gene[ALIAS]);
+	  add_unique_parts($genes{$gene[NAME]},'alias', @aliases);
+	  if ($gene[NAME] ne 'NO NAME') {
+	       for my $alias (@aliases) {
+		    if (not exists $alias_reverse{$alias}) {
+			 $alias_reverse{$alias} = $gene[NAME];
+		    }
+		    elsif ($alias_reverse{$alias} ne $gene[NAME]) {
+			 print STDERR "Alias $alias for $gene[NAME] also points at $alias_reverse{$alias} [".join(',',@aliases).".]\n";
+			 $alias_reverse{$alias} = '';
+		    }
+	       }
+	  }
      }
 }
 
@@ -238,6 +276,11 @@ for my $keyword (keys %keyword_keyword) {
      $auto_weight{$keyword} = $results_by_this_keyword/$results_combined;
 }
 
+my $avg_weight = sum(values %auto_weight) / scalar keys %auto_weight;
+for my $keyword (keys %auto_weight) {
+     $auto_weight{$keyword} = $auto_weight{$keyword}/$avg_weight;
+}
+
 print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n);
 for my $gene (keys %genes) {
      $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}};
@@ -280,6 +323,7 @@ sub add_unique_parts{
 	  $$hr{$key} = [@values];
      }
      else {
+	  return unless @values;
 	  my %temp_hash;
 	  @temp_hash{@{$$hr{$key}}} = (1) x scalar @{$$hr{$key}};
 	  $temp_hash{@values} = (1) x scalar @values;
@@ -297,18 +341,48 @@ sub add_if_better{
      }
 }
 
+sub space_fill{
+     my ($value,$length,$right) = @_;
+     $right ||= 0;
+     if (length($value) > $length) {
+	  $value =~ m/(.{$length})/;
+	  return $1;
+     }
+     if (length($value) == $length) {
+	  return $value
+     }
+     if ($right) {
+	  return join('',
+		      ' ' x ($length - length($value)),
+		      $value,
+		     );
+     }
+     else {
+	  return join('',
+		      $value,
+		      ' ' x ($length - length($value)),
+		     );
+     }
+}
 
+sub results_table_line {
+     my ($keyword,@fields) = @_;
+     return join( ' & ',
+		  space_fill($keyword,23),
+		  map {space_fill($_,11,1)} @fields
+		)."\n";
+}
+
+my @database_order = grep {lc($_) ne 'total'} keys %databases;
 if (defined $results_table_fh) {
-     our ($keyword,$weight,$autoweight,$gct,$hvt,$nct,$t) = ('Keyword','Weight','Autoweight','GeneCards','Harvester','NCBI','Total');
-     format RESULTS_TABLE =
-@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\
-$keyword,                 $weight,      $autoweight,  $gct,         $hvt,         $nct,         $t
-.
-     $results_table_fh->format_name('RESULTS_TABLE');
-     write $results_table_fh;
+     my $keyword;
+     print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight',
+						  map {ucfirst($_)} @database_order,
+						  'Total',
+						 );
 
      for $keyword (sort keys %terms) {
-	  ($gct,$hvt,$nct,$t) =
+	  my @fields =
 	       map {
 		    if (not defined $_) {
 			 '$-$';
@@ -317,15 +391,17 @@ $keyword,                 $weight,      $autoweight,  $gct,         $hvt,
 			 $_->{unique} ||= 0;
 			 "$_->{count} ($_->{unique})";
 		    }
-	       } @{$terms{$keyword}}{qw(genecard harvester ncbi total)};
-	  $weight = $keyword_weight{$keyword} || 1;
-	  $autoweight = $auto_weight{$keyword};
-	  write $results_table_fh;
+	       } @{$terms{$keyword}}{@database_order,'total'};
+	  unshift @fields, $auto_weight{$keyword};
+	  unshift @fields, $keyword_weight{$keyword} || 1;
+	  print {$results_table_fh} results_table_line($keyword,
+						       @fields
+						      );
 
      }
-
      $keyword = 'Total';
-     ($gct,$hvt,$nct,$t) =
+     my @fields = ('','');
+     push @fields,
 	  map {
 	       if (not defined $_) {
 		    '$-$';
@@ -334,11 +410,10 @@ $keyword,                 $weight,      $autoweight,  $gct,         $hvt,
 		    $_->{unique} ||= 0;
 		    "$_->{count} ($_->{unique})";
 	       }
-	  } map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
-     #($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
-     $weight = '';
-     $autoweight = '';
-     write $results_table_fh;
+	  } map {$_->{total}} @databases{@database_order,'total'};
+     print {$results_table_fh} results_table_line($keyword,
+						  @fields
+						 );
 }
 
 __END__