+my %alias_reverse;
+
for my $file_name (@ARGV) {
my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
while (<$file>) {
next if /^"Name"/;
my @gene = map {s/^\"//; s/\"$//; $_;} split /(?<=\")\,(?=\")/, $_;
+ # check to see if there's a different name we should be using
+ if (not exists $genes{$gene[NAME]}) {
+ # if the gene has a valid name, we do at least one test.
+ my $num_tested = $gene[NAME] ne 'NO NAME' ? 1 : 0;
+ my %candidates;
+ if ($gene[NAME] ne 'NO NAME' and exists $alias_reverse{$gene[NAME]} and $alias_reverse{$gene[NAME]} ne '') {
+ $candidates{$alias_reverse{$gene[NAME]}}++;
+ }
+ else {
+ for my $alias (grep {$_ !~ /^NO (ALIASES|NAME)$/} split(/; /, $gene[ALIAS])) {
+ if (exists $alias_reverse{$alias} and $alias_reverse{$alias} ne '') {
+ $candidates{$alias_reverse{$alias}}++;
+ }
+ $num_tested++;
+ }
+ }
+ #print STDERR "Choosing $alias_reverse{$gene[NAME]} for $gene[NAME]\n";
+ for my $candidate (keys %candidates) {
+ if ($candidates{$candidate} > ($num_tested/2)) {
+ print STDERR "Choosing $candidate for '$gene[NAME]', as it matched $candidates{$candidate} of $num_tested tests\n";
+ $gene[NAME] = $candidate;
+ }
+ }
+ }
$genes{$gene[NAME]}{name} = $gene[NAME];
$genes{$gene[NAME]}{database}{$gene[DBNAME]}++;
$genes{$gene[NAME]}{hits}++;
add_if_better($genes{$gene[NAME]},'description',$gene[DESCRIPTION]);
add_if_better($genes{$gene[NAME]},'location',$gene[LOCATION]);
add_unique_parts($genes{$gene[NAME]},'function',split(/; /, $gene[FUNCTION]));
- add_unique_parts($genes{$gene[NAME]},'alias', split(/; /, $gene[ALIAS]));
+ my @aliases = grep {$_ ne 'NO ALIASES'} split(/; /, $gene[ALIAS]);
+ add_unique_parts($genes{$gene[NAME]},'alias', @aliases);
+ if ($gene[NAME] ne 'NO NAME') {
+ for my $alias (@aliases) {
+ if (not exists $alias_reverse{$alias}) {
+ $alias_reverse{$alias} = $gene[NAME];
+ }
+ elsif ($alias_reverse{$alias} ne $gene[NAME]) {
+ print STDERR "Alias $alias for $gene[NAME] also points at $alias_reverse{$alias} [".join(',',@aliases).".]\n";
+ $alias_reverse{$alias} = '';
+ }
+ }
+ }
}
}
$auto_weight{$keyword} = $results_by_this_keyword/$results_combined;
}
+my $avg_weight = sum(values %auto_weight) / scalar keys %auto_weight;
+for my $keyword (keys %auto_weight) {
+ $auto_weight{$keyword} = $auto_weight{$keyword}/$avg_weight;
+}
+
print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n);
for my $gene (keys %genes) {
$genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}};
$$hr{$key} = [@values];
}
else {
+ return unless @values;
my %temp_hash;
@temp_hash{@{$$hr{$key}}} = (1) x scalar @{$$hr{$key}};
$temp_hash{@values} = (1) x scalar @values;
}
}
+sub space_fill{
+ my ($value,$length,$right) = @_;
+ $right ||= 0;
+ if (length($value) > $length) {
+ $value =~ m/(.{$length})/;
+ return $1;
+ }
+ if (length($value) == $length) {
+ return $value
+ }
+ if ($right) {
+ return join('',
+ ' ' x ($length - length($value)),
+ $value,
+ );
+ }
+ else {
+ return join('',
+ $value,
+ ' ' x ($length - length($value)),
+ );
+ }
+}
+sub results_table_line {
+ my ($keyword,@fields) = @_;
+ return join( ' & ',
+ space_fill($keyword,23),
+ map {space_fill($_,11,1)} @fields
+ )."\n";
+}
+
+my @database_order = grep {lc($_) ne 'total'} keys %databases;
if (defined $results_table_fh) {
- our ($keyword,$weight,$autoweight,$gct,$hvt,$nct,$t) = ('Keyword','Weight','Autoweight','GeneCards','Harvester','NCBI','Total');
- format RESULTS_TABLE =
-@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\
-$keyword, $weight, $autoweight, $gct, $hvt, $nct, $t
-.
- $results_table_fh->format_name('RESULTS_TABLE');
- write $results_table_fh;
+ my $keyword;
+ print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight',
+ map {ucfirst($_)} @database_order,
+ 'Total',
+ );
for $keyword (sort keys %terms) {
- ($gct,$hvt,$nct,$t) =
+ my @fields =
map {
if (not defined $_) {
'$-$';
$_->{unique} ||= 0;
"$_->{count} ($_->{unique})";
}
- } @{$terms{$keyword}}{qw(genecard harvester ncbi total)};
- $weight = $keyword_weight{$keyword} || 1;
- $autoweight = $auto_weight{$keyword};
- write $results_table_fh;
+ } @{$terms{$keyword}}{@database_order,'total'};
+ unshift @fields, $auto_weight{$keyword};
+ unshift @fields, $keyword_weight{$keyword} || 1;
+ print {$results_table_fh} results_table_line($keyword,
+ @fields
+ );
}
-
$keyword = 'Total';
- ($gct,$hvt,$nct,$t) =
+ my @fields = ('','');
+ push @fields,
map {
if (not defined $_) {
'$-$';
$_->{unique} ||= 0;
"$_->{count} ($_->{unique})";
}
- } map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
- #($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
- $weight = '';
- $autoweight = '';
- write $results_table_fh;
+ } map {$_->{total}} @databases{@database_order,'total'};
+ print {$results_table_fh} results_table_line($keyword,
+ @fields
+ );
}
__END__