X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fcombine_results;h=49c1ca6da1b32aea859b0daa132d1648888c0ed5;hp=666fe7c81a70808c6bf01b69ed4952f0d6a5a156;hb=HEAD;hpb=af4fd770f221db1cec02393df378e079c0b9a8fc diff --git a/bin/combine_results b/bin/combine_results index 666fe7c..49c1ca6 100755 --- a/bin/combine_results +++ b/bin/combine_results @@ -152,11 +152,37 @@ if (@{$options{keywords}}) { +my %alias_reverse; + for my $file_name (@ARGV) { my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!"; while (<$file>) { next if /^"Name"/; my @gene = map {s/^\"//; s/\"$//; $_;} split /(?<=\")\,(?=\")/, $_; + # check to see if there's a different name we should be using + if (not exists $genes{$gene[NAME]}) { + # if the gene has a valid name, we do at least one test. + my $num_tested = $gene[NAME] ne 'NO NAME' ? 1 : 0; + my %candidates; + if ($gene[NAME] ne 'NO NAME' and exists $alias_reverse{$gene[NAME]} and $alias_reverse{$gene[NAME]} ne '') { + $candidates{$alias_reverse{$gene[NAME]}}++; + } + else { + for my $alias (grep {$_ !~ /^NO (ALIASES|NAME)$/} split(/; /, $gene[ALIAS])) { + if (exists $alias_reverse{$alias} and $alias_reverse{$alias} ne '') { + $candidates{$alias_reverse{$alias}}++; + } + $num_tested++; + } + } + #print STDERR "Choosing $alias_reverse{$gene[NAME]} for $gene[NAME]\n"; + for my $candidate (keys %candidates) { + if ($candidates{$candidate} > ($num_tested/2)) { + print STDERR "Choosing $candidate for '$gene[NAME]', as it matched $candidates{$candidate} of $num_tested tests\n"; + $gene[NAME] = $candidate; + } + } + } $genes{$gene[NAME]}{name} = $gene[NAME]; $genes{$gene[NAME]}{database}{$gene[DBNAME]}++; $genes{$gene[NAME]}{hits}++; @@ -166,7 +192,19 @@ for my $file_name (@ARGV) { add_if_better($genes{$gene[NAME]},'description',$gene[DESCRIPTION]); add_if_better($genes{$gene[NAME]},'location',$gene[LOCATION]); add_unique_parts($genes{$gene[NAME]},'function',split(/; /, $gene[FUNCTION])); - add_unique_parts($genes{$gene[NAME]},'alias', split(/; /, $gene[ALIAS])); + my @aliases = grep {$_ ne 'NO ALIASES'} split(/; /, $gene[ALIAS]); + add_unique_parts($genes{$gene[NAME]},'alias', @aliases); + if ($gene[NAME] ne 'NO NAME') { + for my $alias (@aliases) { + if (not exists $alias_reverse{$alias}) { + $alias_reverse{$alias} = $gene[NAME]; + } + elsif ($alias_reverse{$alias} ne $gene[NAME]) { + print STDERR "Alias $alias for $gene[NAME] also points at $alias_reverse{$alias} [".join(',',@aliases).".]\n"; + $alias_reverse{$alias} = ''; + } + } + } } } @@ -238,6 +276,11 @@ for my $keyword (keys %keyword_keyword) { $auto_weight{$keyword} = $results_by_this_keyword/$results_combined; } +my $avg_weight = sum(values %auto_weight) / scalar keys %auto_weight; +for my $keyword (keys %auto_weight) { + $auto_weight{$keyword} = $auto_weight{$keyword}/$avg_weight; +} + print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n); for my $gene (keys %genes) { $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}; @@ -280,6 +323,7 @@ sub add_unique_parts{ $$hr{$key} = [@values]; } else { + return unless @values; my %temp_hash; @temp_hash{@{$$hr{$key}}} = (1) x scalar @{$$hr{$key}}; $temp_hash{@values} = (1) x scalar @values; @@ -297,18 +341,48 @@ sub add_if_better{ } } +sub space_fill{ + my ($value,$length,$right) = @_; + $right ||= 0; + if (length($value) > $length) { + $value =~ m/(.{$length})/; + return $1; + } + if (length($value) == $length) { + return $value + } + if ($right) { + return join('', + ' ' x ($length - length($value)), + $value, + ); + } + else { + return join('', + $value, + ' ' x ($length - length($value)), + ); + } +} +sub results_table_line { + my ($keyword,@fields) = @_; + return join( ' & ', + space_fill($keyword,23), + map {space_fill($_,11,1)} @fields + )."\n"; +} + +my @database_order = grep {lc($_) ne 'total'} keys %databases; if (defined $results_table_fh) { - our ($keyword,$weight,$autoweight,$gct,$hvt,$nct,$t) = ('Keyword','Weight','Autoweight','GeneCards','Harvester','NCBI','Total'); - format RESULTS_TABLE = -@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\ -$keyword, $weight, $autoweight, $gct, $hvt, $nct, $t -. - $results_table_fh->format_name('RESULTS_TABLE'); - write $results_table_fh; + my $keyword; + print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight', + map {ucfirst($_)} @database_order, + 'Total', + ); for $keyword (sort keys %terms) { - ($gct,$hvt,$nct,$t) = + my @fields = map { if (not defined $_) { '$-$'; @@ -317,15 +391,17 @@ $keyword, $weight, $autoweight, $gct, $hvt, $_->{unique} ||= 0; "$_->{count} ($_->{unique})"; } - } @{$terms{$keyword}}{qw(genecard harvester ncbi total)}; - $weight = $keyword_weight{$keyword} || 1; - $autoweight = $auto_weight{$keyword}; - write $results_table_fh; + } @{$terms{$keyword}}{@database_order,'total'}; + unshift @fields, $auto_weight{$keyword}; + unshift @fields, $keyword_weight{$keyword} || 1; + print {$results_table_fh} results_table_line($keyword, + @fields + ); } - $keyword = 'Total'; - ($gct,$hvt,$nct,$t) = + my @fields = ('',''); + push @fields, map { if (not defined $_) { '$-$'; @@ -334,11 +410,10 @@ $keyword, $weight, $autoweight, $gct, $hvt, $_->{unique} ||= 0; "$_->{count} ($_->{unique})"; } - } map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; - #($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; - $weight = ''; - $autoweight = ''; - write $results_table_fh; + } map {$_->{total}} @databases{@database_order,'total'}; + print {$results_table_fh} results_table_line($keyword, + @fields + ); } __END__