#! /usr/bin/perl # combine_results, is part of the gene search suite, and is released # under the terms of the GPL version 2, or any later version, at your # option. See the file README and COPYING for more information. # Copyright 2006,2007 by Don Armstrong . use warnings; use strict; use Getopt::Long; use Pod::Usage; =head1 NAME combine_results -- combines parsed result files; outputs to stdout. =head1 SYNOPSIS combine_results parsed_results_1.txt [parsedresultfiles ...] Options: --keywords newline delineated list of keywords to search for --results file to store results in --results-table optional file to write result summary table to --debug, -d debugging level [default 0] --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--keywords> A file which contains a newline delinated list of keywords to search for. Can be specified multiple times. Lines starting with # or ; are ignored. An optional weight can be specified after the keyword, which is separated from the keyword by a tab. (If not specified, 1 is assumed.) =item B<--results> A file in which to store the combined results; defaults to STDOUT =item B<--results-table> A file in which to store a summary table of results. If not provided, no summary table is written. =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief useage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES combine_results foo_1.txt Will pretty much do what you want =cut use vars qw($DEBUG); BEGIN{ $DEBUG = 0 unless defined $DEBUG; } use XML::Parser::Expat; use IO::File; use List::Util qw(sum max); # XXX parse config file my %options = (debug => 0, help => 0, man => 0, keywords => [], results => undef, results_table => undef, ); GetOptions(\%options,'keywords|k=s@', 'results=s','results_table|results-table=s', 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; my $results_fh = \*STDOUT; my $results_table_fh = undef; if ($options{results}) { $results_fh = IO::File->new($options{results},'w') or die "Unable to open results file $options{results} for writing"; } if ($options{results_table}) { $results_table_fh = IO::File->new($options{results_table},'w') or die "Unable to open results table file $options{results_table} for writing"; } # CSV columns use constant {NAME => 0, REFSEQ => 1, LOCATION => 2, ALIAS => 3, FUNCTION => 4, DESCRIPTION => 5, KEYWORD => 6, DBNAME => 7, FILENAME => 8, }; my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function); my %genes; my %keyword_weight; if (@{$options{keywords}}) { for my $keyword_file (@{$options{keywords}}) { my $keyword_fh = IO::File->new($keyword_file,'r') or die "Unable to open $keyword_file for reading: $!"; while (<$keyword_fh>) { next if /^\s*[#;]/; next unless /\w+/; chomp; my ($keyword,$weight) = split /\t/, $_; $weight = 1 if not defined $weight; $keyword_weight{$keyword} = $weight; } } } my %alias_reverse; for my $file_name (@ARGV) { my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!"; while (<$file>) { next if /^"Name"/; my @gene = map {s/^\"//; s/\"$//; $_;} split /(?<=\")\,(?=\")/, $_; # check to see if there's a different name we should be using if (not exists $genes{$gene[NAME]}) { # if the gene has a valid name, we do at least one test. my $num_tested = $gene[NAME] ne 'NO NAME' ? 1 : 0; my %candidates; if ($gene[NAME] ne 'NO NAME' and exists $alias_reverse{$gene[NAME]} and $alias_reverse{$gene[NAME]} ne '') { $candidates{$alias_reverse{$gene[NAME]}}++; } else { for my $alias (grep {$_ !~ /^NO (ALIASES|NAME)$/} split(/; /, $gene[ALIAS])) { if (exists $alias_reverse{$alias} and $alias_reverse{$alias} ne '') { $candidates{$alias_reverse{$alias}}++; } $num_tested++; } } #print STDERR "Choosing $alias_reverse{$gene[NAME]} for $gene[NAME]\n"; for my $candidate (keys %candidates) { if ($candidates{$candidate} > ($num_tested/2)) { print STDERR "Choosing $candidate for '$gene[NAME]', as it matched $candidates{$candidate} of $num_tested tests\n"; $gene[NAME] = $candidate; } } } $genes{$gene[NAME]}{name} = $gene[NAME]; $genes{$gene[NAME]}{database}{$gene[DBNAME]}++; $genes{$gene[NAME]}{hits}++; $genes{$gene[NAME]}{terms}{$gene[KEYWORD]}++; $genes{$gene[NAME]}{terms}{$gene[KEYWORD].'['.$gene[DBNAME].']'}++; add_unique_parts($genes{$gene[NAME]},'refseq',$gene[REFSEQ]); add_if_better($genes{$gene[NAME]},'description',$gene[DESCRIPTION]); add_if_better($genes{$gene[NAME]},'location',$gene[LOCATION]); add_unique_parts($genes{$gene[NAME]},'function',split(/; /, $gene[FUNCTION])); my @aliases = grep {$_ ne 'NO ALIASES'} split(/; /, $gene[ALIAS]); add_unique_parts($genes{$gene[NAME]},'alias', @aliases); if ($gene[NAME] ne 'NO NAME') { for my $alias (@aliases) { if (not exists $alias_reverse{$alias}) { $alias_reverse{$alias} = $gene[NAME]; } elsif ($alias_reverse{$alias} ne $gene[NAME]) { print STDERR "Alias $alias for $gene[NAME] also points at $alias_reverse{$alias} [".join(',',@aliases).".]\n"; $alias_reverse{$alias} = ''; } } } } } my %databases; my %terms; my %auto_weight; my %keyword_keyword; for my $gene (keys %genes) { my %term_temp; my %db_temp; my %gene_temp; my %gene_temp2; for my $term (keys %{$genes{$gene}{terms}}) { if ($term =~ /\[/) { my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/; my $hits = $genes{$gene}{terms}{$term}; $keyword =~ s/[-+_]/ /g; $keyword =~ s/\s*$//; $keyword =~ s/[*]//; $gene_temp{$keyword}{$database} = 1; $gene_temp2{$database}{$keyword} = 1; $databases{$database}{$keyword}{count}++; $db_temp{$database}++; $terms{$keyword}{$database}{count}++; } else { my $keyword = $term; my $hits = $genes{$gene}{terms}{$term}; $keyword =~ s/[-+_]/ /g; $keyword =~ s/\s*$//; $keyword =~ s/[*]//; $terms{$keyword}{total}{count}++; } } if (keys %gene_temp == 1) { $terms{[keys %gene_temp]->[0]}{total}{unique}++; if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) { $databases{total}{total}{unique}++ } } if (keys %gene_temp2 == 1) { $databases{[keys %gene_temp2]->[0]}{total}{unique}++; } for my $keyword (keys %gene_temp) { if (keys %{$gene_temp{$keyword}} == 1) { $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++; } for my $keyword2 (keys %gene_temp) { $keyword_keyword{$keyword}{$keyword2}++ } } for my $database (keys %db_temp) { $databases{$database}{total}{count}++; } $databases{total}{total}{count}++; } for my $keyword (keys %keyword_keyword) { # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table # we use max here to avoid 0/0 problems. my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword}); my $results_combined = max(1,grep {defined $_} sum(map {$keyword_keyword{$keyword}{$_}} grep {$_ ne $keyword} keys %{$keyword_keyword{$keyword}} ) ); $auto_weight{$keyword} = $results_by_this_keyword/$results_combined; } my $max_weight = max(values %auto_weight); for my $keyword (keys %auto_weight) { $auto_weight{$keyword} = $auto_weight{$keyword}/$max_weight; } print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n); for my $gene (keys %genes) { $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}; $genes{$gene}{weightedscore}= sum(0, map {defined $keyword_weight{$_}?$keyword_weight{$_}:1} grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}} ); $genes{$gene}{autoscore}= sum(0, map {defined $auto_weight{$_}?$auto_weight{$_}:1} grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}} ); } my $sort = 'autoscore'; if (scalar grep {$_ != 1 } values %keyword_weight) { $sort='weightedscore'; } for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) { print {$results_fh} join (',', map {s/"//g; qq("$_")} map { my $value = $_; if (ref $value eq 'HASH') { join('; ',map {qq($_:$$value{$_})} keys %$value); } elsif (ref $value eq 'ARRAY') { join('; ', @$value); } else { $value; } } @{$genes{$gene}}{@csv_fields} ), qq(\n); } sub add_unique_parts{ my ($hr,$key,@values) = @_; if (not defined $$hr{key}) { $$hr{$key} = [@values]; } else { return unless @values; my %temp_hash; @temp_hash{@{$$hr{$key}}} = (1) x scalar @{$$hr{$key}}; $temp_hash{@values} = (1) x scalar @values; $$hr{$key} = [keys %temp_hash]; } } sub add_if_better{ my ($hash, $key, $value) = @_; if (not defined $$hash{$key}) { $$hash{$key} = $value; } elsif (length $$hash{$key} < length $value and $value !~ /^NO\s+(LOCATION|DESCRIPTION)+$/) { $$hash{$key} = $value; } } sub space_fill{ my ($value,$length,$right) = @_; $right ||= 0; if (length($value) > $length) { $value =~ m/(.{$length})/; return $1; } if (length($value) == $length) { return $value } if ($right) { return join('', ' ' x ($length - length($value)), $value, ); } else { return join('', $value, ' ' x ($length - length($value)), ); } } sub results_table_line { my ($keyword,@fields) = @_; return join( ' & ', space_fill($keyword,23), map {space_fill($_,11,1)} @fields )."\n"; } my @database_order = grep {lc($_) ne 'total'} keys %databases; if (defined $results_table_fh) { my $keyword; print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight', map {ucfirst($_)} @database_order, 'Total', ); for $keyword (sort keys %terms) { my @fields = map { if (not defined $_) { '$-$'; } else { $_->{unique} ||= 0; "$_->{count} ($_->{unique})"; } } @{$terms{$keyword}}{@database_order,'total'}; unshift @fields, $auto_weight{$keyword}; unshift @fields, $keyword_weight{$keyword} || 1; print {$results_table_fh} results_table_line($keyword, @fields ); } $keyword = 'Total'; my @fields = ('',''); push @fields, map { if (not defined $_) { '$-$'; } else { $_->{unique} ||= 0; "$_->{count} ($_->{unique})"; } } map {$_->{total}} @databases{@database_order,'total'}; print {$results_table_fh} results_table_line($keyword, @fields ); } __END__