X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fcombine_results;h=49c1ca6da1b32aea859b0daa132d1648888c0ed5;hp=e49de8f9bb20c1b730bf091dee5ac7c9d338a00c;hb=HEAD;hpb=27d14c1110035a9e8425c100f587792517d1f19b diff --git a/bin/combine_results b/bin/combine_results index e49de8f..49c1ca6 100755 --- a/bin/combine_results +++ b/bin/combine_results @@ -23,6 +23,9 @@ use Pod::Usage; combine_results parsed_results_1.txt [parsedresultfiles ...] Options: + --keywords newline delineated list of keywords to search for + --results file to store results in + --results-table optional file to write result summary table to --debug, -d debugging level [default 0] --help, -h display this help --man, -m display manual @@ -31,6 +34,23 @@ use Pod::Usage; =over +=item B<--keywords> + +A file which contains a newline delinated list of keywords to search +for. Can be specified multiple times. Lines starting with # or ; are +ignored. An optional weight can be specified after the keyword, which +is separated from the keyword by a tab. (If not specified, 1 is +assumed.) + +=item B<--results> + +A file in which to store the combined results; defaults to STDOUT + +=item B<--results-table> + +A file in which to store a summary table of results. If not provided, +no summary table is written. + =item B<--debug, -d> Debug verbosity. (Default 0) @@ -63,6 +83,7 @@ BEGIN{ use XML::Parser::Expat; use IO::File; +use List::Util qw(sum max); # XXX parse config file @@ -70,9 +91,13 @@ my %options = (debug => 0, help => 0, man => 0, keywords => [], + results => undef, + results_table => undef, ); -GetOptions(\%options,'keywords|k=s@','debug|d+','help|h|?','man|m'); +GetOptions(\%options,'keywords|k=s@', + 'results=s','results_table|results-table=s', + 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; @@ -80,6 +105,18 @@ pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; +my $results_fh = \*STDOUT; +my $results_table_fh = undef; + +if ($options{results}) { + $results_fh = IO::File->new($options{results},'w') or + die "Unable to open results file $options{results} for writing"; +} +if ($options{results_table}) { + $results_table_fh = IO::File->new($options{results_table},'w') or + die "Unable to open results table file $options{results_table} for writing"; +} + # CSV columns use constant {NAME => 0, REFSEQ => 1, @@ -92,15 +129,60 @@ use constant {NAME => 0, FILENAME => 8, }; -my @csv_fields = qw(name hits rzscore refseq location alias database terms description function); +my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function); my %genes; +my %keyword_weight; + +if (@{$options{keywords}}) { + for my $keyword_file (@{$options{keywords}}) { + my $keyword_fh = IO::File->new($keyword_file,'r') or + die "Unable to open $keyword_file for reading: $!"; + while (<$keyword_fh>) { + next if /^\s*[#;]/; + next unless /\w+/; + chomp; + my ($keyword,$weight) = split /\t/, $_; + $weight = 1 if not defined $weight; + $keyword_weight{$keyword} = $weight; + } + } +} + + + +my %alias_reverse; + for my $file_name (@ARGV) { my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!"; while (<$file>) { next if /^"Name"/; my @gene = map {s/^\"//; s/\"$//; $_;} split /(?<=\")\,(?=\")/, $_; + # check to see if there's a different name we should be using + if (not exists $genes{$gene[NAME]}) { + # if the gene has a valid name, we do at least one test. + my $num_tested = $gene[NAME] ne 'NO NAME' ? 1 : 0; + my %candidates; + if ($gene[NAME] ne 'NO NAME' and exists $alias_reverse{$gene[NAME]} and $alias_reverse{$gene[NAME]} ne '') { + $candidates{$alias_reverse{$gene[NAME]}}++; + } + else { + for my $alias (grep {$_ !~ /^NO (ALIASES|NAME)$/} split(/; /, $gene[ALIAS])) { + if (exists $alias_reverse{$alias} and $alias_reverse{$alias} ne '') { + $candidates{$alias_reverse{$alias}}++; + } + $num_tested++; + } + } + #print STDERR "Choosing $alias_reverse{$gene[NAME]} for $gene[NAME]\n"; + for my $candidate (keys %candidates) { + if ($candidates{$candidate} > ($num_tested/2)) { + print STDERR "Choosing $candidate for '$gene[NAME]', as it matched $candidates{$candidate} of $num_tested tests\n"; + $gene[NAME] = $candidate; + } + } + } $genes{$gene[NAME]}{name} = $gene[NAME]; $genes{$gene[NAME]}{database}{$gene[DBNAME]}++; $genes{$gene[NAME]}{hits}++; @@ -110,14 +192,114 @@ for my $file_name (@ARGV) { add_if_better($genes{$gene[NAME]},'description',$gene[DESCRIPTION]); add_if_better($genes{$gene[NAME]},'location',$gene[LOCATION]); add_unique_parts($genes{$gene[NAME]},'function',split(/; /, $gene[FUNCTION])); - add_unique_parts($genes{$gene[NAME]},'alias', split(/; /, $gene[ALIAS])); + my @aliases = grep {$_ ne 'NO ALIASES'} split(/; /, $gene[ALIAS]); + add_unique_parts($genes{$gene[NAME]},'alias', @aliases); + if ($gene[NAME] ne 'NO NAME') { + for my $alias (@aliases) { + if (not exists $alias_reverse{$alias}) { + $alias_reverse{$alias} = $gene[NAME]; + } + elsif ($alias_reverse{$alias} ne $gene[NAME]) { + print STDERR "Alias $alias for $gene[NAME] also points at $alias_reverse{$alias} [".join(',',@aliases).".]\n"; + $alias_reverse{$alias} = ''; + } + } + } + } +} + +my %databases; +my %terms; +my %auto_weight; +my %keyword_keyword; +for my $gene (keys %genes) { + my %term_temp; + my %db_temp; + my %gene_temp; + my %gene_temp2; + for my $term (keys %{$genes{$gene}{terms}}) { + if ($term =~ /\[/) { + my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/; + my $hits = $genes{$gene}{terms}{$term}; + $keyword =~ s/[-+_]/ /g; + $keyword =~ s/\s*$//; + $keyword =~ s/[*]//; + $gene_temp{$keyword}{$database} = 1; + $gene_temp2{$database}{$keyword} = 1; + $databases{$database}{$keyword}{count}++; + $db_temp{$database}++; + $terms{$keyword}{$database}{count}++; + } + else { + my $keyword = $term; + my $hits = $genes{$gene}{terms}{$term}; + $keyword =~ s/[-+_]/ /g; + $keyword =~ s/\s*$//; + $keyword =~ s/[*]//; + $terms{$keyword}{total}{count}++; + } + } + if (keys %gene_temp == 1) { + $terms{[keys %gene_temp]->[0]}{total}{unique}++; + if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) { + $databases{total}{total}{unique}++ + } } + if (keys %gene_temp2 == 1) { + $databases{[keys %gene_temp2]->[0]}{total}{unique}++; + } + for my $keyword (keys %gene_temp) { + if (keys %{$gene_temp{$keyword}} == 1) { + $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++; + } + for my $keyword2 (keys %gene_temp) { + $keyword_keyword{$keyword}{$keyword2}++ + } + } + for my $database (keys %db_temp) { + $databases{$database}{total}{count}++; + } + $databases{total}{total}{count}++; + +} + +for my $keyword (keys %keyword_keyword) { + # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table + # we use max here to avoid 0/0 problems. + my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword}); + my $results_combined = max(1,grep {defined $_} + sum(map {$keyword_keyword{$keyword}{$_}} + grep {$_ ne $keyword} + keys %{$keyword_keyword{$keyword}} + ) + ); + $auto_weight{$keyword} = $results_by_this_keyword/$results_combined; } -print join(',',map {qq("$_")} @csv_fields),qq(\n); +my $avg_weight = sum(values %auto_weight) / scalar keys %auto_weight; +for my $keyword (keys %auto_weight) { + $auto_weight{$keyword} = $auto_weight{$keyword}/$avg_weight; +} + +print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n); for my $gene (keys %genes) { $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}; - print STDOUT join (',', + $genes{$gene}{weightedscore}= sum(0, + map {defined $keyword_weight{$_}?$keyword_weight{$_}:1} + grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}} + ); + $genes{$gene}{autoscore}= sum(0, + map {defined $auto_weight{$_}?$auto_weight{$_}:1} + grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}} + ); +} + +my $sort = 'autoscore'; +if (scalar grep {$_ != 1 } values %keyword_weight) { + $sort='weightedscore'; +} +for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) { + print {$results_fh} join (',', map {s/"//g; qq("$_")} map { my $value = $_; @@ -141,6 +323,7 @@ sub add_unique_parts{ $$hr{$key} = [@values]; } else { + return unless @values; my %temp_hash; @temp_hash{@{$$hr{$key}}} = (1) x scalar @{$$hr{$key}}; $temp_hash{@values} = (1) x scalar @values; @@ -158,7 +341,79 @@ sub add_if_better{ } } +sub space_fill{ + my ($value,$length,$right) = @_; + $right ||= 0; + if (length($value) > $length) { + $value =~ m/(.{$length})/; + return $1; + } + if (length($value) == $length) { + return $value + } + if ($right) { + return join('', + ' ' x ($length - length($value)), + $value, + ); + } + else { + return join('', + $value, + ' ' x ($length - length($value)), + ); + } +} + +sub results_table_line { + my ($keyword,@fields) = @_; + return join( ' & ', + space_fill($keyword,23), + map {space_fill($_,11,1)} @fields + )."\n"; +} +my @database_order = grep {lc($_) ne 'total'} keys %databases; +if (defined $results_table_fh) { + my $keyword; + print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight', + map {ucfirst($_)} @database_order, + 'Total', + ); + + for $keyword (sort keys %terms) { + my @fields = + map { + if (not defined $_) { + '$-$'; + } + else { + $_->{unique} ||= 0; + "$_->{count} ($_->{unique})"; + } + } @{$terms{$keyword}}{@database_order,'total'}; + unshift @fields, $auto_weight{$keyword}; + unshift @fields, $keyword_weight{$keyword} || 1; + print {$results_table_fh} results_table_line($keyword, + @fields + ); + } + $keyword = 'Total'; + my @fields = ('',''); + push @fields, + map { + if (not defined $_) { + '$-$'; + } + else { + $_->{unique} ||= 0; + "$_->{count} ($_->{unique})"; + } + } map {$_->{total}} @databases{@database_order,'total'}; + print {$results_table_fh} results_table_line($keyword, + @fields + ); +} __END__