X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=blobdiff_plain;f=bin%2Fcombine_results;fp=bin%2Fcombine_results;h=666fe7c81a70808c6bf01b69ed4952f0d6a5a156;hp=e49de8f9bb20c1b730bf091dee5ac7c9d338a00c;hb=af4fd770f221db1cec02393df378e079c0b9a8fc;hpb=6d24067f20698257dc1103d5c21e8a7f6a32b97b diff --git a/bin/combine_results b/bin/combine_results index e49de8f..666fe7c 100755 --- a/bin/combine_results +++ b/bin/combine_results @@ -23,6 +23,9 @@ use Pod::Usage; combine_results parsed_results_1.txt [parsedresultfiles ...] Options: + --keywords newline delineated list of keywords to search for + --results file to store results in + --results-table optional file to write result summary table to --debug, -d debugging level [default 0] --help, -h display this help --man, -m display manual @@ -31,6 +34,23 @@ use Pod::Usage; =over +=item B<--keywords> + +A file which contains a newline delinated list of keywords to search +for. Can be specified multiple times. Lines starting with # or ; are +ignored. An optional weight can be specified after the keyword, which +is separated from the keyword by a tab. (If not specified, 1 is +assumed.) + +=item B<--results> + +A file in which to store the combined results; defaults to STDOUT + +=item B<--results-table> + +A file in which to store a summary table of results. If not provided, +no summary table is written. + =item B<--debug, -d> Debug verbosity. (Default 0) @@ -63,6 +83,7 @@ BEGIN{ use XML::Parser::Expat; use IO::File; +use List::Util qw(sum max); # XXX parse config file @@ -70,9 +91,13 @@ my %options = (debug => 0, help => 0, man => 0, keywords => [], + results => undef, + results_table => undef, ); -GetOptions(\%options,'keywords|k=s@','debug|d+','help|h|?','man|m'); +GetOptions(\%options,'keywords|k=s@', + 'results=s','results_table|results-table=s', + 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; @@ -80,6 +105,18 @@ pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; +my $results_fh = \*STDOUT; +my $results_table_fh = undef; + +if ($options{results}) { + $results_fh = IO::File->new($options{results},'w') or + die "Unable to open results file $options{results} for writing"; +} +if ($options{results_table}) { + $results_table_fh = IO::File->new($options{results_table},'w') or + die "Unable to open results table file $options{results_table} for writing"; +} + # CSV columns use constant {NAME => 0, REFSEQ => 1, @@ -92,10 +129,29 @@ use constant {NAME => 0, FILENAME => 8, }; -my @csv_fields = qw(name hits rzscore refseq location alias database terms description function); +my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function); my %genes; +my %keyword_weight; + +if (@{$options{keywords}}) { + for my $keyword_file (@{$options{keywords}}) { + my $keyword_fh = IO::File->new($keyword_file,'r') or + die "Unable to open $keyword_file for reading: $!"; + while (<$keyword_fh>) { + next if /^\s*[#;]/; + next unless /\w+/; + chomp; + my ($keyword,$weight) = split /\t/, $_; + $weight = 1 if not defined $weight; + $keyword_weight{$keyword} = $weight; + } + } +} + + + for my $file_name (@ARGV) { my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!"; while (<$file>) { @@ -114,10 +170,93 @@ for my $file_name (@ARGV) { } } -print join(',',map {qq("$_")} @csv_fields),qq(\n); +my %databases; +my %terms; +my %auto_weight; +my %keyword_keyword; +for my $gene (keys %genes) { + my %term_temp; + my %db_temp; + my %gene_temp; + my %gene_temp2; + for my $term (keys %{$genes{$gene}{terms}}) { + if ($term =~ /\[/) { + my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/; + my $hits = $genes{$gene}{terms}{$term}; + $keyword =~ s/[-+_]/ /g; + $keyword =~ s/\s*$//; + $keyword =~ s/[*]//; + $gene_temp{$keyword}{$database} = 1; + $gene_temp2{$database}{$keyword} = 1; + $databases{$database}{$keyword}{count}++; + $db_temp{$database}++; + $terms{$keyword}{$database}{count}++; + } + else { + my $keyword = $term; + my $hits = $genes{$gene}{terms}{$term}; + $keyword =~ s/[-+_]/ /g; + $keyword =~ s/\s*$//; + $keyword =~ s/[*]//; + $terms{$keyword}{total}{count}++; + } + } + if (keys %gene_temp == 1) { + $terms{[keys %gene_temp]->[0]}{total}{unique}++; + if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) { + $databases{total}{total}{unique}++ + } + } + if (keys %gene_temp2 == 1) { + $databases{[keys %gene_temp2]->[0]}{total}{unique}++; + } + for my $keyword (keys %gene_temp) { + if (keys %{$gene_temp{$keyword}} == 1) { + $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++; + } + for my $keyword2 (keys %gene_temp) { + $keyword_keyword{$keyword}{$keyword2}++ + } + } + for my $database (keys %db_temp) { + $databases{$database}{total}{count}++; + } + $databases{total}{total}{count}++; + +} + +for my $keyword (keys %keyword_keyword) { + # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table + # we use max here to avoid 0/0 problems. + my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword}); + my $results_combined = max(1,grep {defined $_} + sum(map {$keyword_keyword{$keyword}{$_}} + grep {$_ ne $keyword} + keys %{$keyword_keyword{$keyword}} + ) + ); + $auto_weight{$keyword} = $results_by_this_keyword/$results_combined; +} + +print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n); for my $gene (keys %genes) { $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}; - print STDOUT join (',', + $genes{$gene}{weightedscore}= sum(0, + map {defined $keyword_weight{$_}?$keyword_weight{$_}:1} + grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}} + ); + $genes{$gene}{autoscore}= sum(0, + map {defined $auto_weight{$_}?$auto_weight{$_}:1} + grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}} + ); +} + +my $sort = 'autoscore'; +if (scalar grep {$_ != 1 } values %keyword_weight) { + $sort='weightedscore'; +} +for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) { + print {$results_fh} join (',', map {s/"//g; qq("$_")} map { my $value = $_; @@ -159,6 +298,47 @@ sub add_if_better{ } +if (defined $results_table_fh) { + our ($keyword,$weight,$autoweight,$gct,$hvt,$nct,$t) = ('Keyword','Weight','Autoweight','GeneCards','Harvester','NCBI','Total'); + format RESULTS_TABLE = +@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\ +$keyword, $weight, $autoweight, $gct, $hvt, $nct, $t +. + $results_table_fh->format_name('RESULTS_TABLE'); + write $results_table_fh; + + for $keyword (sort keys %terms) { + ($gct,$hvt,$nct,$t) = + map { + if (not defined $_) { + '$-$'; + } + else { + $_->{unique} ||= 0; + "$_->{count} ($_->{unique})"; + } + } @{$terms{$keyword}}{qw(genecard harvester ncbi total)}; + $weight = $keyword_weight{$keyword} || 1; + $autoweight = $auto_weight{$keyword}; + write $results_table_fh; + + } + $keyword = 'Total'; + ($gct,$hvt,$nct,$t) = + map { + if (not defined $_) { + '$-$'; + } + else { + $_->{unique} ||= 0; + "$_->{count} ($_->{unique})"; + } + } map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; + #($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; + $weight = ''; + $autoweight = ''; + write $results_table_fh; +} __END__