3 # combine_results, is part of the gene search suite, and is released
4 # under the terms of the GPL version 2, or any later version, at your
5 # option. See the file README and COPYING for more information.
7 # Copyright 2006,2007 by Don Armstrong <don@donarmstrong.com>.
19 combine_results -- combines parsed result files; outputs to stdout.
23 combine_results parsed_results_1.txt [parsedresultfiles ...]
26 --keywords newline delineated list of keywords to search for
27 --results file to store results in
28 --results-table optional file to write result summary table to
29 --debug, -d debugging level [default 0]
30 --help, -h display this help
31 --man, -m display manual
39 A file which contains a newline delinated list of keywords to search
40 for. Can be specified multiple times. Lines starting with # or ; are
41 ignored. An optional weight can be specified after the keyword, which
42 is separated from the keyword by a tab. (If not specified, 1 is
47 A file in which to store the combined results; defaults to STDOUT
49 =item B<--results-table>
51 A file in which to store a summary table of results. If not provided,
52 no summary table is written.
56 Debug verbosity. (Default 0)
60 Display brief useage information.
70 combine_results foo_1.txt
72 Will pretty much do what you want
81 $DEBUG = 0 unless defined $DEBUG;
84 use XML::Parser::Expat;
86 use List::Util qw(sum max);
88 # XXX parse config file
90 my %options = (debug => 0,
95 results_table => undef,
98 GetOptions(\%options,'keywords|k=s@',
99 'results=s','results_table|results-table=s',
100 'debug|d+','help|h|?','man|m');
103 pod2usage() if $options{help};
104 pod2usage({verbose=>2}) if $options{man};
106 $DEBUG = $options{debug};
108 my $results_fh = \*STDOUT;
109 my $results_table_fh = undef;
111 if ($options{results}) {
112 $results_fh = IO::File->new($options{results},'w') or
113 die "Unable to open results file $options{results} for writing";
115 if ($options{results_table}) {
116 $results_table_fh = IO::File->new($options{results_table},'w') or
117 die "Unable to open results table file $options{results_table} for writing";
121 use constant {NAME => 0,
132 my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function);
138 if (@{$options{keywords}}) {
139 for my $keyword_file (@{$options{keywords}}) {
140 my $keyword_fh = IO::File->new($keyword_file,'r') or
141 die "Unable to open $keyword_file for reading: $!";
142 while (<$keyword_fh>) {
146 my ($keyword,$weight) = split /\t/, $_;
147 $weight = 1 if not defined $weight;
148 $keyword_weight{$keyword} = $weight;
157 for my $file_name (@ARGV) {
158 my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
161 my @gene = map {s/^\"//; s/\"$//; $_;} split /(?<=\")\,(?=\")/, $_;
162 # check to see if there's a different name we should be using
163 if (not exists $genes{$gene[NAME]}) {
164 # if the gene has a valid name, we do at least one test.
165 my $num_tested = $gene[NAME] ne 'NO NAME' ? 1 : 0;
167 if ($gene[NAME] ne 'NO NAME' and exists $alias_reverse{$gene[NAME]} and $alias_reverse{$gene[NAME]} ne '') {
168 $candidates{$alias_reverse{$gene[NAME]}}++;
171 for my $alias (grep {$_ !~ /^NO (ALIASES|NAME)$/} split(/; /, $gene[ALIAS])) {
172 if (exists $alias_reverse{$alias} and $alias_reverse{$alias} ne '') {
173 $candidates{$alias_reverse{$alias}}++;
178 #print STDERR "Choosing $alias_reverse{$gene[NAME]} for $gene[NAME]\n";
179 for my $candidate (keys %candidates) {
180 if ($candidates{$candidate} > ($num_tested/2)) {
181 print STDERR "Choosing $candidate for '$gene[NAME]', as it matched $candidates{$candidate} of $num_tested tests\n";
182 $gene[NAME] = $candidate;
186 $genes{$gene[NAME]}{name} = $gene[NAME];
187 $genes{$gene[NAME]}{database}{$gene[DBNAME]}++;
188 $genes{$gene[NAME]}{hits}++;
189 $genes{$gene[NAME]}{terms}{$gene[KEYWORD]}++;
190 $genes{$gene[NAME]}{terms}{$gene[KEYWORD].'['.$gene[DBNAME].']'}++;
191 add_unique_parts($genes{$gene[NAME]},'refseq',$gene[REFSEQ]);
192 add_if_better($genes{$gene[NAME]},'description',$gene[DESCRIPTION]);
193 add_if_better($genes{$gene[NAME]},'location',$gene[LOCATION]);
194 add_unique_parts($genes{$gene[NAME]},'function',split(/; /, $gene[FUNCTION]));
195 my @aliases = grep {$_ ne 'NO ALIASES'} split(/; /, $gene[ALIAS]);
196 add_unique_parts($genes{$gene[NAME]},'alias', @aliases);
197 if ($gene[NAME] ne 'NO NAME') {
198 for my $alias (@aliases) {
199 if (not exists $alias_reverse{$alias}) {
200 $alias_reverse{$alias} = $gene[NAME];
202 elsif ($alias_reverse{$alias} ne $gene[NAME]) {
203 print STDERR "Alias $alias for $gene[NAME] also points at $alias_reverse{$alias} [".join(',',@aliases).".]\n";
204 $alias_reverse{$alias} = '';
215 for my $gene (keys %genes) {
220 for my $term (keys %{$genes{$gene}{terms}}) {
222 my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/;
223 my $hits = $genes{$gene}{terms}{$term};
224 $keyword =~ s/[-+_]/ /g;
225 $keyword =~ s/\s*$//;
227 $gene_temp{$keyword}{$database} = 1;
228 $gene_temp2{$database}{$keyword} = 1;
229 $databases{$database}{$keyword}{count}++;
230 $db_temp{$database}++;
231 $terms{$keyword}{$database}{count}++;
235 my $hits = $genes{$gene}{terms}{$term};
236 $keyword =~ s/[-+_]/ /g;
237 $keyword =~ s/\s*$//;
239 $terms{$keyword}{total}{count}++;
242 if (keys %gene_temp == 1) {
243 $terms{[keys %gene_temp]->[0]}{total}{unique}++;
244 if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) {
245 $databases{total}{total}{unique}++
248 if (keys %gene_temp2 == 1) {
249 $databases{[keys %gene_temp2]->[0]}{total}{unique}++;
251 for my $keyword (keys %gene_temp) {
252 if (keys %{$gene_temp{$keyword}} == 1) {
253 $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++;
255 for my $keyword2 (keys %gene_temp) {
256 $keyword_keyword{$keyword}{$keyword2}++
259 for my $database (keys %db_temp) {
260 $databases{$database}{total}{count}++;
262 $databases{total}{total}{count}++;
266 for my $keyword (keys %keyword_keyword) {
267 # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table
268 # we use max here to avoid 0/0 problems.
269 my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword});
270 my $results_combined = max(1,grep {defined $_}
271 sum(map {$keyword_keyword{$keyword}{$_}}
272 grep {$_ ne $keyword}
273 keys %{$keyword_keyword{$keyword}}
276 $auto_weight{$keyword} = $results_by_this_keyword/$results_combined;
279 my $avg_weight = sum(values %auto_weight) / scalar keys %auto_weight;
280 for my $keyword (keys %auto_weight) {
281 $auto_weight{$keyword} = $auto_weight{$keyword}/$avg_weight;
284 print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n);
285 for my $gene (keys %genes) {
286 $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}};
287 $genes{$gene}{weightedscore}= sum(0,
288 map {defined $keyword_weight{$_}?$keyword_weight{$_}:1}
289 grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
291 $genes{$gene}{autoscore}= sum(0,
292 map {defined $auto_weight{$_}?$auto_weight{$_}:1}
293 grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
297 my $sort = 'autoscore';
298 if (scalar grep {$_ != 1 } values %keyword_weight) {
299 $sort='weightedscore';
301 for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) {
302 print {$results_fh} join (',',
303 map {s/"//g; qq("$_")}
306 if (ref $value eq 'HASH') {
307 join('; ',map {qq($_:$$value{$_})} keys %$value);
309 elsif (ref $value eq 'ARRAY') {
315 } @{$genes{$gene}}{@csv_fields}
320 sub add_unique_parts{
321 my ($hr,$key,@values) = @_;
322 if (not defined $$hr{key}) {
323 $$hr{$key} = [@values];
326 return unless @values;
328 @temp_hash{@{$$hr{$key}}} = (1) x scalar @{$$hr{$key}};
329 $temp_hash{@values} = (1) x scalar @values;
330 $$hr{$key} = [keys %temp_hash];
335 my ($hash, $key, $value) = @_;
336 if (not defined $$hash{$key}) {
337 $$hash{$key} = $value;
339 elsif (length $$hash{$key} < length $value and $value !~ /^NO\s+(LOCATION|DESCRIPTION)+$/) {
340 $$hash{$key} = $value;
345 my ($value,$length,$right) = @_;
347 if (length($value) > $length) {
348 $value =~ m/(.{$length})/;
351 if (length($value) == $length) {
356 ' ' x ($length - length($value)),
363 ' ' x ($length - length($value)),
368 sub results_table_line {
369 my ($keyword,@fields) = @_;
371 space_fill($keyword,23),
372 map {space_fill($_,11,1)} @fields
376 my @database_order = grep {lc($_) ne 'total'} keys %databases;
377 if (defined $results_table_fh) {
379 print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight',
380 map {ucfirst($_)} @database_order,
384 for $keyword (sort keys %terms) {
387 if (not defined $_) {
392 "$_->{count} ($_->{unique})";
394 } @{$terms{$keyword}}{@database_order,'total'};
395 unshift @fields, $auto_weight{$keyword};
396 unshift @fields, $keyword_weight{$keyword} || 1;
397 print {$results_table_fh} results_table_line($keyword,
403 my @fields = ('','');
406 if (not defined $_) {
411 "$_->{count} ($_->{unique})";
413 } map {$_->{total}} @databases{@database_order,'total'};
414 print {$results_table_fh} results_table_line($keyword,