3 # combine_results, is part of the gene search suite, and is released
4 # under the terms of the GPL version 2, or any later version, at your
5 # option. See the file README and COPYING for more information.
7 # Copyright 2006,2007 by Don Armstrong <don@donarmstrong.com>.
19 combine_results -- combines parsed result files; outputs to stdout.
23 combine_results parsed_results_1.txt [parsedresultfiles ...]
26 --keywords newline delineated list of keywords to search for
27 --results file to store results in
28 --results-table optional file to write result summary table to
29 --debug, -d debugging level [default 0]
30 --help, -h display this help
31 --man, -m display manual
39 A file which contains a newline delinated list of keywords to search
40 for. Can be specified multiple times. Lines starting with # or ; are
41 ignored. An optional weight can be specified after the keyword, which
42 is separated from the keyword by a tab. (If not specified, 1 is
47 A file in which to store the combined results; defaults to STDOUT
49 =item B<--results-table>
51 A file in which to store a summary table of results. If not provided,
52 no summary table is written.
56 Debug verbosity. (Default 0)
60 Display brief useage information.
70 combine_results foo_1.txt
72 Will pretty much do what you want
81 $DEBUG = 0 unless defined $DEBUG;
84 use XML::Parser::Expat;
86 use List::Util qw(sum max);
88 # XXX parse config file
90 my %options = (debug => 0,
95 results_table => undef,
98 GetOptions(\%options,'keywords|k=s@',
99 'results=s','results_table|results-table=s',
100 'debug|d+','help|h|?','man|m');
103 pod2usage() if $options{help};
104 pod2usage({verbose=>2}) if $options{man};
106 $DEBUG = $options{debug};
108 my $results_fh = \*STDOUT;
109 my $results_table_fh = undef;
111 if ($options{results}) {
112 $results_fh = IO::File->new($options{results},'w') or
113 die "Unable to open results file $options{results} for writing";
115 if ($options{results_table}) {
116 $results_table_fh = IO::File->new($options{results_table},'w') or
117 die "Unable to open results table file $options{results_table} for writing";
121 use constant {NAME => 0,
132 my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function);
138 if (@{$options{keywords}}) {
139 for my $keyword_file (@{$options{keywords}}) {
140 my $keyword_fh = IO::File->new($keyword_file,'r') or
141 die "Unable to open $keyword_file for reading: $!";
142 while (<$keyword_fh>) {
146 my ($keyword,$weight) = split /\t/, $_;
147 $weight = 1 if not defined $weight;
148 $keyword_weight{$keyword} = $weight;
155 for my $file_name (@ARGV) {
156 my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
159 my @gene = map {s/^\"//; s/\"$//; $_;} split /(?<=\")\,(?=\")/, $_;
160 $genes{$gene[NAME]}{name} = $gene[NAME];
161 $genes{$gene[NAME]}{database}{$gene[DBNAME]}++;
162 $genes{$gene[NAME]}{hits}++;
163 $genes{$gene[NAME]}{terms}{$gene[KEYWORD]}++;
164 $genes{$gene[NAME]}{terms}{$gene[KEYWORD].'['.$gene[DBNAME].']'}++;
165 add_unique_parts($genes{$gene[NAME]},'refseq',$gene[REFSEQ]);
166 add_if_better($genes{$gene[NAME]},'description',$gene[DESCRIPTION]);
167 add_if_better($genes{$gene[NAME]},'location',$gene[LOCATION]);
168 add_unique_parts($genes{$gene[NAME]},'function',split(/; /, $gene[FUNCTION]));
169 add_unique_parts($genes{$gene[NAME]},'alias', split(/; /, $gene[ALIAS]));
177 for my $gene (keys %genes) {
182 for my $term (keys %{$genes{$gene}{terms}}) {
184 my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/;
185 my $hits = $genes{$gene}{terms}{$term};
186 $keyword =~ s/[-+_]/ /g;
187 $keyword =~ s/\s*$//;
189 $gene_temp{$keyword}{$database} = 1;
190 $gene_temp2{$database}{$keyword} = 1;
191 $databases{$database}{$keyword}{count}++;
192 $db_temp{$database}++;
193 $terms{$keyword}{$database}{count}++;
197 my $hits = $genes{$gene}{terms}{$term};
198 $keyword =~ s/[-+_]/ /g;
199 $keyword =~ s/\s*$//;
201 $terms{$keyword}{total}{count}++;
204 if (keys %gene_temp == 1) {
205 $terms{[keys %gene_temp]->[0]}{total}{unique}++;
206 if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) {
207 $databases{total}{total}{unique}++
210 if (keys %gene_temp2 == 1) {
211 $databases{[keys %gene_temp2]->[0]}{total}{unique}++;
213 for my $keyword (keys %gene_temp) {
214 if (keys %{$gene_temp{$keyword}} == 1) {
215 $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++;
217 for my $keyword2 (keys %gene_temp) {
218 $keyword_keyword{$keyword}{$keyword2}++
221 for my $database (keys %db_temp) {
222 $databases{$database}{total}{count}++;
224 $databases{total}{total}{count}++;
228 for my $keyword (keys %keyword_keyword) {
229 # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table
230 # we use max here to avoid 0/0 problems.
231 my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword});
232 my $results_combined = max(1,grep {defined $_}
233 sum(map {$keyword_keyword{$keyword}{$_}}
234 grep {$_ ne $keyword}
235 keys %{$keyword_keyword{$keyword}}
238 $auto_weight{$keyword} = $results_by_this_keyword/$results_combined;
241 print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n);
242 for my $gene (keys %genes) {
243 $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}};
244 $genes{$gene}{weightedscore}= sum(0,
245 map {defined $keyword_weight{$_}?$keyword_weight{$_}:1}
246 grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
248 $genes{$gene}{autoscore}= sum(0,
249 map {defined $auto_weight{$_}?$auto_weight{$_}:1}
250 grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
254 my $sort = 'autoscore';
255 if (scalar grep {$_ != 1 } values %keyword_weight) {
256 $sort='weightedscore';
258 for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) {
259 print {$results_fh} join (',',
260 map {s/"//g; qq("$_")}
263 if (ref $value eq 'HASH') {
264 join('; ',map {qq($_:$$value{$_})} keys %$value);
266 elsif (ref $value eq 'ARRAY') {
272 } @{$genes{$gene}}{@csv_fields}
277 sub add_unique_parts{
278 my ($hr,$key,@values) = @_;
279 if (not defined $$hr{key}) {
280 $$hr{$key} = [@values];
284 @temp_hash{@{$$hr{$key}}} = (1) x scalar @{$$hr{$key}};
285 $temp_hash{@values} = (1) x scalar @values;
286 $$hr{$key} = [keys %temp_hash];
291 my ($hash, $key, $value) = @_;
292 if (not defined $$hash{$key}) {
293 $$hash{$key} = $value;
295 elsif (length $$hash{$key} < length $value and $value !~ /^NO\s+(LOCATION|DESCRIPTION)+$/) {
296 $$hash{$key} = $value;
301 my ($value,$length,$right) = @_;
303 if (length($value) > $length) {
304 $value =~ m/(.{$length})/;
307 if (length($value) == $length) {
312 ' ' x ($length - length($value)),
319 ' ' x ($length - length($value)),
324 sub results_table_line {
325 my ($keyword,@fields) = @_;
327 space_fill($keyword,23),
328 map {space_fill($_,11,1)} @fields
332 my @database_order = grep {lc($_) ne 'total'} keys %databases;
333 if (defined $results_table_fh) {
335 print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight',
336 map {ucfirst($_)} @database_order,
340 for $keyword (sort keys %terms) {
343 if (not defined $_) {
348 "$_->{count} ($_->{unique})";
350 } @{$terms{$keyword}}{@database_order,'total'};
351 unshift @fields, $auto_weight{$keyword};
352 unshift @fields, $keyword_weight{$keyword} || 1;
353 print {$results_table_fh} results_table_line($keyword,
359 my @fields = ('','');
362 if (not defined $_) {
367 "$_->{count} ($_->{unique})";
369 } map {$_->{total}} @databases{@database_order,'total'};
370 print {$results_table_fh} results_table_line($keyword,