combine_results parsed_results_1.txt [parsedresultfiles ...]
Options:
+ --keywords newline delineated list of keywords to search for
+ --results file to store results in
+ --results-table optional file to write result summary table to
--debug, -d debugging level [default 0]
--help, -h display this help
--man, -m display manual
=over
+=item B<--keywords>
+
+A file which contains a newline delinated list of keywords to search
+for. Can be specified multiple times. Lines starting with # or ; are
+ignored. An optional weight can be specified after the keyword, which
+is separated from the keyword by a tab. (If not specified, 1 is
+assumed.)
+
+=item B<--results>
+
+A file in which to store the combined results; defaults to STDOUT
+
+=item B<--results-table>
+
+A file in which to store a summary table of results. If not provided,
+no summary table is written.
+
=item B<--debug, -d>
Debug verbosity. (Default 0)
use XML::Parser::Expat;
use IO::File;
+use List::Util qw(sum max);
# XXX parse config file
help => 0,
man => 0,
keywords => [],
+ results => undef,
+ results_table => undef,
);
-GetOptions(\%options,'keywords|k=s@','debug|d+','help|h|?','man|m');
+GetOptions(\%options,'keywords|k=s@',
+ 'results=s','results_table|results-table=s',
+ 'debug|d+','help|h|?','man|m');
pod2usage() if $options{help};
$DEBUG = $options{debug};
+my $results_fh = \*STDOUT;
+my $results_table_fh = undef;
+
+if ($options{results}) {
+ $results_fh = IO::File->new($options{results},'w') or
+ die "Unable to open results file $options{results} for writing";
+}
+if ($options{results_table}) {
+ $results_table_fh = IO::File->new($options{results_table},'w') or
+ die "Unable to open results table file $options{results_table} for writing";
+}
+
# CSV columns
use constant {NAME => 0,
REFSEQ => 1,
FILENAME => 8,
};
-my @csv_fields = qw(name hits rzscore refseq location alias database terms description function);
+my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function);
my %genes;
+my %keyword_weight;
+
+if (@{$options{keywords}}) {
+ for my $keyword_file (@{$options{keywords}}) {
+ my $keyword_fh = IO::File->new($keyword_file,'r') or
+ die "Unable to open $keyword_file for reading: $!";
+ while (<$keyword_fh>) {
+ next if /^\s*[#;]/;
+ next unless /\w+/;
+ chomp;
+ my ($keyword,$weight) = split /\t/, $_;
+ $weight = 1 if not defined $weight;
+ $keyword_weight{$keyword} = $weight;
+ }
+ }
+}
+
+
+
for my $file_name (@ARGV) {
my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
while (<$file>) {
}
}
-print join(',',map {qq("$_")} @csv_fields),qq(\n);
+my %databases;
+my %terms;
+my %auto_weight;
+my %keyword_keyword;
+for my $gene (keys %genes) {
+ my %term_temp;
+ my %db_temp;
+ my %gene_temp;
+ my %gene_temp2;
+ for my $term (keys %{$genes{$gene}{terms}}) {
+ if ($term =~ /\[/) {
+ my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/;
+ my $hits = $genes{$gene}{terms}{$term};
+ $keyword =~ s/[-+_]/ /g;
+ $keyword =~ s/\s*$//;
+ $keyword =~ s/[*]//;
+ $gene_temp{$keyword}{$database} = 1;
+ $gene_temp2{$database}{$keyword} = 1;
+ $databases{$database}{$keyword}{count}++;
+ $db_temp{$database}++;
+ $terms{$keyword}{$database}{count}++;
+ }
+ else {
+ my $keyword = $term;
+ my $hits = $genes{$gene}{terms}{$term};
+ $keyword =~ s/[-+_]/ /g;
+ $keyword =~ s/\s*$//;
+ $keyword =~ s/[*]//;
+ $terms{$keyword}{total}{count}++;
+ }
+ }
+ if (keys %gene_temp == 1) {
+ $terms{[keys %gene_temp]->[0]}{total}{unique}++;
+ if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) {
+ $databases{total}{total}{unique}++
+ }
+ }
+ if (keys %gene_temp2 == 1) {
+ $databases{[keys %gene_temp2]->[0]}{total}{unique}++;
+ }
+ for my $keyword (keys %gene_temp) {
+ if (keys %{$gene_temp{$keyword}} == 1) {
+ $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++;
+ }
+ for my $keyword2 (keys %gene_temp) {
+ $keyword_keyword{$keyword}{$keyword2}++
+ }
+ }
+ for my $database (keys %db_temp) {
+ $databases{$database}{total}{count}++;
+ }
+ $databases{total}{total}{count}++;
+
+}
+
+for my $keyword (keys %keyword_keyword) {
+ # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table
+ # we use max here to avoid 0/0 problems.
+ my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword});
+ my $results_combined = max(1,grep {defined $_}
+ sum(map {$keyword_keyword{$keyword}{$_}}
+ grep {$_ ne $keyword}
+ keys %{$keyword_keyword{$keyword}}
+ )
+ );
+ $auto_weight{$keyword} = $results_by_this_keyword/$results_combined;
+}
+
+print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n);
for my $gene (keys %genes) {
$genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}};
- print STDOUT join (',',
+ $genes{$gene}{weightedscore}= sum(0,
+ map {defined $keyword_weight{$_}?$keyword_weight{$_}:1}
+ grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
+ );
+ $genes{$gene}{autoscore}= sum(0,
+ map {defined $auto_weight{$_}?$auto_weight{$_}:1}
+ grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
+ );
+}
+
+my $sort = 'autoscore';
+if (scalar grep {$_ != 1 } values %keyword_weight) {
+ $sort='weightedscore';
+}
+for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) {
+ print {$results_fh} join (',',
map {s/"//g; qq("$_")}
map {
my $value = $_;
}
}
+sub space_fill{
+ my ($value,$length,$right) = @_;
+ $right ||= 0;
+ if (length($value) > $length) {
+ $value =~ m/(.{$length})/;
+ return $1;
+ }
+ if (length($value) == $length) {
+ return $value
+ }
+ if ($right) {
+ return join('',
+ ' ' x ($length - length($value)),
+ $value,
+ );
+ }
+ else {
+ return join('',
+ $value,
+ ' ' x ($length - length($value)),
+ );
+ }
+}
+sub results_table_line {
+ my ($keyword,@fields) = @_;
+ return join( ' & ',
+ space_fill($keyword,23),
+ map {space_fill($_,11,1)} @fields
+ )."\n";
+}
+
+my @database_order = grep {lc($_) ne 'total'} keys %databases;
+if (defined $results_table_fh) {
+ my $keyword;
+ print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight',
+ map {ucfirst($_)} @database_order,
+ 'Total',
+ );
+
+ for $keyword (sort keys %terms) {
+ my @fields =
+ map {
+ if (not defined $_) {
+ '$-$';
+ }
+ else {
+ $_->{unique} ||= 0;
+ "$_->{count} ($_->{unique})";
+ }
+ } @{$terms{$keyword}}{@database_order,'total'};
+ unshift @fields, $auto_weight{$keyword};
+ unshift @fields, $keyword_weight{$keyword} || 1;
+ print {$results_table_fh} results_table_line($keyword,
+ @fields
+ );
+ }
+ $keyword = 'Total';
+ my @fields = ('','');
+ push @fields,
+ map {
+ if (not defined $_) {
+ '$-$';
+ }
+ else {
+ $_->{unique} ||= 0;
+ "$_->{count} ($_->{unique})";
+ }
+ } map {$_->{total}} @databases{@database_order,'total'};
+ print {$results_table_fh} results_table_line($keyword,
+ @fields
+ );
+}
__END__