#! /usr/bin/perl
-# parse_ncbi_results retreives files of search results from ncbi,
-# and is released under the terms of the GPL version 2, or any later
-# version, at your option. See the file README and COPYING for more
-# information.
+# combine_results, is part of the gene search suite, and is released
+# under the terms of the GPL version 2, or any later version, at your
+# option. See the file README and COPYING for more information.
-# Copyright 2004 by Don Armstrong <don@donarmstrong.com>.
-
-# $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $
+# Copyright 2006,2007 by Don Armstrong <don@donarmstrong.com>.
use warnings;
=head1 NAME
- parse_ncbi_results [options]
+ combine_results -- combines parsed result files; outputs to stdout.
=head1 SYNOPSIS
+ combine_results parsed_results_1.txt [parsedresultfiles ...]
Options:
- --dir, -D directory to stick results into [default .]
- --name, -n file naming scheme [default ${search}_results.$format]
- --terms, -t file of search terms [default -]
+ --keywords newline delineated list of keywords to search for
+ --results file to store results in
+ --results-table optional file to write result summary table to
--debug, -d debugging level [default 0]
--help, -h display this help
--man, -m display manual
=over
+=item B<--keywords>
+
+A file which contains a newline delinated list of keywords to search
+for. Can be specified multiple times. Lines starting with # or ; are
+ignored. An optional weight can be specified after the keyword, which
+is separated from the keyword by a tab. (If not specified, 1 is
+assumed.)
+
+=item B<--results>
+
+A file in which to store the combined results; defaults to STDOUT
+
+=item B<--results-table>
+
+A file in which to store a summary table of results. If not provided,
+no summary table is written.
+
=item B<--debug, -d>
Debug verbosity. (Default 0)
=head1 EXAMPLES
- parse_ncbi_results -D ./ncbi_results/ -n '${search}_name.html' < search_parameters
+ combine_results foo_1.txt
Will pretty much do what you want
-use vars qw($DEBUG $REVISION);
+use vars qw($DEBUG);
BEGIN{
- ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/;
$DEBUG = 0 unless defined $DEBUG;
}
use XML::Parser::Expat;
use IO::File;
+use List::Util qw(sum max);
# XXX parse config file
my %options = (debug => 0,
help => 0,
man => 0,
- dir => '.',
- keyword => undef,
+ keywords => [],
+ results => undef,
+ results_table => undef,
);
-GetOptions(\%options,'keyword|k=s','debug|d+','help|h|?','man|m');
+GetOptions(\%options,'keywords|k=s@',
+ 'results=s','results_table|results-table=s',
+ 'debug|d+','help|h|?','man|m');
pod2usage() if $options{help};
$DEBUG = $options{debug};
+my $results_fh = \*STDOUT;
+my $results_table_fh = undef;
+
+if ($options{results}) {
+ $results_fh = IO::File->new($options{results},'w') or
+ die "Unable to open results file $options{results} for writing";
+}
+if ($options{results_table}) {
+ $results_table_fh = IO::File->new($options{results_table},'w') or
+ die "Unable to open results table file $options{results_table} for writing";
+}
+
# CSV columns
use constant {NAME => 0,
REFSEQ => 1,
FILENAME => 8,
};
-my @csv_fields = qw(name hits rzscore refseq location alias database terms description function);
+my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function);
my %genes;
+my %keyword_weight;
+
+if (@{$options{keywords}}) {
+ for my $keyword_file (@{$options{keywords}}) {
+ my $keyword_fh = IO::File->new($keyword_file,'r') or
+ die "Unable to open $keyword_file for reading: $!";
+ while (<$keyword_fh>) {
+ next if /^\s*[#;]/;
+ next unless /\w+/;
+ chomp;
+ my ($keyword,$weight) = split /\t/, $_;
+ $weight = 1 if not defined $weight;
+ $keyword_weight{$keyword} = $weight;
+ }
+ }
+}
+
+
+
+my %alias_reverse;
+
for my $file_name (@ARGV) {
my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!";
while (<$file>) {
next if /^"Name"/;
my @gene = map {s/^\"//; s/\"$//; $_;} split /(?<=\")\,(?=\")/, $_;
+ # check to see if there's a different name we should be using
+ if (not exists $genes{$gene[NAME]}) {
+ # if the gene has a valid name, we do at least one test.
+ my $num_tested = $gene[NAME] ne 'NO NAME' ? 1 : 0;
+ my %candidates;
+ if ($gene[NAME] ne 'NO NAME' and exists $alias_reverse{$gene[NAME]} and $alias_reverse{$gene[NAME]} ne '') {
+ $candidates{$alias_reverse{$gene[NAME]}}++;
+ }
+ else {
+ for my $alias (grep {$_ !~ /^NO (ALIASES|NAME)$/} split(/; /, $gene[ALIAS])) {
+ if (exists $alias_reverse{$alias} and $alias_reverse{$alias} ne '') {
+ $candidates{$alias_reverse{$alias}}++;
+ }
+ $num_tested++;
+ }
+ }
+ #print STDERR "Choosing $alias_reverse{$gene[NAME]} for $gene[NAME]\n";
+ for my $candidate (keys %candidates) {
+ if ($candidates{$candidate} > ($num_tested/2)) {
+ print STDERR "Choosing $candidate for '$gene[NAME]', as it matched $candidates{$candidate} of $num_tested tests\n";
+ $gene[NAME] = $candidate;
+ }
+ }
+ }
$genes{$gene[NAME]}{name} = $gene[NAME];
$genes{$gene[NAME]}{database}{$gene[DBNAME]}++;
$genes{$gene[NAME]}{hits}++;
$genes{$gene[NAME]}{terms}{$gene[KEYWORD]}++;
+ $genes{$gene[NAME]}{terms}{$gene[KEYWORD].'['.$gene[DBNAME].']'}++;
add_unique_parts($genes{$gene[NAME]},'refseq',$gene[REFSEQ]);
add_if_better($genes{$gene[NAME]},'description',$gene[DESCRIPTION]);
add_if_better($genes{$gene[NAME]},'location',$gene[LOCATION]);
add_unique_parts($genes{$gene[NAME]},'function',split(/; /, $gene[FUNCTION]));
- add_unique_parts($genes{$gene[NAME]},'alias', split(/; /, $gene[ALIAS]));
+ my @aliases = grep {$_ ne 'NO ALIASES'} split(/; /, $gene[ALIAS]);
+ add_unique_parts($genes{$gene[NAME]},'alias', @aliases);
+ if ($gene[NAME] ne 'NO NAME') {
+ for my $alias (@aliases) {
+ if (not exists $alias_reverse{$alias}) {
+ $alias_reverse{$alias} = $gene[NAME];
+ }
+ elsif ($alias_reverse{$alias} ne $gene[NAME]) {
+ print STDERR "Alias $alias for $gene[NAME] also points at $alias_reverse{$alias} [".join(',',@aliases).".]\n";
+ $alias_reverse{$alias} = '';
+ }
+ }
+ }
}
}
-print join(',',map {qq("$_")} @csv_fields),qq(\n);
+my %databases;
+my %terms;
+my %auto_weight;
+my %keyword_keyword;
for my $gene (keys %genes) {
- $genes{$gene}{rzscore} = scalar keys %{$genes{$gene}{terms}};
- next if $genes{$gene}{rzscore} == 1 and exists $genes{$gene}{terms}{antigen};
- $genes{$gene}{rzscore} -= 1 if exists $genes{$gene}{terms}{antigen};
- print STDOUT join (',',
+ my %term_temp;
+ my %db_temp;
+ my %gene_temp;
+ my %gene_temp2;
+ for my $term (keys %{$genes{$gene}{terms}}) {
+ if ($term =~ /\[/) {
+ my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/;
+ my $hits = $genes{$gene}{terms}{$term};
+ $keyword =~ s/[-+_]/ /g;
+ $keyword =~ s/\s*$//;
+ $keyword =~ s/[*]//;
+ $gene_temp{$keyword}{$database} = 1;
+ $gene_temp2{$database}{$keyword} = 1;
+ $databases{$database}{$keyword}{count}++;
+ $db_temp{$database}++;
+ $terms{$keyword}{$database}{count}++;
+ }
+ else {
+ my $keyword = $term;
+ my $hits = $genes{$gene}{terms}{$term};
+ $keyword =~ s/[-+_]/ /g;
+ $keyword =~ s/\s*$//;
+ $keyword =~ s/[*]//;
+ $terms{$keyword}{total}{count}++;
+ }
+ }
+ if (keys %gene_temp == 1) {
+ $terms{[keys %gene_temp]->[0]}{total}{unique}++;
+ if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) {
+ $databases{total}{total}{unique}++
+ }
+ }
+ if (keys %gene_temp2 == 1) {
+ $databases{[keys %gene_temp2]->[0]}{total}{unique}++;
+ }
+ for my $keyword (keys %gene_temp) {
+ if (keys %{$gene_temp{$keyword}} == 1) {
+ $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++;
+ }
+ for my $keyword2 (keys %gene_temp) {
+ $keyword_keyword{$keyword}{$keyword2}++
+ }
+ }
+ for my $database (keys %db_temp) {
+ $databases{$database}{total}{count}++;
+ }
+ $databases{total}{total}{count}++;
+
+}
+
+for my $keyword (keys %keyword_keyword) {
+ # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table
+ # we use max here to avoid 0/0 problems.
+ my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword});
+ my $results_combined = max(1,grep {defined $_}
+ sum(map {$keyword_keyword{$keyword}{$_}}
+ grep {$_ ne $keyword}
+ keys %{$keyword_keyword{$keyword}}
+ )
+ );
+ $auto_weight{$keyword} = $results_by_this_keyword/$results_combined;
+}
+
+my $avg_weight = sum(values %auto_weight) / scalar keys %auto_weight;
+for my $keyword (keys %auto_weight) {
+ $auto_weight{$keyword} = $auto_weight{$keyword}/$avg_weight;
+}
+
+print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n);
+for my $gene (keys %genes) {
+ $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}};
+ $genes{$gene}{weightedscore}= sum(0,
+ map {defined $keyword_weight{$_}?$keyword_weight{$_}:1}
+ grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
+ );
+ $genes{$gene}{autoscore}= sum(0,
+ map {defined $auto_weight{$_}?$auto_weight{$_}:1}
+ grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}
+ );
+}
+
+my $sort = 'autoscore';
+if (scalar grep {$_ != 1 } values %keyword_weight) {
+ $sort='weightedscore';
+}
+for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) {
+ print {$results_fh} join (',',
map {s/"//g; qq("$_")}
map {
my $value = $_;
$$hr{$key} = [@values];
}
else {
+ return unless @values;
my %temp_hash;
@temp_hash{@{$$hr{$key}}} = (1) x scalar @{$$hr{$key}};
$temp_hash{@values} = (1) x scalar @values;
}
}
+sub space_fill{
+ my ($value,$length,$right) = @_;
+ $right ||= 0;
+ if (length($value) > $length) {
+ $value =~ m/(.{$length})/;
+ return $1;
+ }
+ if (length($value) == $length) {
+ return $value
+ }
+ if ($right) {
+ return join('',
+ ' ' x ($length - length($value)),
+ $value,
+ );
+ }
+ else {
+ return join('',
+ $value,
+ ' ' x ($length - length($value)),
+ );
+ }
+}
+
+sub results_table_line {
+ my ($keyword,@fields) = @_;
+ return join( ' & ',
+ space_fill($keyword,23),
+ map {space_fill($_,11,1)} @fields
+ )."\n";
+}
+my @database_order = grep {lc($_) ne 'total'} keys %databases;
+if (defined $results_table_fh) {
+ my $keyword;
+ print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight',
+ map {ucfirst($_)} @database_order,
+ 'Total',
+ );
+
+ for $keyword (sort keys %terms) {
+ my @fields =
+ map {
+ if (not defined $_) {
+ '$-$';
+ }
+ else {
+ $_->{unique} ||= 0;
+ "$_->{count} ($_->{unique})";
+ }
+ } @{$terms{$keyword}}{@database_order,'total'};
+ unshift @fields, $auto_weight{$keyword};
+ unshift @fields, $keyword_weight{$keyword} || 1;
+ print {$results_table_fh} results_table_line($keyword,
+ @fields
+ );
+ }
+ $keyword = 'Total';
+ my @fields = ('','');
+ push @fields,
+ map {
+ if (not defined $_) {
+ '$-$';
+ }
+ else {
+ $_->{unique} ||= 0;
+ "$_->{count} ($_->{unique})";
+ }
+ } map {$_->{total}} @databases{@database_order,'total'};
+ print {$results_table_fh} results_table_line($keyword,
+ @fields
+ );
+}
__END__