+#! /usr/bin/perl
+# function2gene, is part of the function2gene suite and is released
+# under the terms of the GPL version 2, or any later version, at your
+# option. See the file README and COPYING for more information.
+# Copyright 2007 by Don Armstrong <don@donarmstrong.com>.
+
+
+use threads;
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+use Storable;
+
+=head1 NAME
+
+ function2gene - Call out to each of the search modules to search for
+ each of the terms
+
+=head1 SYNOPSIS
+
+ function2gene --keywords keywords.txt --results gene_search_results
+
+ Options:
+ --keywords newline delineated list of keywords to search for
+ --results directory to store results in
+ --database databases to search
+ --restart-at mode to start searching at
+ --debug, -d debugging level (Default 0)
+ --help, -h display this help
+ --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--keywords>
+
+A file which contains a newline delinated list of keywords to search
+for. Can be specified multiple times. Lines starting with # or ; are
+ignored.
+
+=item B<--results>
+
+Directory in which to store results; also stores the current state of
+the system
+
+=item B<--database>
+
+Databases to search, can be specified multiple times. [Defaults to
+NCBI, GeneCards and Harvester, the only currently supported
+databases.]
+
+=item B<--restart-at>
+
+If you need to restart the process at a particular state (which has
+already been completed) specify this option.
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief useage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+ # Search all databases for transferrin
+ echo 'transferrin' > keywords.txt
+ function2gene --keywords keywords.txt --results keyword_results
+
+=cut
+
+
+use vars qw($DEBUG);
+use Cwd qw(abs_path);
+use IO::File;
+use Storable qw(thaw freeze);
+use File::Basename qw(basename dirname);
+use Thread::Queue;
+
+my %options = (databases => [],
+ keywords => [],
+ debug => 0,
+ help => 0,
+ man => 0,
+ results => '',
+ );
+
+GetOptions(\%options,'keywords=s@','databases=s@',
+ 'restart_at|restart-at=s','results=s',
+ 'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+my $base_dir = dirname(abs_path($0));
+
+my $ERRORS='';
+
+$ERRORS.="restart-at must be one of get, parse or combine\n" if
+ exists $options{restart_at} and $options{restart_at} !~ /^(?:get|parse|combine)$/;
+
+$ERRORS.="unknown database(s)" if
+ @{$options{databases}} and
+ grep {$_ !~ /^(?:ncbi|genecard|harvester)$/i} @{$options{databases}};
+
+if (not length $options{results}) {
+ $ERRORS.="results directory not specified";
+}
+elsif (not -d $options{results} or not -w $options{results}) {
+ $ERRORS.="results directory $options{results} does not exist or is not writeable";
+}
+
+pod2usage($ERRORS) if length $ERRORS;
+
+if (not @{$options{databases}}) {
+ $options{databases} = [qw(ncbi genecard harvester)]
+}
+
+$DEBUG = $options{debug};
+
+# There are three states for our engine
+# Getting results
+# Parsing them
+# Combining results
+
+# first, check to see if the state in the result directory exists
+
+my %state;
+
+$options{keywords} = [map {abs_path($_)} @{$options{keywords}}];
+
+chdir $options{results} or die "Unable to chdir to $options{results}";
+
+if (-e "do_it_all_state") {
+ ADVISE("Using existing state information");
+ my $state_fh = IO::File->new("do_it_all_state",'r') or die
+ "Unable to open state file for reading: $!";
+ local $/;
+ my $state_file = <$state_fh>;
+ %state = %{thaw($state_file)} or die "Unable to thaw state file";
+}
+else {
+ ADVISE("Starting new run");
+ %state = (keywords => [],
+ databases => [map {lc($_)} @{$options{databases}}],
+ done_keywords => {
+ get => {},
+ parse => {},
+ combine => {},
+ },
+ );
+}
+
+my @new_keywords;
+if (@{$options{keywords}}) {
+ # uniqify keywords
+ my %old_keywords;
+ @old_keywords{@{$state{keywords}}} = (1) x @{$state{keywords}};
+ for my $keyword_file (@{$options{keywords}}) {
+ my $keyword_fh = IO::File->new($keyword_file,'r') or die
+ "Unable to open $keyword_file for reading: $!";
+ while (<$keyword_fh>) {
+ next if /^\s*[#;]/;
+ next unless /\w+/;
+ chomp;
+ if (not $old_keywords{$_}) {
+ DEBUG("Adding new keyword '$_'");
+ push @new_keywords, $_;
+ }
+ else {
+ DEBUG("Not adding duplicate keyword '$_'");
+ }
+ }
+ }
+ push @{$state{keywords}},@new_keywords;
+}
+
+if (exists $options{restart_at} and length $options{restart_at}) {
+ if (lc($options{restart_at}) eq 'get') {
+ delete $state{done_keywords}{get};
+ delete $state{done_keywords}{parse};
+ delete $state{done_keywords}{combine};
+ }
+ elsif (lc($options{restart_at}) eq 'parse') {
+ delete $state{done_keywords}{parse};
+ delete $state{done_keywords}{combine};
+ }
+ elsif (lc($options{restart_at}) eq 'combine') {
+ delete $state{done_keywords}{combine};
+ }
+}
+
+# now we need to figure out what has to happen
+# for each keyword, we check to see if we've got results, parsed
+# results, and combined it. If not, we queue up those actions.
+
+my %actions = (combine => 0,
+ get => {},
+ parse => {},
+ );
+
+if (not @{$state{keywords}}) {
+ ADVISE("There are no keywords specified");
+}
+
+for my $keyword (@{$state{keywords}}) {
+ for my $database (@{$state{databases}}) {
+ if (not exists $state{done_keywords}{get}{$database}{$keyword}) {
+ push @{$actions{get}{$database}}, $keyword;
+ delete $state{done_keywords}{parse}{$database}{$keyword} if
+ exists $state{done_keywords}{parse}{$database}{$keyword};
+ delete $state{done_keywords}{combine}{$database}{$keyword} if
+ exists $state{done_keywords}{combine}{$database}{$keyword};
+ }
+ if (not exists $state{done_keywords}{parse}{$database}{$keyword}) {
+ push @{$actions{parse}{$database}},$keyword;
+ delete $state{done_keywords}{combine}{$database}{$keyword} if
+ exists $state{done_keywords}{combine}{$database}{$keyword};
+ }
+ if (not exists $state{done_keywords}{combine}{$database}{$keyword}) {
+ $actions{combine} = 1;
+ }
+ }
+}
+
+
+for my $state (qw(get parse)) {
+ my %databases;
+ for my $database (keys %{$actions{$state}}) {
+ next unless @{$actions{$state}{$database}};
+ $databases{$database}{queue} = Thread::Queue->new
+ or die "Unable to create new thread queue";
+ $databases{$database}{thread} = threads->create(\&handle_action,$state,$database,$databases{$database}{queue})
+ or die "Unable to create new thread";
+ $databases{$database}{queue}->enqueue(@{$actions{$state}{$database}});
+ $databases{$database}{queue}->enqueue(undef);
+ }
+ my $ERRORS=0;
+ for my $database (keys %databases) {
+ my ($actioned_keywords,$failed_keywords) = @{$databases{$database}{thread}->join||[]};
+ if (not defined $failed_keywords) {
+ ADVISE("Something bad happened during '$state' of '$database'");
+ $ERRORS = 1;
+ }
+ elsif (@{$failed_keywords}) {
+ ADVISE("These keywords failed during '$state' of '$database':",@{$failed_keywords});
+ $ERRORS=1;
+ }
+ @{$state{done_keywords}{$state}{$database}}{@{$actioned_keywords}} = (1) x @{$actioned_keywords};
+ delete @{$state{done_keywords}{$state}{$database}}{@{$failed_keywords}};
+ }
+ save_state(\%state);
+ if ($ERRORS) {
+ WARN("Stoping, as there are errors");
+ exit 1;
+ }
+}
+
+if ($actions{combine}) {
+ save_state(\%state);
+ # deal with combining results
+ my @parsed_results = map { my $db = $_;
+ map {
+ "parsed_results_${db}_${_}.txt"
+ } keys %{$state{done_keywords}{parse}{$db}}
+ } keys %{$state{done_keywords}{parse}};
+
+ write_command_to_file('combined_results.txt',
+ "$base_dir/combine_results",
+ @parsed_results,
+ );
+ for my $result (@parsed_results) {
+ s/^parsed_results_//;
+ s/\.txt$//;
+ my ($db,$keyword) = split /_/, $_, 2;
+ $state{done_keywords}{combined}{$db}{$keyword} = 1;
+ }
+ save_state(\%state);
+ ADVISE("Finished; results in $options{results}/combined_results");
+}
+else {
+ ADVISE('Nothing to do. [Perhaps you wanted --restart-at?]');
+}
+
+sub handle_action{
+ my ($state,$database,$queue) = @_;
+ my $keyword;
+ my $actioned_keywords = [];
+ my $failed_keywords = [];
+ DEBUG("Beginning to handle actions for state '$state' database '$database'");
+ while ($keyword = $queue->dequeue) {
+ DEBUG("Handling state '$state' database '$database' keyword '$keyword'");
+ # handle the action, baybee
+ if ($state eq 'get') {
+ my $command_fh;
+ eval {
+ open($command_fh,'|-',
+ "$base_dir/get_${database}_results",
+ ) or die "unable to execute '$base_dir/get_${database}_results'";
+ print {$command_fh} "$keyword\n" or die "unable to print $keyword to 'get_${database}_results'";
+ close($command_fh) or die "Unable to close filehandle";
+ if ($? != 0) {
+ die "get_${database}_results with keyword $keyword failed with error code ".($?>>8);
+ }
+ };
+ if ($@) {
+ WARN($@);
+ push @{$failed_keywords}, $keyword;
+ next;
+ }
+ }
+ elsif ($state eq 'parse') {
+ eval {
+ write_command_to_file("parsed_results_${database}_${keyword}.txt",
+ "$base_dir/parse_${database}_results",
+ '--keywords',
+ $keyword,
+ );
+ };
+ if ($@) {
+ WARN("parse_${database}_results failed with $@");
+ push @{$failed_keywords}, $keyword;
+ next;
+ }
+ }
+ else {
+ die "I don't know how to handle state $state";
+ }
+ ADVISE("$state results from '$database' for '$keyword'");
+ push @{$actioned_keywords},$keyword;
+ }
+ return [$actioned_keywords,$failed_keywords];
+}
+
+sub save_state{
+ my ($state) = @_;
+ my $state_fh = IO::File->new("do_it_all_state",'w') or die
+ "Unable to open state file for writing: $!";
+ print {$state_fh} freeze($state) or die "Unable to freeze state file";
+ close $state_fh or die "Unable to close state file: $!";
+}
+
+sub write_command_to_file{
+ my ($file,@command) = @_;
+ my $fh = IO::File->new($file,'w') or
+ die "Unable to open $file for writing: $!";
+ my $command_fh;
+ open($command_fh,'-|',
+ @command,
+ ) or die "Unable to execute $command[0] $!";
+ print {$fh} <$command_fh>;
+ close $fh;
+ close $command_fh or die "$command[0] failed with ".($?>>8);
+}
+
+
+sub ADVISE{
+ print STDOUT map {($_,qq(\n))} @_;
+}
+
+sub DEBUG{
+ print STDERR map {($_,qq(\n))} @_;
+}
+
+
+sub WARN {
+ print STDERR map {($_,qq(\n))} @_;
+}
+
+__END__