#! /usr/bin/perl # function2gene, is part of the function2gene suite and is released # under the terms of the GPL version 2, or any later version, at your # option. See the file README and COPYING for more information. # Copyright 2007 by Don Armstrong . use threads; use warnings; use strict; use Getopt::Long; use Pod::Usage; use Storable; =head1 NAME function2gene - Call out to each of the search modules to search for each of the terms =head1 SYNOPSIS function2gene --keywords keywords.txt --results gene_search_results Options: --keywords newline delineated list of keywords to search for --results directory to store results in --database databases to search --restart-at mode to start searching at --invalidate-state state to invalidate --debug, -d debugging level (Default 0) --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--keywords> A file which contains a newline delinated list of keywords to search for. Can be specified multiple times. Lines starting with # or ; are ignored. An optional weight can be specified after the keyword, which is separated from the keyword by a tab. (If not specified, 1 is assumed.) =item B<--results> Directory in which to store results; also stores the current state of the system =item B<--database> Databases to search, can be specified multiple times. [Defaults to NCBI, GeneCards and Harvester, the only currently supported databases.] =item B<--restart-at> If you need to restart the process at a particular state (which has already been completed) specify this option. Valid values are get, parse, or combine. =item B<--invalidate-state> This is a more powerful version of --restart-at, which can specifically invalidate a certain method,database,keyword combination. For example, you can request that the keyword foo be retreived again from ncbi using --invalidate-state 'get,ncbi,foo' =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief useage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES # Search all databases for transferrin echo 'transferrin' > keywords.txt function2gene --keywords keywords.txt --results keyword_results # reparse the results function2gene --keywords keywords.txt --results keyword_results \ --restart-at parse =cut use vars qw($DEBUG); use Cwd qw(abs_path); use IO::File; use Storable qw(thaw freeze); use File::Basename qw(basename dirname); use Thread::Queue; my %options = (databases => [], keywords => [], debug => 0, help => 0, man => 0, results => '', invalidate_state => [], ); GetOptions(\%options,'keywords=s@','databases=s@', 'restart_at|restart-at=s','results=s', 'invalidate_state|invalidate-state=s@', 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; my $base_dir = dirname(abs_path($0)); my $ERRORS=''; $ERRORS.="restart-at must be one of get, parse or combine\n" if exists $options{restart_at} and $options{restart_at} !~ /^(?:get|parse|combine)$/; $ERRORS.="unknown database(s)" if @{$options{databases}} and grep {$_ !~ /^(?:ncbi|genecard|harvester)$/i} @{$options{databases}}; if (not length $options{results}) { $ERRORS.="results directory not specified"; } elsif (not -d $options{results} or not -w $options{results}) { $ERRORS.="results directory $options{results} does not exist or is not writeable"; } pod2usage($ERRORS) if length $ERRORS; if (not @{$options{databases}}) { $options{databases} = [qw(ncbi genecard harvester)] } $DEBUG = $options{debug}; # There are three states for our engine # Getting results # Parsing them # Combining results # first, check to see if the state in the result directory exists my %state; $options{keywords} = [map {abs_path($_)} @{$options{keywords}}]; chdir $options{results} or die "Unable to chdir to $options{results}"; if (-e "function2gene_state") { ADVISE("Using existing state information"); my $state_fh = IO::File->new("function2gene_state",'r') or die "Unable to open state file for reading: $!"; local $/; my $state_file = <$state_fh>; %state = %{thaw($state_file)} or die "Unable to thaw state file"; } else { ADVISE("Starting new run"); %state = (keywords => [], databases => [map {lc($_)} @{$options{databases}}], done_keywords => { get => {}, parse => {}, combine => {}, }, ); } my @new_keywords; if (@{$options{keywords}}) { # uniqify keywords my %old_keywords; @old_keywords{@{$state{keywords}}} = (1) x @{$state{keywords}}; for my $keyword_file (@{$options{keywords}}) { my $keyword_fh = IO::File->new($keyword_file,'r') or die "Unable to open $keyword_file for reading: $!"; while (<$keyword_fh>) { next if /^\s*[#;]/; next unless /\w+/; chomp; my ($keyword,$weight) = split /\t/, $_; $weight = 1 if not defined $weight; $state{keyword_weight}{$keyword} = $weight; if (not $old_keywords{$_}) { DEBUG("Adding new keyword '$_'"); push @new_keywords, $_; } else { DEBUG("Not adding duplicate keyword '$_'"); } } } push @{$state{keywords}},@new_keywords; } if (exists $options{restart_at} and length $options{restart_at}) { if (lc($options{restart_at}) eq 'get') { delete $state{done_keywords}{get}; delete $state{done_keywords}{parse}; delete $state{done_keywords}{combine}; } elsif (lc($options{restart_at}) eq 'parse') { delete $state{done_keywords}{parse}; delete $state{done_keywords}{combine}; } elsif (lc($options{restart_at}) eq 'combine') { delete $state{done_keywords}{combine}; } } if (exists $options{invalidate_state}) { for my $invalidate_state (@{$options{invalidate_state}}) { my ($method,$database,$keyword) = split /,/, $invalidate_state; if (grep {not defined $_ } ($method,$database,$keyword) ) { print STDERR "The invalidate state option '$invalidate_state' is invalid.\n"; next; } if (not exists $state{done_keywords}{$method}) { print STDERR "Method '$method' does not exist, and cannot be invalidated\n"; next; } if (not exists $state{done_keywords}{$method}{$database}) { print STDERR "Database '$database' does not exist for method '$method', and cannot be invalidated\n"; next; } if (not length $keyword) { delete $state{done_keywords}{$method}{$database}; if ($method eq 'get') { delete $state{done_keywords}{parse}{$database}; delete $state{done_keywords}{combine}{$database}; } if ($method eq 'parse') { delete $state{done_keywords}{combine}{$database}; } next; } if (not exists $state{done_keywords}{$method}{$database}{$keyword}) { print STDERR "Keyword '$keyword' does not exist for database '$database' and method '$method', and cannot be invalidated\n"; next; } delete $state{done_keywords}{$method}{$database}{$keyword}; if ($method eq 'get') { delete $state{done_keywords}{parse}{$database}{$keyword}; delete $state{done_keywords}{combine}{$database}{$keyword}; } if ($method eq 'parse') { delete $state{done_keywords}{combine}{$database}{$keyword}; } } } # now we need to figure out what has to happen # for each keyword, we check to see if we've got results, parsed # results, and combined it. If not, we queue up those actions. my %actions = (combine => 0, get => {}, parse => {}, ); if (not @{$state{keywords}}) { ADVISE("There are no keywords specified"); } for my $keyword (@{$state{keywords}}) { for my $database (@{$state{databases}}) { if (not exists $state{done_keywords}{get}{$database}{$keyword}) { push @{$actions{get}{$database}}, $keyword; delete $state{done_keywords}{parse}{$database}{$keyword} if exists $state{done_keywords}{parse}{$database}{$keyword}; delete $state{done_keywords}{combine}{$database}{$keyword} if exists $state{done_keywords}{combine}{$database}{$keyword}; } if (not exists $state{done_keywords}{parse}{$database}{$keyword}) { push @{$actions{parse}{$database}},$keyword; delete $state{done_keywords}{combine}{$database}{$keyword} if exists $state{done_keywords}{combine}{$database}{$keyword}; } if (not exists $state{done_keywords}{combine}{$database}{$keyword}) { $actions{combine} = 1; } } } for my $state (qw(get parse)) { my %databases; for my $database (keys %{$actions{$state}}) { next unless @{$actions{$state}{$database}}; $databases{$database}{queue} = Thread::Queue->new or die "Unable to create new thread queue"; $databases{$database}{thread} = threads->create(\&handle_action,$state,$database,$databases{$database}{queue}) or die "Unable to create new thread"; $databases{$database}{queue}->enqueue(@{$actions{$state}{$database}}); $databases{$database}{queue}->enqueue(undef); } my $ERRORS=0; for my $database (keys %databases) { my ($actioned_keywords,$failed_keywords) = @{$databases{$database}{thread}->join||[]}; if (not defined $failed_keywords) { ADVISE("Something bad happened during '$state' of '$database'"); $ERRORS = 1; } elsif (@{$failed_keywords}) { ADVISE("These keywords failed during '$state' of '$database':",@{$failed_keywords}); $ERRORS=1; } @{$state{done_keywords}{$state}{$database}}{@{$actioned_keywords}} = (1) x @{$actioned_keywords}; delete @{$state{done_keywords}{$state}{$database}}{@{$failed_keywords}}; } save_state(\%state); if ($ERRORS) { WARN("Stoping, asthere are errors"); exit 1; } } if ($actions{combine}) { save_state(\%state); # deal with combining results my @parsed_results = map { my $db = $_; map { "parsed_results_${db}_${_}.txt" } keys %{$state{done_keywords}{parse}{$db}} } keys %{$state{done_keywords}{parse}}; # create temporary file to store keyword weights my $file = IO::File->new('combined_keywords.txt','w') or die "Unable to open combined_keywords.txt for writing: $!"; for my $keyword (keys %{$state{keyword_weight}}) { print {$file} "$keyword\t$state{keyword_weight}{$keyword}\n"; } system("$base_dir/combine_results", '--keywords','combined_keywords.txt', '--results','combined_results.txt', '--results-table','combined_results_table.txt', @parsed_results, ) == 0 or die "combine_results failed with ".($?>>8); for my $result (@parsed_results) { $result =~ s/^parsed_results_//; $result =~ s/\.txt$//; my ($db,$keyword) = split /_/, $result, 2; $state{done_keywords}{combined}{$db}{$keyword} = 1; } save_state(\%state); ADVISE("Finished; results in $options{results}/combined_results.txt"); } else { ADVISE('Nothing to do. [Perhaps you wanted --restart-at?]'); } sub handle_action{ my ($state,$database,$queue) = @_; my $keyword; my $actioned_keywords = []; my $failed_keywords = []; DEBUG("Beginning to handle actions for state '$state' database '$database'"); while ($keyword = $queue->dequeue) { DEBUG("Handling state '$state' database '$database' keyword '$keyword'"); # handle the action, baybee if ($state eq 'get') { my $command_fh; eval { open($command_fh,'|-', "$base_dir/get_${database}_results", ) or die "unable to execute '$base_dir/get_${database}_results'"; print {$command_fh} "$keyword\n" or die "unable to print $keyword to 'get_${database}_results'"; close($command_fh) or die "Unable to close filehandle"; if ($? != 0) { die "get_${database}_results with keyword $keyword failed with error code ".($?>>8); } }; if ($@) { WARN($@); push @{$failed_keywords}, $keyword; next; } } elsif ($state eq 'parse') { eval { write_command_to_file("parsed_results_${database}_${keyword}.txt", "$base_dir/parse_${database}_results", '--keywords', $keyword, ); }; if ($@) { WARN("parse_${database}_results failed with $@"); push @{$failed_keywords}, $keyword; next; } } else { die "I don't know how to handle state $state"; } ADVISE("$state results from '$database' for '$keyword'"); push @{$actioned_keywords},$keyword; } return [$actioned_keywords,$failed_keywords]; } sub save_state{ my ($state) = @_; my $state_fh = IO::File->new("function2gene_state",'w') or die "Unable to open state file for writing: $!"; print {$state_fh} freeze($state) or die "Unable to freeze state file"; close $state_fh or die "Unable to close state file: $!"; } sub write_command_to_file{ my ($file,@command) = @_; my $fh = IO::File->new($file,'w') or die "Unable to open $file for writing: $!"; my $command_fh; open($command_fh,'-|', @command, ) or die "Unable to execute $command[0] $!"; print {$fh} <$command_fh>; close $fh; close $command_fh or die "$command[0] failed with ".($?>>8); } sub ADVISE{ print STDOUT map {($_,qq(\n))} @_; } sub DEBUG{ print STDERR map {($_,qq(\n))} @_; } sub WARN { print STDERR map {($_,qq(\n))} @_; } __END__