2 # function2gene, is part of the function2gene suite and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2007 by Don Armstrong <don@donarmstrong.com>.
19 function2gene - Call out to each of the search modules to search for
24 function2gene --keywords keywords.txt --results gene_search_results
27 --keywords newline delineated list of keywords to search for
28 --results directory to store results in
29 --database databases to search
30 --restart-at mode to start searching at
31 --invalidate-state state to invalidate
32 --debug, -d debugging level (Default 0)
33 --help, -h display this help
34 --man, -m display manual
42 A file which contains a newline delinated list of keywords to search
43 for. Can be specified multiple times. Lines starting with # or ; are
48 Directory in which to store results; also stores the current state of
53 Databases to search, can be specified multiple times. [Defaults to
54 NCBI, GeneCards and Harvester, the only currently supported
59 If you need to restart the process at a particular state (which has
60 already been completed) specify this option. Valid values are get,
63 =item B<--invalidate-state>
65 This is a more powerful version of --restart-at, which can
66 specifically invalidate a certain method,database,keyword combination.
68 For example, you can request that the keyword foo be retreived again
69 from ncbi using --invalidate-state 'get,ncbi,foo'
73 Debug verbosity. (Default 0)
77 Display brief useage information.
87 # Search all databases for transferrin
88 echo 'transferrin' > keywords.txt
89 function2gene --keywords keywords.txt --results keyword_results
92 function2gene --keywords keywords.txt --results keyword_results \
101 use Storable qw(thaw freeze);
102 use File::Basename qw(basename dirname);
105 my %options = (databases => [],
111 invalidate_state => [],
114 GetOptions(\%options,'keywords=s@','databases=s@',
115 'restart_at|restart-at=s','results=s',
116 'invalidate_state|invalidate-state=s@',
117 'debug|d+','help|h|?','man|m');
119 pod2usage() if $options{help};
120 pod2usage({verbose=>2}) if $options{man};
122 my $base_dir = dirname(abs_path($0));
126 $ERRORS.="restart-at must be one of get, parse or combine\n" if
127 exists $options{restart_at} and $options{restart_at} !~ /^(?:get|parse|combine)$/;
129 $ERRORS.="unknown database(s)" if
130 @{$options{databases}} and
131 grep {$_ !~ /^(?:ncbi|genecard|harvester)$/i} @{$options{databases}};
133 if (not length $options{results}) {
134 $ERRORS.="results directory not specified";
136 elsif (not -d $options{results} or not -w $options{results}) {
137 $ERRORS.="results directory $options{results} does not exist or is not writeable";
140 pod2usage($ERRORS) if length $ERRORS;
142 if (not @{$options{databases}}) {
143 $options{databases} = [qw(ncbi genecard harvester)]
146 $DEBUG = $options{debug};
148 # There are three states for our engine
153 # first, check to see if the state in the result directory exists
157 $options{keywords} = [map {abs_path($_)} @{$options{keywords}}];
159 chdir $options{results} or die "Unable to chdir to $options{results}";
161 if (-e "do_it_all_state") {
162 ADVISE("Using existing state information");
163 my $state_fh = IO::File->new("do_it_all_state",'r') or die
164 "Unable to open state file for reading: $!";
166 my $state_file = <$state_fh>;
167 %state = %{thaw($state_file)} or die "Unable to thaw state file";
170 ADVISE("Starting new run");
171 %state = (keywords => [],
172 databases => [map {lc($_)} @{$options{databases}}],
182 if (@{$options{keywords}}) {
185 @old_keywords{@{$state{keywords}}} = (1) x @{$state{keywords}};
186 for my $keyword_file (@{$options{keywords}}) {
187 my $keyword_fh = IO::File->new($keyword_file,'r') or die
188 "Unable to open $keyword_file for reading: $!";
189 while (<$keyword_fh>) {
193 if (not $old_keywords{$_}) {
194 DEBUG("Adding new keyword '$_'");
195 push @new_keywords, $_;
198 DEBUG("Not adding duplicate keyword '$_'");
202 push @{$state{keywords}},@new_keywords;
205 if (exists $options{restart_at} and length $options{restart_at}) {
206 if (lc($options{restart_at}) eq 'get') {
207 delete $state{done_keywords}{get};
208 delete $state{done_keywords}{parse};
209 delete $state{done_keywords}{combine};
211 elsif (lc($options{restart_at}) eq 'parse') {
212 delete $state{done_keywords}{parse};
213 delete $state{done_keywords}{combine};
215 elsif (lc($options{restart_at}) eq 'combine') {
216 delete $state{done_keywords}{combine};
220 if (exists $options{invalidate_state}) {
221 for my $invalidate_state (@{$options{invalidate_state}}) {
222 my ($method,$database,$keyword) = split /,/, $invalidate_state;
223 if (not exists $state{done_keywords}{$method}) {
224 print STDERR "Method '$method' does not exist, and cannot be invalidated\n";
226 elsif (not exists $state{done_keywords}{$method}{$database}) {
227 print STDERR "Database '$database' does not exist for method '$method', and cannot be invalidated\n";
229 elsif (not exists $state{done_keywords}{$method}{$database}{$keyword}) {
230 print STDERR "Keyword '$keyword' does not exist for database '$database' and method '$method', and cannot be invalidated\n";
233 delete $state{done_keywords}{$method}{$database}{$keyword};
238 # now we need to figure out what has to happen
239 # for each keyword, we check to see if we've got results, parsed
240 # results, and combined it. If not, we queue up those actions.
242 my %actions = (combine => 0,
247 if (not @{$state{keywords}}) {
248 ADVISE("There are no keywords specified");
251 for my $keyword (@{$state{keywords}}) {
252 for my $database (@{$state{databases}}) {
253 if (not exists $state{done_keywords}{get}{$database}{$keyword}) {
254 push @{$actions{get}{$database}}, $keyword;
255 delete $state{done_keywords}{parse}{$database}{$keyword} if
256 exists $state{done_keywords}{parse}{$database}{$keyword};
257 delete $state{done_keywords}{combine}{$database}{$keyword} if
258 exists $state{done_keywords}{combine}{$database}{$keyword};
260 if (not exists $state{done_keywords}{parse}{$database}{$keyword}) {
261 push @{$actions{parse}{$database}},$keyword;
262 delete $state{done_keywords}{combine}{$database}{$keyword} if
263 exists $state{done_keywords}{combine}{$database}{$keyword};
265 if (not exists $state{done_keywords}{combine}{$database}{$keyword}) {
266 $actions{combine} = 1;
272 for my $state (qw(get parse)) {
274 for my $database (keys %{$actions{$state}}) {
275 next unless @{$actions{$state}{$database}};
276 $databases{$database}{queue} = Thread::Queue->new
277 or die "Unable to create new thread queue";
278 $databases{$database}{thread} = threads->create(\&handle_action,$state,$database,$databases{$database}{queue})
279 or die "Unable to create new thread";
280 $databases{$database}{queue}->enqueue(@{$actions{$state}{$database}});
281 $databases{$database}{queue}->enqueue(undef);
284 for my $database (keys %databases) {
285 my ($actioned_keywords,$failed_keywords) = @{$databases{$database}{thread}->join||[]};
286 if (not defined $failed_keywords) {
287 ADVISE("Something bad happened during '$state' of '$database'");
290 elsif (@{$failed_keywords}) {
291 ADVISE("These keywords failed during '$state' of '$database':",@{$failed_keywords});
294 @{$state{done_keywords}{$state}{$database}}{@{$actioned_keywords}} = (1) x @{$actioned_keywords};
295 delete @{$state{done_keywords}{$state}{$database}}{@{$failed_keywords}};
299 WARN("Stoping, as there are errors");
304 if ($actions{combine}) {
306 # deal with combining results
307 my @parsed_results = map { my $db = $_;
309 "parsed_results_${db}_${_}.txt"
310 } keys %{$state{done_keywords}{parse}{$db}}
311 } keys %{$state{done_keywords}{parse}};
313 write_command_to_file('combined_results.txt',
314 "$base_dir/combine_results",
317 for my $result (@parsed_results) {
318 $result =~ s/^parsed_results_//;
319 $result =~ s/\.txt$//;
320 my ($db,$keyword) = split /_/, $result, 2;
321 $state{done_keywords}{combined}{$db}{$keyword} = 1;
324 write_command_to_file('combined_results_table.txt',
325 "$base_dir/results_to_table",
326 'combined_results.txt',
328 ADVISE("Finished; results in $options{results}/combined_results.txt");
331 ADVISE('Nothing to do. [Perhaps you wanted --restart-at?]');
335 my ($state,$database,$queue) = @_;
337 my $actioned_keywords = [];
338 my $failed_keywords = [];
339 DEBUG("Beginning to handle actions for state '$state' database '$database'");
340 while ($keyword = $queue->dequeue) {
341 DEBUG("Handling state '$state' database '$database' keyword '$keyword'");
342 # handle the action, baybee
343 if ($state eq 'get') {
346 open($command_fh,'|-',
347 "$base_dir/get_${database}_results",
348 ) or die "unable to execute '$base_dir/get_${database}_results'";
349 print {$command_fh} "$keyword\n" or die "unable to print $keyword to 'get_${database}_results'";
350 close($command_fh) or die "Unable to close filehandle";
352 die "get_${database}_results with keyword $keyword failed with error code ".($?>>8);
357 push @{$failed_keywords}, $keyword;
361 elsif ($state eq 'parse') {
363 write_command_to_file("parsed_results_${database}_${keyword}.txt",
364 "$base_dir/parse_${database}_results",
370 WARN("parse_${database}_results failed with $@");
371 push @{$failed_keywords}, $keyword;
376 die "I don't know how to handle state $state";
378 ADVISE("$state results from '$database' for '$keyword'");
379 push @{$actioned_keywords},$keyword;
381 return [$actioned_keywords,$failed_keywords];
386 my $state_fh = IO::File->new("do_it_all_state",'w') or die
387 "Unable to open state file for writing: $!";
388 print {$state_fh} freeze($state) or die "Unable to freeze state file";
389 close $state_fh or die "Unable to close state file: $!";
392 sub write_command_to_file{
393 my ($file,@command) = @_;
394 my $fh = IO::File->new($file,'w') or
395 die "Unable to open $file for writing: $!";
397 open($command_fh,'-|',
399 ) or die "Unable to execute $command[0] $!";
400 print {$fh} <$command_fh>;
402 close $command_fh or die "$command[0] failed with ".($?>>8);
407 print STDOUT map {($_,qq(\n))} @_;
411 print STDERR map {($_,qq(\n))} @_;
416 print STDERR map {($_,qq(\n))} @_;