2 # function2gene, is part of the function2gene suite and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2007 by Don Armstrong <don@donarmstrong.com>.
19 function2gene - Call out to each of the search modules to search for
24 function2gene --keywords keywords.txt --results gene_search_results
27 --keywords newline delineated list of keywords to search for
28 --results directory to store results in
29 --database databases to search
30 --restart-at mode to start searching at
31 --invalidate-state state to invalidate
32 --debug, -d debugging level (Default 0)
33 --help, -h display this help
34 --man, -m display manual
42 A file which contains a newline delinated list of keywords to search
43 for. Can be specified multiple times. Lines starting with # or ; are
44 ignored. An optional weight can be specified after the keyword, which
45 is separated from the keyword by a tab. (If not specified, 1 is
50 Directory in which to store results; also stores the current state of
55 Databases to search, can be specified multiple times. [Defaults to
56 NCBI, GeneCards and Harvester, the only currently supported
61 If you need to restart the process at a particular state (which has
62 already been completed) specify this option. Valid values are get,
65 =item B<--invalidate-state>
67 This is a more powerful version of --restart-at, which can
68 specifically invalidate a certain method,database,keyword combination.
70 For example, you can request that the keyword foo be retreived again
71 from ncbi using --invalidate-state 'get,ncbi,foo'
75 Debug verbosity. (Default 0)
79 Display brief useage information.
89 # Search all databases for transferrin
90 echo 'transferrin' > keywords.txt
91 function2gene --keywords keywords.txt --results keyword_results
94 function2gene --keywords keywords.txt --results keyword_results \
101 use Cwd qw(abs_path);
103 use Storable qw(thaw freeze);
104 use File::Basename qw(basename dirname);
107 my %options = (databases => [],
113 invalidate_state => [],
116 GetOptions(\%options,'keywords=s@','databases=s@',
117 'restart_at|restart-at=s','results=s',
118 'invalidate_state|invalidate-state=s@',
119 'debug|d+','help|h|?','man|m');
121 pod2usage() if $options{help};
122 pod2usage({verbose=>2}) if $options{man};
124 my $base_dir = dirname(abs_path($0));
128 $ERRORS.="restart-at must be one of get, parse or combine\n" if
129 exists $options{restart_at} and $options{restart_at} !~ /^(?:get|parse|combine)$/;
131 $ERRORS.="unknown database(s)" if
132 @{$options{databases}} and
133 grep {$_ !~ /^(?:ncbi|genecard|harvester)$/i} @{$options{databases}};
135 if (not length $options{results}) {
136 $ERRORS.="results directory not specified";
138 elsif (not -d $options{results} or not -w $options{results}) {
139 $ERRORS.="results directory $options{results} does not exist or is not writeable";
142 pod2usage($ERRORS) if length $ERRORS;
144 if (not @{$options{databases}}) {
145 $options{databases} = [qw(ncbi genecard harvester)]
148 $DEBUG = $options{debug};
150 # There are three states for our engine
155 # first, check to see if the state in the result directory exists
159 $options{keywords} = [map {abs_path($_)} @{$options{keywords}}];
161 chdir $options{results} or die "Unable to chdir to $options{results}";
163 if (-e "function2gene_state") {
164 ADVISE("Using existing state information");
165 my $state_fh = IO::File->new("function2gene_state",'r') or die
166 "Unable to open state file for reading: $!";
168 my $state_file = <$state_fh>;
169 %state = %{thaw($state_file)} or die "Unable to thaw state file";
172 ADVISE("Starting new run");
173 %state = (keywords => [],
174 databases => [map {lc($_)} @{$options{databases}}],
184 if (@{$options{keywords}}) {
187 @old_keywords{@{$state{keywords}}} = (1) x @{$state{keywords}};
188 for my $keyword_file (@{$options{keywords}}) {
189 my $keyword_fh = IO::File->new($keyword_file,'r') or die
190 "Unable to open $keyword_file for reading: $!";
191 while (<$keyword_fh>) {
196 my ($keyword,$weight) = split /\t/, $_;
197 $weight = 1 if not defined $weight;
198 $state{keyword_weight}{$keyword} = $weight;
199 if (not $old_keywords{$_}) {
200 DEBUG("Adding new keyword '$_'");
201 push @new_keywords, $_;
204 DEBUG("Not adding duplicate keyword '$_'");
208 push @{$state{keywords}},@new_keywords;
211 if (exists $options{restart_at} and length $options{restart_at}) {
212 if (lc($options{restart_at}) eq 'get') {
213 delete $state{done_keywords}{get};
214 delete $state{done_keywords}{parse};
215 delete $state{done_keywords}{combine};
217 elsif (lc($options{restart_at}) eq 'parse') {
218 delete $state{done_keywords}{parse};
219 delete $state{done_keywords}{combine};
221 elsif (lc($options{restart_at}) eq 'combine') {
222 delete $state{done_keywords}{combine};
226 if (exists $options{invalidate_state}) {
227 for my $invalidate_state (@{$options{invalidate_state}}) {
228 my ($method,$database,$keyword) = split /,/, $invalidate_state;
229 if (grep {not defined $_ } ($method,$database,$keyword) ) {
230 print STDERR "The invalidate state option '$invalidate_state' is invalid.\n";
233 if (not exists $state{done_keywords}{$method}) {
234 print STDERR "Method '$method' does not exist, and cannot be invalidated\n";
237 if (not exists $state{done_keywords}{$method}{$database}) {
238 print STDERR "Database '$database' does not exist for method '$method', and cannot be invalidated\n";
241 if (not length $keyword) {
242 delete $state{done_keywords}{$method}{$database};
243 if ($method eq 'get') {
244 delete $state{done_keywords}{parse}{$database};
245 delete $state{done_keywords}{combine}{$database};
247 if ($method eq 'parse') {
248 delete $state{done_keywords}{combine}{$database};
252 if (not exists $state{done_keywords}{$method}{$database}{$keyword}) {
253 print STDERR "Keyword '$keyword' does not exist for database '$database' and method '$method', and cannot be invalidated\n";
256 delete $state{done_keywords}{$method}{$database}{$keyword};
257 if ($method eq 'get') {
258 delete $state{done_keywords}{parse}{$database}{$keyword};
259 delete $state{done_keywords}{combine}{$database}{$keyword};
261 if ($method eq 'parse') {
262 delete $state{done_keywords}{combine}{$database}{$keyword};
267 # now we need to figure out what has to happen
268 # for each keyword, we check to see if we've got results, parsed
269 # results, and combined it. If not, we queue up those actions.
271 my %actions = (combine => 0,
276 if (not @{$state{keywords}}) {
277 ADVISE("There are no keywords specified");
280 for my $keyword (@{$state{keywords}}) {
281 for my $database (@{$state{databases}}) {
282 if (not exists $state{done_keywords}{get}{$database}{$keyword}) {
283 push @{$actions{get}{$database}}, $keyword;
284 delete $state{done_keywords}{parse}{$database}{$keyword} if
285 exists $state{done_keywords}{parse}{$database}{$keyword};
286 delete $state{done_keywords}{combine}{$database}{$keyword} if
287 exists $state{done_keywords}{combine}{$database}{$keyword};
289 if (not exists $state{done_keywords}{parse}{$database}{$keyword}) {
290 push @{$actions{parse}{$database}},$keyword;
291 delete $state{done_keywords}{combine}{$database}{$keyword} if
292 exists $state{done_keywords}{combine}{$database}{$keyword};
294 if (not exists $state{done_keywords}{combine}{$database}{$keyword}) {
295 $actions{combine} = 1;
301 for my $state (qw(get parse)) {
303 for my $database (keys %{$actions{$state}}) {
304 next unless @{$actions{$state}{$database}};
305 $databases{$database}{queue} = Thread::Queue->new
306 or die "Unable to create new thread queue";
307 $databases{$database}{thread} = threads->create(\&handle_action,$state,$database,$databases{$database}{queue})
308 or die "Unable to create new thread";
309 $databases{$database}{queue}->enqueue(@{$actions{$state}{$database}});
310 $databases{$database}{queue}->enqueue(undef);
313 for my $database (keys %databases) {
314 my ($actioned_keywords,$failed_keywords) = @{$databases{$database}{thread}->join||[]};
315 if (not defined $failed_keywords) {
316 ADVISE("Something bad happened during '$state' of '$database'");
319 elsif (@{$failed_keywords}) {
320 ADVISE("These keywords failed during '$state' of '$database':",@{$failed_keywords});
323 @{$state{done_keywords}{$state}{$database}}{@{$actioned_keywords}} = (1) x @{$actioned_keywords};
324 delete @{$state{done_keywords}{$state}{$database}}{@{$failed_keywords}};
328 WARN("Stoping, asthere are errors");
333 if ($actions{combine}) {
335 # deal with combining results
336 my @parsed_results = map { my $db = $_;
338 "parsed_results_${db}_${_}.txt"
339 } keys %{$state{done_keywords}{parse}{$db}}
340 } keys %{$state{done_keywords}{parse}};
342 # create temporary file to store keyword weights
343 my $file = IO::File->new('combined_keywords.txt','w') or
344 die "Unable to open combined_keywords.txt for writing: $!";
345 for my $keyword (keys %{$state{keyword_weight}}) {
346 print {$file} "$keyword\t$state{keyword_weight}{$keyword}\n";
348 system("$base_dir/combine_results",
349 '--keywords','combined_keywords.txt',
350 '--results','combined_results.txt',
351 '--results-table','combined_results_table.txt',
354 or die "combine_results failed with ".($?>>8);
355 for my $result (@parsed_results) {
356 $result =~ s/^parsed_results_//;
357 $result =~ s/\.txt$//;
358 my ($db,$keyword) = split /_/, $result, 2;
359 $state{done_keywords}{combined}{$db}{$keyword} = 1;
362 ADVISE("Finished; results in $options{results}/combined_results.txt");
365 ADVISE('Nothing to do. [Perhaps you wanted --restart-at?]');
369 my ($state,$database,$queue) = @_;
371 my $actioned_keywords = [];
372 my $failed_keywords = [];
373 DEBUG("Beginning to handle actions for state '$state' database '$database'");
374 while ($keyword = $queue->dequeue) {
375 DEBUG("Handling state '$state' database '$database' keyword '$keyword'");
376 # handle the action, baybee
377 if ($state eq 'get') {
380 open($command_fh,'|-',
381 "$base_dir/get_${database}_results",
382 ) or die "unable to execute '$base_dir/get_${database}_results'";
383 print {$command_fh} "$keyword\n" or die "unable to print $keyword to 'get_${database}_results'";
384 close($command_fh) or die "Unable to close filehandle";
386 die "get_${database}_results with keyword $keyword failed with error code ".($?>>8);
391 push @{$failed_keywords}, $keyword;
395 elsif ($state eq 'parse') {
397 write_command_to_file("parsed_results_${database}_${keyword}.txt",
398 "$base_dir/parse_${database}_results",
404 WARN("parse_${database}_results failed with $@");
405 push @{$failed_keywords}, $keyword;
410 die "I don't know how to handle state $state";
412 ADVISE("$state results from '$database' for '$keyword'");
413 push @{$actioned_keywords},$keyword;
415 return [$actioned_keywords,$failed_keywords];
420 my $state_fh = IO::File->new("function2gene_state",'w') or die
421 "Unable to open state file for writing: $!";
422 print {$state_fh} freeze($state) or die "Unable to freeze state file";
423 close $state_fh or die "Unable to close state file: $!";
426 sub write_command_to_file{
427 my ($file,@command) = @_;
428 my $fh = IO::File->new($file,'w') or
429 die "Unable to open $file for writing: $!";
431 open($command_fh,'-|',
433 ) or die "Unable to execute $command[0] $!";
434 print {$fh} <$command_fh>;
436 close $command_fh or die "$command[0] failed with ".($?>>8);
441 print STDOUT map {($_,qq(\n))} @_;
445 print STDERR map {($_,qq(\n))} @_;
450 print STDERR map {($_,qq(\n))} @_;