From: Don Armstrong Date: Fri, 26 Oct 2007 20:26:45 +0000 (+0000) Subject: * Rename do_it_all; document function2gene more clearly X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=commitdiff_plain;h=3557eb364c40602a2f2d3f4c2a68edd6ee00b632 * Rename do_it_all; document function2gene more clearly git-svn-id: file:///srv/svn/function2gene/trunk@13 a0738b58-4706-0410-8799-fb830574a030 --- diff --git a/bin/do_it_all b/bin/do_it_all deleted file mode 100755 index 9a681ec..0000000 --- a/bin/do_it_all +++ /dev/null @@ -1,378 +0,0 @@ -#! /usr/bin/perl -# do_it_all, is part of the gene search suite and is released -# under the terms of the GPL version 2, or any later version, at your -# option. See the file README and COPYING for more information. -# Copyright 2007 by Don Armstrong . - - -use threads; -use warnings; -use strict; - -use Getopt::Long; -use Pod::Usage; - -use Storable; - -=head1 NAME - - do_it_all - Call out to each of the search modules to search for - each of the terms - -=head1 SYNOPSIS - - do_it_all --keywords keywords.txt --results gene_search_results - - Options: - --keywords newline delineated list of keywords to search for - --results directory to store results in - --database databases to search - --restart-at mode to start searching at - --debug, -d debugging level (Default 0) - --help, -h display this help - --man, -m display manual - -=head1 OPTIONS - -=over - -=item B<--keywords> - -A file which contains a newline delinated list of keywords to search -for. Can be specified multiple times. Lines starting with # or ; are -ignored. - -=item B<--results> - -Directory in which to store results; also stores the current state of -the system - -=item B<--database> - -Databases to search, can be specified multiple times. [Defaults to -NCBI, GeneCards and Harvester, the only currently supported -databases.] - -=item B<--restart-at> - -If you need to restart the process at a particular state (which has -already been completed) specify this option. - -=item B<--debug, -d> - -Debug verbosity. (Default 0) - -=item B<--help, -h> - -Display brief useage information. - -=item B<--man, -m> - -Display this manual. - -=back - -=head1 EXAMPLES - - -=cut - - -use vars qw($DEBUG); -use Cwd qw(abs_path); -use IO::File; -use Storable qw(thaw freeze); -use File::Basename qw(basename dirname); -use Thread::Queue; - -my %options = (databases => [], - keywords => [], - debug => 0, - help => 0, - man => 0, - results => '', - ); - -GetOptions(\%options,'keywords=s@','databases=s@', - 'restart_at|restart-at=s','results=s', - 'debug|d+','help|h|?','man|m'); - -pod2usage() if $options{help}; -pod2usage({verbose=>2}) if $options{man}; - -my $base_dir = dirname(abs_path($0)); - -my $ERRORS=''; - -$ERRORS.="restart-at must be one of get, parse or combine\n" if - exists $options{restart_at} and $options{restart_at} !~ /^(?:get|parse|combine)$/; - -$ERRORS.="unknown database(s)" if - @{$options{databases}} and - grep {$_ !~ /^(?:ncbi|genecard|harvester)$/i} @{$options{databases}}; - -if (not length $options{results}) { - $ERRORS.="results directory not specified"; -} -elsif (not -d $options{results} or not -w $options{results}) { - $ERRORS.="results directory $options{results} does not exist or is not writeable"; -} - -pod2usage($ERRORS) if length $ERRORS; - -if (not @{$options{databases}}) { - $options{databases} = [qw(ncbi genecard harvester)] -} - -$DEBUG = $options{debug}; - -# There are three states for our engine -# Getting results -# Parsing them -# Combining results - -# first, check to see if the state in the result directory exists - -my %state; - -$options{keywords} = [map {abs_path($_)} @{$options{keywords}}]; - -chdir $options{results} or die "Unable to chdir to $options{results}"; - -if (-e "do_it_all_state") { - ADVISE("Using existing state information"); - my $state_fh = IO::File->new("do_it_all_state",'r') or die - "Unable to open state file for reading: $!"; - local $/; - my $state_file = <$state_fh>; - %state = %{thaw($state_file)} or die "Unable to thaw state file"; -} -else { - ADVISE("Starting new run"); - %state = (keywords => [], - databases => [map {lc($_)} @{$options{databases}}], - done_keywords => { - get => {}, - parse => {}, - combine => {}, - }, - ); -} - -my @new_keywords; -if (@{$options{keywords}}) { - # uniqify keywords - my %old_keywords; - @old_keywords{@{$state{keywords}}} = (1) x @{$state{keywords}}; - for my $keyword_file (@{$options{keywords}}) { - my $keyword_fh = IO::File->new($keyword_file,'r') or die - "Unable to open $keyword_file for reading: $!"; - while (<$keyword_fh>) { - next if /^\s*[#;]/; - next unless /\w+/; - chomp; - if (not $old_keywords{$_}) { - DEBUG("Adding new keyword '$_'"); - push @new_keywords, $_; - } - else { - DEBUG("Not adding duplicate keyword '$_'"); - } - } - } - push @{$state{keywords}},@new_keywords; -} - -if (exists $options{restart_at} and length $options{restart_at}) { - if (lc($options{restart_at}) eq 'get') { - delete $state{done_keywords}{get}; - delete $state{done_keywords}{parse}; - delete $state{done_keywords}{combine}; - } - elsif (lc($options{restart_at}) eq 'parse') { - delete $state{done_keywords}{parse}; - delete $state{done_keywords}{combine}; - } - elsif (lc($options{restart_at}) eq 'combine') { - delete $state{done_keywords}{combine}; - } -} - -# now we need to figure out what has to happen -# for each keyword, we check to see if we've got results, parsed -# results, and combined it. If not, we queue up those actions. - -my %actions = (combine => 0, - get => {}, - parse => {}, - ); - -if (not @{$state{keywords}}) { - ADVISE("There are no keywords specified"); -} - -for my $keyword (@{$state{keywords}}) { - for my $database (@{$state{databases}}) { - if (not exists $state{done_keywords}{get}{$database}{$keyword}) { - push @{$actions{get}{$database}}, $keyword; - delete $state{done_keywords}{parse}{$database}{$keyword} if - exists $state{done_keywords}{parse}{$database}{$keyword}; - delete $state{done_keywords}{combine}{$database}{$keyword} if - exists $state{done_keywords}{combine}{$database}{$keyword}; - } - if (not exists $state{done_keywords}{parse}{$database}{$keyword}) { - push @{$actions{parse}{$database}},$keyword; - delete $state{done_keywords}{combine}{$database}{$keyword} if - exists $state{done_keywords}{combine}{$database}{$keyword}; - } - if (not exists $state{done_keywords}{combine}{$database}{$keyword}) { - $actions{combine} = 1; - } - } -} - - -for my $state (qw(get parse)) { - my %databases; - for my $database (keys %{$actions{$state}}) { - next unless @{$actions{$state}{$database}}; - $databases{$database}{queue} = Thread::Queue->new - or die "Unable to create new thread queue"; - $databases{$database}{thread} = threads->create(\&handle_action,$state,$database,$databases{$database}{queue}) - or die "Unable to create new thread"; - $databases{$database}{queue}->enqueue(@{$actions{$state}{$database}}); - $databases{$database}{queue}->enqueue(undef); - } - my $ERRORS=0; - for my $database (keys %databases) { - my ($actioned_keywords,$failed_keywords) = @{$databases{$database}{thread}->join||[]}; - if (not defined $failed_keywords) { - ADVISE("Something bad happened during '$state' of '$database'"); - $ERRORS = 1; - } - elsif (@{$failed_keywords}) { - ADVISE("These keywords failed during '$state' of '$database':",@{$failed_keywords}); - $ERRORS=1; - } - @{$state{done_keywords}{$state}{$database}}{@{$actioned_keywords}} = (1) x @{$actioned_keywords}; - delete @{$state{done_keywords}{$state}{$database}}{@{$failed_keywords}}; - } - save_state(\%state); - if ($ERRORS) { - WARN("Stoping, as there are errors"); - exit 1; - } -} - -if ($actions{combine}) { - save_state(\%state); - # deal with combining results - my @parsed_results = map { my $db = $_; - map { - "parsed_results_${db}_${_}.txt" - } keys %{$state{done_keywords}{parse}{$db}} - } keys %{$state{done_keywords}{parse}}; - - write_command_to_file('combined_results.txt', - "$base_dir/combine_results", - @parsed_results, - ); - for my $result (@parsed_results) { - s/^parsed_results_//; - s/\.txt$//; - my ($db,$keyword) = split /_/, $_, 2; - $state{done_keywords}{combined}{$db}{$keyword} = 1; - } - save_state(\%state); - ADVISE("Finished; results in $options{results}/combined_results"); -} -else { - ADVISE('Nothing to do. [Perhaps you wanted --restart-at?]'); -} - -sub handle_action{ - my ($state,$database,$queue) = @_; - my $keyword; - my $actioned_keywords = []; - my $failed_keywords = []; - DEBUG("Beginning to handle actions for state '$state' database '$database'"); - while ($keyword = $queue->dequeue) { - DEBUG("Handling state '$state' database '$database' keyword '$keyword'"); - # handle the action, baybee - if ($state eq 'get') { - my $command_fh; - eval { - open($command_fh,'|-', - "$base_dir/get_${database}_results", - ) or die "unable to execute '$base_dir/get_${database}_results'"; - print {$command_fh} "$keyword\n" or die "unable to print $keyword to 'get_${database}_results'"; - close($command_fh) or die "Unable to close filehandle"; - if ($? != 0) { - die "get_${database}_results with keyword $keyword failed with error code ".($?>>8); - } - }; - if ($@) { - WARN($@); - push @{$failed_keywords}, $keyword; - next; - } - } - elsif ($state eq 'parse') { - eval { - write_command_to_file("parsed_results_${database}_${keyword}.txt", - "$base_dir/parse_${database}_results", - '--keywords', - $keyword, - ); - }; - if ($@) { - WARN("parse_${database}_results failed with $@"); - push @{$failed_keywords}, $keyword; - next; - } - } - else { - die "I don't know how to handle state $state"; - } - ADVISE("$state results from '$database' for '$keyword'"); - push @{$actioned_keywords},$keyword; - } - return [$actioned_keywords,$failed_keywords]; -} - -sub save_state{ - my ($state) = @_; - my $state_fh = IO::File->new("do_it_all_state",'w') or die - "Unable to open state file for writing: $!"; - print {$state_fh} freeze($state) or die "Unable to freeze state file"; - close $state_fh or die "Unable to close state file: $!"; -} - -sub write_command_to_file{ - my ($file,@command) = @_; - my $fh = IO::File->new($file,'w') or - die "Unable to open $file for writing: $!"; - my $command_fh; - open($command_fh,'-|', - @command, - ) or die "Unable to execute $command[0] $!"; - print {$fh} <$command_fh>; - close $fh; - close $command_fh or die "$command[0] failed with ".($?>>8); -} - - -sub ADVISE{ - print STDOUT map {($_,qq(\n))} @_; -} - -sub DEBUG{ - print STDERR map {($_,qq(\n))} @_; -} - - -sub WARN { - print STDERR map {($_,qq(\n))} @_; -} - -__END__ diff --git a/bin/function2gene b/bin/function2gene new file mode 100755 index 0000000..922deac --- /dev/null +++ b/bin/function2gene @@ -0,0 +1,381 @@ +#! /usr/bin/perl +# function2gene, is part of the function2gene suite and is released +# under the terms of the GPL version 2, or any later version, at your +# option. See the file README and COPYING for more information. +# Copyright 2007 by Don Armstrong . + + +use threads; +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +use Storable; + +=head1 NAME + + function2gene - Call out to each of the search modules to search for + each of the terms + +=head1 SYNOPSIS + + function2gene --keywords keywords.txt --results gene_search_results + + Options: + --keywords newline delineated list of keywords to search for + --results directory to store results in + --database databases to search + --restart-at mode to start searching at + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--keywords> + +A file which contains a newline delinated list of keywords to search +for. Can be specified multiple times. Lines starting with # or ; are +ignored. + +=item B<--results> + +Directory in which to store results; also stores the current state of +the system + +=item B<--database> + +Databases to search, can be specified multiple times. [Defaults to +NCBI, GeneCards and Harvester, the only currently supported +databases.] + +=item B<--restart-at> + +If you need to restart the process at a particular state (which has +already been completed) specify this option. + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief useage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + # Search all databases for transferrin + echo 'transferrin' > keywords.txt + function2gene --keywords keywords.txt --results keyword_results + +=cut + + +use vars qw($DEBUG); +use Cwd qw(abs_path); +use IO::File; +use Storable qw(thaw freeze); +use File::Basename qw(basename dirname); +use Thread::Queue; + +my %options = (databases => [], + keywords => [], + debug => 0, + help => 0, + man => 0, + results => '', + ); + +GetOptions(\%options,'keywords=s@','databases=s@', + 'restart_at|restart-at=s','results=s', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +my $base_dir = dirname(abs_path($0)); + +my $ERRORS=''; + +$ERRORS.="restart-at must be one of get, parse or combine\n" if + exists $options{restart_at} and $options{restart_at} !~ /^(?:get|parse|combine)$/; + +$ERRORS.="unknown database(s)" if + @{$options{databases}} and + grep {$_ !~ /^(?:ncbi|genecard|harvester)$/i} @{$options{databases}}; + +if (not length $options{results}) { + $ERRORS.="results directory not specified"; +} +elsif (not -d $options{results} or not -w $options{results}) { + $ERRORS.="results directory $options{results} does not exist or is not writeable"; +} + +pod2usage($ERRORS) if length $ERRORS; + +if (not @{$options{databases}}) { + $options{databases} = [qw(ncbi genecard harvester)] +} + +$DEBUG = $options{debug}; + +# There are three states for our engine +# Getting results +# Parsing them +# Combining results + +# first, check to see if the state in the result directory exists + +my %state; + +$options{keywords} = [map {abs_path($_)} @{$options{keywords}}]; + +chdir $options{results} or die "Unable to chdir to $options{results}"; + +if (-e "do_it_all_state") { + ADVISE("Using existing state information"); + my $state_fh = IO::File->new("do_it_all_state",'r') or die + "Unable to open state file for reading: $!"; + local $/; + my $state_file = <$state_fh>; + %state = %{thaw($state_file)} or die "Unable to thaw state file"; +} +else { + ADVISE("Starting new run"); + %state = (keywords => [], + databases => [map {lc($_)} @{$options{databases}}], + done_keywords => { + get => {}, + parse => {}, + combine => {}, + }, + ); +} + +my @new_keywords; +if (@{$options{keywords}}) { + # uniqify keywords + my %old_keywords; + @old_keywords{@{$state{keywords}}} = (1) x @{$state{keywords}}; + for my $keyword_file (@{$options{keywords}}) { + my $keyword_fh = IO::File->new($keyword_file,'r') or die + "Unable to open $keyword_file for reading: $!"; + while (<$keyword_fh>) { + next if /^\s*[#;]/; + next unless /\w+/; + chomp; + if (not $old_keywords{$_}) { + DEBUG("Adding new keyword '$_'"); + push @new_keywords, $_; + } + else { + DEBUG("Not adding duplicate keyword '$_'"); + } + } + } + push @{$state{keywords}},@new_keywords; +} + +if (exists $options{restart_at} and length $options{restart_at}) { + if (lc($options{restart_at}) eq 'get') { + delete $state{done_keywords}{get}; + delete $state{done_keywords}{parse}; + delete $state{done_keywords}{combine}; + } + elsif (lc($options{restart_at}) eq 'parse') { + delete $state{done_keywords}{parse}; + delete $state{done_keywords}{combine}; + } + elsif (lc($options{restart_at}) eq 'combine') { + delete $state{done_keywords}{combine}; + } +} + +# now we need to figure out what has to happen +# for each keyword, we check to see if we've got results, parsed +# results, and combined it. If not, we queue up those actions. + +my %actions = (combine => 0, + get => {}, + parse => {}, + ); + +if (not @{$state{keywords}}) { + ADVISE("There are no keywords specified"); +} + +for my $keyword (@{$state{keywords}}) { + for my $database (@{$state{databases}}) { + if (not exists $state{done_keywords}{get}{$database}{$keyword}) { + push @{$actions{get}{$database}}, $keyword; + delete $state{done_keywords}{parse}{$database}{$keyword} if + exists $state{done_keywords}{parse}{$database}{$keyword}; + delete $state{done_keywords}{combine}{$database}{$keyword} if + exists $state{done_keywords}{combine}{$database}{$keyword}; + } + if (not exists $state{done_keywords}{parse}{$database}{$keyword}) { + push @{$actions{parse}{$database}},$keyword; + delete $state{done_keywords}{combine}{$database}{$keyword} if + exists $state{done_keywords}{combine}{$database}{$keyword}; + } + if (not exists $state{done_keywords}{combine}{$database}{$keyword}) { + $actions{combine} = 1; + } + } +} + + +for my $state (qw(get parse)) { + my %databases; + for my $database (keys %{$actions{$state}}) { + next unless @{$actions{$state}{$database}}; + $databases{$database}{queue} = Thread::Queue->new + or die "Unable to create new thread queue"; + $databases{$database}{thread} = threads->create(\&handle_action,$state,$database,$databases{$database}{queue}) + or die "Unable to create new thread"; + $databases{$database}{queue}->enqueue(@{$actions{$state}{$database}}); + $databases{$database}{queue}->enqueue(undef); + } + my $ERRORS=0; + for my $database (keys %databases) { + my ($actioned_keywords,$failed_keywords) = @{$databases{$database}{thread}->join||[]}; + if (not defined $failed_keywords) { + ADVISE("Something bad happened during '$state' of '$database'"); + $ERRORS = 1; + } + elsif (@{$failed_keywords}) { + ADVISE("These keywords failed during '$state' of '$database':",@{$failed_keywords}); + $ERRORS=1; + } + @{$state{done_keywords}{$state}{$database}}{@{$actioned_keywords}} = (1) x @{$actioned_keywords}; + delete @{$state{done_keywords}{$state}{$database}}{@{$failed_keywords}}; + } + save_state(\%state); + if ($ERRORS) { + WARN("Stoping, as there are errors"); + exit 1; + } +} + +if ($actions{combine}) { + save_state(\%state); + # deal with combining results + my @parsed_results = map { my $db = $_; + map { + "parsed_results_${db}_${_}.txt" + } keys %{$state{done_keywords}{parse}{$db}} + } keys %{$state{done_keywords}{parse}}; + + write_command_to_file('combined_results.txt', + "$base_dir/combine_results", + @parsed_results, + ); + for my $result (@parsed_results) { + s/^parsed_results_//; + s/\.txt$//; + my ($db,$keyword) = split /_/, $_, 2; + $state{done_keywords}{combined}{$db}{$keyword} = 1; + } + save_state(\%state); + ADVISE("Finished; results in $options{results}/combined_results"); +} +else { + ADVISE('Nothing to do. [Perhaps you wanted --restart-at?]'); +} + +sub handle_action{ + my ($state,$database,$queue) = @_; + my $keyword; + my $actioned_keywords = []; + my $failed_keywords = []; + DEBUG("Beginning to handle actions for state '$state' database '$database'"); + while ($keyword = $queue->dequeue) { + DEBUG("Handling state '$state' database '$database' keyword '$keyword'"); + # handle the action, baybee + if ($state eq 'get') { + my $command_fh; + eval { + open($command_fh,'|-', + "$base_dir/get_${database}_results", + ) or die "unable to execute '$base_dir/get_${database}_results'"; + print {$command_fh} "$keyword\n" or die "unable to print $keyword to 'get_${database}_results'"; + close($command_fh) or die "Unable to close filehandle"; + if ($? != 0) { + die "get_${database}_results with keyword $keyword failed with error code ".($?>>8); + } + }; + if ($@) { + WARN($@); + push @{$failed_keywords}, $keyword; + next; + } + } + elsif ($state eq 'parse') { + eval { + write_command_to_file("parsed_results_${database}_${keyword}.txt", + "$base_dir/parse_${database}_results", + '--keywords', + $keyword, + ); + }; + if ($@) { + WARN("parse_${database}_results failed with $@"); + push @{$failed_keywords}, $keyword; + next; + } + } + else { + die "I don't know how to handle state $state"; + } + ADVISE("$state results from '$database' for '$keyword'"); + push @{$actioned_keywords},$keyword; + } + return [$actioned_keywords,$failed_keywords]; +} + +sub save_state{ + my ($state) = @_; + my $state_fh = IO::File->new("do_it_all_state",'w') or die + "Unable to open state file for writing: $!"; + print {$state_fh} freeze($state) or die "Unable to freeze state file"; + close $state_fh or die "Unable to close state file: $!"; +} + +sub write_command_to_file{ + my ($file,@command) = @_; + my $fh = IO::File->new($file,'w') or + die "Unable to open $file for writing: $!"; + my $command_fh; + open($command_fh,'-|', + @command, + ) or die "Unable to execute $command[0] $!"; + print {$fh} <$command_fh>; + close $fh; + close $command_fh or die "$command[0] failed with ".($?>>8); +} + + +sub ADVISE{ + print STDOUT map {($_,qq(\n))} @_; +} + +sub DEBUG{ + print STDERR map {($_,qq(\n))} @_; +} + + +sub WARN { + print STDERR map {($_,qq(\n))} @_; +} + +__END__