#! /usr/bin/perl # do_it_all, is part of the gene search suite and is released # under the terms of the GPL version 2, or any later version, at your # option. See the file README and COPYING for more information. # Copyright 2007 by Don Armstrong . # $Id: perl_script 495 2006-08-10 08:02:01Z don $ use warnings; use strict; use Getopt::Long; use Pod::Usage; use Storable; =head1 NAME do_it_all - Call out to each of the search modules to search for each of the terms =head1 SYNOPSIS do_it_all --keywords keywords.txt --results gene_search_results Options: --keywords newline delineated list of keywords to search for --results directory to store results in --database databases to search --restart-at mode to start searching at --debug, -d debugging level (Default 0) --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--keywords> A file which contains a newline delinated list of keywords to search for. Can be specified multiple times. Lines starting with # or ; are ignored. =item B<--results> Directory in which to store results; also stores the current state of the system =item B<--database> Databases to search, can be specified multiple times. [Defaults to NCBI, GeneCards and Harvester, the only currently supported databases.] =item B<--restart-at> If you need to restart the process at a particular state (which has already been completed) specify this option. =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief useage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES =cut use vars qw($DEBUG); my %options = (databases => [], keywords => [], debug => 0, help => 0, man => 0, directory => '', ); GetOptions(\%options,'keywords=s@','databases=s@', 'restart_at|restart-at=s', 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; my $ERRORS=''; $ERRORS.="restart-at must be one of get, parse or combine\n" if exists $options{restart_at} and $options{restart_at} !~ /^(?:get|parse|combine)$/; $ERRORS.="unknown database(s)" if @{$options{databases}} and grep {$_ !~ /^(?:ncbi|genecards|harvester)$/i} @{$options{databases}}; if (not length $options{directory}) { $ERRORS.="directory not specified"; } elsif (not -d $options{directory} or not -w $options{directory}) { $ERRORS.="directory $options{directory} does not exist or is not writeable"; } pod2usage($ERRORS) if length $ERRORS; if (not @{$options{databases}}) { $options{databases} = [qw(ncbi genecards harvester)] } $DEBUG = $options{debug}; # There are three states for our engine # Getting results # Parsing them # Combining results # first, check to see if the state in the result directory exists my %state; if (-e "$options{directory}/do_it_all_state") { ADVISE("Using existing state information"); my $state_fh = IO::File->new("$options{directory}/do_it_all_state",'r') or die "Unable to open state file for reading: $!"; local $/; my $state_file = <$state_fh> or die "Unabel to read state file $!"; %state = %{thaw($state_file)} or die "Unable to thaw state file"; } else { ADVISE("Starting new run"); %state = (keywords => [], databases => [map {lc($_)} @{$options{databases}}], gotten_keywords => {}, parsed_keywords => {}, combined_keywords => {}, ); } my @new_keywords; if (@{$options{keywords}}) { # uniqify keywords my %old_keywords; @old_keywords{@{$state{keywords}}} = (1) x @{$state{keywords}}; for my $keyword_file (@{$options{keywords}}) { my $keyword_fh = IO::File->new($keyword_file,'r') or die "Unable to open $keyword_file for reading: $!"; local $/; while (<$keyword_fh>) { next if /^\s*[#;]/; chomp; if (not $old_keywords{$_}) { DEBUG("Adding new keyword '$_'"); push @new_keywords, $_; } else { DEBUG("Not adding duplicate keyword '$_'"); } } } } if (exists $options{restart_at} and length $options{restart_at}) { if (lc($options{restart_at}) eq 'get') { delete $state{gotten_keywords}; delete $state{parsed_keywords}; delete $state{combined_keywords}; } elsif (lc($options{restart_at}) eq 'parse') { delete $state{parsed_keywords}; delete $state{combined_keywords}; } elsif (lc($options{restart_at}) eq 'combine') { delete $state{combined_keywords}; } } # now we need to figure out what has to happen # for each keyword, we check to see if we've got results, parsed # results, and combined it. If not, we queue up those actions. my @get_needed = (); my @parse_needed = (); my $combine_needed = 0; for my $keyword (@{$state{keywords}}) { for my $database (@{$state{databases}}) { if (not exists $state{gotten_keywords}{$database}{$keyword}) { push @get_needed,[$database,$keyword]; delete $state{parsed_keywords}{$database}{$keyword} if exists $state{gotten_keywords}{$database}{$keyword}; delete $state{combined_keywords}{$database}{$keyword} if exists $state{gotten_keywords}{$database}{$keyword}; } if (not exists $state{parsed_keywords}{$database}{$keyword}) { push @parse_needed,[$database,$keyword]; delete $state{combined_keywords}{$database}{$keyword} if exists $state{gotten_keywords}{$database}{$keyword}; } if (not exists $state{combined_keywords}{$database}{$keyword}) { $combine_needed = 1; } } } # handle getting needed results for my $action (@get_needed) { } # handle parsing needed results for my $action (@parse_needed) { } # handle combining results sub ADVISE{ print STDOUT map {($_,qq(\n))} @_; } sub DEBUG{ print STDERR map {($_,qq(\n))} @_; } sub WARN { print STDERR map {($_,qq(\n))} @_; } __END__