From: Don Armstrong Date: Mon, 27 Aug 2007 22:29:21 +0000 (+0000) Subject: update search program with options for do_it_all; implement calls to subsideary scripts X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=commitdiff_plain;h=79c0f28d89ba108bcfca68a8b5d4f0e3855455dc update search program with options for do_it_all; implement calls to subsideary scripts git-svn-id: file:///srv/svn/function2gene/trunk@5 a0738b58-4706-0410-8799-fb830574a030 --- diff --git a/bin/combine_results b/bin/combine_results index 8125c11..76a4e0c 100755 --- a/bin/combine_results +++ b/bin/combine_results @@ -1,13 +1,10 @@ #! /usr/bin/perl -# parse_ncbi_results retreives files of search results from ncbi, -# and is released under the terms of the GPL version 2, or any later -# version, at your option. See the file README and COPYING for more -# information. +# combine_results, is part of the gene search suite, and is released +# under the terms of the GPL version 2, or any later version, at your +# option. See the file README and COPYING for more information. -# Copyright 2004 by Don Armstrong . - -# $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $ +# Copyright 2006,2007 by Don Armstrong . use warnings; @@ -19,15 +16,13 @@ use Pod::Usage; =head1 NAME - parse_ncbi_results [options] + combine_results -- combines parsed result files; outputs to stdout. =head1 SYNOPSIS + combine_results parsed_results_1.txt [parsedresultfiles ...] Options: - --dir, -D directory to stick results into [default .] - --name, -n file naming scheme [default ${search}_results.$format] - --terms, -t file of search terms [default -] --debug, -d debugging level [default 0] --help, -h display this help --man, -m display manual @@ -52,7 +47,7 @@ Display this manual. =head1 EXAMPLES - parse_ncbi_results -D ./ncbi_results/ -n '${search}_name.html' < search_parameters + combine_results foo_1.txt Will pretty much do what you want @@ -60,10 +55,9 @@ Will pretty much do what you want -use vars qw($DEBUG $REVISION); +use vars qw($DEBUG); BEGIN{ - ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/; $DEBUG = 0 unless defined $DEBUG; } @@ -75,8 +69,6 @@ use IO::File; my %options = (debug => 0, help => 0, man => 0, - dir => '.', - keyword => undef, ); GetOptions(\%options,'keyword|k=s','debug|d+','help|h|?','man|m'); diff --git a/bin/do_it_all b/bin/do_it_all index 3d3cfa3..788319a 100755 --- a/bin/do_it_all +++ b/bin/do_it_all @@ -3,7 +3,6 @@ # under the terms of the GPL version 2, or any later version, at your # option. See the file README and COPYING for more information. # Copyright 2007 by Don Armstrong . -# $Id: perl_script 495 2006-08-10 08:02:01Z don $ use warnings; @@ -16,8 +15,8 @@ use Storable; =head1 NAME -do_it_all - Call out to each of the search modules to search for each -of the terms + do_it_all - Call out to each of the search modules to search for + each of the terms =head1 SYNOPSIS @@ -82,6 +81,7 @@ use vars qw($DEBUG); use Cwd qw(abs_path); use IO::File; use Storable qw(thaw freeze); +use File::Basename qw(basename); my %options = (databases => [], keywords => [], @@ -98,6 +98,8 @@ GetOptions(\%options,'keywords=s@','databases=s@', pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; +my $base_dir = basename($0); + my $ERRORS=''; $ERRORS.="restart-at must be one of get, parse or combine\n" if @@ -180,16 +182,16 @@ if (@{$options{keywords}}) { if (exists $options{restart_at} and length $options{restart_at}) { if (lc($options{restart_at}) eq 'get') { - delete $state{gotten_keywords}; - delete $state{parsed_keywords}; - delete $state{combined_keywords}; + delete $state{done_keywords}{get}; + delete $state{done_keywords}{parse}; + delete $state{done_keywords}{combine}; } elsif (lc($options{restart_at}) eq 'parse') { - delete $state{parsed_keywords}; - delete $state{combined_keywords}; + delete $state{done_keywords}{parse}; + delete $state{done_keywords}{combine}; } elsif (lc($options{restart_at}) eq 'combine') { - delete $state{combined_keywords}; + delete $state{done_keywords}{combine}; } } @@ -217,7 +219,7 @@ for my $keyword (@{$state{keywords}}) { } if (not exists $state{done_keywords}{parse}{$database}{$keyword}) { push @{$actions{parse}{$database}},$keyword; - delete $state{done_keywords}{combine}{$database}{$keyword} if + delete $state{done_keywords}{combine}{$database}{$keyword} if exists $state{done_keywords}{combine}{$database}{$keyword}; } if (not exists $state{done_keywords}{combine}{$database}{$keyword}) { @@ -255,6 +257,32 @@ for my $state (qw(get parse)) { } } +if ($actions{combine}) { + save_state(\%state); + # deal with combining results + my @parsed_results = map { my $db = $_; + map { + "parsed_results_${db}_${_}.txt" + } keys %{$state{done_keywords}{parse}{$db}} + } keys %{$state{done_keywords}{parse}}; + + write_command_to_file('combined_results.txt', + "$base_dir/combine_results", + @parsed_results, + ); + for my $result (@parsed_results) { + s/^parsed_results_//; + s/\.txt$//; + my ($db,$keyword) = split /_/, $_, 2; + $state{done_keywords}{combined}{$db}{$keyword} = 1; + } + save_state(\%state); + ADVISE("Finished; results in $options{results}/combined_results"); +} +else { + ADVISE('Nothing to do. [Perhaps you wanted --restart-at?]'); +} + sub handle_action{ my ($state,$database,$queue) = @_; my $keyword; @@ -262,6 +290,34 @@ sub handle_action{ my $failed_keywords = (); while ($keyword = $queue->dequeue) { # handle the action, baybee + if ($state eq 'get') { + my $command_fh; + open($command_fh,'|-', + "get_${database}_results", + ); + print {$command_fh} "$keyword\n"; + close($command_fh); + if ($? != 0) { + WARN("get_${database}_results with keyword $keyword failed with error code ".($?>>8)); + next; + } + } + elsif ($state eq 'parse') { + eval { + write_command_to_file("parsed_results_${database}_${keyword}.txt", + "parse_${database}_results", + '--keywords', + $keyword, + ); + }; + if ($@) { + WARN("parse_${database}_results failed with $@"); + next; + } + } + else { + die "I don't know how to handle state $state"; + } ADVISE("$state results from '$database' for '$keyword'"); push @{$actioned_keywords},$keyword; } @@ -276,6 +332,19 @@ sub save_state{ close $state_fh or die "Unable to close state file: $!"; } +sub write_command_to_file{ + my ($file,@command); + my $fh = IO::File->new($file,'w') or + die "Unable to open $file for writing: $!"; + my $command_fh; + open($command_fh,'-|', + @command, + ) or die "Unable to execute $command[0] $!"; + print {$fh} <$command_fh>; + close $fh; + close $command_fh or die "$command[0] failed with ".($?>>8); +} + sub ADVISE{ print STDOUT map {($_,qq(\n))} @_; diff --git a/bin/get_ncbi_xml_results b/bin/get_ncbi_xml_results index 3ee4778..c104356 100755 --- a/bin/get_ncbi_xml_results +++ b/bin/get_ncbi_xml_results @@ -84,7 +84,7 @@ my %options = (debug => 0, format => 'xml', database => 'gene', dir => '.', - name => '${search}_results.$format', + name => 'ncbi_${search}_results.$format', terms => '-', pubmed_site => 'http://www.ncbi.nlm.nih.gov', pubmed_search_url => '/entrez/query.fcgi?db=gene&cmd=search&term=12q24*+AND+homo[Orgn]&doptcmdl=Brief&dispmax=1000', diff --git a/bin/parse_genecard_results b/bin/parse_genecard_results index 4c00d9b..6024d9b 100755 --- a/bin/parse_genecard_results +++ b/bin/parse_genecard_results @@ -70,16 +70,17 @@ BEGIN{ use IO::File; use IO::Dir; -# XXX parse config file - my %options = (debug => 0, help => 0, man => 0, dir => '.', keyword => undef, + keywords => 0, ); -GetOptions(\%options,'keyword|k=s','dir|D=s','debug|d+','help|h|?','man|m'); +GetOptions(\%options,'keyword|k=s','dir|D=s','debug|d+','help|h|?','man|m', + 'keywords', + ); pod2usage() if $options{help}; @@ -99,6 +100,13 @@ use constant {NAME => 0, FILENAME => 8, }; +if ($options{keywords}) { + if (@ARGV != 1) { + pod2usage("If the --keywords option is used, exactly one argument (the keyword) must be passed"); + } + $option{dir} = "$ARGV[0]_results_genecard"; +} + if (not -d $options{dir}) { die "$options{dir} does not exist or is not a directory"; } diff --git a/bin/parse_harvester_results b/bin/parse_harvester_results index 29d0719..a35c9b9 100755 --- a/bin/parse_harvester_results +++ b/bin/parse_harvester_results @@ -77,6 +77,7 @@ my %options = (debug => 0, man => 0, dir => '.', keyword => undef, + keywords => 0, ); GetOptions(\%options,'keyword|k=s','dir|D=s','debug|d+','help|h|?','man|m'); @@ -99,6 +100,15 @@ use constant {NAME => 0, FILENAME => 8, }; +if ($options{keywords}) { + if (@ARGV != 1) { + pod2usage("If the --keywords option is used, exactly one argument (the keyword) must be passed"); + } + $option{dir} = "$ARGV[0]_results_harvester"; +} + + + if (not -d $options{dir}) { die "$options{dir} does not exist or is not a directory"; } diff --git a/bin/parse_ncbi_results b/bin/parse_ncbi_results index 51d339d..1e2b8d4 100755 --- a/bin/parse_ncbi_results +++ b/bin/parse_ncbi_results @@ -77,9 +77,12 @@ my %options = (debug => 0, man => 0, dir => '.', keyword => undef, + keywords => 0, ); -GetOptions(\%options,'keyword|k=s','debug|d+','help|h|?','man|m'); +GetOptions(\%options,'keyword|k=s','debug|d+','help|h|?','man|m', + 'keywords' + ); pod2usage() if $options{help}; @@ -179,10 +182,16 @@ $parser->setHandlers('Start' => \&tag_start, 'Char' => \&tag_content ); +print STDOUT join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n); for (@ARGV) { $file_name = $_; - ($keyword) = $options{keyword} || $file_name =~ m#(?:^|/)([^\/]+?)[\s-]+AND[\s\-].+_results.xml$#; - print STDOUT join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n); + if ($options{keywords}) { + $keyword = $_; + $file_name = "ncbi_${keyword}_results.xml"; + } + else { + ($keyword) = $options{keyword} || $file_name =~ m#(?:^|/)([^\/]+?)[\s-]+AND[\s\-].+_results.xml$#; + } my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!"; $parser->parse($file);