From: Don Armstrong Date: Tue, 22 Jan 2008 23:04:20 +0000 (+0000) Subject: * Stop requiring wget X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=commitdiff_plain;h=af4fd770f221db1cec02393df378e079c0b9a8fc * Stop requiring wget * Fix issue with file naming after not requiring wget * Fix undefined warning in combine_results git-svn-id: file:///srv/svn/function2gene/trunk@26 a0738b58-4706-0410-8799-fb830574a030 --- diff --git a/bin/combine_results b/bin/combine_results index e49de8f..666fe7c 100755 --- a/bin/combine_results +++ b/bin/combine_results @@ -23,6 +23,9 @@ use Pod::Usage; combine_results parsed_results_1.txt [parsedresultfiles ...] Options: + --keywords newline delineated list of keywords to search for + --results file to store results in + --results-table optional file to write result summary table to --debug, -d debugging level [default 0] --help, -h display this help --man, -m display manual @@ -31,6 +34,23 @@ use Pod::Usage; =over +=item B<--keywords> + +A file which contains a newline delinated list of keywords to search +for. Can be specified multiple times. Lines starting with # or ; are +ignored. An optional weight can be specified after the keyword, which +is separated from the keyword by a tab. (If not specified, 1 is +assumed.) + +=item B<--results> + +A file in which to store the combined results; defaults to STDOUT + +=item B<--results-table> + +A file in which to store a summary table of results. If not provided, +no summary table is written. + =item B<--debug, -d> Debug verbosity. (Default 0) @@ -63,6 +83,7 @@ BEGIN{ use XML::Parser::Expat; use IO::File; +use List::Util qw(sum max); # XXX parse config file @@ -70,9 +91,13 @@ my %options = (debug => 0, help => 0, man => 0, keywords => [], + results => undef, + results_table => undef, ); -GetOptions(\%options,'keywords|k=s@','debug|d+','help|h|?','man|m'); +GetOptions(\%options,'keywords|k=s@', + 'results=s','results_table|results-table=s', + 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; @@ -80,6 +105,18 @@ pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; +my $results_fh = \*STDOUT; +my $results_table_fh = undef; + +if ($options{results}) { + $results_fh = IO::File->new($options{results},'w') or + die "Unable to open results file $options{results} for writing"; +} +if ($options{results_table}) { + $results_table_fh = IO::File->new($options{results_table},'w') or + die "Unable to open results table file $options{results_table} for writing"; +} + # CSV columns use constant {NAME => 0, REFSEQ => 1, @@ -92,10 +129,29 @@ use constant {NAME => 0, FILENAME => 8, }; -my @csv_fields = qw(name hits rzscore refseq location alias database terms description function); +my @csv_fields = qw(name hits rzscore weightedscore autoscore refseq location alias database terms description function); my %genes; +my %keyword_weight; + +if (@{$options{keywords}}) { + for my $keyword_file (@{$options{keywords}}) { + my $keyword_fh = IO::File->new($keyword_file,'r') or + die "Unable to open $keyword_file for reading: $!"; + while (<$keyword_fh>) { + next if /^\s*[#;]/; + next unless /\w+/; + chomp; + my ($keyword,$weight) = split /\t/, $_; + $weight = 1 if not defined $weight; + $keyword_weight{$keyword} = $weight; + } + } +} + + + for my $file_name (@ARGV) { my $file = new IO::File $file_name, 'r' or die "Unable to open file $file_name $!"; while (<$file>) { @@ -114,10 +170,93 @@ for my $file_name (@ARGV) { } } -print join(',',map {qq("$_")} @csv_fields),qq(\n); +my %databases; +my %terms; +my %auto_weight; +my %keyword_keyword; +for my $gene (keys %genes) { + my %term_temp; + my %db_temp; + my %gene_temp; + my %gene_temp2; + for my $term (keys %{$genes{$gene}{terms}}) { + if ($term =~ /\[/) { + my ($keyword,$database) = $term =~ /([^[]+)\[([^\]]+)\]/; + my $hits = $genes{$gene}{terms}{$term}; + $keyword =~ s/[-+_]/ /g; + $keyword =~ s/\s*$//; + $keyword =~ s/[*]//; + $gene_temp{$keyword}{$database} = 1; + $gene_temp2{$database}{$keyword} = 1; + $databases{$database}{$keyword}{count}++; + $db_temp{$database}++; + $terms{$keyword}{$database}{count}++; + } + else { + my $keyword = $term; + my $hits = $genes{$gene}{terms}{$term}; + $keyword =~ s/[-+_]/ /g; + $keyword =~ s/\s*$//; + $keyword =~ s/[*]//; + $terms{$keyword}{total}{count}++; + } + } + if (keys %gene_temp == 1) { + $terms{[keys %gene_temp]->[0]}{total}{unique}++; + if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) { + $databases{total}{total}{unique}++ + } + } + if (keys %gene_temp2 == 1) { + $databases{[keys %gene_temp2]->[0]}{total}{unique}++; + } + for my $keyword (keys %gene_temp) { + if (keys %{$gene_temp{$keyword}} == 1) { + $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++; + } + for my $keyword2 (keys %gene_temp) { + $keyword_keyword{$keyword}{$keyword2}++ + } + } + for my $database (keys %db_temp) { + $databases{$database}{total}{count}++; + } + $databases{total}{total}{count}++; + +} + +for my $keyword (keys %keyword_keyword) { + # the autoweight table is the diagonal over the sum of the column of the keyword/keyword table + # we use max here to avoid 0/0 problems. + my $results_by_this_keyword = max(1,$keyword_keyword{$keyword}{$keyword}); + my $results_combined = max(1,grep {defined $_} + sum(map {$keyword_keyword{$keyword}{$_}} + grep {$_ ne $keyword} + keys %{$keyword_keyword{$keyword}} + ) + ); + $auto_weight{$keyword} = $results_by_this_keyword/$results_combined; +} + +print {$results_fh} join(',',map {qq("$_")} @csv_fields),qq(\n); for my $gene (keys %genes) { $genes{$gene}{rzscore} = scalar grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}}; - print STDOUT join (',', + $genes{$gene}{weightedscore}= sum(0, + map {defined $keyword_weight{$_}?$keyword_weight{$_}:1} + grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}} + ); + $genes{$gene}{autoscore}= sum(0, + map {defined $auto_weight{$_}?$auto_weight{$_}:1} + grep {$_ !~ /\[/} keys %{$genes{$gene}{terms}} + ); +} + +my $sort = 'autoscore'; +if (scalar grep {$_ != 1 } values %keyword_weight) { + $sort='weightedscore'; +} +for my $gene (sort {$genes{$b}{$sort} <=> $genes{$a}{$sort}} keys %genes) { + print {$results_fh} join (',', map {s/"//g; qq("$_")} map { my $value = $_; @@ -159,6 +298,47 @@ sub add_if_better{ } +if (defined $results_table_fh) { + our ($keyword,$weight,$autoweight,$gct,$hvt,$nct,$t) = ('Keyword','Weight','Autoweight','GeneCards','Harvester','NCBI','Total'); + format RESULTS_TABLE = +@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\ +$keyword, $weight, $autoweight, $gct, $hvt, $nct, $t +. + $results_table_fh->format_name('RESULTS_TABLE'); + write $results_table_fh; + + for $keyword (sort keys %terms) { + ($gct,$hvt,$nct,$t) = + map { + if (not defined $_) { + '$-$'; + } + else { + $_->{unique} ||= 0; + "$_->{count} ($_->{unique})"; + } + } @{$terms{$keyword}}{qw(genecard harvester ncbi total)}; + $weight = $keyword_weight{$keyword} || 1; + $autoweight = $auto_weight{$keyword}; + write $results_table_fh; + + } + $keyword = 'Total'; + ($gct,$hvt,$nct,$t) = + map { + if (not defined $_) { + '$-$'; + } + else { + $_->{unique} ||= 0; + "$_->{count} ($_->{unique})"; + } + } map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; + #($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; + $weight = ''; + $autoweight = ''; + write $results_table_fh; +} __END__ diff --git a/bin/function2gene b/bin/function2gene index f3bd3dd..de4915b 100755 --- a/bin/function2gene +++ b/bin/function2gene @@ -339,13 +339,18 @@ if ($actions{combine}) { } keys %{$state{done_keywords}{parse}}; # create temporary file to store keyword weights - - write_command_to_file('combined_results.txt', - "$base_dir/combine_results", - '--keywords', - - @parsed_results, - ); + my $file = IO::File->new('combined_keywords.txt','w') or + die "Unable to open combined_keywords.txt for writing: $!"; + for my $keyword (keys %{$state{keyword_weight}}) { + print {$file} "$keyword\t$state{keyword_weight}{$keyword}\n"; + } + system("$base_dir/combine_results", + '--keywords','combined_keywords.txt', + '--results','combined_results.txt', + '--results-table','combined_results_table.txt', + @parsed_results, + ) == 0 + or die "combine_results failed with ".($?>>8); for my $result (@parsed_results) { $result =~ s/^parsed_results_//; $result =~ s/\.txt$//; @@ -353,10 +358,6 @@ if ($actions{combine}) { $state{done_keywords}{combined}{$db}{$keyword} = 1; } save_state(\%state); - write_command_to_file('combined_results_table.txt', - "$base_dir/results_to_table", - 'combined_results.txt', - ); ADVISE("Finished; results in $options{results}/combined_results.txt"); } else { diff --git a/bin/get_genecard_results b/bin/get_genecard_results index 017f1e1..ac38e29 100755 --- a/bin/get_genecard_results +++ b/bin/get_genecard_results @@ -70,6 +70,7 @@ BEGIN{ use IO::File; use URI; use WWW::Mechanize; +use Time::HiRes qw(usleep); # XXX parse config file @@ -127,7 +128,22 @@ while (<$terms>) { # Get XML file my @current_urls; while (@current_urls = map{$options{genecard_site}.$_} splice(@result_urls,0,30)) { - system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!"; + for my $url (@current_urls) { + # sleep for around 2 seconds + usleep((0.5+rand)*2*1000000); + $mech->get($url); + my $cleaned_url = $url; + $cleaned_url =~ s{http://}{}g; + $cleaned_url =~ s/[^\w]//g; + eval { + $mech->save_content($options{dir}.'/'.$dir_name.'/'.$cleaned_url); + print "retreived $url\n"; + }; + if ($@) { + warn $@; + } + } + #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',qq($options{dir}/$dir_name),@current_urls) == 0 or warn "$!"; } } diff --git a/bin/get_harvester_results b/bin/get_harvester_results index 7446e08..4d3f593 100755 --- a/bin/get_harvester_results +++ b/bin/get_harvester_results @@ -72,6 +72,7 @@ use IO::File; use URI; use WWW::Mechanize; use Thread::Queue; +use Time::HiRes qw(usleep); # XXX parse config file @@ -172,7 +173,24 @@ sub get_url{ sub wget_urls{ my ($dir,@urls) = @_; return unless @urls; - system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!"; + # replacing wget with WWW::Mechanize + my $mech = WWW::Mechanize->new(agent => "DA_get_harvester_results/$REVISION"); + for my $url (@urls) { + # sleep for around 2 seconds + usleep((0.5+rand)*2*1000000); + $mech->get($url); + my $cleaned_url = $url; + $cleaned_url =~ s{http://}{}g; + $cleaned_url =~ s/[^\w]//g; + eval { + $mech->save_content($dir.'/'.$cleaned_url); + print "retreived $url\n"; + }; + if ($@) { + warn $@; + } + } + #system(q(wget),'-nd','-nH','-w','2','--random-wait','-P',$dir,@urls) == 0 or warn "$!"; } __END__ diff --git a/bin/parse_genecard_results b/bin/parse_genecard_results index b423719..fbd9885 100755 --- a/bin/parse_genecard_results +++ b/bin/parse_genecard_results @@ -118,6 +118,8 @@ my $dir = new IO::Dir $options{dir} or die "Unable to open dir $options{dir}: $! print join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n); +my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_genecard#; + while ($_ = $dir->read) { my $file_name = $_; next if $file_name =~ /^\./; @@ -167,9 +169,9 @@ while ($_ = $dir->read) { $results[FUNCTION] ||= 'NO FUNCTION'; # Figure out the keyword used - ($results[KEYWORD]) = $file_name =~ /search=([^&]+)/; + ($results[KEYWORD]) = $file_name =~ /search=?([^&]+)$/; - $results[KEYWORD] ||= 'NO KEYWORD'; + $results[KEYWORD] ||= $keyword || 'NO KEYWORD'; # Swiss prot functions my @description = (map {s/<[^>]+>/ /g;