From a4b9214b7939fbe990f2373684dfd7aa1f9e77e2 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Sat, 27 Oct 2007 00:14:42 +0000 Subject: [PATCH] * Fix genecard and harvester parsers * Add headers for results_to_table * Call results_to_table in function2gene git-svn-id: file:///srv/svn/function2gene/trunk@15 a0738b58-4706-0410-8799-fb830574a030 --- bin/function2gene | 13 +++++++++++-- bin/parse_genecard_results | 37 ++++++++++++++++++++++++------------- bin/parse_harvester_results | 7 +++++++ bin/results_to_table | 6 ++++-- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/bin/function2gene b/bin/function2gene index 07dd2ae..ed9a38c 100755 --- a/bin/function2gene +++ b/bin/function2gene @@ -56,7 +56,8 @@ databases.] =item B<--restart-at> If you need to restart the process at a particular state (which has -already been completed) specify this option. +already been completed) specify this option. Valid values are get, +parse, or combine. =item B<--debug, -d> @@ -78,6 +79,10 @@ Display this manual. echo 'transferrin' > keywords.txt function2gene --keywords keywords.txt --results keyword_results + # reparse the results + function2gene --keywords keywords.txt --results keyword_results \ + --restart-at parse + =cut @@ -287,7 +292,11 @@ if ($actions{combine}) { $state{done_keywords}{combined}{$db}{$keyword} = 1; } save_state(\%state); - ADVISE("Finished; results in $options{results}/combined_results"); + write_command_to_file('combined_results_table.txt', + "$base_dir/results_to_table", + 'combined_results.txt', + ); + ADVISE("Finished; results in $options{results}/combined_results.txt"); } else { ADVISE('Nothing to do. [Perhaps you wanted --restart-at?]'); diff --git a/bin/parse_genecard_results b/bin/parse_genecard_results index 4b7d514..b423719 100755 --- a/bin/parse_genecard_results +++ b/bin/parse_genecard_results @@ -70,6 +70,9 @@ BEGIN{ use IO::File; use IO::Dir; +use HTML::TreeBuilder; +use HTML::ElementTable; + my %options = (debug => 0, help => 0, man => 0, @@ -139,29 +142,26 @@ while ($_ = $dir->read) { $results[REFSEQ] ||= 'NO REFSEQ'; # Find Gene Location - ($results[LOCATION]) = $result =~ m&LocusLink\s+cytogenetic\s+band:\s+ - \s*([^\<]+?)\s*&xis; + ($results[LOCATION]) = $result =~ m{cytogenetic\s+band:\s+ + ]*>\s*([^\<]+?)\s*}xis; $results[LOCATION] ||= 'NO LOCATION'; # Find gene aliases - my ($alias_table) = $result =~ m|Aliases and Descriptions(.+?)|is; - $alias_table ||=''; - - my @gene_aliases = $alias_table =~ m|
  • \s*([^\(]{0,20}?)\s*\(]+>]+>Aliases.+?)}is; + $alias_table ||= ''; + my @gene_aliases = map {s/\s*$//; $_;} $alias_table =~ m{\s*([^<]+)<}gis; $results[ALIAS] = join('; ', @gene_aliases); $results[ALIAS] ||= 'NO ALIASES'; # Find gene function(s) - # Swiss prot functions - my @functions = $result =~ m&
  • Function:\s+(.+?)(?:
  • )|(?:)&gis; - + my @functions; # GO Functions - push @functions, (map {s/\n//g} - map {s#\s*\s*(?:\s*)?\s*# #g; $_;} - $result =~ m{(GO:\d+\s*(?:\s*)?.+?)(?:
    |

    )}gis + push @functions, (map {s/\n//g; $_;} + map {s#\s*(?:\s*)?\s*# #g; $_;} + $result =~ m{(GO:\d+\s*(?:\s*)?.+?)(?:||

    |

    )}gis ); $results[FUNCTION] = join('; ', map {(defined $_)?($_):()} @functions); $results[FUNCTION] ||= 'NO FUNCTION'; @@ -171,8 +171,19 @@ while ($_ = $dir->read) { $results[KEYWORD] ||= 'NO KEYWORD'; + # Swiss prot functions + my @description = (map {s/<[^>]+>/ /g; + s/\s+/ /g; + $_; + } + $result =~ m{<(?:dd|li)>Function(?::|:)\s+ + (.+?)<(?:/dd|li)>}xgis + ); + # Figure out what the description is - $results[DESCRIPTION] = ''; + $results[DESCRIPTION] = join('; ', + map {(defined $_)?($_):()} + @description); # Database searched $results[DBNAME] = 'genecard'; diff --git a/bin/parse_harvester_results b/bin/parse_harvester_results index 30fc011..e520661 100755 --- a/bin/parse_harvester_results +++ b/bin/parse_harvester_results @@ -142,6 +142,13 @@ while ($_ = $dir->read) { ($results[NAME]) = $result =~ m&\s*Entry\s*name\s* \s*\s*([^<]+?)\s*\s*&xis; } + if (not defined $results[NAME]) { + ($results[NAME]) = $result =~ m{[^:]+:\s*[^\*]+\*[^\*]+\*\s*([^-]+)}xis; + $results[NAME] =~ s/\s*$// if defined $results[NAME]; + $results[NAME] =~ s/^\s*$// if defined $results[NAME]; + $results[NAME] =~ s/\d+\s*kDa\s*protein// if defined $results[NAME]; + $results[NAME] =~ s/\s*similar to .+// if defined $results[NAME]; + } $results[NAME] ||= 'NO NAME'; $results[NAME] =~ s/_HUMAN//; diff --git a/bin/results_to_table b/bin/results_to_table index 04f2f6d..24c9400 100755 --- a/bin/results_to_table +++ b/bin/results_to_table @@ -75,12 +75,14 @@ while (<$search_results_fh>) { $databases{total}{total}{count}++; } -our ($keyword,$gct,$hvt,$nct,$t); +our ($keyword,$gct,$hvt,$nct,$t) = ('Keyword','GeneCards','Harvester','NCBI','Total'); format STDOUT = @<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\ $keyword, $gct, $hvt, $nct, $t . -for$keyword (sort keys %terms) { +write; + +for $keyword (sort keys %terms) { ($gct,$hvt,$nct,$t) = map { if (not defined $_) { -- 2.39.2