=item B<--restart-at>
If you need to restart the process at a particular state (which has
-already been completed) specify this option.
+already been completed) specify this option. Valid values are get,
+parse, or combine.
=item B<--debug, -d>
echo 'transferrin' > keywords.txt
function2gene --keywords keywords.txt --results keyword_results
+ # reparse the results
+ function2gene --keywords keywords.txt --results keyword_results \
+ --restart-at parse
+
=cut
$state{done_keywords}{combined}{$db}{$keyword} = 1;
}
save_state(\%state);
- ADVISE("Finished; results in $options{results}/combined_results");
+ write_command_to_file('combined_results_table.txt',
+ "$base_dir/results_to_table",
+ 'combined_results.txt',
+ );
+ ADVISE("Finished; results in $options{results}/combined_results.txt");
}
else {
ADVISE('Nothing to do. [Perhaps you wanted --restart-at?]');
use IO::File;
use IO::Dir;
+use HTML::TreeBuilder;
+use HTML::ElementTable;
+
my %options = (debug => 0,
help => 0,
man => 0,
$results[REFSEQ] ||= 'NO REFSEQ';
# Find Gene Location
- ($results[LOCATION]) = $result =~ m&<I>LocusLink\s+cytogenetic\s+band:</I><b>\s+
- <a\s+href="[^\"]+"\s+target\s+=\s+"aaa">\s*([^\<]+?)\s*</a>&xis;
+ ($results[LOCATION]) = $result =~ m{cytogenetic\s+band:</I><b>\s+
+ <a\s+href="[^\"]+"\s+target\s*=\s*"aaa"[^>]*>\s*([^\<]+?)\s*</a>}xis;
$results[LOCATION] ||= 'NO LOCATION';
# Find gene aliases
- my ($alias_table) = $result =~ m|<b>Aliases and Descriptions</b>(.+?)</TR>|is;
- $alias_table ||='';
-
- my @gene_aliases = $alias_table =~ m|<li>\s*([^\(]{0,20}?)\s*\(<FONT|gis;
+ my ($alias_table) = $result =~ m{(<table[^>]+><tr><th[^>]+>Aliases.+?</table>)}is;
+ $alias_table ||= '';
+ my @gene_aliases = map {s/\s*$//; $_;} $alias_table =~ m{<td(?: nowrap)?>\s*([^<]+)<}gis;
$results[ALIAS] = join('; ', @gene_aliases);
$results[ALIAS] ||= 'NO ALIASES';
# Find gene function(s)
- # Swiss prot functions
- my @functions = $result =~ m&<li><b>Function:</b>\s+(.+?)(?:<li>)|(?:</ul>)&gis;
-
+ my @functions;
# GO Functions
- push @functions, (map {s/\n//g}
- map {s#\s*\s*</a>(?:</td><td>\s*)?\s*# #g; $_;}
- $result =~ m{(GO:\d+\s*</a>(?:</td><td>\s*)?.+?)(?:</td><dd>|<p>)}gis
+ push @functions, (map {s/\n//g; $_;}
+ map {s#\s*</a>(?:</td><td>\s*)?\s*# #g; $_;}
+ $result =~ m{(GO:\d+\s*</a>(?:</td><td>\s*)?.+?)(?:</font>|</td>|<dd>|<p>)}gis
);
$results[FUNCTION] = join('; ', map {(defined $_)?($_):()} @functions);
$results[FUNCTION] ||= 'NO FUNCTION';
$results[KEYWORD] ||= 'NO KEYWORD';
+ # Swiss prot functions
+ my @description = (map {s/<[^>]+>/ /g;
+ s/\s+/ /g;
+ $_;
+ }
+ $result =~ m{<(?:dd|li)><b>Function(?::</b>|</b>:)\s+
+ (.+?)<(?:/dd|li)>}xgis
+ );
+
# Figure out what the description is
- $results[DESCRIPTION] = '';
+ $results[DESCRIPTION] = join('; ',
+ map {(defined $_)?($_):()}
+ @description);
# Database searched
$results[DBNAME] = 'genecard';
($results[NAME]) = $result =~ m&<TR>\s*<TD\s*BGCOLOR="\#FEFE99"\s*VALIGN="top"\s*NOWRAP>Entry\s*name</TD>\s*
<TD\s*VALIGN="top"\s*COLSPAN="5">\s*<b>\s*([^<]+?)\s*</b></TD>\s*</TR>&xis;
}
+ if (not defined $results[NAME]) {
+ ($results[NAME]) = $result =~ m{<TITLE>[^:]+:\s*[^\*]+\*[^\*]+\*\s*([^-]+)}xis;
+ $results[NAME] =~ s/\s*$// if defined $results[NAME];
+ $results[NAME] =~ s/^\s*$// if defined $results[NAME];
+ $results[NAME] =~ s/\d+\s*kDa\s*protein// if defined $results[NAME];
+ $results[NAME] =~ s/\s*similar to .+// if defined $results[NAME];
+ }
$results[NAME] ||= 'NO NAME';
$results[NAME] =~ s/_HUMAN//;
$databases{total}{total}{count}++;
}
-our ($keyword,$gct,$hvt,$nct,$t);
+our ($keyword,$gct,$hvt,$nct,$t) = ('Keyword','GeneCards','Harvester','NCBI','Total');
format STDOUT =
@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\
$keyword, $gct, $hvt, $nct, $t
.
-for$keyword (sort keys %terms) {
+write;
+
+for $keyword (sort keys %terms) {
($gct,$hvt,$nct,$t) =
map {
if (not defined $_) {