use IO::File;
use IO::Dir;
+use HTML::TreeBuilder;
+use HTML::ElementTable;
+
my %options = (debug => 0,
help => 0,
man => 0,
print join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n);
+my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_genecard#;
+
while ($_ = $dir->read) {
my $file_name = $_;
next if $file_name =~ /^\./;
my @results;
# Find gene name
- ($results[NAME]) = $result =~ m&(?:Lean|Gene)Card\s+for\s+(?:(?:disorder\s+locus|uncategorized|
- hugo\s*reserved\s*symbol|cluster|
- potentially\s*expressed\s*sequence)|(?:predicted\s+|pseudo|rna\s+|)gene)
- \s*(?:with\s*support\s*|)<FONT\s+COLOR=\"[^\"]+\">\s*<FONT\s+SIZE=\+2>\s*([^\s]+)\s*&xis;
+ ($results[NAME]) = $result =~ m{(?:Lean|Gene)Card\s+for\s+[^<]+<FONT[^>]+>\s*([^<]+)}xis;
$results[NAME] ||= 'NO NAME';
# Find REF SEQ number
- ($results[REFSEQ]) = $result =~ m|http://www.ncbi.nlm.nih.gov/entrez/query.fcgi\?
- cmd=Search\&db=nucleotide\&doptcmdl=GenBank\&term=([^\"]+)\"|xis;
+ ($results[REFSEQ]) = $result =~ m{http://www.ncbi.nlm.nih.gov/entrez/query.fcgi\?
+ (?:cmd=Search\&db=nucleotide|db=nucleotide\&cmd=search)
+ \&doptcmdl=GenBank\&term=([^\"]+)\"}xis;
$results[REFSEQ] ||= 'NO REFSEQ';
# Find Gene Location
- ($results[LOCATION]) = $result =~ m&<I>LocusLink\s+cytogenetic\s+band:</I><b>\s+
- <a\s+href="[^\"]+"\s+target\s+=\s+"aaa">\s*([^\<]+?)\s*</a>&xis;
+ ($results[LOCATION]) = $result =~ m{cytogenetic\s+band:</I><b>\s+
+ <a\s+href="[^\"]+"\s+target\s*=\s*"aaa"[^>]*>\s*([^\<]+?)\s*</a>}xis;
$results[LOCATION] ||= 'NO LOCATION';
# Find gene aliases
- my ($alias_table) = $result =~ m|<b>Aliases and Descriptions</b>(.+?)</TR>|is;
- $alias_table ||='';
-
- my @gene_aliases = $alias_table =~ m|<li>\s*([^\(]{0,20}?)\s*\(<FONT|gis;
+ my ($alias_table) = $result =~ m{(<table[^>]+><tr><th[^>]+>Aliases.+?</table>)}is;
+ $alias_table ||= '';
+ my @gene_aliases = map {s/\s*$//; $_;} $alias_table =~ m{<td(?: nowrap)?>\s*([^<]+)<}gis;
$results[ALIAS] = join('; ', @gene_aliases);
$results[ALIAS] ||= 'NO ALIASES';
# Find gene function(s)
- # Swiss prot functions
- my @functions = $result =~ m&<li><b>Function:</b>\s+(.+?)(?:<li>)|(?:</ul>)&gis;
-
+ my @functions;
# GO Functions
- push @functions, (map {s#\s*</a>\s*# #g; $_;} $result =~ m&(GO:\d+\s*</a>.+?)(?:<dd>|<p>)&gis);
+ push @functions, (map {s/\n//g; $_;}
+ map {s#\s*</a>(?:</td><td>\s*)?\s*# #g; $_;}
+ $result =~ m{(GO:\d+\s*</a>(?:</td><td>\s*)?.+?)(?:</font>|</td>|<dd>|<p>)}gis
+ );
$results[FUNCTION] = join('; ', map {(defined $_)?($_):()} @functions);
$results[FUNCTION] ||= 'NO FUNCTION';
# Figure out the keyword used
- ($results[KEYWORD]) = $file_name =~ /search=([^&]+)/;
+ ($results[KEYWORD]) = $file_name =~ /search=?([^&]+)$/;
+
+ $results[KEYWORD] ||= $keyword || 'NO KEYWORD';
- $results[KEYWORD] ||= 'NO KEYWORD';
+ # Swiss prot functions
+ my @description = (map {s/<[^>]+>/ /g;
+ s/\s+/ /g;
+ $_;
+ }
+ $result =~ m{<(?:dd|li)><b>Function(?::</b>|</b>:)\s+
+ (.+?)<(?:/dd|li)>}xgis
+ );
# Figure out what the description is
- $results[DESCRIPTION] = '';
+ $results[DESCRIPTION] = join('; ',
+ map {(defined $_)?($_):()}
+ @description);
# Database searched
$results[DBNAME] = 'genecard';