my @results;
# Find gene name
- ($results[NAME]) = $result =~ m&(?:Lean|Gene)Card\s+for\s+(?:(?:disorder\s+locus|uncategorized|
- hugo\s*reserved\s*symbol|cluster|
- potentially\s*expressed\s*sequence)|(?:predicted\s+|pseudo|rna\s+|)gene)
- \s*(?:with\s*support\s*|)<FONT\s+COLOR=\"[^\"]+\">\s*<FONT\s+SIZE=\+2>\s*([^\s]+)\s*&xis;
+ ($results[NAME]) = $result =~ m{(?:Lean|Gene)Card\s+for\s+[^<]+<FONT[^>]+>\s*([^<]+)}xis;
$results[NAME] ||= 'NO NAME';
# Find REF SEQ number
- ($results[REFSEQ]) = $result =~ m|http://www.ncbi.nlm.nih.gov/entrez/query.fcgi\?
- cmd=Search\&db=nucleotide\&doptcmdl=GenBank\&term=([^\"]+)\"|xis;
+ ($results[REFSEQ]) = $result =~ m{http://www.ncbi.nlm.nih.gov/entrez/query.fcgi\?
+ (?:cmd=Search\&db=nucleotide|db=nucleotide\&cmd=search)
+ \&doptcmdl=GenBank\&term=([^\"]+)\"}xis;
$results[REFSEQ] ||= 'NO REFSEQ';
my @functions = $result =~ m&<li><b>Function:</b>\s+(.+?)(?:<li>)|(?:</ul>)&gis;
# GO Functions
- push @functions, (map {s#\s*</a>\s*# #g; $_;} $result =~ m&(GO:\d+\s*</a>.+?)(?:<dd>|<p>)&gis);
+ push @functions, (map {s/\n//g}
+ map {s#\s*\s*</a>(?:</td><td>\s*)?\s*# #g; $_;}
+ $result =~ m{(GO:\d+\s*</a>(?:</td><td>\s*)?.+?)(?:</td><dd>|<p>)}gis
+ );
$results[FUNCTION] = join('; ', map {(defined $_)?($_):()} @functions);
$results[FUNCTION] ||= 'NO FUNCTION';
--- /dev/null
+#! /usr/bin/perl
+
+use warnings;
+use strict;
+
+use List::Util qw(sum);
+use IO::File;
+
+die "search_resultsfile" unless @ARGV == 1;
+my ($search_results_fn) = @ARGV;
+
+my $search_results_fh = new IO::File $search_results_fn, 'r' or die "Unable to read $search_results_fn: $!";
+
+# read in the search results
+my @results;
+my %name_idx;
+my %refseq_idx;
+my %databases;
+my %terms;
+while (<$search_results_fh>) {
+ # "name","hits","rzscore","refseq","location","alias","database","terms","description","function"
+ next if $_ =~ /^\"name\"\,/;
+ chomp;
+ my %record;
+ @record{qw(name hits rzscore refseq location alias database terms description function)} = map {s/^"//; s/"$//; $_;} split /\"\,\"/;
+ push @results,{%record};
+ push @{$name_idx{lc($record{name})}}, $#results;
+ foreach (map {lc($_)} split /\s*;\s*/, $record{alias}) {
+ push @{$name_idx{$_}}, $#results;
+ }
+ die "Duplicate refseq at record $." if exists $refseq_idx{lc $record{refseq}};
+ $refseq_idx{lc $record{name}} = $#results;
+ my @terms = split '; ', $record{terms};
+ my %term_temp;
+ my %db_temp;
+ my %gene_temp;
+ my %gene_temp2;
+ for my $term (@terms) {
+ if ($term =~ /\[/) {
+ my ($keyword,$database,$hits) = $term =~ /([^[]+)\[([^\]]+)\]:(\d+)/;
+ $keyword =~ s/[-+_]/ /g;
+ $keyword =~ s/\s*$//;
+ $keyword =~ s/[*]//;
+ $gene_temp{$keyword}{$database} = 1;
+ $gene_temp2{$database}{$keyword} = 1;
+ $databases{$database}{$keyword}{count}++;
+ $db_temp{$database}++;
+ $terms{$keyword}{$database}{count}++;
+ }
+ else {
+ my ($keyword,$hits) = $term =~ /([^:]+):(\d+)/;
+ $keyword =~ s/[-+_]/ /g;
+ $keyword =~ s/\s*$//;
+ $keyword =~ s/[*]//;
+ $terms{$keyword}{total}{count}++;
+ }
+ }
+ if (keys %gene_temp == 1) {
+ $terms{[keys %gene_temp]->[0]}{total}{unique}++;
+ if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) {
+ $databases{total}{total}{unique}++
+ }
+ }
+ if (keys %gene_temp2 == 1) {
+ $databases{[keys %gene_temp2]->[0]}{total}{unique}++;
+ }
+ for my $keyword (keys %gene_temp) {
+ if (keys %{$gene_temp{$keyword}} == 1) {
+ $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++;
+ }
+ }
+ for my $database (keys %db_temp) {
+ $databases{$database}{total}{count}++;
+ }
+ $databases{total}{total}{count}++;
+}
+
+our ($keyword,$gct,$hvt,$nct,$t);
+ format STDOUT =
+@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\
+$keyword, $gct, $hvt, $nct, $t
+.
+for$keyword (sort keys %terms) {
+ ($gct,$hvt,$nct,$t) =
+ map {
+ if (not defined $_) {
+ '$-$';
+ }
+ else {
+ $_->{unique} ||= 0;
+ "$_->{count} ($_->{unique})";
+ }
+ } @{$terms{$keyword}}{qw(genecard harvester ncbi total)};
+ write;
+}
+
+$keyword = 'Total';
+($gct,$hvt,$nct,$t) =
+ map {
+ if (not defined $_) {
+ '$-$';
+ }
+ else {
+ $_->{unique} ||= 0;
+ "$_->{count} ($_->{unique})";
+ }
+ } map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
+#($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
+write;