Add results to table; modify the search parssers to work better. Fix error in get_ncb...
authorDon Armstrong <don@donarmstrong.com>
Fri, 26 Oct 2007 21:40:13 +0000 (21:40 +0000)
committerDon Armstrong <don@donarmstrong.com>
Fri, 26 Oct 2007 21:40:13 +0000 (21:40 +0000)
git-svn-id: file:///srv/svn/function2gene/trunk@14 a0738b58-4706-0410-8799-fb830574a030

bin/function2gene
bin/get_ncbi_results
bin/parse_genecard_results
bin/parse_harvester_results
bin/results_to_table [new file with mode: 0755]

index 922deac..07dd2ae 100755 (executable)
@@ -281,9 +281,9 @@ if ($actions{combine}) {
                           @parsed_results,
                          );
      for my $result (@parsed_results) {
-         s/^parsed_results_//;
-         s/\.txt$//;
-         my ($db,$keyword) = split /_/, $_, 2;
+         $result =~ s/^parsed_results_//;
+         $result =~ s/\.txt$//;
+         my ($db,$keyword) = split /_/, $result, 2;
          $state{done_keywords}{combined}{$db}{$keyword} = 1;
      }
      save_state(\%state);
index 7a86901..3e1a625 100755 (executable)
@@ -143,6 +143,7 @@ while (<$terms>) {
 
      # Get XML file
      my @current_ids;
+     print {$xml_file} "<opt>\n";
      while (@current_ids = splice(@gene_ids,0,5)) {
          $uri = URI->new($options{pubmed_site}.$options{pubmed_get_url});
          $uri->query_form($uri->query_form(),
@@ -167,10 +168,10 @@ while (<$terms>) {
          $response =~ s/^\s*<pre>//gso;
          $response =~ s#</pre>\s*$##gso;
 
-
          print {$xml_file} $response;
          sleep 10;
      }
+     print {$xml_file} "</opt>\n";
      undef $xml_file;
 }
 
index a6718c5..4b7d514 100755 (executable)
@@ -128,15 +128,13 @@ while ($_ = $dir->read) {
      my @results;
 
      # Find gene name
-     ($results[NAME]) = $result =~ m&(?:Lean|Gene)Card\s+for\s+(?:(?:disorder\s+locus|uncategorized|
-                                    hugo\s*reserved\s*symbol|cluster|
-                                    potentially\s*expressed\s*sequence)|(?:predicted\s+|pseudo|rna\s+|)gene)
-                                    \s*(?:with\s*support\s*|)<FONT\s+COLOR=\"[^\"]+\">\s*<FONT\s+SIZE=\+2>\s*([^\s]+)\s*&xis;
+     ($results[NAME]) = $result =~ m{(?:Lean|Gene)Card\s+for\s+[^<]+<FONT[^>]+>\s*([^<]+)}xis;
 
      $results[NAME] ||= 'NO NAME';
      # Find REF SEQ number
-     ($results[REFSEQ]) = $result =~ m|http://www.ncbi.nlm.nih.gov/entrez/query.fcgi\?
-                              cmd=Search\&db=nucleotide\&doptcmdl=GenBank\&term=([^\"]+)\"|xis;
+     ($results[REFSEQ]) = $result =~ m{http://www.ncbi.nlm.nih.gov/entrez/query.fcgi\?
+                                      (?:cmd=Search\&db=nucleotide|db=nucleotide\&cmd=search)
+                                      \&doptcmdl=GenBank\&term=([^\"]+)\"}xis;
 
      $results[REFSEQ] ||= 'NO REFSEQ';
 
@@ -161,7 +159,10 @@ while ($_ = $dir->read) {
      my @functions = $result =~ m&<li><b>Function:</b>\s+(.+?)(?:<li>)|(?:</ul>)&gis;
 
      # GO Functions
-     push @functions, (map {s#\s*</a>\s*# #g; $_;} $result =~ m&(GO:\d+\s*</a>.+?)(?:<dd>|<p>)&gis);
+     push @functions, (map {s/\n//g}
+                      map {s#\s*\s*</a>(?:</td><td>\s*)?\s*# #g; $_;}
+                      $result =~ m{(GO:\d+\s*</a>(?:</td><td>\s*)?.+?)(?:</td><dd>|<p>)}gis
+                     );
      $results[FUNCTION] = join('; ', map {(defined $_)?($_):()} @functions);
      $results[FUNCTION] ||= 'NO FUNCTION';
 
index 7a8cfd4..30fc011 100755 (executable)
@@ -144,6 +144,7 @@ while ($_ = $dir->read) {
      }
 
      $results[NAME] ||= 'NO NAME';
+     $results[NAME] =~ s/_HUMAN//;
 
      # Find REF SEQ number
      ($results[REFSEQ]) = $result =~ m&<a\s+href="http://www.ncbi.nlm.nih.gov/entrez/
diff --git a/bin/results_to_table b/bin/results_to_table
new file mode 100755 (executable)
index 0000000..04f2f6d
--- /dev/null
@@ -0,0 +1,109 @@
+#! /usr/bin/perl
+
+use warnings;
+use strict;
+
+use List::Util qw(sum);
+use IO::File;
+
+die "search_resultsfile" unless @ARGV == 1;
+my ($search_results_fn) = @ARGV;
+
+my $search_results_fh = new IO::File $search_results_fn, 'r' or die "Unable to read $search_results_fn: $!";
+
+# read in the search results
+my @results;
+my %name_idx;
+my %refseq_idx;
+my %databases;
+my %terms;
+while (<$search_results_fh>) {
+     # "name","hits","rzscore","refseq","location","alias","database","terms","description","function"
+     next if $_ =~ /^\"name\"\,/;
+     chomp;
+     my %record;
+     @record{qw(name hits rzscore refseq location alias database terms description function)} = map {s/^"//; s/"$//; $_;} split /\"\,\"/;
+     push @results,{%record};
+     push @{$name_idx{lc($record{name})}}, $#results;
+     foreach (map {lc($_)} split /\s*;\s*/, $record{alias}) {
+         push @{$name_idx{$_}}, $#results;
+     }
+     die "Duplicate refseq at record $." if exists $refseq_idx{lc $record{refseq}};
+     $refseq_idx{lc $record{name}} = $#results;
+     my @terms = split '; ', $record{terms};
+     my %term_temp;
+     my %db_temp;
+     my %gene_temp;
+     my %gene_temp2;
+     for my $term (@terms) {
+         if ($term =~ /\[/) {
+              my ($keyword,$database,$hits) = $term =~ /([^[]+)\[([^\]]+)\]:(\d+)/;
+              $keyword =~ s/[-+_]/ /g;
+              $keyword =~ s/\s*$//;
+              $keyword =~ s/[*]//;
+              $gene_temp{$keyword}{$database} = 1;
+              $gene_temp2{$database}{$keyword} = 1;
+              $databases{$database}{$keyword}{count}++;
+              $db_temp{$database}++;
+              $terms{$keyword}{$database}{count}++;
+         }
+         else {
+              my ($keyword,$hits) = $term =~ /([^:]+):(\d+)/;
+              $keyword =~ s/[-+_]/ /g;
+              $keyword =~ s/\s*$//;
+              $keyword =~ s/[*]//;
+              $terms{$keyword}{total}{count}++;
+         }
+     }
+     if (keys %gene_temp == 1) {
+         $terms{[keys %gene_temp]->[0]}{total}{unique}++;
+         if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) {
+              $databases{total}{total}{unique}++
+         }
+     }
+     if (keys %gene_temp2 == 1) {
+         $databases{[keys %gene_temp2]->[0]}{total}{unique}++;
+     }
+     for my $keyword (keys %gene_temp) {
+         if (keys %{$gene_temp{$keyword}} == 1) {
+              $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++;
+         }
+     }
+     for my $database (keys %db_temp) {
+         $databases{$database}{total}{count}++;
+     }
+     $databases{total}{total}{count}++;
+}
+
+our ($keyword,$gct,$hvt,$nct,$t);
+     format STDOUT =
+@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\
+$keyword,                 $gct,     $hvt,     $nct,     $t
+.
+for$keyword (sort keys %terms) {
+     ($gct,$hvt,$nct,$t) =
+         map {
+              if (not defined $_) {
+                   '$-$';
+              }
+              else {
+                   $_->{unique} ||= 0;
+                   "$_->{count} ($_->{unique})";
+              }
+         } @{$terms{$keyword}}{qw(genecard harvester ncbi total)};
+     write;
+}
+
+$keyword = 'Total';
+($gct,$hvt,$nct,$t) =
+     map {
+         if (not defined $_) {
+              '$-$';
+         }
+         else {
+              $_->{unique} ||= 0;
+              "$_->{count} ($_->{unique})";
+         }
+     } map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
+#($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
+write;