From: Don Armstrong Date: Fri, 26 Oct 2007 21:40:13 +0000 (+0000) Subject: Add results to table; modify the search parssers to work better. Fix error in get_ncb... X-Git-Url: https://git.donarmstrong.com/?p=function2gene.git;a=commitdiff_plain;h=0486e6434cd182e8b30ba008d98ad3d3f008d2ab;hp=3557eb364c40602a2f2d3f4c2a68edd6ee00b632 Add results to table; modify the search parssers to work better. Fix error in get_ncbi_results git-svn-id: file:///srv/svn/function2gene/trunk@14 a0738b58-4706-0410-8799-fb830574a030 --- diff --git a/bin/function2gene b/bin/function2gene index 922deac..07dd2ae 100755 --- a/bin/function2gene +++ b/bin/function2gene @@ -281,9 +281,9 @@ if ($actions{combine}) { @parsed_results, ); for my $result (@parsed_results) { - s/^parsed_results_//; - s/\.txt$//; - my ($db,$keyword) = split /_/, $_, 2; + $result =~ s/^parsed_results_//; + $result =~ s/\.txt$//; + my ($db,$keyword) = split /_/, $result, 2; $state{done_keywords}{combined}{$db}{$keyword} = 1; } save_state(\%state); diff --git a/bin/get_ncbi_results b/bin/get_ncbi_results index 7a86901..3e1a625 100755 --- a/bin/get_ncbi_results +++ b/bin/get_ncbi_results @@ -143,6 +143,7 @@ while (<$terms>) { # Get XML file my @current_ids; + print {$xml_file} "\n"; while (@current_ids = splice(@gene_ids,0,5)) { $uri = URI->new($options{pubmed_site}.$options{pubmed_get_url}); $uri->query_form($uri->query_form(), @@ -167,10 +168,10 @@ while (<$terms>) { $response =~ s/^\s*
//gso;
 	  $response =~ s#
\s*$##gso; - print {$xml_file} $response; sleep 10; } + print {$xml_file} "
\n"; undef $xml_file; } diff --git a/bin/parse_genecard_results b/bin/parse_genecard_results index a6718c5..4b7d514 100755 --- a/bin/parse_genecard_results +++ b/bin/parse_genecard_results @@ -128,15 +128,13 @@ while ($_ = $dir->read) { my @results; # Find gene name - ($results[NAME]) = $result =~ m&(?:Lean|Gene)Card\s+for\s+(?:(?:disorder\s+locus|uncategorized| - hugo\s*reserved\s*symbol|cluster| - potentially\s*expressed\s*sequence)|(?:predicted\s+|pseudo|rna\s+|)gene) - \s*(?:with\s*support\s*|)\s*\s*([^\s]+)\s*&xis; + ($results[NAME]) = $result =~ m{(?:Lean|Gene)Card\s+for\s+[^<]+]+>\s*([^<]+)}xis; $results[NAME] ||= 'NO NAME'; # Find REF SEQ number - ($results[REFSEQ]) = $result =~ m|http://www.ncbi.nlm.nih.gov/entrez/query.fcgi\? - cmd=Search\&db=nucleotide\&doptcmdl=GenBank\&term=([^\"]+)\"|xis; + ($results[REFSEQ]) = $result =~ m{http://www.ncbi.nlm.nih.gov/entrez/query.fcgi\? + (?:cmd=Search\&db=nucleotide|db=nucleotide\&cmd=search) + \&doptcmdl=GenBank\&term=([^\"]+)\"}xis; $results[REFSEQ] ||= 'NO REFSEQ'; @@ -161,7 +159,10 @@ while ($_ = $dir->read) { my @functions = $result =~ m&
  • Function:\s+(.+?)(?:
  • )|(?:)&gis; # GO Functions - push @functions, (map {s#\s*\s*# #g; $_;} $result =~ m&(GO:\d+\s*.+?)(?:
    |

    )&gis); + push @functions, (map {s/\n//g} + map {s#\s*\s*(?:\s*)?\s*# #g; $_;} + $result =~ m{(GO:\d+\s*(?:\s*)?.+?)(?:

    |

    )}gis + ); $results[FUNCTION] = join('; ', map {(defined $_)?($_):()} @functions); $results[FUNCTION] ||= 'NO FUNCTION'; diff --git a/bin/parse_harvester_results b/bin/parse_harvester_results index 7a8cfd4..30fc011 100755 --- a/bin/parse_harvester_results +++ b/bin/parse_harvester_results @@ -144,6 +144,7 @@ while ($_ = $dir->read) { } $results[NAME] ||= 'NO NAME'; + $results[NAME] =~ s/_HUMAN//; # Find REF SEQ number ($results[REFSEQ]) = $result =~ m&) { + # "name","hits","rzscore","refseq","location","alias","database","terms","description","function" + next if $_ =~ /^\"name\"\,/; + chomp; + my %record; + @record{qw(name hits rzscore refseq location alias database terms description function)} = map {s/^"//; s/"$//; $_;} split /\"\,\"/; + push @results,{%record}; + push @{$name_idx{lc($record{name})}}, $#results; + foreach (map {lc($_)} split /\s*;\s*/, $record{alias}) { + push @{$name_idx{$_}}, $#results; + } + die "Duplicate refseq at record $." if exists $refseq_idx{lc $record{refseq}}; + $refseq_idx{lc $record{name}} = $#results; + my @terms = split '; ', $record{terms}; + my %term_temp; + my %db_temp; + my %gene_temp; + my %gene_temp2; + for my $term (@terms) { + if ($term =~ /\[/) { + my ($keyword,$database,$hits) = $term =~ /([^[]+)\[([^\]]+)\]:(\d+)/; + $keyword =~ s/[-+_]/ /g; + $keyword =~ s/\s*$//; + $keyword =~ s/[*]//; + $gene_temp{$keyword}{$database} = 1; + $gene_temp2{$database}{$keyword} = 1; + $databases{$database}{$keyword}{count}++; + $db_temp{$database}++; + $terms{$keyword}{$database}{count}++; + } + else { + my ($keyword,$hits) = $term =~ /([^:]+):(\d+)/; + $keyword =~ s/[-+_]/ /g; + $keyword =~ s/\s*$//; + $keyword =~ s/[*]//; + $terms{$keyword}{total}{count}++; + } + } + if (keys %gene_temp == 1) { + $terms{[keys %gene_temp]->[0]}{total}{unique}++; + if (keys %{$gene_temp{[keys %gene_temp]->[0]}} == 1) { + $databases{total}{total}{unique}++ + } + } + if (keys %gene_temp2 == 1) { + $databases{[keys %gene_temp2]->[0]}{total}{unique}++; + } + for my $keyword (keys %gene_temp) { + if (keys %{$gene_temp{$keyword}} == 1) { + $terms{$keyword}{[keys %{$gene_temp{$keyword}}]->[0]}{unique}++; + } + } + for my $database (keys %db_temp) { + $databases{$database}{total}{count}++; + } + $databases{total}{total}{count}++; +} + +our ($keyword,$gct,$hvt,$nct,$t); + format STDOUT = +@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\ +$keyword, $gct, $hvt, $nct, $t +. +for$keyword (sort keys %terms) { + ($gct,$hvt,$nct,$t) = + map { + if (not defined $_) { + '$-$'; + } + else { + $_->{unique} ||= 0; + "$_->{count} ($_->{unique})"; + } + } @{$terms{$keyword}}{qw(genecard harvester ncbi total)}; + write; +} + +$keyword = 'Total'; +($gct,$hvt,$nct,$t) = + map { + if (not defined $_) { + '$-$'; + } + else { + $_->{unique} ||= 0; + "$_->{count} ($_->{unique})"; + } + } map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; +#($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; +write;