From b09fb9f692a51f9f0e6451a32cf4b869566dbec8 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Thu, 14 Feb 2008 02:08:05 +0000 Subject: [PATCH] add parse ensembl results git-svn-id: file:///srv/svn/function2gene/trunk@30 a0738b58-4706-0410-8799-fb830574a030 --- bin/parse_ensembl_results | 199 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100755 bin/parse_ensembl_results diff --git a/bin/parse_ensembl_results b/bin/parse_ensembl_results new file mode 100755 index 0000000..cdd1fba --- /dev/null +++ b/bin/parse_ensembl_results @@ -0,0 +1,199 @@ +#! /usr/bin/perl + +# parse_genecard_results retreives files of search results from ncbi, +# and is released under the terms of the GPL version 2, or any later +# version, at your option. See the file README and COPYING for more +# information. + +# Copyright 2004 by Don Armstrong . + +# $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $ + + +use warnings; +use strict; + + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + + parse_genecard_results [options] + +=head1 SYNOPSIS + + + Options: + --dir, -D directory to stick results into [default .] + --name, -n file naming scheme [default ${search}_results.$format] + --terms, -t file of search terms [default -] + --debug, -d debugging level [default 0] + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief useage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + parse_harvester_results -D ./harvester_results/ -n '${search}_name.html' < search_parameters + +Will pretty much do what you want + +=cut + + + +use vars qw($DEBUG $REVISION); + +BEGIN{ + ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/; + $DEBUG = 0 unless defined $DEBUG; +} + +use IO::File; +use IO::Dir; + +use HTML::TreeBuilder; +use HTML::ElementTable; + +my %options = (debug => 0, + help => 0, + man => 0, + dir => '.', + keyword => undef, + keywords => 0, + ); + +GetOptions(\%options,'keyword|k=s','dir|D=s','debug|d+','help|h|?','man|m', + 'keywords', + ); + + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + +# CSV columns +use constant {NAME => 0, + REFSEQ => 1, + LOCATION => 2, + ALIAS => 3, + FUNCTION => 4, + DESCRIPTION => 5, + KEYWORD => 6, + DBNAME => 7, + FILENAME => 8, + }; + +if ($options{keywords}) { + if (@ARGV != 1) { + pod2usage("If the --keywords option is used, exactly one argument (the keyword) must be passed"); + } + $options{dir} = "$ARGV[0]_results_genecard"; +} + +if (not -d $options{dir}) { + die "$options{dir} does not exist or is not a directory"; +} + +my $dir = new IO::Dir $options{dir} or die "Unable to open dir $options{dir}: $!"; + +print join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n); + +my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_genecard#; + +while ($_ = $dir->read) { + my $file_name = $_; + next if $file_name =~ /^\./; + next unless -f "$options{dir}/$file_name" and -r "$options{dir}/$file_name"; + + my $file = new IO::File "$options{dir}/$file_name", 'r' or die "Unable to open file $file_name"; + + local $/; + my $result = <$file>; + next if $result =~ m/is not present in the current release/; + + my @results; + + # Find gene name + ($results[NAME]) = map {s/^[^:]+://; $_;}$result =~ m{a\s+href=\"[^"]+genenames.org[^"]+">\s*([^<]+?)\s*}xis; + + $results[NAME] ||= 'NO NAME'; + # Find REF SEQ number + ($results[REFSEQ]) = $result =~ m{for\s*(ENSG\d+)}xis; + + $results[REFSEQ] ||= 'NO REFSEQ'; + + # Find Gene Location + my @location = $result =~ m{on\s+Chromosome\s+([\dX]+)\s+at\s+location\s+<[^>]+>([\d,]+)}xis; + if (@location) { + $results[LOCATION] = "$location[0] $location[1]"; + } + $results[LOCATION] ||= 'NO LOCATION'; + + # Find gene aliases + my @gene_aliases = map {split/,\s+/;} $result =~ m{Synonyms:\s+\s*([^<]+)\s*}gxis; + + $results[ALIAS] = join('; ', @gene_aliases); + $results[ALIAS] ||= 'NO ALIASES'; + + # Find gene function(s) +# +# my @functions; +# # GO Functions +# push @functions, (map {s/\n//g; $_;} +# map {s#\s*(?:\s*)?\s*# #g; $_;} +# $result =~ m{(GO:\d+\s*(?:\s*)?.+?)(?:||
|

)}gis +# ); +# $results[FUNCTION] = join('; ', map {(defined $_)?($_):()} @functions); + $results[FUNCTION] ||= 'NO FUNCTION'; + + # Figure out the keyword used + $results[KEYWORD] ||= $keyword || 'NO KEYWORD'; + + my @description = (map {s/\n/ /g; + s/\s+/ /g; + $_; + } + $result =~ m{\s* + Description\s* + \s* + \s* +

\s*([^<]+)}xgis + ); + # Figure out what the description is + $results[DESCRIPTION] = join('; ', + map {(defined $_)?($_):()} + @description); + + # Database searched + $results[DBNAME] = 'ensembl'; + $results[FILENAME] = $file_name; + + print join(',',map {qq("$_")} @results),qq(\n); +} + + + + + + +__END__ -- 2.39.2