From 029f32c46252899b050634e52c46a0e15505bc47 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Thu, 14 Feb 2008 05:55:09 +0000 Subject: [PATCH] * Finish addition of ensembl support git-svn-id: file:///srv/svn/function2gene/trunk@31 a0738b58-4706-0410-8799-fb830574a030 --- bin/combine_results | 69 ++++++++++++++++++++++++++++----------- bin/function2gene | 4 +-- bin/parse_ensembl_results | 8 +++-- 3 files changed, 57 insertions(+), 24 deletions(-) diff --git a/bin/combine_results b/bin/combine_results index 666fe7c..a365620 100755 --- a/bin/combine_results +++ b/bin/combine_results @@ -297,18 +297,48 @@ sub add_if_better{ } } +sub space_fill{ + my ($value,$length,$right) = @_; + $right ||= 0; + if (length($value) > $length) { + $value =~ m/(.{$length})/; + return $1; + } + if (length($value) == $length) { + return $value + } + if ($right) { + return join('', + ' ' x ($length - length($value)), + $value, + ); + } + else { + return join('', + $value, + ' ' x ($length - length($value)), + ); + } +} + +sub results_table_line { + my ($keyword,@fields) = @_; + return join( ' & ', + space_fill($keyword,23), + map {space_fill($_,11,1)} @fields + )."\n"; +} +my @database_order = grep {lc($_) ne 'total'} keys %databases; if (defined $results_table_fh) { - our ($keyword,$weight,$autoweight,$gct,$hvt,$nct,$t) = ('Keyword','Weight','Autoweight','GeneCards','Harvester','NCBI','Total'); - format RESULTS_TABLE = -@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\ -$keyword, $weight, $autoweight, $gct, $hvt, $nct, $t -. - $results_table_fh->format_name('RESULTS_TABLE'); - write $results_table_fh; + my $keyword; + print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight', + map {ucfirst($_)} @database_order, + 'Total', + ); for $keyword (sort keys %terms) { - ($gct,$hvt,$nct,$t) = + my @fields = map { if (not defined $_) { '$-$'; @@ -317,15 +347,17 @@ $keyword, $weight, $autoweight, $gct, $hvt, $_->{unique} ||= 0; "$_->{count} ($_->{unique})"; } - } @{$terms{$keyword}}{qw(genecard harvester ncbi total)}; - $weight = $keyword_weight{$keyword} || 1; - $autoweight = $auto_weight{$keyword}; - write $results_table_fh; + } @{$terms{$keyword}}{@database_order,'total'}; + unshift @fields, $auto_weight{$keyword}; + unshift @fields, $keyword_weight{$keyword} || 1; + print {$results_table_fh} results_table_line($keyword, + @fields + ); } - $keyword = 'Total'; - ($gct,$hvt,$nct,$t) = + my @fields = ('',''); + push @fields, map { if (not defined $_) { '$-$'; @@ -334,11 +366,10 @@ $keyword, $weight, $autoweight, $gct, $hvt, $_->{unique} ||= 0; "$_->{count} ($_->{unique})"; } - } map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; - #($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)}; - $weight = ''; - $autoweight = ''; - write $results_table_fh; + } map {$_->{total}} @databases{@database_order,'total'}; + print {$results_table_fh} results_table_line($keyword, + @fields + ); } __END__ diff --git a/bin/function2gene b/bin/function2gene index b6810e2..a892072 100755 --- a/bin/function2gene +++ b/bin/function2gene @@ -130,7 +130,7 @@ $ERRORS.="restart-at must be one of get, parse or combine\n" if $ERRORS.="unknown database(s)" if @{$options{databases}} and - grep {$_ !~ /^(?:ncbi|genecard|harvester)$/i} @{$options{databases}}; + grep {$_ !~ /^(?:ncbi|genecard|harvester|ensembl)$/i} @{$options{databases}}; if (not length $options{results}) { $ERRORS.="results directory not specified"; @@ -142,7 +142,7 @@ elsif (not -d $options{results} or not -w $options{results}) { pod2usage($ERRORS) if length $ERRORS; if (not @{$options{databases}}) { - $options{databases} = [qw(ncbi genecard harvester)] + $options{databases} = [qw(ncbi genecard harvester ensembl)] } $DEBUG = $options{debug}; diff --git a/bin/parse_ensembl_results b/bin/parse_ensembl_results index cdd1fba..df20b6c 100755 --- a/bin/parse_ensembl_results +++ b/bin/parse_ensembl_results @@ -107,7 +107,7 @@ if ($options{keywords}) { if (@ARGV != 1) { pod2usage("If the --keywords option is used, exactly one argument (the keyword) must be passed"); } - $options{dir} = "$ARGV[0]_results_genecard"; + $options{dir} = "$ARGV[0]_results_ensembl"; } if (not -d $options{dir}) { @@ -118,7 +118,7 @@ my $dir = new IO::Dir $options{dir} or die "Unable to open dir $options{dir}: $! print join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n); -my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_genecard#; +my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_ensembl#; while ($_ = $dir->read) { my $file_name = $_; @@ -134,9 +134,11 @@ while ($_ = $dir->read) { my @results; # Find gene name - ($results[NAME]) = map {s/^[^:]+://; $_;}$result =~ m{a\s+href=\"[^"]+genenames.org[^"]+">\s*([^<]+?)\s*}xis; + ($results[NAME]) = $result =~ m{a\s+href=\"[^"]+genenames.org[^"]+">\s*([^<]+?)\s*}xis; $results[NAME] ||= 'NO NAME'; + # strip of leading : bits + $results[NAME] =~ s/^[^\:]+\://; # Find REF SEQ number ($results[REFSEQ]) = $result =~ m{for\s*(ENSG\d+)}xis; -- 2.39.2