* Finish addition of ensembl support
authorDon Armstrong <don@donarmstrong.com>
Thu, 14 Feb 2008 05:55:09 +0000 (05:55 +0000)
committerDon Armstrong <don@donarmstrong.com>
Thu, 14 Feb 2008 05:55:09 +0000 (05:55 +0000)
git-svn-id: file:///srv/svn/function2gene/trunk@31 a0738b58-4706-0410-8799-fb830574a030

bin/combine_results
bin/function2gene
bin/parse_ensembl_results

index 666fe7c..a365620 100755 (executable)
@@ -297,18 +297,48 @@ sub add_if_better{
      }
 }
 
+sub space_fill{
+     my ($value,$length,$right) = @_;
+     $right ||= 0;
+     if (length($value) > $length) {
+         $value =~ m/(.{$length})/;
+         return $1;
+     }
+     if (length($value) == $length) {
+         return $value
+     }
+     if ($right) {
+         return join('',
+                     ' ' x ($length - length($value)),
+                     $value,
+                    );
+     }
+     else {
+         return join('',
+                     $value,
+                     ' ' x ($length - length($value)),
+                    );
+     }
+}
+
+sub results_table_line {
+     my ($keyword,@fields) = @_;
+     return join( ' & ',
+                 space_fill($keyword,23),
+                 map {space_fill($_,11,1)} @fields
+               )."\n";
+}
 
+my @database_order = grep {lc($_) ne 'total'} keys %databases;
 if (defined $results_table_fh) {
-     our ($keyword,$weight,$autoweight,$gct,$hvt,$nct,$t) = ('Keyword','Weight','Autoweight','GeneCards','Harvester','NCBI','Total');
-     format RESULTS_TABLE =
-@<<<<<<<<<<<<<<<<<<<<<< & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> & @>>>>>>>>>> \\
-$keyword,                 $weight,      $autoweight,  $gct,         $hvt,         $nct,         $t
-.
-     $results_table_fh->format_name('RESULTS_TABLE');
-     write $results_table_fh;
+     my $keyword;
+     print {$results_table_fh} results_table_line('Keyword','Weight','Autoweight',
+                                                 map {ucfirst($_)} @database_order,
+                                                 'Total',
+                                                );
 
      for $keyword (sort keys %terms) {
-         ($gct,$hvt,$nct,$t) =
+         my @fields =
               map {
                    if (not defined $_) {
                         '$-$';
@@ -317,15 +347,17 @@ $keyword,                 $weight,      $autoweight,  $gct,         $hvt,
                         $_->{unique} ||= 0;
                         "$_->{count} ($_->{unique})";
                    }
-              } @{$terms{$keyword}}{qw(genecard harvester ncbi total)};
-         $weight = $keyword_weight{$keyword} || 1;
-         $autoweight = $auto_weight{$keyword};
-         write $results_table_fh;
+              } @{$terms{$keyword}}{@database_order,'total'};
+         unshift @fields, $auto_weight{$keyword};
+         unshift @fields, $keyword_weight{$keyword} || 1;
+         print {$results_table_fh} results_table_line($keyword,
+                                                      @fields
+                                                     );
 
      }
-
      $keyword = 'Total';
-     ($gct,$hvt,$nct,$t) =
+     my @fields = ('','');
+     push @fields,
          map {
               if (not defined $_) {
                    '$-$';
@@ -334,11 +366,10 @@ $keyword,                 $weight,      $autoweight,  $gct,         $hvt,
                    $_->{unique} ||= 0;
                    "$_->{count} ($_->{unique})";
               }
-         } map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
-     #($gct,$hvt,$nct,$t) = map {$_->{total}} @databases{qw(genecard harvester ncbi total)};
-     $weight = '';
-     $autoweight = '';
-     write $results_table_fh;
+         } map {$_->{total}} @databases{@database_order,'total'};
+     print {$results_table_fh} results_table_line($keyword,
+                                                 @fields
+                                                );
 }
 
 __END__
index b6810e2..a892072 100755 (executable)
@@ -130,7 +130,7 @@ $ERRORS.="restart-at must be one of get, parse or combine\n" if
 
 $ERRORS.="unknown database(s)" if
      @{$options{databases}} and
-     grep {$_ !~ /^(?:ncbi|genecard|harvester)$/i} @{$options{databases}};
+     grep {$_ !~ /^(?:ncbi|genecard|harvester|ensembl)$/i} @{$options{databases}};
 
 if (not length $options{results}) {
      $ERRORS.="results directory not specified";
@@ -142,7 +142,7 @@ elsif (not -d $options{results} or not -w $options{results}) {
 pod2usage($ERRORS) if length $ERRORS;
 
 if (not @{$options{databases}}) {
-     $options{databases} = [qw(ncbi genecard harvester)]
+     $options{databases} = [qw(ncbi genecard harvester ensembl)]
 }
 
 $DEBUG = $options{debug};
index cdd1fba..df20b6c 100755 (executable)
@@ -107,7 +107,7 @@ if ($options{keywords}) {
      if (@ARGV != 1) {
          pod2usage("If the --keywords option is used, exactly one argument (the keyword) must be passed");
      }
-     $options{dir} = "$ARGV[0]_results_genecard";
+     $options{dir} = "$ARGV[0]_results_ensembl";
 }
 
 if (not -d $options{dir}) {
@@ -118,7 +118,7 @@ my $dir = new IO::Dir $options{dir} or die "Unable to open dir $options{dir}: $!
 
 print join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n);
 
-my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_genecard#;
+my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_ensembl#;
 
 while ($_ = $dir->read) {
      my $file_name = $_;
@@ -134,9 +134,11 @@ while ($_ = $dir->read) {
      my @results;
 
      # Find gene name
-     ($results[NAME]) = map {s/^[^:]+://; $_;}$result =~ m{a\s+href=\"[^"]+genenames.org[^"]+">\s*([^<]+?)\s*</a>}xis;
+     ($results[NAME]) = $result =~ m{a\s+href=\"[^"]+genenames.org[^"]+">\s*([^<]+?)\s*</a>}xis;
 
      $results[NAME] ||= 'NO NAME';
+     # strip of leading : bits
+     $results[NAME] =~ s/^[^\:]+\://;
      # Find REF SEQ number
      ($results[REFSEQ]) = $result =~ m{for\s*(ENSG\d+)}xis;