rsem v1.1.12, add a script to extract transcript-to-gene-map info from Trinity output

author Bo Li <bli@cs.wisc.edu>

Wed, 19 Oct 2011 00:18:18 +0000 (19:18 -0500)

committer Bo Li <bli@cs.wisc.edu>

Wed, 19 Oct 2011 00:18:18 +0000 (19:18 -0500)
author Bo Li <bli@cs.wisc.edu>
Wed, 19 Oct 2011 00:18:18 +0000 (19:18 -0500)
committer Bo Li <bli@cs.wisc.edu>
Wed, 19 Oct 2011 00:18:18 +0000 (19:18 -0500)
diff --git a/README.md b/README.md

index f8c4e2709436f97e446ac3dbbda2306818df70f4..7ef198ce38fbccaa85149267deb89d74ab69f5d3 100644 (file)
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ Table of Contents
  * [Usage](#usage)
  * [Example](#example)
  * [Simulation](#simulation)
+* [Generate Transcript-to-Gene-Map from Trinity Output](#gen_trinity)
  * [Acknowledgements](#acknowledgements)
  * [License](#license)
  
@@ -187,9 +188,9 @@ The commands for this scenario are as follows:
  
      rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]
  
-estimated_model_file:  File containing model parameters.  Generated by
+estimated_model_file:  file containing model parameters.  Generated by
  rsem-calculate-expression.   
-estimated_isoform_results: File containing isoform expression levels.
+estimated_isoform_results: file containing isoform expression levels.
  Generated by rsem-calculate-expression.   
  theta0: fraction of reads that are "noise" (not derived from a transcript).   
  N: number of reads to simulate.   
@@ -206,6 +207,17 @@ output_name_1.fq & output_name_2.fq if paired-end with quality score.
  
  output_name.sim.isoforms.results, output_name.sim.genes.results : Results estimated based on sample values.
  
+## <a name="gen_trinity"></a> Generate Transcript-to-Gene-Map from Trinity Output
+
+For Trinity users, RSEM provides a perl script to generate transcript-to-gene-map file from the fasta file produced by Trinity.
+
+### Usage:
+
+    extract-transcript-to-gene-map-from-trinity trinity_fasta_file map_file
+
+trinity_fasta_file: the fasta file produced by trinity, which contains all transcripts assembled.    
+map_file: transcript-to-gene-map file's name.    
+ 
  ## <a name="acknowledgements"></a> Acknowledgements
  
  RSEM uses the [Boost C++](http://www.boost.org) and
diff --git a/Refs.h b/Refs.h

index 63b67e16e0d705d8c66d3946085f76b79496724c..67c5d57a349ab589a479b6ac0608058e86cb973e 100644 (file)
--- a/Refs.h
+++ b/Refs.h
@@ -97,7 +97,10 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) {
      while((pt = getline(fin, line)) && line[0] != '>') {
        rawseq += line;
      }
-    assert(rawseq.size() > 0);
+    if (rawseq.size() <= 0) {
+      printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str()); 
+      continue;
+    }
      ++M;
      seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag)));
    }
diff --git a/extract-transcript-to-gene-map-from-trinity b/extract-transcript-to-gene-map-from-trinity

new file mode 100755 (executable)

index 0000000..e04e993
--- /dev/null
+++ b/extract-transcript-to-gene-map-from-trinity
@@ -0,0 +1,34 @@
+#!/usr/bin/perl
+
+use strict;
+
+if (scalar(@ARGV) != 2) {
+    print "Usage: extract-transcript-to-gene-map-from-trinity trinity_fasta_file map_file\n";
+    exit(-1);
+}
+
+open(INPUT, $ARGV[0]);
+open(OUTPUT, ">$ARGV[1]");
+
+my ($tag, $line);
+$tag = <INPUT>; chomp($tag);
+while (substr($tag, 0, 1) eq ">") {
+    $tag = substr($tag, 1);
+    my $cnt = 0;
+    while (($line = <INPUT>) && substr($line, 0, 1) ne ">") {
+       $cnt++;
+    }
+    if ($cnt == 0) { print "Warning: Fasta entry $tag has an empty sequence, it is omitted.\n"; }
+    else {
+       my ($tid, $gid) = split(/ /, $tag);
+       my ($comp, $c, @tmp) = split(/_/, $gid);
+       $gid = join("_", $comp, $c);
+       print OUTPUT "$gid\t$tid\n";
+    }
+    $tag = $line; chomp($tag);
+} 
+
+close(INPUT);
+close(OUTPUT);
+
+ 
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index 94764f4bd280215ee4ad38d14f3ea4abc6497136..9b4e69ea0d47ce19ee292b9e4b6f073e6891c1b7 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -484,7 +484,7 @@ Calculate 95% credibility intervals and posterior mean estimates.  (Default: off
  
  =item B<--seed-length> <int>
  
-Seed length used by the read aligner.  Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads.  If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter.  (Default: 25)
+Seed length used by the read aligner.  Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads.  If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. (Default: 25)
  
  =item B<--tag> <string>
author	Bo Li <bli@cs.wisc.edu>
	Wed, 19 Oct 2011 00:18:18 +0000 (19:18 -0500)
committer	Bo Li <bli@cs.wisc.edu>
	Wed, 19 Oct 2011 00:18:18 +0000 (19:18 -0500)
README.md		patch \| blob \| history
Refs.h		patch \| blob \| history
extract-transcript-to-gene-map-from-trinity	[new file with mode: 0755]	patch \| blob
rsem-calculate-expression		patch \| blob \| history