From: Bo Li <bli@cs.wisc.edu>
Date: Wed, 19 Oct 2011 00:18:18 +0000 (-0500)
Subject: rsem v1.1.12, add a script to extract transcript-to-gene-map info from Trinity output
X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=bf13409ebd15750dc9be1f92a622188d81ae634d

rsem v1.1.12, add a script to extract transcript-to-gene-map info from Trinity output
---

diff --git a/README.md b/README.md
index f8c4e27..7ef198c 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ Table of Contents
 * [Usage](#usage)
 * [Example](#example)
 * [Simulation](#simulation)
+* [Generate Transcript-to-Gene-Map from Trinity Output](#gen_trinity)
 * [Acknowledgements](#acknowledgements)
 * [License](#license)
 
@@ -187,9 +188,9 @@ The commands for this scenario are as follows:
 
     rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]
 
-estimated_model_file:  File containing model parameters.  Generated by
+estimated_model_file:  file containing model parameters.  Generated by
 rsem-calculate-expression.   
-estimated_isoform_results: File containing isoform expression levels.
+estimated_isoform_results: file containing isoform expression levels.
 Generated by rsem-calculate-expression.   
 theta0: fraction of reads that are "noise" (not derived from a transcript).   
 N: number of reads to simulate.   
@@ -206,6 +207,17 @@ output_name_1.fq & output_name_2.fq if paired-end with quality score.
 
 output_name.sim.isoforms.results, output_name.sim.genes.results : Results estimated based on sample values.
 
+## <a name="gen_trinity"></a> Generate Transcript-to-Gene-Map from Trinity Output
+
+For Trinity users, RSEM provides a perl script to generate transcript-to-gene-map file from the fasta file produced by Trinity.
+
+### Usage:
+
+    extract-transcript-to-gene-map-from-trinity trinity_fasta_file map_file
+
+trinity_fasta_file: the fasta file produced by trinity, which contains all transcripts assembled.    
+map_file: transcript-to-gene-map file's name.    
+ 
 ## <a name="acknowledgements"></a> Acknowledgements
 
 RSEM uses the [Boost C++](http://www.boost.org) and
diff --git a/Refs.h b/Refs.h
index 63b67e1..67c5d57 100644
--- a/Refs.h
+++ b/Refs.h
@@ -97,7 +97,10 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) {
     while((pt = getline(fin, line)) && line[0] != '>') {
       rawseq += line;
     }
-    assert(rawseq.size() > 0);
+    if (rawseq.size() <= 0) {
+      printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str()); 
+      continue;
+    }
     ++M;
     seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag)));
   }
diff --git a/extract-transcript-to-gene-map-from-trinity b/extract-transcript-to-gene-map-from-trinity
new file mode 100755
index 0000000..e04e993
--- /dev/null
+++ b/extract-transcript-to-gene-map-from-trinity
@@ -0,0 +1,34 @@
+#!/usr/bin/perl
+
+use strict;
+
+if (scalar(@ARGV) != 2) {
+    print "Usage: extract-transcript-to-gene-map-from-trinity trinity_fasta_file map_file\n";
+    exit(-1);
+}
+
+open(INPUT, $ARGV[0]);
+open(OUTPUT, ">$ARGV[1]");
+
+my ($tag, $line);
+$tag = <INPUT>; chomp($tag);
+while (substr($tag, 0, 1) eq ">") {
+    $tag = substr($tag, 1);
+    my $cnt = 0;
+    while (($line = <INPUT>) && substr($line, 0, 1) ne ">") {
+	$cnt++;
+    }
+    if ($cnt == 0) { print "Warning: Fasta entry $tag has an empty sequence, it is omitted.\n"; }
+    else {
+	my ($tid, $gid) = split(/ /, $tag);
+	my ($comp, $c, @tmp) = split(/_/, $gid);
+	$gid = join("_", $comp, $c);
+	print OUTPUT "$gid\t$tid\n";
+    }
+    $tag = $line; chomp($tag);
+} 
+
+close(INPUT);
+close(OUTPUT);
+
+ 
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index 94764f4..9b4e69e 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -484,7 +484,7 @@ Calculate 95% credibility intervals and posterior mean estimates.  (Default: off
 
 =item B<--seed-length> <int>
 
-Seed length used by the read aligner.  Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads.  If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter.  (Default: 25)
+Seed length used by the read aligner.  Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads.  If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. (Default: 25)
 
 =item B<--tag> <string>