From 58d504aaf36ae486b1dba6d03e0e9f1c25855037 Mon Sep 17 00:00:00 2001
From: Bo Li <bli@cs.wisc.edu>
Date: Mon, 2 Dec 2013 01:10:41 -0600
Subject: [PATCH] Added a file describing the format of
 sample_name.stat/sample_name.model

---
 model_file_description.txt | 74 ++++++++++++++++++++++++++++++++++++++
 rsem-calculate-expression  |  6 +++-
 2 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 model_file_description.txt

diff --git a/model_file_description.txt b/model_file_description.txt
new file mode 100644
index 0000000..47356bc
--- /dev/null
+++ b/model_file_description.txt
@@ -0,0 +1,74 @@
+model_type # 0, single-end, no quality score; 1, single-end, quality score; 2, paired-end, no quality score; 3, paired-end, quality score
+
+forward_prob # The probability of generating a read from forward strand
+
+# Fragment length distribution or read length distribution (if reads are single-end reads and no fragment length distribution parameters are provided)
+lower_bound upper_bound span # Fragment/read length should be in the range of (lower_bound, upper_bound] and span = upper_bound - lower_bound
+prob_1 prob_2 ... prob_span # This line contains span probabilities separated by a single space. prob_i is the probability of generating a length of (lower_bound + i)
+
+[
+# Read length distribution, this section is optional for single-end reads model, the "[" and "]" suggest this section is optional.
+has_optional_length_dist # This line only appear if model_type < 2. 0 stands no optional length distribution for that the previous section already defines the read length distribution, 1 stands for having this optional read length distribution. If has_optional_length_dist = 0, the next two lines will not appear
+lower_bound upper_bound span # Fragment/read length should be in the range of (lower_bound, upper_bound] and span = upper_bound - lower_bound
+prob_1 prob_2 ... prob_span # This line contains span probabilities separated by a single space. prob_i is the probability of generating a length of (lower_bound + i) 
+]
+
+# Read start position distribution
+estimate_RSPD # 0 means that RSEM assumes a uniform read generating distribution across a transcript, 1 means that RSEM estimated a RSPD from data
+number_of_bins # number of bins used for estimating RSPD, please refer to Li, B., Ruotti, V., Stewart, R. M., Thomson, J. A., Dewey, C. N. (2010). RNA-Seq gene expression estimation with read mapping uncertainty. Bioinformatics, 26(4), 493-500. for more details
+bin_prob_1 bin_prob_2 ... bin_prob_number_of_bins # number_of_bins single space separated numbers representing the probabilities of starting a read from each bin
+
+[
+# Quality score distribution, only presented if model_type == 1 or 3. Quality scores are generated according to a first order Markov model
+# Quality scores are numbered from 0 internally, thus if prhed+33 quality score is used in the data, then i corresponds to phred score (i+33)
+size # total number of quality scores modeled
+p_init_0 p_init_1 ... p_init_(size-1) # size real-valued, single space separated numbers representing the initial probabilities of each state (quality score)
+p_trans_0,0 ... p_trans_0,(size-1)
+...
+p_trans_(size-1),0 ... p_trans_(size-1),(size-1) # size times size matrix representing the transition matrix for the Morkov chain
+]
+
+[
+# Sequencing error model for quality scores, only presented if model_type == 1 or 3
+size ncodes # size is the total number of quality scores and ncodes is the number of different bases (ncodes = 5, 0: A; 1: C; 2: G; 3: T; 4: N)
+# In the following, RSEM gives size blocks. The i th block represents the sequencing error model for quality score i. Blocks are separated by single blank lines
+...
+
+p_i,0,0 p_i,0,1 p_i,0,2 p_i,0,3 p_i,0,4
+...
+p_i,4,0 p_i,4,1 p_i,4,2 p_i,4,3 p_i,4,4 # The i th block contains a ncodes times ncodes matrix. Each cell (j,k) gives the probability of generating a base k given the quality score is i and the reference base is j
+
+...
+]
+
+[
+# Model for generating a noise read based on quality scores. Only presented if model_type == 1 or 3
+size ncodes # the same as sequencing error model for quality score
+p_0,0 p_0,1 p_0,2 p_0,3 p_0,4
+...
+p_(size-1),0 p_(size-1),1 p_(size-1),2 p_(size-1),3 p_(size-1),4 # A size times ncode matrix giving the probability of generating a base in a noise read given a quality score
+]
+
+[
+# Sequencing error model without quality score, only presented if model_type == 0 or 2. It records the probability of generating a base k at position i given the corresponding reference base is j. Positions are numbered from 0
+profile_length ncodes # profile_length should be equal to the maximum length of any read in the data set
+# There will be profile_length blocks separated by single blank lines
+...
+
+p_i,0,0 p_i,0,1 p_i,0,2 p_i,0,3 p_i,0,4
+...
+p_i,4,0 p_i,4,1 p_i,4,2 p_i,4,3 p_i,4,4 # The i th block contains a ncodes times ncodes matrix. Each cell (j,k) gives the probability of generating a base k at position i given the reference base is j
+
+...
+]
+
+[
+# Model for generating a noise read without quality scores. Only presented if model_type == 0 or 2
+ncodes # ncodes = 5
+p_0 p_1 p_2 p_3 p_4 # probabilities of generating a particular base for the noise read, all positions use the same base generating probabilities
+]
+
+[
+# Some other numbers, which has nothing to do with simulation
+]
+ 
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index 62339cb..3f2817d 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -524,6 +524,10 @@ Suppress the output of logging information. (Default: off)
 
 Show help information.
 
+=item B<--version>
+
+Show version information.
+
 =back
 
 =head1 DESCRIPTION
@@ -714,7 +718,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                            /ref/mouse_125 \
                            mmliver_single_without_quals
 
-4) Data are the same as 1). We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals.  We allow RSEM to use 1GB of memory for CI calculation:
+4) Data are the same as 1). This time we assume the bowtie executables are under '/sw/bowtie'. We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals.  We allow RSEM to use 1GB of memory for CI calculation:
 
  rsem-calculate-expression --bowtie-path /sw/bowtie \
                            --phred64-quals \
-- 
2.39.2