Added '--seed' option for 'rsem-simulate-reads'

author Bo Li <bli@cs.wisc.edu>

Sat, 22 Mar 2014 04:53:57 +0000 (23:53 -0500)

committer Bo Li <bli@cs.wisc.edu>

Sat, 22 Mar 2014 04:53:57 +0000 (23:53 -0500)
author Bo Li <bli@cs.wisc.edu>
Sat, 22 Mar 2014 04:53:57 +0000 (23:53 -0500)
committer Bo Li <bli@cs.wisc.edu>
Sat, 22 Mar 2014 04:53:57 +0000 (23:53 -0500)
diff --git a/README.md b/README.md

index f6b5ec857f44861c1cd4d5d62b483738e511833d..9fe0297fd769b705ba0bf1969046c73d47f869af 100644 (file)
--- a/README.md
+++ b/README.md
@@ -304,11 +304,14 @@ __N:__ The total number of reads to be simulated. If 'rsem-calculate-expression'
  
  __output_name:__ Prefix for all output files.   
  
+__--seed seed:__ Set seed for the random number generator used in simulation. The seed should be a 32-bit unsigned integer.
+
  __-q:__ Set it will stop outputting intermediate information.   
  
  ### Outputs:
  
  output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.
+output_name.sim.alleles.results: Allele-specific expression levels estimated by counting where each simulated read comes from.
  
  output_name.fa if single-end without quality score;   
  output_name.fq if single-end with quality score;   
diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW

index b942a635a25fb74ed249f98d1e50f838949b91a4..8a2a166f33f98d9d0f452c586a577e648ca38447 100644 (file)
--- a/WHAT_IS_NEW
+++ b/WHAT_IS_NEW
@@ -1,7 +1,9 @@
  RSEM v1.2.12
  
  - Enabled allele-specific expression estimation
-- Added option '--calc-pme' option to calculate posterior mean estimates only (no credibility intervals)
+- Added '--calc-pme' option for 'rsem-calculate-expression' to calculate posterior mean estimates only (no credibility intervals)
+- Modified the shebang line of RSEM perl scripts to make them more portable
+- Added '--seed' option for 'rsem-simulate-reads' to enable users set the seed of random number generator used by the simulation
  
  --------------------------------------------------------------------------------------------
  
diff --git a/simul.h b/simul.h

index ba105dbde965dfadb1deeaeb7def93e3357cbb9b..07c1c65ec8d243b147bc70150b3e50fc381824c4 100644 (file)
--- a/simul.h
+++ b/simul.h
@@ -1,7 +1,6 @@
  #ifndef SIMUL_H_
  #define SIMUL_H_
  
-#include<ctime>
  #include<cassert>
  
  #include "boost/random.hpp"
@@ -9,7 +8,7 @@
  class simul {
  public:
  
-       simul() : rg(boost::mt19937(time(NULL))) {
+       simul(unsigned int seed) : rg(boost::mt19937(seed)) {
         }
  
         // interval : [,)
diff --git a/simulation.cpp b/simulation.cpp

index 3eb42422c2bf8f2d28a5baf250981cd8d0a0332d..6dfcc0a0a1ff7bc30fe05f2dec435fd3190ef492 100644 (file)
--- a/simulation.cpp
+++ b/simulation.cpp
@@ -1,4 +1,5 @@
  #include<cmath>
+#include<ctime>
  #include<cstdio>
  #include<cstring>
  #include<cstdlib>
@@ -6,6 +7,7 @@
  #include<string>
  #include<iostream>
  #include<fstream>
+#include<sstream>
  #include<vector>
  
  #include "utils.h"
@@ -51,7 +53,7 @@ char outReadF[2][STRLEN];
  char refName[STRLEN];
  char refF[STRLEN], tiF[STRLEN];
  
-simul sampler;
+simul *sampler;
  
  void genOutReadStreams(int type, char *outFN) {
         switch(type) {
@@ -113,7 +115,7 @@ void simulate(char* modelF, char* resultsF) {
         READ_INT_TYPE resimulation_count = 0;
  
         //simulating...
-       model.startSimulation(&sampler, theta);
+       model.startSimulation(sampler, theta);
         for (READ_INT_TYPE i = 0; i < N; i++) {
                 while (!model.simulate(i, read, sid)) { ++resimulation_count; }
                 read.write(n_os, os);
@@ -136,8 +138,8 @@ int main(int argc, char* argv[]) {
         bool quiet = false;
         FILE *fi = NULL;
  
-       if (argc != 7 && argc != 8) {
-               printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]\n\n");
+       if (argc < 7 || argc > 10) {
+               printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [--seed seed] [-q]\n\n");
                 printf("Parameters:\n\n");
                 printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n");
                 printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n");
@@ -145,9 +147,11 @@ int main(int argc, char* argv[]) {
                 printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n");
                 printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n");
                 printf("output_name: Prefix for all output files.\n");
+               printf("--seed seed: Set seed for the random number generator used in simulation. The seed should be a 32-bit unsigned integer.\n");
                 printf("-q: Set it will stop outputting intermediate information.\n\n");
                 printf("Outputs:\n\n");
-               printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n\n");
+               printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n");
+               printf("output_name.sim.alleles.results: Allele-specific expression levels estimated by counting where each simulated read comes from.\n\n");
                 printf("output_name.fa if single-end without quality score;\noutput_name.fq if single-end with quality score;\noutput_name_1.fa & output_name_2.fa if paired-end without quality score;\noutput_name_1.fq & output_name_2.fq if paired-end with quality score.\n\n");
                 printf("Format of the header line: Each simulated read's header line encodes where it comes from. The header line has the format:\n\n");
                 printf("\t{>/@}_rid_dir_sid_pos[_insertL]\n\n");
@@ -163,8 +167,21 @@ int main(int argc, char* argv[]) {
                 exit(-1);
         }
  
-       if (argc == 8 && !strcmp(argv[7], "-q")) quiet = true;
+       quiet = false;
+       sampler = NULL;
+       for (int i = 7; i < argc; i++) {
+         if (!strcmp(argv[i], "-q")) quiet = true;
+         if (!strcmp(argv[i], "--seed")) {
+           assert(i + 1 < argc);
+           istringstream reader(argv[i + 1]);
+           unsigned int seed;
+           assert(reader>> seed);
+           sampler = new simul(seed);
+         }
+       }
+
         verbose = !quiet;
+       if (sampler == NULL) sampler = new simul(time(NULL));
  
         strcpy(refName, argv[1]);
         alleleS = isAlleleSpecific(refName);
@@ -200,6 +217,7 @@ int main(int argc, char* argv[]) {
  
         writeResultsSimulation(M, refName, argv[6], transcripts, eel, counts);
         releaseOutReadStreams();
+       delete sampler;
  
         return 0;
  }
author	Bo Li <bli@cs.wisc.edu>
	Sat, 22 Mar 2014 04:53:57 +0000 (23:53 -0500)
committer	Bo Li <bli@cs.wisc.edu>
	Sat, 22 Mar 2014 04:53:57 +0000 (23:53 -0500)
README.md		patch \| blob \| history
WHAT_IS_NEW		patch \| blob \| history
simul.h		patch \| blob \| history
simulation.cpp		patch \| blob \| history