From c729fa33c6e2f9e3e1b374c66a55265525e7d974 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Fri, 21 Mar 2014 23:53:57 -0500 Subject: [PATCH] Added '--seed' option for 'rsem-simulate-reads' --- README.md | 3 +++ WHAT_IS_NEW | 4 +++- simul.h | 3 +-- simulation.cpp | 30 ++++++++++++++++++++++++------ 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index f6b5ec8..9fe0297 100644 --- a/README.md +++ b/README.md @@ -304,11 +304,14 @@ __N:__ The total number of reads to be simulated. If 'rsem-calculate-expression' __output_name:__ Prefix for all output files. +__--seed seed:__ Set seed for the random number generator used in simulation. The seed should be a 32-bit unsigned integer. + __-q:__ Set it will stop outputting intermediate information. ### Outputs: output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from. +output_name.sim.alleles.results: Allele-specific expression levels estimated by counting where each simulated read comes from. output_name.fa if single-end without quality score; output_name.fq if single-end with quality score; diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW index b942a63..8a2a166 100644 --- a/WHAT_IS_NEW +++ b/WHAT_IS_NEW @@ -1,7 +1,9 @@ RSEM v1.2.12 - Enabled allele-specific expression estimation -- Added option '--calc-pme' option to calculate posterior mean estimates only (no credibility intervals) +- Added '--calc-pme' option for 'rsem-calculate-expression' to calculate posterior mean estimates only (no credibility intervals) +- Modified the shebang line of RSEM perl scripts to make them more portable +- Added '--seed' option for 'rsem-simulate-reads' to enable users set the seed of random number generator used by the simulation -------------------------------------------------------------------------------------------- diff --git a/simul.h b/simul.h index ba105db..07c1c65 100644 --- a/simul.h +++ b/simul.h @@ -1,7 +1,6 @@ #ifndef SIMUL_H_ #define SIMUL_H_ -#include #include #include "boost/random.hpp" @@ -9,7 +8,7 @@ class simul { public: - simul() : rg(boost::mt19937(time(NULL))) { + simul(unsigned int seed) : rg(boost::mt19937(seed)) { } // interval : [,) diff --git a/simulation.cpp b/simulation.cpp index 3eb4242..6dfcc0a 100644 --- a/simulation.cpp +++ b/simulation.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -6,6 +7,7 @@ #include #include #include +#include #include #include "utils.h" @@ -51,7 +53,7 @@ char outReadF[2][STRLEN]; char refName[STRLEN]; char refF[STRLEN], tiF[STRLEN]; -simul sampler; +simul *sampler; void genOutReadStreams(int type, char *outFN) { switch(type) { @@ -113,7 +115,7 @@ void simulate(char* modelF, char* resultsF) { READ_INT_TYPE resimulation_count = 0; //simulating... - model.startSimulation(&sampler, theta); + model.startSimulation(sampler, theta); for (READ_INT_TYPE i = 0; i < N; i++) { while (!model.simulate(i, read, sid)) { ++resimulation_count; } read.write(n_os, os); @@ -136,8 +138,8 @@ int main(int argc, char* argv[]) { bool quiet = false; FILE *fi = NULL; - if (argc != 7 && argc != 8) { - printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]\n\n"); + if (argc < 7 || argc > 10) { + printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [--seed seed] [-q]\n\n"); printf("Parameters:\n\n"); printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n"); printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n"); @@ -145,9 +147,11 @@ int main(int argc, char* argv[]) { printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n"); printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n"); printf("output_name: Prefix for all output files.\n"); + printf("--seed seed: Set seed for the random number generator used in simulation. The seed should be a 32-bit unsigned integer.\n"); printf("-q: Set it will stop outputting intermediate information.\n\n"); printf("Outputs:\n\n"); - printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n\n"); + printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n"); + printf("output_name.sim.alleles.results: Allele-specific expression levels estimated by counting where each simulated read comes from.\n\n"); printf("output_name.fa if single-end without quality score;\noutput_name.fq if single-end with quality score;\noutput_name_1.fa & output_name_2.fa if paired-end without quality score;\noutput_name_1.fq & output_name_2.fq if paired-end with quality score.\n\n"); printf("Format of the header line: Each simulated read's header line encodes where it comes from. The header line has the format:\n\n"); printf("\t{>/@}_rid_dir_sid_pos[_insertL]\n\n"); @@ -163,8 +167,21 @@ int main(int argc, char* argv[]) { exit(-1); } - if (argc == 8 && !strcmp(argv[7], "-q")) quiet = true; + quiet = false; + sampler = NULL; + for (int i = 7; i < argc; i++) { + if (!strcmp(argv[i], "-q")) quiet = true; + if (!strcmp(argv[i], "--seed")) { + assert(i + 1 < argc); + istringstream reader(argv[i + 1]); + unsigned int seed; + assert(reader>> seed); + sampler = new simul(seed); + } + } + verbose = !quiet; + if (sampler == NULL) sampler = new simul(time(NULL)); strcpy(refName, argv[1]); alleleS = isAlleleSpecific(refName); @@ -200,6 +217,7 @@ int main(int argc, char* argv[]) { writeResultsSimulation(M, refName, argv[6], transcripts, eel, counts); releaseOutReadStreams(); + delete sampler; return 0; } -- 2.39.2