#include<cmath>
+#include<ctime>
#include<cstdio>
#include<cstring>
#include<cstdlib>
#include<string>
#include<iostream>
#include<fstream>
+#include<sstream>
#include<vector>
#include "utils.h"
char refName[STRLEN];
char refF[STRLEN], tiF[STRLEN];
-simul sampler;
+simul *sampler;
void genOutReadStreams(int type, char *outFN) {
switch(type) {
READ_INT_TYPE resimulation_count = 0;
//simulating...
- model.startSimulation(&sampler, theta);
+ model.startSimulation(sampler, theta);
for (READ_INT_TYPE i = 0; i < N; i++) {
while (!model.simulate(i, read, sid)) { ++resimulation_count; }
read.write(n_os, os);
bool quiet = false;
FILE *fi = NULL;
- if (argc != 7 && argc != 8) {
- printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]\n\n");
+ if (argc < 7 || argc > 10) {
+ printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [--seed seed] [-q]\n\n");
printf("Parameters:\n\n");
printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n");
printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n");
printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n");
printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n");
printf("output_name: Prefix for all output files.\n");
+ printf("--seed seed: Set seed for the random number generator used in simulation. The seed should be a 32-bit unsigned integer.\n");
printf("-q: Set it will stop outputting intermediate information.\n\n");
printf("Outputs:\n\n");
- printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n\n");
+ printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n");
+ printf("output_name.sim.alleles.results: Allele-specific expression levels estimated by counting where each simulated read comes from.\n\n");
printf("output_name.fa if single-end without quality score;\noutput_name.fq if single-end with quality score;\noutput_name_1.fa & output_name_2.fa if paired-end without quality score;\noutput_name_1.fq & output_name_2.fq if paired-end with quality score.\n\n");
printf("Format of the header line: Each simulated read's header line encodes where it comes from. The header line has the format:\n\n");
printf("\t{>/@}_rid_dir_sid_pos[_insertL]\n\n");
exit(-1);
}
- if (argc == 8 && !strcmp(argv[7], "-q")) quiet = true;
+ quiet = false;
+ sampler = NULL;
+ for (int i = 7; i < argc; i++) {
+ if (!strcmp(argv[i], "-q")) quiet = true;
+ if (!strcmp(argv[i], "--seed")) {
+ assert(i + 1 < argc);
+ istringstream reader(argv[i + 1]);
+ unsigned int seed;
+ assert(reader>> seed);
+ sampler = new simul(seed);
+ }
+ }
+
verbose = !quiet;
+ if (sampler == NULL) sampler = new simul(time(NULL));
strcpy(refName, argv[1]);
alleleS = isAlleleSpecific(refName);
writeResultsSimulation(M, refName, argv[6], transcripts, eel, counts);
releaseOutReadStreams();
+ delete sampler;
return 0;
}