14 #include "my_assert.h"
16 #include "SingleRead.h"
17 #include "SingleReadQ.h"
18 #include "PairedEndRead.h"
19 #include "PairedEndReadQ.h"
22 #include "SingleModel.h"
23 #include "SingleQModel.h"
24 #include "PairedEndModel.h"
25 #include "PairedEndQModel.h"
28 #include "Transcript.h"
29 #include "Transcripts.h"
31 #include "WriteResults.h"
44 Transcripts transcripts;
47 vector<double> theta, counts;
51 char outReadF[2][STRLEN];
54 char refF[STRLEN], tiF[STRLEN];
58 void genOutReadStreams(int type, char *outFN) {
62 sprintf(outReadF[0], "%s.fa", outFN);
66 sprintf(outReadF[0], "%s.fq", outFN);
70 for (int i = 0; i < n_os; i++)
71 sprintf(outReadF[i], "%s_%d.fa", outFN, i + 1);
75 for (int i = 0; i < n_os; i++)
76 sprintf(outReadF[i], "%s_%d.fq", outFN, i + 1);
80 for (int i = 0; i < n_os; i++)
81 os[i] = new ofstream(outReadF[i]);
84 template<class ReadType, class ModelType>
85 void simulate(char* modelF, char* resultsF) {
86 ModelType model(&refs);
93 calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
95 //generate theta vector
96 ifstream fin(resultsF);
100 getline(fin, line); // read the first line, which is just column names
101 for (int i = 1; i <= M; i++) {
104 for (int j = 0; j < OFFSITE; j++) pos = line.find_first_of('\t', pos) + 1;
105 size_t pos2 = line.find_first_of('\t', pos);
106 if (pos2 == string::npos) pos2 = line.length();
107 tpm = atof(line.substr(pos, pos2 - pos).c_str());
108 theta[i] = tpm * eel[i]; // during simulation, there is no check for effL < 0. The reason is for that case, eel[i] here = 0 and therefore no chance to sample from it
111 assert(denom > EPSILON);
113 for (int i = 1; i <= M; i++) theta[i] = theta[i] / denom * (1.0 - theta[0]);
115 READ_INT_TYPE resimulation_count = 0;
118 model.startSimulation(sampler, theta);
119 for (READ_INT_TYPE i = 0; i < N; i++) {
120 while (!model.simulate(i, read, sid)) { ++resimulation_count; }
121 read.write(n_os, os);
123 if ((i + 1) % 1000000 == 0 && verbose) cout<<"GEN "<< i + 1<< endl;
125 model.finishSimulation();
127 cout<< "Total number of resimulation is "<< resimulation_count<< endl;
130 void releaseOutReadStreams() {
131 for (int i = 0; i < n_os; i++) {
132 ((ofstream*)os[i])->close();
137 int main(int argc, char* argv[]) {
141 if (argc < 7 || argc > 10) {
142 printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [--seed seed] [-q]\n\n");
143 printf("Parameters:\n\n");
144 printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n");
145 printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n");
146 printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. If the RSEM references built are aware of allele-specific transcripts, 'sample_name.alleles.results' should be used instead.\n");
147 printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n");
148 printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n");
149 printf("output_name: Prefix for all output files.\n");
150 printf("--seed seed: Set seed for the random number generator used in simulation. The seed should be a 32-bit unsigned integer.\n");
151 printf("-q: Set it will stop outputting intermediate information.\n\n");
152 printf("Outputs:\n\n");
153 printf("output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from.\n");
154 printf("output_name.sim.alleles.results: Allele-specific expression levels estimated by counting where each simulated read comes from.\n\n");
155 printf("output_name.fa if single-end without quality score;\noutput_name.fq if single-end with quality score;\noutput_name_1.fa & output_name_2.fa if paired-end without quality score;\noutput_name_1.fq & output_name_2.fq if paired-end with quality score.\n\n");
156 printf("Format of the header line: Each simulated read's header line encodes where it comes from. The header line has the format:\n\n");
157 printf("\t{>/@}_rid_dir_sid_pos[_insertL]\n\n");
158 printf("{>/@}: Either '>' or '@' must appear. '>' appears if FASTA files are generated and '@' appears if FASTQ files are generated\n");
159 printf("rid: Simulated read's index, numbered from 0\n");
160 printf("dir: The direction of the simulated read. 0 refers to forward strand ('+') and 1 refers to reverse strand ('-')\n");
161 printf("sid: Represent which transcript this read is simulated from. It ranges between 0 and M, where M is the total number of transcripts. If sid=0, the read is simulated from the background noise. Otherwise, the read is simulated from a transcript with index sid. Transcript sid's transcript name can be found in the 'transcript_id' column of the 'sample_name.isoforms.results' file (at line sid + 1, line 1 is for column names)\n");
162 printf("pos: The start position of the simulated read in strand dir of transcript sid. It is numbered from 0\n");
163 printf("insertL: Only appear for paired-end reads. It gives the insert length of the simulated read.\n\n");
164 printf("Example:\n\n");
165 printf("Suppose we want to simulate 50 millon single-end reads with quality scores and use the parameters learned from [Example](#example). In addition, we set theta0 as 0.2 and output_name as 'simulated_reads'. The command is:\n\n");
166 printf("\trsem-simulate-reads /ref/mouse_125 mmliver_single_quals.stat/mmliver_single_quals.model mmliver_single_quals.isoforms.results 0.2 50000000 simulated_reads\n");
172 for (int i = 7; i < argc; i++) {
173 if (!strcmp(argv[i], "-q")) quiet = true;
174 if (!strcmp(argv[i], "--seed")) {
175 assert(i + 1 < argc);
176 istringstream reader(argv[i + 1]);
178 assert(reader>> seed);
179 sampler = new simul(seed);
184 if (sampler == NULL) sampler = new simul(time(NULL));
186 strcpy(refName, argv[1]);
187 alleleS = isAlleleSpecific(refName);
188 OFFSITE = (alleleS ? 6: 5);
191 sprintf(refF, "%s.seq", argv[1]);
194 sprintf(tiF, "%s.ti", argv[1]);
195 transcripts.readFrom(tiF);
197 //read model type from modelF
198 fi = fopen(argv[2], "r");
199 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[2]); exit(-1); }
200 assert(fscanf(fi, "%d", &model_type) == 1);
203 theta.assign(M + 1, 0.0);
204 theta[0] = atof(argv[4]);
207 genOutReadStreams(model_type, argv[6]);
209 counts.assign(M + 1, 0.0);
212 case 0: simulate<SingleRead, SingleModel>(argv[2], argv[3]); break;
213 case 1: simulate<SingleReadQ, SingleQModel>(argv[2], argv[3]); break;
214 case 2: simulate<PairedEndRead, PairedEndModel>(argv[2], argv[3]); break;
215 case 3: simulate<PairedEndReadQ, PairedEndQModel>(argv[2], argv[3]); break;
218 writeResultsSimulation(M, refName, argv[6], transcripts, eel, counts);
219 releaseOutReadStreams();