14 #include "SingleRead.h"
15 #include "SingleReadQ.h"
16 #include "PairedEndRead.h"
17 #include "PairedEndReadQ.h"
20 #include "SingleModel.h"
21 #include "SingleQModel.h"
22 #include "PairedEndModel.h"
23 #include "PairedEndQModel.h"
26 #include "GroupInfo.h"
27 #include "Transcript.h"
28 #include "Transcripts.h"
30 #include "simul_mersenne.h"
39 Transcripts transcripts;
41 double *theta, *counts;
46 char outReadF[2][STRLEN];
48 char refF[STRLEN], groupF[STRLEN], tiF[STRLEN];
49 char geneResF[STRLEN], isoResF[STRLEN];
51 simul_mersenne sampler;
53 void genOutReadStreams(int type, char *outFN) {
57 sprintf(outReadF[0], "%s.fa", outFN);
61 sprintf(outReadF[0], "%s.fq", outFN);
65 for (int i = 0; i < n_os; i++)
66 sprintf(outReadF[i], "%s_%d.fa", outFN, i + 1);
70 for (int i = 0; i < n_os; i++)
71 sprintf(outReadF[i], "%s_%d.fq", outFN, i + 1);
75 for (int i = 0; i < n_os; i++)
76 os[i] = new ofstream(outReadF[i]);
79 template<class ModelType>
80 void calcExpectedEffectiveLengths(ModelType& model) {
82 double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
84 model.getGLD().copyTo(pdf, cdf, lb, ub, span);
85 clen = new double[span + 1];
87 for (int i = 1; i <= span; i++) {
88 clen[i] = clen[i - 1] + pdf[i] * (lb + i);
92 eel.resize(M + 1, 0.0);
93 for (int i = 1; i <= M; i++) {
94 int totLen = refs.getRef(i).getTotLen();
95 int fullLen = refs.getRef(i).getFullLen();
96 int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
97 int pos2 = max(min(totLen, ub) - lb, 0);
99 if (pos2 == 0) { eel[i] = 0.0; continue; }
101 eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
103 if (eel[i] < MINEEL) { eel[i] = 0.0; }
111 template<class ReadType, class ModelType>
112 void simulate(char* modelF, char* resultsF) {
113 ModelType model(&refs);
120 calcExpectedEffectiveLengths<ModelType>(model);
122 //generate theta vector
123 ifstream fin(resultsF);
127 for (int i = 1; i <= M; i++) {
130 for (int j = 0; j < 3; j++) pos = line.find_first_of('\t', pos) + 1;
131 size_t pos2 = line.find_first_of('\t', pos);
132 if (pos2 == string::npos) pos2 = line.length();
133 tau = atof(line.substr(pos, pos2 - pos).c_str());
134 theta[i] = tau * eel[i];
137 assert(denom > EPSILON);
139 for (int i = 1; i <= M; i++) theta[i] = theta[i] / denom * (1.0 - theta[0]);
144 model.startSimulation(&sampler, theta);
145 for (int i = 0; i < N; i++) {
146 while (!model.simulate(i, read, sid)) { ++tmp2140; }//;
147 read.write(n_os, os);
149 if ((i + 1) % 1000000 == 0 && verbose) printf("GEN %d\n", i + 1);
151 model.finishSimulation();
153 printf("Total number of resimulation is %d\n", tmp2140);
156 void writeResFiles(char* outFN) {
160 //calculate normalized read fraction
161 double *nrf = new double[M + 1];
162 memset(nrf, 0, sizeof(double) * (M + 1));
164 for (int i = 1; i <= M; i++)
165 if (eel[i] > EPSILON) {
170 if (counts[i] > EPSILON) { printf("Warning: An isoform which EEL is less than %.6g gets sampled!\n", MINEEL); }
173 for (int i = 1; i <= M; i++) nrf[i] /= denom;
175 //calculate tau values
176 double *tau = new double[M + 1];
177 memset(tau, 0, sizeof(double) * (M + 1));
179 for (int i = 1; i <= M; i++)
180 if (eel[i] > EPSILON) {
181 tau[i] = nrf[i] / eel[i];
185 for (int i = 1; i <= M; i++) tau[i] /= denom;
188 sprintf(isoResF, "%s.sim.isoforms.results", outFN);
189 fo = fopen(isoResF, "w");
190 for (int i = 1; i <= M; i++) {
191 const Transcript& transcript = transcripts.getTranscriptAt(i);
192 fprintf(fo, "%s\t%.2f\t%.15g\t%.15g", transcript.getTranscriptID().c_str(), counts[i], nrf[i], tau[i]);
194 if (transcript.getLeft() != "") { fprintf(fo, "\t%s", transcript.getLeft().c_str()); }
200 sprintf(geneResF, "%s.sim.genes.results", outFN);
201 fo = fopen(geneResF, "w");
202 for (int i = 0; i < m; i++) {
203 double sum_c = 0.0, sum_nrf = 0.0, sum_tau = 0.0;
204 int b = gi.spAt(i), e = gi.spAt(i + 1);
205 for (int j = b; j < e; j++) {
210 const string& gene_id = transcripts.getTranscriptAt(b).getGeneID();
211 fprintf(fo, "%s\t%.2f\t%.15g\t%.15g\t", gene_id.c_str(), sum_c, sum_nrf, sum_tau);
212 for (int j = b; j < e; j++) {
213 fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : '\n'));
222 void releaseOutReadStreams() {
223 for (int i = 0; i < n_os; i++) {
224 ((ofstream*)os[i])->close();
229 int main(int argc, char* argv[]) {
233 if (argc != 7 && argc != 8) {
234 printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]\n");
238 if (argc == 8 && !strcmp(argv[7], "-q")) quiet = true;
242 sprintf(refF, "%s.seq", argv[1]);
245 sprintf(groupF, "%s.grp", argv[1]);
248 sprintf(tiF, "%s.ti", argv[1]);
249 transcripts.readFrom(tiF);
251 //read model type from modelF
252 fi = fopen(argv[2], "r");
253 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[2]); exit(-1); }
254 fscanf(fi, "%d", &model_type);
257 theta = new double[M + 1];
258 theta[0] = atof(argv[4]);
261 genOutReadStreams(model_type, argv[6]);
263 counts = new double[M + 1];
264 memset(counts, 0, sizeof(double) * (M + 1));
267 case 0: simulate<SingleRead, SingleModel>(argv[2], argv[3]); break;
268 case 1: simulate<SingleReadQ, SingleQModel>(argv[2], argv[3]); break;
269 case 2: simulate<PairedEndRead, PairedEndModel>(argv[2], argv[3]); break;
270 case 3: simulate<PairedEndReadQ, PairedEndQModel>(argv[2], argv[3]); break;
273 writeResFiles(argv[6]);
274 releaseOutReadStreams();