14 #include "SingleRead.h"
15 #include "SingleReadQ.h"
16 #include "PairedEndRead.h"
17 #include "PairedEndReadQ.h"
20 #include "SingleModel.h"
21 #include "SingleQModel.h"
22 #include "PairedEndModel.h"
23 #include "PairedEndQModel.h"
26 #include "GroupInfo.h"
27 #include "Transcript.h"
28 #include "Transcripts.h"
39 Transcripts transcripts;
41 double *theta, *counts;
46 char outReadF[2][STRLEN];
48 char refF[STRLEN], groupF[STRLEN], tiF[STRLEN];
49 char geneResF[STRLEN], isoResF[STRLEN];
53 void genOutReadStreams(int type, char *outFN) {
57 sprintf(outReadF[0], "%s.fa", outFN);
61 sprintf(outReadF[0], "%s.fq", outFN);
65 for (int i = 0; i < n_os; i++)
66 sprintf(outReadF[i], "%s_%d.fa", outFN, i + 1);
70 for (int i = 0; i < n_os; i++)
71 sprintf(outReadF[i], "%s_%d.fq", outFN, i + 1);
75 for (int i = 0; i < n_os; i++)
76 os[i] = new ofstream(outReadF[i]);
79 template<class ModelType>
80 void calcExpectedEffectiveLengths(ModelType& model) {
82 double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
84 model.getGLD().copyTo(pdf, cdf, lb, ub, span);
85 clen = new double[span + 1];
87 for (int i = 1; i <= span; i++) {
88 clen[i] = clen[i - 1] + pdf[i] * (lb + i);
92 eel.resize(M + 1, 0.0);
93 for (int i = 1; i <= M; i++) {
94 int totLen = refs.getRef(i).getTotLen();
95 int fullLen = refs.getRef(i).getFullLen();
96 int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
97 int pos2 = max(min(totLen, ub) - lb, 0);
99 if (pos2 == 0) { eel[i] = 0.0; continue; }
101 eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
103 if (eel[i] < MINEEL) { eel[i] = 0.0; }
111 template<class ReadType, class ModelType>
112 void simulate(char* modelF, char* resultsF) {
113 ModelType model(&refs);
120 calcExpectedEffectiveLengths<ModelType>(model);
122 //generate theta vector
123 ifstream fin(resultsF);
127 for (int i = 1; i <= M; i++) {
130 for (int j = 0; j < 2; j++) pos = line.find_first_of('\t', pos) + 1;
131 size_t pos2 = line.find_first_of('\t', pos);
132 if (pos2 == string::npos) pos2 = line.length();
133 tau = atof(line.substr(pos, pos2 - pos).c_str());
134 theta[i] = tau * eel[i];
137 assert(denom > EPSILON);
139 for (int i = 1; i <= M; i++) theta[i] = theta[i] / denom * (1.0 - theta[0]);
141 int resimulation_count = 0;
144 model.startSimulation(&sampler, theta);
145 for (int i = 0; i < N; i++) {
146 while (!model.simulate(i, read, sid)) { ++resimulation_count; }
147 read.write(n_os, os);
149 if ((i + 1) % 1000000 == 0 && verbose) printf("GEN %d\n", i + 1);
151 model.finishSimulation();
153 printf("Total number of resimulation is %d\n", resimulation_count);
156 void writeResFiles(char* outFN) {
160 //calculate tau values
161 double *tau = new double[M + 1];
162 memset(tau, 0, sizeof(double) * (M + 1));
164 for (int i = 1; i <= M; i++)
165 if (eel[i] > EPSILON) {
166 tau[i] = counts[i] / eel[i];
170 if (counts[i] > EPSILON) { printf("Warning: An isoform which EEL is less than %.6g gets sampled!\n", MINEEL); }
173 for (int i = 1; i <= M; i++) tau[i] /= denom;
176 sprintf(isoResF, "%s.sim.isoforms.results", outFN);
177 fo = fopen(isoResF, "w");
178 for (int i = 1; i <= M; i++) {
179 const Transcript& transcript = transcripts.getTranscriptAt(i);
180 fprintf(fo, "%s\t%.2f\t%.15g", transcript.getTranscriptID().c_str(), counts[i], tau[i]);
182 if (transcript.getLeft() != "") { fprintf(fo, "\t%s", transcript.getLeft().c_str()); }
188 sprintf(geneResF, "%s.sim.genes.results", outFN);
189 fo = fopen(geneResF, "w");
190 for (int i = 0; i < m; i++) {
191 double sum_c = 0.0, sum_tau = 0.0;
192 int b = gi.spAt(i), e = gi.spAt(i + 1);
193 for (int j = b; j < e; j++) {
197 const string& gene_id = transcripts.getTranscriptAt(b).getGeneID();
198 fprintf(fo, "%s\t%.2f\t%.15g\t", gene_id.c_str(), sum_c, sum_tau);
199 for (int j = b; j < e; j++) {
200 fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : '\n'));
208 void releaseOutReadStreams() {
209 for (int i = 0; i < n_os; i++) {
210 ((ofstream*)os[i])->close();
215 int main(int argc, char* argv[]) {
219 if (argc != 7 && argc != 8) {
220 printf("Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q]\n");
224 if (argc == 8 && !strcmp(argv[7], "-q")) quiet = true;
228 sprintf(refF, "%s.seq", argv[1]);
231 sprintf(groupF, "%s.grp", argv[1]);
234 sprintf(tiF, "%s.ti", argv[1]);
235 transcripts.readFrom(tiF);
237 //read model type from modelF
238 fi = fopen(argv[2], "r");
239 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[2]); exit(-1); }
240 fscanf(fi, "%d", &model_type);
243 theta = new double[M + 1];
244 theta[0] = atof(argv[4]);
247 genOutReadStreams(model_type, argv[6]);
249 counts = new double[M + 1];
250 memset(counts, 0, sizeof(double) * (M + 1));
253 case 0: simulate<SingleRead, SingleModel>(argv[2], argv[3]); break;
254 case 1: simulate<SingleReadQ, SingleQModel>(argv[2], argv[3]); break;
255 case 2: simulate<PairedEndRead, PairedEndModel>(argv[2], argv[3]); break;
256 case 3: simulate<PairedEndReadQ, PairedEndQModel>(argv[2], argv[3]); break;
259 writeResFiles(argv[6]);
260 releaseOutReadStreams();