1 #ifndef PAIREDENDQMODEL_H_
2 #define PAIREDENDQMODEL_H_
13 #include "Orientation.h"
18 #include "NoiseQProfile.h"
20 #include "ModelParams.h"
23 #include "SingleReadQ.h"
24 #include "PairedEndReadQ.h"
25 #include "PairedEndHit.h"
26 #include "ReadReader.h"
30 class PairedEndQModel {
32 PairedEndQModel(Refs* refs = NULL) {
34 M = (refs != NULL ? refs->getM() : 0);
35 memset(N, 0, sizeof(N));
37 needCalcConPrb = true;
39 ori = new Orientation();
41 rspd = new RSPD(estRSPD);
43 qpro = new QProfile();
44 nqpro = new NoiseQProfile();
51 //If it is not a master node, only init & update can be used!
52 PairedEndQModel(ModelParams& params, bool isMaster = true) {
54 memcpy(N, params.N, sizeof(params.N));
56 estRSPD = params.estRSPD;
57 seedLen = params.seedLen;
58 needCalcConPrb = true;
60 ori = NULL; gld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL; mld = NULL;
64 if (!estRSPD) rspd = new RSPD(estRSPD);
66 mld = new LenDist(params.mate_minL, params.mate_maxL);
69 ori = new Orientation(params.probF);
70 gld = new LenDist(params.minL, params.maxL);
71 if (estRSPD) rspd = new RSPD(estRSPD, params.B);
72 qpro = new QProfile();
73 nqpro = new NoiseQProfile();
78 if (ori != NULL) delete ori;
79 if (gld != NULL) delete gld;
80 if (rspd != NULL) delete rspd;
81 if (qd != NULL) delete qd;
82 if (qpro != NULL) delete qpro;
83 if (nqpro != NULL) delete nqpro;
84 if (mld != NULL) delete mld;
85 if (mw != NULL) delete mw;
88 void estimateFromReads(const char*);
90 //if prob is too small, just make it 0
91 double getConPrb(const PairedEndReadQ& read, const PairedEndHit& hit) {
92 if (read.isLowQuality()) return 0.0;
95 int sid = hit.getSid();
96 RefSeq &ref = refs->getRef(sid);
97 int dir = hit.getDir();
98 int pos = hit.getPos();
99 int fullLen = ref.getFullLen();
100 int totLen = ref.getTotLen();
101 int insertLen = hit.getInsertL();
103 int fpos = (dir == 0 ? pos : totLen - pos - insertLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
104 int effL = std::min(fullLen, totLen - insertLen + 1);
106 assert(fpos >= 0 && fpos + insertLen <= totLen && insertLen <= totLen);
107 if (fpos >= fullLen || ref.getMask(fpos)) return 0.0; // For paired-end model, fpos is the seedPos
109 prob = ori->getProb(dir) * gld->getAdjustedProb(insertLen, totLen) *
110 rspd->getAdjustedProb(fpos, effL, fullLen);
112 const SingleReadQ& mate1 = read.getMate1();
113 prob *= mld->getAdjustedProb(mate1.getReadLength(), insertLen) *
114 qpro->getProb(mate1.getReadSeq(), mate1.getQScore(), ref, pos, dir);
116 const SingleReadQ& mate2 = read.getMate2();
117 int m2pos = totLen - pos - insertLen;
119 prob *= mld->getAdjustedProb(mate2.getReadLength(), hit.getInsertL()) *
120 qpro->getProb(mate2.getReadSeq(), mate2.getQScore(), ref, m2pos, m2dir);
122 if (prob < EPSILON) { prob = 0.0; }
124 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
129 double getNoiseConPrb(const PairedEndReadQ& read) {
130 if (read.isLowQuality()) return 0.0;
133 const SingleReadQ& mate1 = read.getMate1();
134 const SingleReadQ& mate2 = read.getMate2();
136 prob = mld->getProb(mate1.getReadLength()) * nqpro->getProb(mate1.getReadSeq(), mate1.getQScore());
137 prob *= mld->getProb(mate2.getReadLength()) * nqpro->getProb(mate2.getReadSeq(), mate2.getQScore());
139 if (prob < EPSILON) { prob = 0.0; }
141 prob = (mw[0] < EPSILON ? 0.0: prob / mw[0]);
146 double getLogP() { return nqpro->getLogP(); }
150 void update(const PairedEndReadQ& read, const PairedEndHit& hit, double frac) {
151 if (read.isLowQuality() || frac < EPSILON) return;
153 RefSeq& ref = refs->getRef(hit.getSid());
154 const SingleReadQ& mate1 = read.getMate1();
155 const SingleReadQ& mate2 = read.getMate2();
157 gld->update(hit.getInsertL(), frac);
159 int fpos = (hit.getDir() == 0 ? hit.getPos() : ref.getTotLen() - hit.getPos() - hit.getInsertL());
160 rspd->update(fpos, ref.getFullLen(), frac);
162 qpro->update(mate1.getReadSeq(), mate1.getQScore(), ref, hit.getPos(), hit.getDir(), frac);
164 int m2pos = ref.getTotLen() - hit.getPos() - hit.getInsertL();
165 int m2dir = !hit.getDir();
166 qpro->update(mate2.getReadSeq(), mate2.getQScore(), ref, m2pos, m2dir, frac);
169 void updateNoise(const PairedEndReadQ& read, double frac) {
170 if (read.isLowQuality() || frac < EPSILON) return;
172 const SingleReadQ& mate1 = read.getMate1();
173 const SingleReadQ& mate2 = read.getMate2();
175 nqpro->update(mate1.getReadSeq(), mate1.getQScore(), frac);
176 nqpro->update(mate2.getReadSeq(), mate2.getQScore(), frac);
181 void collect(const PairedEndQModel&);
183 bool getNeedCalcConPrb() { return needCalcConPrb; }
184 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
186 void read(const char*);
187 void write(const char*);
189 const LenDist& getGLD() { return *gld; }
191 void startSimulation(simul*, double*);
192 bool simulate(int, PairedEndReadQ&, int&);
193 void finishSimulation();
195 //Use it after function 'read' or 'estimateFromReads'
201 int getModelType() const { return model_type; }
203 bool simulate(int, PairedEndReadQ&, SingleReadQ&, int&);
206 static const int model_type = 3;
207 static const int read_type = 3;
215 bool needCalcConPrb; //true need, false does not need
218 LenDist *gld, *mld; //mld1 mate_length_dist
222 NoiseQProfile *nqpro;
224 simul *sampler; // for simulation
225 double *theta_cdf; // for simulation
227 double *mw; // for masking
232 void PairedEndQModel::estimateFromReads(const char* readFN) {
234 char readFs[2][STRLEN];
238 for (int i = 0; i < 3; i++)
240 genReadFileNames(readFN, i, read_type, s, readFs);
241 ReadReader<PairedEndReadQ> reader(s, readFs);
244 while (reader.next(read)) {
245 SingleReadQ mate1 = read.getMate1();
246 SingleReadQ mate2 = read.getMate2();
248 mld->update(mate1.getReadLength(), 1.0);
249 mld->update(mate2.getReadLength(), 1.0);
251 qd->update(mate1.getQScore());
252 qd->update(mate2.getQScore());
255 nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
256 nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
260 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
263 if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
268 nqpro->calcInitParams();
270 mw = new double[M + 1];
274 void PairedEndQModel::init() {
276 if (estRSPD) rspd->init();
281 void PairedEndQModel::finish() {
283 if (estRSPD) rspd->finish();
286 needCalcConPrb = true;
290 void PairedEndQModel::collect(const PairedEndQModel& o) {
291 gld->collect(*(o.gld));
292 if (estRSPD) rspd->collect(*(o.rspd));
293 qpro->collect(*(o.qpro));
294 nqpro->collect(*(o.nqpro));
297 //Only master node can call
298 void PairedEndQModel::read(const char* inpF) {
300 FILE *fi = fopen(inpF, "r");
301 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
303 fscanf(fi, "%d", &val);
304 assert(val == model_type);
314 if (fscanf(fi, "%d", &val) == 1) {
317 mw = new double[M + 1];
318 for (int i = 0; i <= M; i++) fscanf(fi, "%lf", &mw[i]);
326 //Only master node can call. Only be called at EM.cpp
327 void PairedEndQModel::write(const char* outF) {
328 FILE *fo = fopen(outF, "w");
330 fprintf(fo, "%d\n", model_type);
333 ori->write(fo); fprintf(fo, "\n");
334 gld->write(fo); fprintf(fo, "\n");
335 mld->write(fo); fprintf(fo, "\n");
336 rspd->write(fo); fprintf(fo, "\n");
337 qd->write(fo); fprintf(fo, "\n");
338 qpro->write(fo); fprintf(fo, "\n");
342 fprintf(fo, "\n%d\n", M);
343 for (int i = 0; i < M; i++) {
344 fprintf(fo, "%.15g ", mw[i]);
346 fprintf(fo, "%.15g\n", mw[M]);
352 void PairedEndQModel::startSimulation(simul* sampler, double* theta) {
353 this->sampler = sampler;
355 theta_cdf = new double[M + 1];
356 for (int i = 0; i <= M; i++) {
357 theta_cdf[i] = theta[i];
358 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
361 rspd->startSimulation(M, refs);
362 qd->startSimulation();
363 qpro->startSimulation();
364 nqpro->startSimulation();
367 bool PairedEndQModel::simulate(int rid, PairedEndReadQ& read, int& sid) {
369 int insertL, mateL1, mateL2;
371 std::string qual1, qual2, readseq1, readseq2;
372 std::ostringstream strout;
374 sid = sampler->sample(theta_cdf, M + 1);
377 dir = pos = insertL = 0;
378 mateL1 = mld->simulate(sampler, -1);
379 qual1 = qd->simulate(sampler, mateL1);
380 readseq1 = nqpro->simulate(sampler, mateL1, qual1);
382 mateL2 = mld->simulate(sampler, -1);
383 qual2 = qd->simulate(sampler, mateL2);
384 readseq2 = nqpro->simulate(sampler, mateL2, qual2);
387 RefSeq &ref = refs->getRef(sid);
388 dir = ori->simulate(sampler);
389 insertL = gld->simulate(sampler, ref.getTotLen());
390 if (insertL < 0) return false;
391 int effL = std::min(ref.getFullLen(), ref.getTotLen() - insertL + 1);
392 pos = rspd->simulate(sampler, sid, effL);
393 if (pos < 0) return false;
394 if (dir > 0) pos = ref.getTotLen() - pos - insertL;
396 mateL1 = mld->simulate(sampler, insertL);
397 qual1 = qd->simulate(sampler, mateL1);
398 readseq1 = qpro->simulate(sampler, mateL1, pos, dir, qual1, ref);
400 int m2pos = ref.getTotLen() - pos - insertL;
403 mateL2 = mld->simulate(sampler, insertL);
404 qual2 = qd->simulate(sampler, mateL2);
405 readseq2 = qpro->simulate(sampler, mateL2, m2pos, m2dir, qual2, ref);
408 std::ostringstream stdout;
409 stdout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos<<"_"<<insertL;
412 read = PairedEndReadQ(SingleReadQ(name + "/1", readseq1, qual1), SingleReadQ(name + "/2", readseq2, qual2));
417 void PairedEndQModel::finishSimulation() {
420 rspd->finishSimulation();
421 qd->finishSimulation();
422 qpro->finishSimulation();
423 nqpro->finishSimulation();
427 void PairedEndQModel::calcMW() {
428 assert(seedLen >= OLEN && mld->getMinL() >= seedLen);
430 memset(mw, 0, sizeof(double) * (M + 1));
433 for (int i = 1; i <= M; i++) {
434 RefSeq& ref = refs->getRef(i);
435 int totLen = ref.getTotLen();
436 int fullLen = ref.getFullLen();
437 int end = std::min(fullLen, totLen - gld->getMinL() + 1);
442 //seedPos is fpos here
443 for (int seedPos = 0; seedPos < end; seedPos++)
444 if (ref.getMask(seedPos)) {
445 minL = gld->getMinL();
446 maxL = std::min(gld->getMaxL(), totLen - seedPos);
448 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
449 effL = std::min(fullLen, totLen - fragLen + 1);
450 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
457 // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
463 #endif /* PAIREDENDQMODEL_H_ */