1 #ifndef PAIREDENDQMODEL_H_
2 #define PAIREDENDQMODEL_H_
15 #include "my_assert.h"
16 #include "Orientation.h"
21 #include "NoiseQProfile.h"
23 #include "ModelParams.h"
26 #include "SingleReadQ.h"
27 #include "PairedEndReadQ.h"
28 #include "PairedEndHit.h"
29 #include "ReadReader.h"
33 class PairedEndQModel {
35 PairedEndQModel(Refs* refs = NULL) {
37 M = (refs != NULL ? refs->getM() : 0);
38 memset(N, 0, sizeof(N));
40 needCalcConPrb = true;
42 ori = new Orientation();
44 rspd = new RSPD(estRSPD);
46 qpro = new QProfile();
47 nqpro = new NoiseQProfile();
54 //If it is not a master node, only init & update can be used!
55 PairedEndQModel(ModelParams& params, bool isMaster = true) {
57 memcpy(N, params.N, sizeof(params.N));
59 estRSPD = params.estRSPD;
60 seedLen = params.seedLen;
61 needCalcConPrb = true;
63 ori = NULL; gld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL; mld = NULL;
67 if (!estRSPD) rspd = new RSPD(estRSPD);
69 mld = new LenDist(params.mate_minL, params.mate_maxL);
72 ori = new Orientation(params.probF);
73 gld = new LenDist(params.minL, params.maxL);
74 if (estRSPD) rspd = new RSPD(estRSPD, params.B);
75 qpro = new QProfile();
76 nqpro = new NoiseQProfile();
81 if (ori != NULL) delete ori;
82 if (gld != NULL) delete gld;
83 if (rspd != NULL) delete rspd;
84 if (qd != NULL) delete qd;
85 if (qpro != NULL) delete qpro;
86 if (nqpro != NULL) delete nqpro;
87 if (mld != NULL) delete mld;
88 if (mw != NULL) delete mw;
91 void estimateFromReads(const char*);
93 //if prob is too small, just make it 0
94 double getConPrb(const PairedEndReadQ& read, const PairedEndHit& hit) {
95 if (read.isLowQuality()) return 0.0;
98 int sid = hit.getSid();
99 RefSeq &ref = refs->getRef(sid);
100 int dir = hit.getDir();
101 int pos = hit.getPos();
102 int fullLen = ref.getFullLen();
103 int totLen = ref.getTotLen();
104 int insertLen = hit.getInsertL();
106 int fpos = (dir == 0 ? pos : totLen - pos - insertLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
107 int effL = std::min(fullLen, totLen - insertLen + 1);
109 general_assert(fpos >= 0, "The alignment of fragment " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
110 " from the forward direction, which should be a non-negative number! " + \
111 "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
112 general_assert(fpos + insertLen <= totLen,"Fragment " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
113 + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
114 general_assert(insertLen <= totLen, "Fragment " + read.getName() + " has length " + itos(insertLen) + ", but it is aligned to transcript " \
115 + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the fragment's length!");
117 if (fpos >= fullLen || ref.getMask(fpos)) return 0.0; // For paired-end model, fpos is the seedPos
119 prob = ori->getProb(dir) * gld->getAdjustedProb(insertLen, totLen) *
120 rspd->getAdjustedProb(fpos, effL, fullLen);
122 const SingleReadQ& mate1 = read.getMate1();
123 prob *= mld->getAdjustedProb(mate1.getReadLength(), insertLen) *
124 qpro->getProb(mate1.getReadSeq(), mate1.getQScore(), ref, pos, dir);
126 const SingleReadQ& mate2 = read.getMate2();
127 int m2pos = totLen - pos - insertLen;
129 prob *= mld->getAdjustedProb(mate2.getReadLength(), hit.getInsertL()) *
130 qpro->getProb(mate2.getReadSeq(), mate2.getQScore(), ref, m2pos, m2dir);
132 if (prob < EPSILON) { prob = 0.0; }
134 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
139 double getNoiseConPrb(const PairedEndReadQ& read) {
140 if (read.isLowQuality()) return 0.0;
143 const SingleReadQ& mate1 = read.getMate1();
144 const SingleReadQ& mate2 = read.getMate2();
146 prob = mld->getProb(mate1.getReadLength()) * nqpro->getProb(mate1.getReadSeq(), mate1.getQScore());
147 prob *= mld->getProb(mate2.getReadLength()) * nqpro->getProb(mate2.getReadSeq(), mate2.getQScore());
149 if (prob < EPSILON) { prob = 0.0; }
151 prob = (mw[0] < EPSILON ? 0.0: prob / mw[0]);
156 double getLogP() { return nqpro->getLogP(); }
160 void update(const PairedEndReadQ& read, const PairedEndHit& hit, double frac) {
161 if (read.isLowQuality() || frac < EPSILON) return;
163 RefSeq& ref = refs->getRef(hit.getSid());
164 const SingleReadQ& mate1 = read.getMate1();
165 const SingleReadQ& mate2 = read.getMate2();
167 gld->update(hit.getInsertL(), frac);
169 int fpos = (hit.getDir() == 0 ? hit.getPos() : ref.getTotLen() - hit.getPos() - hit.getInsertL());
170 rspd->update(fpos, ref.getFullLen(), frac);
172 qpro->update(mate1.getReadSeq(), mate1.getQScore(), ref, hit.getPos(), hit.getDir(), frac);
174 int m2pos = ref.getTotLen() - hit.getPos() - hit.getInsertL();
175 int m2dir = !hit.getDir();
176 qpro->update(mate2.getReadSeq(), mate2.getQScore(), ref, m2pos, m2dir, frac);
179 void updateNoise(const PairedEndReadQ& read, double frac) {
180 if (read.isLowQuality() || frac < EPSILON) return;
182 const SingleReadQ& mate1 = read.getMate1();
183 const SingleReadQ& mate2 = read.getMate2();
185 nqpro->update(mate1.getReadSeq(), mate1.getQScore(), frac);
186 nqpro->update(mate2.getReadSeq(), mate2.getQScore(), frac);
191 void collect(const PairedEndQModel&);
193 bool getNeedCalcConPrb() { return needCalcConPrb; }
194 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
196 void read(const char*);
197 void write(const char*);
199 const LenDist& getGLD() { return *gld; }
201 void startSimulation(simul*, const std::vector<double>&);
202 bool simulate(READ_INT_TYPE, PairedEndReadQ&, int&);
203 void finishSimulation();
205 //Use it after function 'read' or 'estimateFromReads'
206 const double* getMW() {
211 int getModelType() const { return model_type; }
214 static const int model_type = 3;
215 static const int read_type = 3;
223 bool needCalcConPrb; //true need, false does not need
226 LenDist *gld, *mld; //mld1 mate_length_dist
230 NoiseQProfile *nqpro;
232 simul *sampler; // for simulation
233 double *theta_cdf; // for simulation
235 double *mw; // for masking
240 void PairedEndQModel::estimateFromReads(const char* readFN) {
242 char readFs[2][STRLEN];
246 for (int i = 0; i < 3; i++)
248 genReadFileNames(readFN, i, read_type, s, readFs);
249 ReadReader<PairedEndReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
251 READ_INT_TYPE cnt = 0;
252 while (reader.next(read)) {
253 SingleReadQ mate1 = read.getMate1();
254 SingleReadQ mate2 = read.getMate2();
256 if (!read.isLowQuality()) {
257 mld->update(mate1.getReadLength(), 1.0);
258 mld->update(mate2.getReadLength(), 1.0);
260 qd->update(mate1.getQScore());
261 qd->update(mate2.getQScore());
264 nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
265 nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
268 else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) {
269 std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to at least one of the mates' length < seed length "<< seedLen<< "!"<< std::endl;
273 if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
276 if (verbose) { std::cout<<"estimateFromReads, N"<< i<<" finished."<< std::endl; }
281 nqpro->calcInitParams();
283 mw = new double[M + 1];
287 void PairedEndQModel::init() {
289 if (estRSPD) rspd->init();
294 void PairedEndQModel::finish() {
296 if (estRSPD) rspd->finish();
299 needCalcConPrb = true;
303 void PairedEndQModel::collect(const PairedEndQModel& o) {
304 gld->collect(*(o.gld));
305 if (estRSPD) rspd->collect(*(o.rspd));
306 qpro->collect(*(o.qpro));
307 nqpro->collect(*(o.nqpro));
310 //Only master node can call
311 void PairedEndQModel::read(const char* inpF) {
313 FILE *fi = fopen(inpF, "r");
314 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
316 assert(fscanf(fi, "%d", &val) == 1);
317 assert(val == model_type);
327 if (fscanf(fi, "%d", &val) == 1) {
330 mw = new double[M + 1];
331 for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
339 //Only master node can call. Only be called at EM.cpp
340 void PairedEndQModel::write(const char* outF) {
341 FILE *fo = fopen(outF, "w");
343 fprintf(fo, "%d\n", model_type);
346 ori->write(fo); fprintf(fo, "\n");
347 gld->write(fo); fprintf(fo, "\n");
348 mld->write(fo); fprintf(fo, "\n");
349 rspd->write(fo); fprintf(fo, "\n");
350 qd->write(fo); fprintf(fo, "\n");
351 qpro->write(fo); fprintf(fo, "\n");
355 fprintf(fo, "\n%d\n", M);
356 for (int i = 0; i < M; i++) {
357 fprintf(fo, "%.15g ", mw[i]);
359 fprintf(fo, "%.15g\n", mw[M]);
365 void PairedEndQModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
366 this->sampler = sampler;
368 theta_cdf = new double[M + 1];
369 for (int i = 0; i <= M; i++) {
370 theta_cdf[i] = theta[i];
371 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
374 rspd->startSimulation(M, refs);
375 qd->startSimulation();
376 qpro->startSimulation();
377 nqpro->startSimulation();
380 bool PairedEndQModel::simulate(READ_INT_TYPE rid, PairedEndReadQ& read, int& sid) {
382 int insertL, mateL1, mateL2;
384 std::string qual1, qual2, readseq1, readseq2;
385 std::ostringstream strout;
387 sid = sampler->sample(theta_cdf, M + 1);
390 dir = pos = insertL = 0;
391 mateL1 = mld->simulate(sampler, -1);
392 qual1 = qd->simulate(sampler, mateL1);
393 readseq1 = nqpro->simulate(sampler, mateL1, qual1);
395 mateL2 = mld->simulate(sampler, -1);
396 qual2 = qd->simulate(sampler, mateL2);
397 readseq2 = nqpro->simulate(sampler, mateL2, qual2);
400 RefSeq &ref = refs->getRef(sid);
401 dir = ori->simulate(sampler);
402 insertL = gld->simulate(sampler, ref.getTotLen());
403 if (insertL < 0) return false;
404 int effL = std::min(ref.getFullLen(), ref.getTotLen() - insertL + 1);
405 pos = rspd->simulate(sampler, sid, effL);
406 if (pos < 0) return false;
407 if (dir > 0) pos = ref.getTotLen() - pos - insertL;
409 mateL1 = mld->simulate(sampler, insertL);
410 qual1 = qd->simulate(sampler, mateL1);
411 readseq1 = qpro->simulate(sampler, mateL1, pos, dir, qual1, ref);
413 int m2pos = ref.getTotLen() - pos - insertL;
416 mateL2 = mld->simulate(sampler, insertL);
417 qual2 = qd->simulate(sampler, mateL2);
418 readseq2 = qpro->simulate(sampler, mateL2, m2pos, m2dir, qual2, ref);
421 strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos<<"_"<<insertL;
424 read = PairedEndReadQ(SingleReadQ(name + "/1", readseq1, qual1), SingleReadQ(name + "/2", readseq2, qual2));
429 void PairedEndQModel::finishSimulation() {
432 rspd->finishSimulation();
433 qd->finishSimulation();
434 qpro->finishSimulation();
435 nqpro->finishSimulation();
439 void PairedEndQModel::calcMW() {
440 assert(mld->getMinL() >= seedLen);
442 memset(mw, 0, sizeof(double) * (M + 1));
445 for (int i = 1; i <= M; i++) {
446 RefSeq& ref = refs->getRef(i);
447 int totLen = ref.getTotLen();
448 int fullLen = ref.getFullLen();
449 int end = std::min(fullLen, totLen - gld->getMinL() + 1);
454 //seedPos is fpos here
455 for (int seedPos = 0; seedPos < end; seedPos++)
456 if (ref.getMask(seedPos)) {
457 minL = gld->getMinL();
458 maxL = std::min(gld->getMaxL(), totLen - seedPos);
460 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
461 effL = std::min(fullLen, totLen - fragLen + 1);
462 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
469 // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
475 #endif /* PAIREDENDQMODEL_H_ */