1 #ifndef SINGLEQMODEL_H_
2 #define SINGLEQMODEL_H_
13 #include "Orientation.h"
18 #include "NoiseQProfile.h"
20 #include "ModelParams.h"
23 #include "SingleReadQ.h"
24 #include "SingleHit.h"
25 #include "ReadReader.h"
31 SingleQModel(Refs* refs = NULL) {
33 M = (refs != NULL ? refs->getM() : 0);
34 memset(N, 0, sizeof(N));
36 needCalcConPrb = true;
38 ori = new Orientation();
41 rspd = new RSPD(estRSPD);
43 qpro = new QProfile();
44 nqpro = new NoiseQProfile();
46 mean = -1.0; sd = 0.0;
52 //If it is not a master node, only init & update can be used!
53 SingleQModel(ModelParams& params, bool isMaster = true) {
55 memcpy(N, params.N, sizeof(params.N));
57 estRSPD = params.estRSPD;
58 mean = params.mean; sd = params.sd;
59 seedLen = params.seedLen;
60 needCalcConPrb = true;
62 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL;
66 gld = new LenDist(params.minL, params.maxL);
67 if (mean >= EPSILON) {
68 mld = new LenDist(params.mate_minL, params.mate_maxL);
70 if (!estRSPD) { rspd = new RSPD(estRSPD); }
74 ori = new Orientation(params.probF);
75 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
76 qpro = new QProfile();
77 nqpro = new NoiseQProfile();
82 if (ori != NULL) delete ori;
83 if (gld != NULL) delete gld;
84 if (mld != NULL) delete mld;
85 if (rspd != NULL) delete rspd;
86 if (qd != NULL) delete qd;
87 if (qpro != NULL) delete qpro;
88 if (nqpro != NULL) delete nqpro;
89 if (mw != NULL) delete[] mw;
93 //SingleQModel& operator=(const SingleQModel&);
95 void estimateFromReads(const char*);
97 //if prob is too small, just make it 0
98 double getConPrb(const SingleReadQ& read, const SingleHit& hit) const {
99 if (read.isLowQuality()) return 0.0;
102 int sid = hit.getSid();
103 RefSeq &ref = refs->getRef(sid);
104 int fullLen = ref.getFullLen();
105 int totLen = ref.getTotLen();
106 int dir = hit.getDir();
107 int pos = hit.getPos();
108 int readLen = read.getReadLength();
109 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
111 assert(fpos >= 0 && fpos + readLen <= totLen && readLen <= totLen);
112 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
113 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
119 int minL = std::max(readLen, gld->getMinL());
120 int maxL = std::min(totLen - pos, gld->getMaxL());
121 int pfpos; // possible fpos for fragment
123 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
124 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
125 effL = std::min(fullLen, totLen - fragLen + 1);
126 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
130 effL = std::min(fullLen, totLen - readLen + 1);
131 value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
134 prob = ori->getProb(dir) * value * qpro->getProb(read.getReadSeq(), read.getQScore(), ref, pos, dir);
136 if (prob < EPSILON) { prob = 0.0; }
138 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
143 double getNoiseConPrb(const SingleReadQ& read) {
144 if (read.isLowQuality()) return 0.0;
145 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
146 prob *= nqpro->getProb(read.getReadSeq(), read.getQScore());
147 if (prob < EPSILON) { prob = 0.0; }
149 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
154 double getLogP() { return nqpro->getLogP(); }
158 void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
159 if (read.isLowQuality() || frac < EPSILON) return;
161 const RefSeq& ref = refs->getRef(hit.getSid());
163 int dir = hit.getDir();
164 int pos = hit.getPos();
167 int fullLen = ref.getFullLen();
169 // Only use one strand to estimate RSPD
170 if (ori->getProb(0) >= ORIVALVE && dir == 0) {
171 rspd->update(pos, fullLen, frac);
174 if (ori->getProb(0) < ORIVALVE && dir == 1) {
175 int totLen = ref.getTotLen();
176 int readLen = read.getReadLength();
181 int minL = std::max(readLen, gld->getMinL());
182 int maxL = std::min(totLen - pos, gld->getMaxL());
184 assert(maxL >= minL);
185 std::vector<double> frag_vec(maxL - minL + 1, 0.0);
187 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
188 pfpos = totLen - pos - fragLen;
189 effL = std::min(fullLen, totLen - fragLen + 1);
190 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
191 sum += frag_vec[fragLen - minL];
193 assert(sum >= EPSILON);
194 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
195 pfpos = totLen - pos - fragLen;
196 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
200 rspd->update(totLen - pos - readLen, fullLen, frac);
204 qpro->update(read.getReadSeq(), read.getQScore(), ref, pos, dir, frac);
207 void updateNoise(const SingleReadQ& read, double frac) {
208 if (read.isLowQuality() || frac < EPSILON) return;
210 nqpro->update(read.getReadSeq(), read.getQScore(), frac);
215 void collect(const SingleQModel&);
217 //void copy(const SingleQModel&);
219 bool getNeedCalcConPrb() { return needCalcConPrb; }
220 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
224 //double* getP1() { return p1; }
225 //double* getP2() { return p2; }
227 void read(const char*);
228 void write(const char*);
230 const LenDist& getGLD() { return *gld; }
232 void startSimulation(simul*, double*);
233 bool simulate(int, SingleReadQ&, int&);
234 void finishSimulation();
236 //Use it after function 'read' or 'estimateFromReads'
242 int getModelType() const { return model_type; }
245 static const int model_type = 1;
246 static const int read_type = 1;
253 //double *p1, *p2; P_i' & P_i'';
255 bool estRSPD; // true if estimate RSPD
256 bool needCalcConPrb; //true need, false does not need
263 NoiseQProfile *nqpro;
265 simul *sampler; // for simulation
266 double *theta_cdf; // for simulation
268 double *mw; // for masking
273 void SingleQModel::estimateFromReads(const char* readFN) {
275 char readFs[2][STRLEN];
278 mld != NULL ? mld->init() : gld->init();
279 for (int i = 0; i < 3; i++)
281 genReadFileNames(readFN, i, read_type, s, readFs);
282 ReadReader<SingleReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
285 while (reader.next(read)) {
286 if (!read.isLowQuality()) {
287 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
288 qd->update(read.getQScore());
289 if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
291 else if (verbose && read.getReadLength() < seedLen) {
292 printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen);
296 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
299 if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
302 mld != NULL ? mld->finish() : gld->finish();
305 if (mean >= EPSILON) {
306 assert(mld->getMaxL() <= gld->getMaxL());
307 gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
310 nqpro->calcInitParams();
312 mw = new double[M + 1];
316 void SingleQModel::init() {
317 if (estRSPD) rspd->init();
322 void SingleQModel::finish() {
323 if (estRSPD) rspd->finish();
326 needCalcConPrb = true;
327 if (estRSPD) calcMW();
330 void SingleQModel::collect(const SingleQModel& o) {
331 if (estRSPD) rspd->collect(*(o.rspd));
332 qpro->collect(*(o.qpro));
333 nqpro->collect(*(o.nqpro));
336 //Only master node can call
337 void SingleQModel::read(const char* inpF) {
339 FILE *fi = fopen(inpF, "r");
340 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
342 assert(fscanf(fi, "%d", &val) == 1);
343 assert(val == model_type);
347 assert(fscanf(fi, "%d", &val) == 1);
349 if (mld == NULL) mld = new LenDist();
357 if (fscanf(fi, "%d", &val) == 1) {
360 mw = new double[M + 1];
361 for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
368 //Only master node can call. Only be called at EM.cpp
369 void SingleQModel::write(const char* outF) {
370 FILE *fo = fopen(outF, "w");
372 fprintf(fo, "%d\n", model_type);
375 ori->write(fo); fprintf(fo, "\n");
376 gld->write(fo); fprintf(fo, "\n");
381 else { fprintf(fo, "0\n"); }
383 rspd->write(fo); fprintf(fo, "\n");
384 qd->write(fo); fprintf(fo, "\n");
385 qpro->write(fo); fprintf(fo, "\n");
389 fprintf(fo, "\n%d\n", M);
390 for (int i = 0; i < M; i++) {
391 fprintf(fo, "%.15g ", mw[i]);
393 fprintf(fo, "%.15g\n", mw[M]);
399 void SingleQModel::startSimulation(simul* sampler, double* theta) {
400 this->sampler = sampler;
402 theta_cdf = new double[M + 1];
403 for (int i = 0; i <= M; i++) {
404 theta_cdf[i] = theta[i];
405 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
408 rspd->startSimulation(M, refs);
409 qd->startSimulation();
410 qpro->startSimulation();
411 nqpro->startSimulation();
414 bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) {
415 int dir, pos, readLen, fragLen;
417 std::string qual, readseq;
418 std::ostringstream strout;
420 sid = sampler->sample(theta_cdf, M + 1);
424 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
425 qual = qd->simulate(sampler, readLen);
426 readseq = nqpro->simulate(sampler, readLen, qual);
429 RefSeq &ref = refs->getRef(sid);
430 dir = ori->simulate(sampler);
431 fragLen = gld->simulate(sampler, ref.getTotLen());
432 if (fragLen < 0) return false;
434 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
435 pos = rspd->simulate(sampler, sid, effL);
436 if (pos < 0) return false;
437 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
440 readLen = mld->simulate(sampler, fragLen);
441 if (readLen < 0) return false;
442 qual = qd->simulate(sampler, readLen);
443 readseq = qpro->simulate(sampler, readLen, pos, dir, qual, ref);
446 qual = qd->simulate(sampler, fragLen);
447 readseq = qpro->simulate(sampler, fragLen, pos, dir, qual, ref);
451 strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
454 read = SingleReadQ(name, readseq, qual);
459 void SingleQModel::finishSimulation() {
462 rspd->finishSimulation();
463 qd->finishSimulation();
464 qpro->finishSimulation();
465 nqpro->finishSimulation();
468 void SingleQModel::calcMW() {
471 assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
473 memset(mw, 0, sizeof(double) * (M + 1));
476 probF = ori->getProb(0);
477 probR = ori->getProb(1);
479 for (int i = 1; i <= M; i++) {
480 RefSeq& ref = refs->getRef(i);
481 int totLen = ref.getTotLen();
482 int fullLen = ref.getFullLen();
486 int end = std::min(fullLen, totLen - seedLen + 1);
489 for (int seedPos = 0; seedPos < end; seedPos++)
490 if (ref.getMask(seedPos)) {
492 minL = gld->getMinL();
493 maxL = std::min(gld->getMaxL(), totLen - seedPos);
495 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
496 effL = std::min(fullLen, totLen - fragLen + 1);
497 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
498 value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
501 minL = gld->getMinL();
502 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
503 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
504 pfpos = seedPos - (fragLen - seedLen);
505 effL = std::min(fullLen, totLen - fragLen + 1);
506 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
507 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
511 //for reverse strand masking
512 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
513 minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
514 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
515 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
516 pfpos = seedPos - (fragLen - seedLen);
517 effL = std::min(fullLen, totLen - fragLen + 1);
518 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
519 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
526 // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
532 #endif /* SINGLEQMODEL_H_ */