1 #ifndef SINGLEQMODEL_H_
2 #define SINGLEQMODEL_H_
13 #include "Orientation.h"
18 #include "NoiseQProfile.h"
20 #include "ModelParams.h"
23 #include "SingleReadQ.h"
24 #include "SingleHit.h"
25 #include "ReadReader.h"
31 SingleQModel(Refs* refs = NULL) {
33 M = (refs != NULL ? refs->getM() : 0);
34 memset(N, 0, sizeof(N));
36 needCalcConPrb = true;
38 ori = new Orientation();
41 rspd = new RSPD(estRSPD);
43 qpro = new QProfile();
44 nqpro = new NoiseQProfile();
46 mean = -1.0; sd = 0.0;
52 //If it is not a master node, only init & update can be used!
53 SingleQModel(ModelParams& params, bool isMaster = true) {
55 memcpy(N, params.N, sizeof(params.N));
57 estRSPD = params.estRSPD;
58 mean = params.mean; sd = params.sd;
59 seedLen = params.seedLen;
60 needCalcConPrb = true;
62 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL;
66 gld = new LenDist(params.minL, params.maxL);
67 if (mean >= EPSILON) {
68 mld = new LenDist(params.mate_minL, params.mate_maxL);
70 if (!estRSPD) { rspd = new RSPD(estRSPD); }
74 ori = new Orientation(params.probF);
75 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
76 qpro = new QProfile();
77 nqpro = new NoiseQProfile();
82 if (ori != NULL) delete ori;
83 if (gld != NULL) delete gld;
84 if (mld != NULL) delete mld;
85 if (rspd != NULL) delete rspd;
86 if (qd != NULL) delete qd;
87 if (qpro != NULL) delete qpro;
88 if (nqpro != NULL) delete nqpro;
89 if (mw != NULL) delete[] mw;
93 //SingleQModel& operator=(const SingleQModel&);
95 void estimateFromReads(const char*);
97 //if prob is too small, just make it 0
98 double getConPrb(const SingleReadQ& read, const SingleHit& hit) const {
99 if (read.isLowQuality()) return 0.0;
102 int sid = hit.getSid();
103 RefSeq &ref = refs->getRef(sid);
104 int fullLen = ref.getFullLen();
105 int totLen = ref.getTotLen();
106 int dir = hit.getDir();
107 int pos = hit.getPos();
108 int readLen = read.getReadLength();
109 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
111 assert(fpos >= 0 && fpos + readLen <= totLen && readLen <= totLen);
112 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
113 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
119 int minL = std::max(readLen, gld->getMinL());
120 int maxL = std::min(totLen - pos, gld->getMaxL());
121 int pfpos; // possible fpos for fragment
123 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
124 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
125 effL = std::min(fullLen, totLen - fragLen + 1);
126 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
130 effL = std::min(fullLen, totLen - readLen + 1);
131 value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
134 prob = ori->getProb(dir) * value * qpro->getProb(read.getReadSeq(), read.getQScore(), ref, pos, dir);
136 if (prob < EPSILON) { prob = 0.0; }
138 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
143 double getNoiseConPrb(const SingleReadQ& read) {
144 if (read.isLowQuality()) return 0.0;
145 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
146 prob *= nqpro->getProb(read.getReadSeq(), read.getQScore());
147 if (prob < EPSILON) { prob = 0.0; }
149 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
154 double getLogP() { return nqpro->getLogP(); }
158 void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
159 if (read.isLowQuality() || frac < EPSILON) return;
161 const RefSeq& ref = refs->getRef(hit.getSid());
163 int dir = hit.getDir();
164 int pos = hit.getPos();
167 int fullLen = ref.getFullLen();
169 // Only use one strand to estimate RSPD
170 if (ori->getProb(0) >= ORIVALVE && dir == 0) {
171 rspd->update(pos, fullLen, frac);
174 if (ori->getProb(0) < ORIVALVE && dir == 1) {
175 int totLen = ref.getTotLen();
176 int readLen = read.getReadLength();
181 int minL = std::max(readLen, gld->getMinL());
182 int maxL = std::min(totLen - pos, gld->getMaxL());
184 assert(maxL >= minL);
185 std::vector<double> frag_vec(maxL - minL + 1, 0.0);
187 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
188 pfpos = totLen - pos - fragLen;
189 effL = std::min(fullLen, totLen - fragLen + 1);
190 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
191 sum += frag_vec[fragLen - minL];
193 assert(sum >= EPSILON);
194 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
195 pfpos = totLen - pos - fragLen;
196 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
200 rspd->update(totLen - pos - readLen, fullLen, frac);
204 qpro->update(read.getReadSeq(), read.getQScore(), ref, pos, dir, frac);
207 void updateNoise(const SingleReadQ& read, double frac) {
208 if (read.isLowQuality() || frac < EPSILON) return;
210 nqpro->update(read.getReadSeq(), read.getQScore(), frac);
215 void collect(const SingleQModel&);
217 //void copy(const SingleQModel&);
219 bool getNeedCalcConPrb() { return needCalcConPrb; }
220 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
224 //double* getP1() { return p1; }
225 //double* getP2() { return p2; }
227 void read(const char*);
228 void write(const char*);
230 const LenDist& getGLD() { return *gld; }
232 void startSimulation(simul*, double*);
233 bool simulate(int, SingleReadQ&, int&);
234 void finishSimulation();
236 //Use it after function 'read' or 'estimateFromReads'
242 int getModelType() const { return model_type; }
245 static const int model_type = 1;
246 static const int read_type = 1;
253 //double *p1, *p2; P_i' & P_i'';
255 bool estRSPD; // true if estimate RSPD
256 bool needCalcConPrb; //true need, false does not need
263 NoiseQProfile *nqpro;
265 simul *sampler; // for simulation
266 double *theta_cdf; // for simulation
268 double *mw; // for masking
273 void SingleQModel::estimateFromReads(const char* readFN) {
275 char readFs[2][STRLEN];
278 mld != NULL ? mld->init() : gld->init();
279 for (int i = 0; i < 3; i++)
281 genReadFileNames(readFN, i, read_type, s, readFs);
282 ReadReader<SingleReadQ> reader(s, readFs);
285 while (reader.next(read)) {
286 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
287 qd->update(read.getQScore());
288 if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
291 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
294 if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
297 mld != NULL ? mld->finish() : gld->finish();
300 if (mean >= EPSILON) {
301 assert(mld->getMaxL() <= gld->getMaxL());
302 gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
305 nqpro->calcInitParams();
307 mw = new double[M + 1];
311 void SingleQModel::init() {
312 if (estRSPD) rspd->init();
317 void SingleQModel::finish() {
318 if (estRSPD) rspd->finish();
321 needCalcConPrb = true;
322 if (estRSPD) calcMW();
325 void SingleQModel::collect(const SingleQModel& o) {
326 if (estRSPD) rspd->collect(*(o.rspd));
327 qpro->collect(*(o.qpro));
328 nqpro->collect(*(o.nqpro));
331 //Only master node can call
332 void SingleQModel::read(const char* inpF) {
334 FILE *fi = fopen(inpF, "r");
335 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
337 fscanf(fi, "%d", &val);
338 assert(val == model_type);
342 fscanf(fi, "%d", &val);
344 if (mld == NULL) mld = new LenDist();
352 if (fscanf(fi, "%d", &val) == 1) {
355 mw = new double[M + 1];
356 for (int i = 0; i <= M; i++) fscanf(fi, "%lf", &mw[i]);
363 //Only master node can call. Only be called at EM.cpp
364 void SingleQModel::write(const char* outF) {
365 FILE *fo = fopen(outF, "w");
367 fprintf(fo, "%d\n", model_type);
370 ori->write(fo); fprintf(fo, "\n");
371 gld->write(fo); fprintf(fo, "\n");
376 else { fprintf(fo, "0\n"); }
378 rspd->write(fo); fprintf(fo, "\n");
379 qd->write(fo); fprintf(fo, "\n");
380 qpro->write(fo); fprintf(fo, "\n");
384 fprintf(fo, "\n%d\n", M);
385 for (int i = 0; i < M; i++) {
386 fprintf(fo, "%.15g ", mw[i]);
388 fprintf(fo, "%.15g\n", mw[M]);
394 void SingleQModel::startSimulation(simul* sampler, double* theta) {
395 this->sampler = sampler;
397 theta_cdf = new double[M + 1];
398 for (int i = 0; i <= M; i++) {
399 theta_cdf[i] = theta[i];
400 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
403 rspd->startSimulation(M, refs);
404 qd->startSimulation();
405 qpro->startSimulation();
406 nqpro->startSimulation();
409 bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) {
410 int dir, pos, readLen, fragLen;
412 std::string qual, readseq;
413 std::ostringstream strout;
415 sid = sampler->sample(theta_cdf, M + 1);
419 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
420 qual = qd->simulate(sampler, readLen);
421 readseq = nqpro->simulate(sampler, readLen, qual);
424 RefSeq &ref = refs->getRef(sid);
425 dir = ori->simulate(sampler);
426 fragLen = gld->simulate(sampler, ref.getTotLen());
427 if (fragLen < 0) return false;
429 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
430 pos = rspd->simulate(sampler, sid, effL);
431 if (pos < 0) return false;
432 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
435 readLen = mld->simulate(sampler, fragLen);
436 if (readLen < 0) return false;
437 qual = qd->simulate(sampler, readLen);
438 readseq = qpro->simulate(sampler, readLen, pos, dir, qual, ref);
441 qual = qd->simulate(sampler, fragLen);
442 readseq = qpro->simulate(sampler, fragLen, pos, dir, qual, ref);
446 std::ostringstream stdout;
447 stdout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
450 read = SingleReadQ(name, readseq, qual);
455 void SingleQModel::finishSimulation() {
458 rspd->finishSimulation();
459 qd->finishSimulation();
460 qpro->finishSimulation();
461 nqpro->finishSimulation();
464 void SingleQModel::calcMW() {
467 assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
469 memset(mw, 0, sizeof(double) * (M + 1));
472 probF = ori->getProb(0);
473 probR = ori->getProb(1);
475 for (int i = 1; i <= M; i++) {
476 RefSeq& ref = refs->getRef(i);
477 int totLen = ref.getTotLen();
478 int fullLen = ref.getFullLen();
482 int end = std::min(fullLen, totLen - seedLen + 1);
485 for (int seedPos = 0; seedPos < end; seedPos++)
486 if (ref.getMask(seedPos)) {
488 minL = gld->getMinL();
489 maxL = std::min(gld->getMaxL(), totLen - seedPos);
491 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
492 effL = std::min(fullLen, totLen - fragLen + 1);
493 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
494 value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
497 minL = gld->getMinL();
498 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
499 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
500 pfpos = seedPos - (fragLen - seedLen);
501 effL = std::min(fullLen, totLen - fragLen + 1);
502 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
503 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
507 //for reverse strand masking
508 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
509 minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
510 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
511 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
512 pfpos = seedPos - (fragLen - seedLen);
513 effL = std::min(fullLen, totLen - fragLen + 1);
514 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
515 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
522 // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
528 #endif /* SINGLEQMODEL_H_ */