1 #ifndef SINGLEQMODEL_H_
2 #define SINGLEQMODEL_H_
13 #include "Orientation.h"
18 #include "NoiseQProfile.h"
20 #include "ModelParams.h"
23 #include "SingleReadQ.h"
24 #include "SingleHit.h"
25 #include "ReadReader.h"
31 SingleQModel(Refs* refs = NULL) {
33 M = (refs != NULL ? refs->getM() : 0);
34 memset(N, 0, sizeof(N));
36 needCalcConPrb = true;
38 ori = new Orientation();
41 rspd = new RSPD(estRSPD);
43 qpro = new QProfile();
44 nqpro = new NoiseQProfile();
46 mean = -1.0; sd = 0.0;
52 //If it is not a master node, only init & update can be used!
53 SingleQModel(ModelParams& params, bool isMaster = true) {
55 memcpy(N, params.N, sizeof(params.N));
57 estRSPD = params.estRSPD;
58 mean = params.mean; sd = params.sd;
59 seedLen = params.seedLen;
60 needCalcConPrb = true;
62 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL;
66 ori = new Orientation(params.probF);
67 gld = new LenDist(params.minL, params.maxL);
68 if (mean >= EPSILON) {
69 mld = new LenDist(params.mate_minL, params.mate_maxL);
71 if (!estRSPD) { rspd = new RSPD(estRSPD); }
75 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
76 qpro = new QProfile();
77 nqpro = new NoiseQProfile();
82 if (ori != NULL) delete ori;
83 if (gld != NULL) delete gld;
84 if (mld != NULL) delete mld;
85 if (rspd != NULL) delete rspd;
86 if (qd != NULL) delete qd;
87 if (qpro != NULL) delete qpro;
88 if (nqpro != NULL) delete nqpro;
89 if (mw != NULL) delete[] mw;
93 //SingleQModel& operator=(const SingleQModel&);
95 void estimateFromReads(const char*);
97 //if prob is too small, just make it 0
98 double getConPrb(const SingleReadQ& read, const SingleHit& hit) const {
99 if (read.isLowQuality()) return 0.0;
102 int sid = hit.getSid();
103 RefSeq &ref = refs->getRef(sid);
104 int fullLen = ref.getFullLen();
105 int totLen = ref.getTotLen();
106 int dir = hit.getDir();
107 int pos = hit.getPos();
108 int readLen = read.getReadLength();
109 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
111 assert(fpos >= 0 && fpos + readLen <= totLen && readLen <= totLen);
112 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
113 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
119 int minL = std::max(readLen, gld->getMinL());
120 int maxL = std::min(totLen - pos, gld->getMaxL());
121 int pfpos; // possible fpos for fragment
123 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
124 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
125 effL = std::min(fullLen, totLen - fragLen + 1);
126 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
130 effL = std::min(fullLen, totLen - readLen + 1);
131 value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
134 prob = ori->getProb(dir) * value * qpro->getProb(read.getReadSeq(), read.getQScore(), ref, pos, dir);
136 if (prob < EPSILON) { prob = 0.0; }
138 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
143 double getNoiseConPrb(const SingleReadQ& read) {
144 if (read.isLowQuality()) return 0.0;
145 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
146 prob *= nqpro->getProb(read.getReadSeq(), read.getQScore());
147 if (prob < EPSILON) { prob = 0.0; }
149 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
154 double getLogP() { return nqpro->getLogP(); }
158 void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
159 if (read.isLowQuality() || frac < EPSILON) return;
161 RefSeq& ref = refs->getRef(hit.getSid());
162 int dir = hit.getDir();
163 int pos = hit.getPos();
166 int fullLen = ref.getFullLen();
168 // Only use one strand to estimate RSPD
169 if (ori->getProb(0) >= ORIVALVE && dir == 0) {
170 rspd->update(pos, fullLen, frac);
173 if (ori->getProb(0) < ORIVALVE && dir == 1) {
174 int totLen = ref.getTotLen();
175 int readLen = read.getReadLength();
180 int minL = std::max(readLen, gld->getMinL());
181 int maxL = std::min(totLen - pos, gld->getMaxL());
183 assert(maxL >= minL);
184 std::vector<double> frag_vec(maxL - minL + 1, 0.0);
186 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
187 pfpos = totLen - pos - fragLen;
188 effL = std::min(fullLen, totLen - fragLen + 1);
189 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
190 sum += frag_vec[fragLen - minL];
192 assert(sum >= EPSILON);
193 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
194 pfpos = totLen - pos - fragLen;
195 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
199 rspd->update(totLen - pos - readLen, fullLen, frac);
203 qpro->update(read.getReadSeq(), read.getQScore(), ref, pos, dir, frac);
206 void updateNoise(const SingleReadQ& read, double frac) {
207 if (read.isLowQuality() || frac < EPSILON) return;
209 nqpro->update(read.getReadSeq(), read.getQScore(), frac);
214 void collect(const SingleQModel&);
216 //void copy(const SingleQModel&);
218 bool getNeedCalcConPrb() { return needCalcConPrb; }
219 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
223 //double* getP1() { return p1; }
224 //double* getP2() { return p2; }
226 void read(const char*);
227 void write(const char*);
229 const LenDist& getGLD() { return *gld; }
231 void startSimulation(simul*, double*);
232 bool simulate(int, SingleReadQ&, int&);
233 void finishSimulation();
235 //Use it after function 'read' or 'estimateFromReads'
241 int getModelType() const { return model_type; }
244 static const int model_type = 1;
245 static const int read_type = 1;
252 //double *p1, *p2; P_i' & P_i'';
254 bool estRSPD; // true if estimate RSPD
255 bool needCalcConPrb; //true need, false does not need
262 NoiseQProfile *nqpro;
264 simul *sampler; // for simulation
265 double *theta_cdf; // for simulation
267 double *mw; // for masking
272 void SingleQModel::estimateFromReads(const char* readFN) {
274 char readFs[2][STRLEN];
277 mld != NULL ? mld->init() : gld->init();
278 for (int i = 0; i < 3; i++)
280 genReadFileNames(readFN, i, read_type, s, readFs);
281 ReadReader<SingleReadQ> reader(s, readFs);
284 while (reader.next(read)) {
285 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
286 qd->update(read.getQScore());
287 if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
290 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
293 if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
296 mld != NULL ? mld->finish() : gld->finish();
299 if (mean >= EPSILON) {
300 assert(mld->getMaxL() <= gld->getMaxL());
301 gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
304 nqpro->calcInitParams();
306 mw = new double[M + 1];
310 void SingleQModel::init() {
311 if (estRSPD) rspd->init();
316 void SingleQModel::finish() {
317 if (estRSPD) rspd->finish();
320 needCalcConPrb = true;
321 if (estRSPD) calcMW();
324 void SingleQModel::collect(const SingleQModel& o) {
325 if (estRSPD) rspd->collect(*(o.rspd));
326 qpro->collect(*(o.qpro));
327 nqpro->collect(*(o.nqpro));
330 //Only master node can call
331 void SingleQModel::read(const char* inpF) {
333 FILE *fi = fopen(inpF, "r");
334 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
336 fscanf(fi, "%d", &val);
337 assert(val == model_type);
341 fscanf(fi, "%d", &val);
343 if (mld == NULL) mld = new LenDist();
351 if (fscanf(fi, "%d", &M) == 1) {
352 mw = new double[M + 1];
353 for (int i = 0; i <= M; i++) fscanf(fi, "%lf", &mw[i]);
359 //Only master node can call
360 void SingleQModel::write(const char* outF) {
361 FILE *fo = fopen(outF, "w");
363 fprintf(fo, "%d\n", model_type);
366 ori->write(fo); fprintf(fo, "\n");
367 gld->write(fo); fprintf(fo, "\n");
372 else { fprintf(fo, "0\n"); }
374 rspd->write(fo); fprintf(fo, "\n");
375 qd->write(fo); fprintf(fo, "\n");
376 qpro->write(fo); fprintf(fo, "\n");
380 fprintf(fo, "\n%d\n", M);
381 for (int i = 0; i < M; i++) {
382 fprintf(fo, "%.15g ", mw[i]);
384 fprintf(fo, "%.15g\n", mw[M]);
390 void SingleQModel::startSimulation(simul* sampler, double* theta) {
391 this->sampler = sampler;
393 theta_cdf = new double[M + 1];
394 for (int i = 0; i <= M; i++) {
395 theta_cdf[i] = theta[i];
396 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
399 rspd->startSimulation(M, refs);
400 qd->startSimulation();
401 qpro->startSimulation();
402 nqpro->startSimulation();
405 bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) {
406 int dir, pos, readLen, fragLen;
408 std::string qual, readseq;
409 std::ostringstream strout;
411 sid = sampler->sample(theta_cdf, M + 1);
415 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
416 qual = qd->simulate(sampler, readLen);
417 readseq = nqpro->simulate(sampler, readLen, qual);
420 RefSeq &ref = refs->getRef(sid);
421 dir = ori->simulate(sampler);
422 fragLen = gld->simulate(sampler, ref.getTotLen());
423 if (fragLen < 0) return false;
425 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
426 pos = rspd->simulate(sampler, sid, effL);
427 if (pos < 0) return false;
428 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
431 readLen = mld->simulate(sampler, fragLen);
432 if (readLen < 0) return false;
433 qual = qd->simulate(sampler, readLen);
434 readseq = qpro->simulate(sampler, readLen, pos, dir, qual, ref);
437 qual = qd->simulate(sampler, fragLen);
438 readseq = qpro->simulate(sampler, fragLen, pos, dir, qual, ref);
442 std::ostringstream stdout;
443 stdout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
446 read = SingleReadQ(name, readseq, qual);
451 void SingleQModel::finishSimulation() {
454 rspd->finishSimulation();
455 qd->finishSimulation();
456 qpro->finishSimulation();
457 nqpro->finishSimulation();
460 void SingleQModel::calcMW() {
463 assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
465 memset(mw, 0, sizeof(double) * (M + 1));
469 probF = ori->getProb(0);
470 probR = ori->getProb(1);
472 for (int i = 1; i <= M; i++) {
473 RefSeq& ref = refs->getRef(i);
474 int totLen = ref.getTotLen();
475 int fullLen = ref.getFullLen();
479 int end = std::min(fullLen, totLen - seedLen + 1);
482 for (int seedPos = 0; seedPos < end; seedPos++)
483 if (ref.getMask(seedPos)) {
485 minL = gld->getMinL();
486 maxL = std::min(gld->getMaxL(), totLen - seedPos);
488 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
489 effL = std::min(fullLen, totLen - fragLen + 1);
490 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
491 value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
494 minL = gld->getMinL();
495 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
496 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
497 pfpos = seedPos - (fragLen - seedLen);
498 effL = std::min(fullLen, totLen - fragLen + 1);
499 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
500 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
504 //for reverse strand masking
505 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
506 minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
507 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
508 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
509 pfpos = seedPos - (fragLen - seedLen);
510 effL = std::min(fullLen, totLen - fragLen + 1);
511 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
512 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
519 // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
525 #endif /* SINGLEQMODEL_H_ */