1 #ifndef SINGLEQMODEL_H_
2 #define SINGLEQMODEL_H_
15 #include "my_assert.h"
16 #include "Orientation.h"
21 #include "NoiseQProfile.h"
23 #include "ModelParams.h"
26 #include "SingleReadQ.h"
27 #include "SingleHit.h"
28 #include "ReadReader.h"
34 SingleQModel(Refs* refs = NULL) {
36 M = (refs != NULL ? refs->getM() : 0);
37 memset(N, 0, sizeof(N));
39 needCalcConPrb = true;
41 ori = new Orientation();
44 rspd = new RSPD(estRSPD);
46 qpro = new QProfile();
47 nqpro = new NoiseQProfile();
49 mean = -1.0; sd = 0.0;
55 //If it is not a master node, only init & update can be used!
56 SingleQModel(ModelParams& params, bool isMaster = true) {
58 memcpy(N, params.N, sizeof(params.N));
60 estRSPD = params.estRSPD;
61 mean = params.mean; sd = params.sd;
62 seedLen = params.seedLen;
63 needCalcConPrb = true;
65 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL;
69 gld = new LenDist(params.minL, params.maxL);
70 if (mean >= EPSILON) {
71 mld = new LenDist(params.mate_minL, params.mate_maxL);
73 if (!estRSPD) { rspd = new RSPD(estRSPD); }
77 ori = new Orientation(params.probF);
78 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
79 qpro = new QProfile();
80 nqpro = new NoiseQProfile();
85 if (ori != NULL) delete ori;
86 if (gld != NULL) delete gld;
87 if (mld != NULL) delete mld;
88 if (rspd != NULL) delete rspd;
89 if (qd != NULL) delete qd;
90 if (qpro != NULL) delete qpro;
91 if (nqpro != NULL) delete nqpro;
92 if (mw != NULL) delete[] mw;
96 //SingleQModel& operator=(const SingleQModel&);
98 void estimateFromReads(const char*);
100 //if prob is too small, just make it 0
101 double getConPrb(const SingleReadQ& read, const SingleHit& hit) const {
102 if (read.isLowQuality()) return 0.0;
105 int sid = hit.getSid();
106 RefSeq &ref = refs->getRef(sid);
107 int fullLen = ref.getFullLen();
108 int totLen = ref.getTotLen();
109 int dir = hit.getDir();
110 int pos = hit.getPos();
111 int readLen = read.getReadLength();
112 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
114 general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
115 " from the forward direction, which should be a non-negative number! " + \
116 "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
117 general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
118 + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
119 general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \
120 + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!");
122 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
123 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
129 int minL = std::max(readLen, gld->getMinL());
130 int maxL = std::min(totLen - pos, gld->getMaxL());
131 int pfpos; // possible fpos for fragment
133 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
134 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
135 effL = std::min(fullLen, totLen - fragLen + 1);
136 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
140 effL = std::min(fullLen, totLen - readLen + 1);
141 value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
144 prob = ori->getProb(dir) * value * qpro->getProb(read.getReadSeq(), read.getQScore(), ref, pos, dir);
146 if (prob < EPSILON) { prob = 0.0; }
148 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
153 double getNoiseConPrb(const SingleReadQ& read) {
154 if (read.isLowQuality()) return 0.0;
155 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
156 prob *= nqpro->getProb(read.getReadSeq(), read.getQScore());
157 if (prob < EPSILON) { prob = 0.0; }
159 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
164 double getLogP() { return nqpro->getLogP(); }
168 void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
169 if (read.isLowQuality() || frac < EPSILON) return;
171 const RefSeq& ref = refs->getRef(hit.getSid());
173 int dir = hit.getDir();
174 int pos = hit.getPos();
177 int fullLen = ref.getFullLen();
179 // Only use one strand to estimate RSPD
180 if (ori->getProb(0) >= ORIVALVE && dir == 0) {
181 rspd->update(pos, fullLen, frac);
184 if (ori->getProb(0) < ORIVALVE && dir == 1) {
185 int totLen = ref.getTotLen();
186 int readLen = read.getReadLength();
191 int minL = std::max(readLen, gld->getMinL());
192 int maxL = std::min(totLen - pos, gld->getMaxL());
194 assert(maxL >= minL);
195 std::vector<double> frag_vec(maxL - minL + 1, 0.0);
197 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
198 pfpos = totLen - pos - fragLen;
199 effL = std::min(fullLen, totLen - fragLen + 1);
200 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
201 sum += frag_vec[fragLen - minL];
203 assert(sum >= EPSILON);
204 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
205 pfpos = totLen - pos - fragLen;
206 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
210 rspd->update(totLen - pos - readLen, fullLen, frac);
214 qpro->update(read.getReadSeq(), read.getQScore(), ref, pos, dir, frac);
217 void updateNoise(const SingleReadQ& read, double frac) {
218 if (read.isLowQuality() || frac < EPSILON) return;
220 nqpro->update(read.getReadSeq(), read.getQScore(), frac);
225 void collect(const SingleQModel&);
227 //void copy(const SingleQModel&);
229 bool getNeedCalcConPrb() { return needCalcConPrb; }
230 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
234 //double* getP1() { return p1; }
235 //double* getP2() { return p2; }
237 void read(const char*);
238 void write(const char*);
240 const LenDist& getGLD() { return *gld; }
242 void startSimulation(simul*, const std::vector<double>&);
243 bool simulate(READ_INT_TYPE, SingleReadQ&, int&);
244 void finishSimulation();
246 //Use it after function 'read' or 'estimateFromReads'
247 const double* getMW() {
252 int getModelType() const { return model_type; }
255 static const int model_type = 1;
256 static const int read_type = 1;
263 //double *p1, *p2; P_i' & P_i'';
265 bool estRSPD; // true if estimate RSPD
266 bool needCalcConPrb; //true need, false does not need
273 NoiseQProfile *nqpro;
275 simul *sampler; // for simulation
276 double *theta_cdf; // for simulation
278 double *mw; // for masking
283 void SingleQModel::estimateFromReads(const char* readFN) {
285 char readFs[2][STRLEN];
288 mld != NULL ? mld->init() : gld->init();
289 for (int i = 0; i < 3; i++)
291 genReadFileNames(readFN, i, read_type, s, readFs);
292 ReadReader<SingleReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
294 READ_INT_TYPE cnt = 0;
295 while (reader.next(read)) {
296 if (!read.isLowQuality()) {
297 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
298 qd->update(read.getQScore());
299 if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
301 else if (verbose && read.getReadLength() < seedLen) {
302 std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to read length "<< read.getReadLength()<< " < seed length "<< seedLen<< "!"<< std::endl;
306 if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
309 if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; }
312 mld != NULL ? mld->finish() : gld->finish();
315 if (mean >= EPSILON) {
316 assert(mld->getMaxL() <= gld->getMaxL());
317 gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
320 nqpro->calcInitParams();
322 mw = new double[M + 1];
326 void SingleQModel::init() {
327 if (estRSPD) rspd->init();
332 void SingleQModel::finish() {
333 if (estRSPD) rspd->finish();
336 needCalcConPrb = true;
337 if (estRSPD) calcMW();
340 void SingleQModel::collect(const SingleQModel& o) {
341 if (estRSPD) rspd->collect(*(o.rspd));
342 qpro->collect(*(o.qpro));
343 nqpro->collect(*(o.nqpro));
346 //Only master node can call
347 void SingleQModel::read(const char* inpF) {
349 FILE *fi = fopen(inpF, "r");
350 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
352 assert(fscanf(fi, "%d", &val) == 1);
353 assert(val == model_type);
357 assert(fscanf(fi, "%d", &val) == 1);
359 if (mld == NULL) mld = new LenDist();
367 if (fscanf(fi, "%d", &val) == 1) {
370 mw = new double[M + 1];
371 for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
378 //Only master node can call. Only be called at EM.cpp
379 void SingleQModel::write(const char* outF) {
380 FILE *fo = fopen(outF, "w");
382 fprintf(fo, "%d\n", model_type);
385 ori->write(fo); fprintf(fo, "\n");
386 gld->write(fo); fprintf(fo, "\n");
391 else { fprintf(fo, "0\n"); }
393 rspd->write(fo); fprintf(fo, "\n");
394 qd->write(fo); fprintf(fo, "\n");
395 qpro->write(fo); fprintf(fo, "\n");
399 fprintf(fo, "\n%d\n", M);
400 for (int i = 0; i < M; i++) {
401 fprintf(fo, "%.15g ", mw[i]);
403 fprintf(fo, "%.15g\n", mw[M]);
409 void SingleQModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
410 this->sampler = sampler;
412 theta_cdf = new double[M + 1];
413 for (int i = 0; i <= M; i++) {
414 theta_cdf[i] = theta[i];
415 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
418 rspd->startSimulation(M, refs);
419 qd->startSimulation();
420 qpro->startSimulation();
421 nqpro->startSimulation();
424 bool SingleQModel::simulate(READ_INT_TYPE rid, SingleReadQ& read, int& sid) {
425 int dir, pos, readLen, fragLen;
427 std::string qual, readseq;
428 std::ostringstream strout;
430 sid = sampler->sample(theta_cdf, M + 1);
434 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
435 qual = qd->simulate(sampler, readLen);
436 readseq = nqpro->simulate(sampler, readLen, qual);
439 RefSeq &ref = refs->getRef(sid);
440 dir = ori->simulate(sampler);
441 fragLen = gld->simulate(sampler, ref.getTotLen());
442 if (fragLen < 0) return false;
444 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
445 pos = rspd->simulate(sampler, sid, effL);
446 if (pos < 0) return false;
447 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
450 readLen = mld->simulate(sampler, fragLen);
451 if (readLen < 0) return false;
452 qual = qd->simulate(sampler, readLen);
453 readseq = qpro->simulate(sampler, readLen, pos, dir, qual, ref);
456 qual = qd->simulate(sampler, fragLen);
457 readseq = qpro->simulate(sampler, fragLen, pos, dir, qual, ref);
461 strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
464 read = SingleReadQ(name, readseq, qual);
469 void SingleQModel::finishSimulation() {
472 rspd->finishSimulation();
473 qd->finishSimulation();
474 qpro->finishSimulation();
475 nqpro->finishSimulation();
478 void SingleQModel::calcMW() {
481 assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
483 memset(mw, 0, sizeof(double) * (M + 1));
486 probF = ori->getProb(0);
487 probR = ori->getProb(1);
489 for (int i = 1; i <= M; i++) {
490 RefSeq& ref = refs->getRef(i);
491 int totLen = ref.getTotLen();
492 int fullLen = ref.getFullLen();
496 int end = std::min(fullLen, totLen - seedLen + 1);
499 for (int seedPos = 0; seedPos < end; seedPos++)
500 if (ref.getMask(seedPos)) {
502 minL = gld->getMinL();
503 maxL = std::min(gld->getMaxL(), totLen - seedPos);
505 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
506 effL = std::min(fullLen, totLen - fragLen + 1);
507 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
508 value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
511 minL = gld->getMinL();
512 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
513 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
514 pfpos = seedPos - (fragLen - seedLen);
515 effL = std::min(fullLen, totLen - fragLen + 1);
516 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
517 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
521 //for reverse strand masking
522 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
523 minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
524 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
525 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
526 pfpos = seedPos - (fragLen - seedLen);
527 effL = std::min(fullLen, totLen - fragLen + 1);
528 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
529 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
536 // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
542 #endif /* SINGLEQMODEL_H_ */