13 #include "my_assert.h"
14 #include "Orientation.h"
18 #include "NoiseProfile.h"
20 #include "ModelParams.h"
23 #include "SingleRead.h"
24 #include "SingleHit.h"
25 #include "ReadReader.h"
31 SingleModel(Refs* refs = NULL) {
33 M = (refs != NULL ? refs->getM() : 0);
34 memset(N, 0, sizeof(N));
36 needCalcConPrb = true;
38 ori = new Orientation();
41 rspd = new RSPD(estRSPD);
43 npro = new NoiseProfile();
45 mean = -1.0; sd = 0.0;
51 //If it is not a master node, only init & update can be used!
52 SingleModel(ModelParams& params, bool isMaster = true) {
54 memcpy(N, params.N, sizeof(params.N));
56 estRSPD = params.estRSPD;
57 mean = params.mean; sd = params.sd;
58 seedLen = params.seedLen;
59 needCalcConPrb = true;
61 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; pro = NULL; npro = NULL;
65 gld = new LenDist(params.minL, params.maxL);
66 if (mean >= EPSILON) {
67 mld = new LenDist(params.mate_minL, params.mate_maxL);
69 if (!estRSPD) { rspd = new RSPD(estRSPD); }
72 ori = new Orientation(params.probF);
73 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
74 pro = new Profile(params.maxL);
75 npro = new NoiseProfile();
80 if (ori != NULL) delete ori;
81 if (gld != NULL) delete gld;
82 if (mld != NULL) delete mld;
83 if (rspd != NULL) delete rspd;
84 if (pro != NULL) delete pro;
85 if (npro != NULL) delete npro;
86 if (mw != NULL) delete[] mw;
90 void estimateFromReads(const char*);
92 //if prob is too small, just make it 0
93 double getConPrb(const SingleRead& read, const SingleHit& hit) {
94 if (read.isLowQuality()) return 0.0;
97 int sid = hit.getSid();
98 RefSeq &ref = refs->getRef(sid);
99 int fullLen = ref.getFullLen();
100 int totLen = ref.getTotLen();
101 int dir = hit.getDir();
102 int pos = hit.getPos();
103 int readLen = read.getReadLength();
104 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
106 general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
107 " from the forward direction, which should be a non-negative number! " + \
108 "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
109 general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
110 + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
111 general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \
112 + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!");
114 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
115 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
121 int minL = std::max(readLen, gld->getMinL());
122 int maxL = std::min(totLen - pos, gld->getMaxL());
123 int pfpos; // possible fpos for fragment
125 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
126 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
127 effL = std::min(fullLen, totLen - fragLen + 1);
128 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
132 effL = std::min(fullLen, totLen - readLen + 1);
133 value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
136 prob = ori->getProb(dir) * value * pro->getProb(read.getReadSeq(), ref, pos, dir);
138 if (prob < EPSILON) { prob = 0.0; }
141 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
146 double getNoiseConPrb(const SingleRead& read) {
147 if (read.isLowQuality()) return 0.0;
148 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
149 prob *= npro->getProb(read.getReadSeq());
150 if (prob < EPSILON) { prob = 0.0; }
152 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
157 double getLogP() { return npro->getLogP(); }
161 void update(const SingleRead& read, const SingleHit& hit, double frac) {
162 if (read.isLowQuality() || frac < EPSILON) return;
164 RefSeq& ref = refs->getRef(hit.getSid());
165 int dir = hit.getDir();
166 int pos = hit.getPos();
169 int fullLen = ref.getFullLen();
171 // Only use one strand to estimate RSPD
172 if (ori->getProb(0) >= ORIVALVE && dir == 0) {
173 rspd->update(pos, fullLen, frac);
176 if (ori->getProb(0) < ORIVALVE && dir == 1) {
177 int totLen = ref.getTotLen();
178 int readLen = read.getReadLength();
183 int minL = std::max(readLen, gld->getMinL());
184 int maxL = std::min(totLen - pos, gld->getMaxL());
186 assert(maxL >= minL);
187 std::vector<double> frag_vec(maxL - minL + 1, 0.0);
189 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
190 pfpos = totLen - pos - fragLen;
191 effL = std::min(fullLen, totLen - fragLen + 1);
192 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
193 sum += frag_vec[fragLen - minL];
195 assert(sum >= EPSILON);
196 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
197 pfpos = totLen - pos - fragLen;
198 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
202 rspd->update(totLen - pos - readLen, fullLen, frac);
206 pro->update(read.getReadSeq(), ref, pos, dir, frac);
209 void updateNoise(const SingleRead& read, double frac) {
210 if (read.isLowQuality() || frac < EPSILON) return;
212 npro->update(read.getReadSeq(), frac);
217 void collect(const SingleModel&);
219 bool getNeedCalcConPrb() { return needCalcConPrb; }
220 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
224 //double* getP1() { return p1; }
225 //double* getP2() { return p2; }
227 void read(const char*);
228 void write(const char*);
230 const LenDist& getGLD() { return *gld; }
232 void startSimulation(simul*, double*);
233 bool simulate(int, SingleRead&, int&);
234 void finishSimulation();
241 int getModelType() const { return model_type; }
244 static const int model_type = 0;
245 static const int read_type = 0;
252 //double *p1, *p2; P_i' & P_i''
254 bool estRSPD; // true if estimate RSPD
255 bool needCalcConPrb; // true need, false does not need
263 simul *sampler; // for simulation
264 double *theta_cdf; // for simulation
266 double *mw; // for masking
271 void SingleModel::estimateFromReads(const char* readFN) {
273 char readFs[2][STRLEN];
276 mld != NULL ? mld->init() : gld->init();
277 for (int i = 0; i < 3; i++)
279 genReadFileNames(readFN, i, read_type, s, readFs);
280 ReadReader<SingleRead> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
283 while (reader.next(read)) {
284 if (!read.isLowQuality()) {
285 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
286 if (i == 0) { npro->updateC(read.getReadSeq()); }
288 else if (verbose && read.getReadLength() < seedLen) {
289 printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen);
293 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
296 if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
299 mld != NULL ? mld->finish() : gld->finish();
301 if (mean >= EPSILON) {
302 assert(mld->getMaxL() <= gld->getMaxL());
303 gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
305 npro->calcInitParams();
307 mw = new double[M + 1];
311 void SingleModel::init() {
312 if (estRSPD) rspd->init();
317 void SingleModel::finish() {
318 if (estRSPD) rspd->finish();
321 needCalcConPrb = true;
322 if (estRSPD) calcMW();
325 void SingleModel::collect(const SingleModel& o) {
326 if (estRSPD) rspd->collect(*(o.rspd));
327 pro->collect(*(o.pro));
328 npro->collect(*(o.npro));
331 //Only master node can call
332 void SingleModel::read(const char* inpF) {
334 FILE *fi = fopen(inpF, "r");
335 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
337 assert(fscanf(fi, "%d", &val) == 1);
338 assert(val == model_type);
342 assert(fscanf(fi, "%d", &val) == 1);
344 if (mld == NULL) mld = new LenDist();
351 if (fscanf(fi, "%d", &val) == 1) {
354 mw = new double[M + 1];
355 for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
362 //Only master node can call. Only be called at EM.cpp
363 void SingleModel::write(const char* outF) {
364 FILE *fo = fopen(outF, "w");
366 fprintf(fo, "%d\n", model_type);
369 ori->write(fo); fprintf(fo, "\n");
370 gld->write(fo); fprintf(fo, "\n");
375 else { fprintf(fo, "0\n"); }
377 rspd->write(fo); fprintf(fo, "\n");
378 pro->write(fo); fprintf(fo, "\n");
382 fprintf(fo, "\n%d\n", M);
383 for (int i = 0; i < M; i++) {
384 fprintf(fo, "%.15g ", mw[i]);
386 fprintf(fo, "%.15g\n", mw[M]);
392 void SingleModel::startSimulation(simul* sampler, double* theta) {
393 this->sampler = sampler;
395 theta_cdf = new double[M + 1];
396 for (int i = 0; i <= M; i++) {
397 theta_cdf[i] = theta[i];
398 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
401 rspd->startSimulation(M, refs);
402 pro->startSimulation();
403 npro->startSimulation();
406 bool SingleModel::simulate(int rid, SingleRead& read, int& sid) {
407 int dir, pos, readLen, fragLen;
410 std::ostringstream strout;
412 sid = sampler->sample(theta_cdf, M + 1);
416 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
417 readseq = npro->simulate(sampler, readLen);
420 RefSeq &ref = refs->getRef(sid);
421 dir = ori->simulate(sampler);
422 fragLen = gld->simulate(sampler, ref.getTotLen());
423 if (fragLen < 0) return false;
424 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
425 pos = rspd->simulate(sampler, sid, effL);
426 if (pos < 0) return false;
427 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
430 readLen = mld->simulate(sampler, fragLen);
431 if (readLen < 0) return false;
432 readseq = pro->simulate(sampler, readLen, pos, dir, ref);
435 readseq = pro->simulate(sampler, fragLen, pos, dir, ref);
439 strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
442 read = SingleRead(name, readseq);
447 void SingleModel::finishSimulation() {
450 rspd->finishSimulation();
451 pro->finishSimulation();
452 npro->finishSimulation();
455 void SingleModel::calcMW() {
458 assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
460 memset(mw, 0, sizeof(double) * (M + 1));
463 probF = ori->getProb(0);
464 probR = ori->getProb(1);
466 for (int i = 1; i <= M; i++) {
467 RefSeq& ref = refs->getRef(i);
468 int totLen = ref.getTotLen();
469 int fullLen = ref.getFullLen();
473 int end = std::min(fullLen, totLen - seedLen + 1);
476 for (int seedPos = 0; seedPos < end; seedPos++)
477 if (ref.getMask(seedPos)) {
479 minL = gld->getMinL();
480 maxL = std::min(gld->getMaxL(), totLen - seedPos);
482 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
483 effL = std::min(fullLen, totLen - fragLen + 1);
484 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
485 value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
488 minL = gld->getMinL();
489 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
490 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
491 pfpos = seedPos - (fragLen - seedLen);
492 effL = std::min(fullLen, totLen - fragLen + 1);
493 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
494 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
498 //for reverse strand masking
499 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
500 minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
501 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
502 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
503 pfpos = seedPos - (fragLen - seedLen);
504 effL = std::min(fullLen, totLen - fragLen + 1);
505 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
506 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
513 // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
519 #endif /* SINGLEMODEL_H_ */