15 #include "my_assert.h"
16 #include "Orientation.h"
20 #include "NoiseProfile.h"
22 #include "ModelParams.h"
25 #include "SingleRead.h"
26 #include "SingleHit.h"
27 #include "ReadReader.h"
33 SingleModel(Refs* refs = NULL) {
35 M = (refs != NULL ? refs->getM() : 0);
36 memset(N, 0, sizeof(N));
38 needCalcConPrb = true;
40 ori = new Orientation();
43 rspd = new RSPD(estRSPD);
45 npro = new NoiseProfile();
47 mean = -1.0; sd = 0.0;
53 //If it is not a master node, only init & update can be used!
54 SingleModel(ModelParams& params, bool isMaster = true) {
56 memcpy(N, params.N, sizeof(params.N));
58 estRSPD = params.estRSPD;
59 mean = params.mean; sd = params.sd;
60 seedLen = params.seedLen;
61 needCalcConPrb = true;
63 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; pro = NULL; npro = NULL;
67 gld = new LenDist(params.minL, params.maxL);
68 if (mean >= EPSILON) {
69 mld = new LenDist(params.mate_minL, params.mate_maxL);
71 if (!estRSPD) { rspd = new RSPD(estRSPD); }
74 ori = new Orientation(params.probF);
75 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
76 pro = new Profile(params.maxL);
77 npro = new NoiseProfile();
82 if (ori != NULL) delete ori;
83 if (gld != NULL) delete gld;
84 if (mld != NULL) delete mld;
85 if (rspd != NULL) delete rspd;
86 if (pro != NULL) delete pro;
87 if (npro != NULL) delete npro;
88 if (mw != NULL) delete[] mw;
92 void estimateFromReads(const char*);
94 //if prob is too small, just make it 0
95 double getConPrb(const SingleRead& read, const SingleHit& hit) {
96 if (read.isLowQuality()) return 0.0;
99 int sid = hit.getSid();
100 RefSeq &ref = refs->getRef(sid);
101 int fullLen = ref.getFullLen();
102 int totLen = ref.getTotLen();
103 int dir = hit.getDir();
104 int pos = hit.getPos();
105 int readLen = read.getReadLength();
106 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
108 general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
109 " from the forward direction, which should be a non-negative number! " + \
110 "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
111 general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
112 + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
113 general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \
114 + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!");
116 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
117 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
123 int minL = std::max(readLen, gld->getMinL());
124 int maxL = std::min(totLen - pos, gld->getMaxL());
125 int pfpos; // possible fpos for fragment
127 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
128 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
129 effL = std::min(fullLen, totLen - fragLen + 1);
130 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
134 effL = std::min(fullLen, totLen - readLen + 1);
135 value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
138 prob = ori->getProb(dir) * value * pro->getProb(read.getReadSeq(), ref, pos, dir);
140 if (prob < EPSILON) { prob = 0.0; }
143 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
148 double getNoiseConPrb(const SingleRead& read) {
149 if (read.isLowQuality()) return 0.0;
150 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
151 prob *= npro->getProb(read.getReadSeq());
152 if (prob < EPSILON) { prob = 0.0; }
154 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
159 double getLogP() { return npro->getLogP(); }
163 void update(const SingleRead& read, const SingleHit& hit, double frac) {
164 if (read.isLowQuality() || frac < EPSILON) return;
166 RefSeq& ref = refs->getRef(hit.getSid());
167 int dir = hit.getDir();
168 int pos = hit.getPos();
171 int fullLen = ref.getFullLen();
173 // Only use one strand to estimate RSPD
174 if (ori->getProb(0) >= ORIVALVE && dir == 0) {
175 rspd->update(pos, fullLen, frac);
178 if (ori->getProb(0) < ORIVALVE && dir == 1) {
179 int totLen = ref.getTotLen();
180 int readLen = read.getReadLength();
185 int minL = std::max(readLen, gld->getMinL());
186 int maxL = std::min(totLen - pos, gld->getMaxL());
188 assert(maxL >= minL);
189 std::vector<double> frag_vec(maxL - minL + 1, 0.0);
191 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
192 pfpos = totLen - pos - fragLen;
193 effL = std::min(fullLen, totLen - fragLen + 1);
194 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
195 sum += frag_vec[fragLen - minL];
197 assert(sum >= EPSILON);
198 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
199 pfpos = totLen - pos - fragLen;
200 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
204 rspd->update(totLen - pos - readLen, fullLen, frac);
208 pro->update(read.getReadSeq(), ref, pos, dir, frac);
211 void updateNoise(const SingleRead& read, double frac) {
212 if (read.isLowQuality() || frac < EPSILON) return;
214 npro->update(read.getReadSeq(), frac);
219 void collect(const SingleModel&);
221 bool getNeedCalcConPrb() { return needCalcConPrb; }
222 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
226 //double* getP1() { return p1; }
227 //double* getP2() { return p2; }
229 void read(const char*);
230 void write(const char*);
232 const LenDist& getGLD() { return *gld; }
234 void startSimulation(simul*, const std::vector<double>&);
235 bool simulate(READ_INT_TYPE, SingleRead&, int&);
236 void finishSimulation();
238 const double* getMW() {
243 int getModelType() const { return model_type; }
246 static const int model_type = 0;
247 static const int read_type = 0;
254 //double *p1, *p2; P_i' & P_i''
256 bool estRSPD; // true if estimate RSPD
257 bool needCalcConPrb; // true need, false does not need
265 simul *sampler; // for simulation
266 double *theta_cdf; // for simulation
268 double *mw; // for masking
273 void SingleModel::estimateFromReads(const char* readFN) {
275 char readFs[2][STRLEN];
278 mld != NULL ? mld->init() : gld->init();
279 for (int i = 0; i < 3; i++)
281 genReadFileNames(readFN, i, read_type, s, readFs);
282 ReadReader<SingleRead> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
284 READ_INT_TYPE cnt = 0;
285 while (reader.next(read)) {
286 if (!read.isLowQuality()) {
287 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
288 if (i == 0) { npro->updateC(read.getReadSeq()); }
290 else if (verbose && read.getReadLength() < seedLen) {
291 std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to read length "<< read.getReadLength()<< " < seed length "<< seedLen<< "!"<< std::endl;
295 if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
298 if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; }
301 mld != NULL ? mld->finish() : gld->finish();
303 if (mean >= EPSILON) {
304 assert(mld->getMaxL() <= gld->getMaxL());
305 gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
307 npro->calcInitParams();
309 mw = new double[M + 1];
313 void SingleModel::init() {
314 if (estRSPD) rspd->init();
319 void SingleModel::finish() {
320 if (estRSPD) rspd->finish();
323 needCalcConPrb = true;
324 if (estRSPD) calcMW();
327 void SingleModel::collect(const SingleModel& o) {
328 if (estRSPD) rspd->collect(*(o.rspd));
329 pro->collect(*(o.pro));
330 npro->collect(*(o.npro));
333 //Only master node can call
334 void SingleModel::read(const char* inpF) {
336 FILE *fi = fopen(inpF, "r");
337 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
339 assert(fscanf(fi, "%d", &val) == 1);
340 assert(val == model_type);
344 assert(fscanf(fi, "%d", &val) == 1);
346 if (mld == NULL) mld = new LenDist();
353 if (fscanf(fi, "%d", &val) == 1) {
356 mw = new double[M + 1];
357 for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
364 //Only master node can call. Only be called at EM.cpp
365 void SingleModel::write(const char* outF) {
366 FILE *fo = fopen(outF, "w");
368 fprintf(fo, "%d\n", model_type);
371 ori->write(fo); fprintf(fo, "\n");
372 gld->write(fo); fprintf(fo, "\n");
377 else { fprintf(fo, "0\n"); }
379 rspd->write(fo); fprintf(fo, "\n");
380 pro->write(fo); fprintf(fo, "\n");
384 fprintf(fo, "\n%d\n", M);
385 for (int i = 0; i < M; i++) {
386 fprintf(fo, "%.15g ", mw[i]);
388 fprintf(fo, "%.15g\n", mw[M]);
394 void SingleModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
395 this->sampler = sampler;
397 theta_cdf = new double[M + 1];
398 for (int i = 0; i <= M; i++) {
399 theta_cdf[i] = theta[i];
400 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
403 rspd->startSimulation(M, refs);
404 pro->startSimulation();
405 npro->startSimulation();
408 bool SingleModel::simulate(READ_INT_TYPE rid, SingleRead& read, int& sid) {
409 int dir, pos, readLen, fragLen;
412 std::ostringstream strout;
414 sid = sampler->sample(theta_cdf, M + 1);
418 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
419 readseq = npro->simulate(sampler, readLen);
422 RefSeq &ref = refs->getRef(sid);
423 dir = ori->simulate(sampler);
424 fragLen = gld->simulate(sampler, ref.getTotLen());
425 if (fragLen < 0) return false;
426 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
427 pos = rspd->simulate(sampler, sid, effL);
428 if (pos < 0) return false;
429 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
432 readLen = mld->simulate(sampler, fragLen);
433 if (readLen < 0) return false;
434 readseq = pro->simulate(sampler, readLen, pos, dir, ref);
437 readseq = pro->simulate(sampler, fragLen, pos, dir, ref);
441 strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
444 read = SingleRead(name, readseq);
449 void SingleModel::finishSimulation() {
452 rspd->finishSimulation();
453 pro->finishSimulation();
454 npro->finishSimulation();
457 void SingleModel::calcMW() {
460 assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
462 memset(mw, 0, sizeof(double) * (M + 1));
465 probF = ori->getProb(0);
466 probR = ori->getProb(1);
468 for (int i = 1; i <= M; i++) {
469 RefSeq& ref = refs->getRef(i);
470 int totLen = ref.getTotLen();
471 int fullLen = ref.getFullLen();
475 int end = std::min(fullLen, totLen - seedLen + 1);
478 for (int seedPos = 0; seedPos < end; seedPos++)
479 if (ref.getMask(seedPos)) {
481 minL = gld->getMinL();
482 maxL = std::min(gld->getMaxL(), totLen - seedPos);
484 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
485 effL = std::min(fullLen, totLen - fragLen + 1);
486 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
487 value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
490 minL = gld->getMinL();
491 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
492 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
493 pfpos = seedPos - (fragLen - seedLen);
494 effL = std::min(fullLen, totLen - fragLen + 1);
495 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
496 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
500 //for reverse strand masking
501 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
502 minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
503 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
504 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
505 pfpos = seedPos - (fragLen - seedLen);
506 effL = std::min(fullLen, totLen - fragLen + 1);
507 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
508 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
515 // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
521 #endif /* SINGLEMODEL_H_ */