14 #include "my_assert.h"
15 #include "Orientation.h"
19 #include "NoiseProfile.h"
21 #include "ModelParams.h"
24 #include "SingleRead.h"
25 #include "SingleHit.h"
26 #include "ReadReader.h"
32 SingleModel(Refs* refs = NULL) {
34 M = (refs != NULL ? refs->getM() : 0);
35 memset(N, 0, sizeof(N));
37 needCalcConPrb = true;
39 ori = new Orientation();
42 rspd = new RSPD(estRSPD);
44 npro = new NoiseProfile();
46 mean = -1.0; sd = 0.0;
52 //If it is not a master node, only init & update can be used!
53 SingleModel(ModelParams& params, bool isMaster = true) {
55 memcpy(N, params.N, sizeof(params.N));
57 estRSPD = params.estRSPD;
58 mean = params.mean; sd = params.sd;
59 seedLen = params.seedLen;
60 needCalcConPrb = true;
62 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; pro = NULL; npro = NULL;
66 gld = new LenDist(params.minL, params.maxL);
67 if (mean >= EPSILON) {
68 mld = new LenDist(params.mate_minL, params.mate_maxL);
70 if (!estRSPD) { rspd = new RSPD(estRSPD); }
73 ori = new Orientation(params.probF);
74 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
75 pro = new Profile(params.maxL);
76 npro = new NoiseProfile();
81 if (ori != NULL) delete ori;
82 if (gld != NULL) delete gld;
83 if (mld != NULL) delete mld;
84 if (rspd != NULL) delete rspd;
85 if (pro != NULL) delete pro;
86 if (npro != NULL) delete npro;
87 if (mw != NULL) delete[] mw;
91 void estimateFromReads(const char*);
93 //if prob is too small, just make it 0
94 double getConPrb(const SingleRead& read, const SingleHit& hit) {
95 if (read.isLowQuality()) return 0.0;
98 int sid = hit.getSid();
99 RefSeq &ref = refs->getRef(sid);
100 int fullLen = ref.getFullLen();
101 int totLen = ref.getTotLen();
102 int dir = hit.getDir();
103 int pos = hit.getPos();
104 int readLen = read.getReadLength();
105 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
107 general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
108 " from the forward direction, which should be a non-negative number! " + \
109 "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
110 general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
111 + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
112 general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \
113 + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!");
115 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
116 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
122 int minL = std::max(readLen, gld->getMinL());
123 int maxL = std::min(totLen - pos, gld->getMaxL());
124 int pfpos; // possible fpos for fragment
126 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
127 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
128 effL = std::min(fullLen, totLen - fragLen + 1);
129 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
133 effL = std::min(fullLen, totLen - readLen + 1);
134 value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
137 prob = ori->getProb(dir) * value * pro->getProb(read.getReadSeq(), ref, pos, dir);
139 if (prob < EPSILON) { prob = 0.0; }
142 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
147 double getNoiseConPrb(const SingleRead& read) {
148 if (read.isLowQuality()) return 0.0;
149 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
150 prob *= npro->getProb(read.getReadSeq());
151 if (prob < EPSILON) { prob = 0.0; }
153 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
158 double getLogP() { return npro->getLogP(); }
162 void update(const SingleRead& read, const SingleHit& hit, double frac) {
163 if (read.isLowQuality() || frac < EPSILON) return;
165 RefSeq& ref = refs->getRef(hit.getSid());
166 int dir = hit.getDir();
167 int pos = hit.getPos();
170 int fullLen = ref.getFullLen();
172 // Only use one strand to estimate RSPD
173 if (ori->getProb(0) >= ORIVALVE && dir == 0) {
174 rspd->update(pos, fullLen, frac);
177 if (ori->getProb(0) < ORIVALVE && dir == 1) {
178 int totLen = ref.getTotLen();
179 int readLen = read.getReadLength();
184 int minL = std::max(readLen, gld->getMinL());
185 int maxL = std::min(totLen - pos, gld->getMaxL());
187 assert(maxL >= minL);
188 std::vector<double> frag_vec(maxL - minL + 1, 0.0);
190 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
191 pfpos = totLen - pos - fragLen;
192 effL = std::min(fullLen, totLen - fragLen + 1);
193 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
194 sum += frag_vec[fragLen - minL];
196 assert(sum >= EPSILON);
197 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
198 pfpos = totLen - pos - fragLen;
199 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
203 rspd->update(totLen - pos - readLen, fullLen, frac);
207 pro->update(read.getReadSeq(), ref, pos, dir, frac);
210 void updateNoise(const SingleRead& read, double frac) {
211 if (read.isLowQuality() || frac < EPSILON) return;
213 npro->update(read.getReadSeq(), frac);
218 void collect(const SingleModel&);
220 bool getNeedCalcConPrb() { return needCalcConPrb; }
221 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
225 //double* getP1() { return p1; }
226 //double* getP2() { return p2; }
228 void read(const char*);
229 void write(const char*);
231 const LenDist& getGLD() { return *gld; }
233 void startSimulation(simul*, double*);
234 bool simulate(READ_INT_TYPE, SingleRead&, int&);
235 void finishSimulation();
242 int getModelType() const { return model_type; }
245 static const int model_type = 0;
246 static const int read_type = 0;
253 //double *p1, *p2; P_i' & P_i''
255 bool estRSPD; // true if estimate RSPD
256 bool needCalcConPrb; // true need, false does not need
264 simul *sampler; // for simulation
265 double *theta_cdf; // for simulation
267 double *mw; // for masking
272 void SingleModel::estimateFromReads(const char* readFN) {
274 char readFs[2][STRLEN];
277 mld != NULL ? mld->init() : gld->init();
278 for (int i = 0; i < 3; i++)
280 genReadFileNames(readFN, i, read_type, s, readFs);
281 ReadReader<SingleRead> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
283 READ_INT_TYPE cnt = 0;
284 while (reader.next(read)) {
285 if (!read.isLowQuality()) {
286 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
287 if (i == 0) { npro->updateC(read.getReadSeq()); }
289 else if (verbose && read.getReadLength() < seedLen) {
290 std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to read length "<< read.getReadLength()<< " < seed length "<< seedLen<< "!"<< std::endl;
294 if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
297 if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; }
300 mld != NULL ? mld->finish() : gld->finish();
302 if (mean >= EPSILON) {
303 assert(mld->getMaxL() <= gld->getMaxL());
304 gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
306 npro->calcInitParams();
308 mw = new double[M + 1];
312 void SingleModel::init() {
313 if (estRSPD) rspd->init();
318 void SingleModel::finish() {
319 if (estRSPD) rspd->finish();
322 needCalcConPrb = true;
323 if (estRSPD) calcMW();
326 void SingleModel::collect(const SingleModel& o) {
327 if (estRSPD) rspd->collect(*(o.rspd));
328 pro->collect(*(o.pro));
329 npro->collect(*(o.npro));
332 //Only master node can call
333 void SingleModel::read(const char* inpF) {
335 FILE *fi = fopen(inpF, "r");
336 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
338 assert(fscanf(fi, "%d", &val) == 1);
339 assert(val == model_type);
343 assert(fscanf(fi, "%d", &val) == 1);
345 if (mld == NULL) mld = new LenDist();
352 if (fscanf(fi, "%d", &val) == 1) {
355 mw = new double[M + 1];
356 for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
363 //Only master node can call. Only be called at EM.cpp
364 void SingleModel::write(const char* outF) {
365 FILE *fo = fopen(outF, "w");
367 fprintf(fo, "%d\n", model_type);
370 ori->write(fo); fprintf(fo, "\n");
371 gld->write(fo); fprintf(fo, "\n");
376 else { fprintf(fo, "0\n"); }
378 rspd->write(fo); fprintf(fo, "\n");
379 pro->write(fo); fprintf(fo, "\n");
383 fprintf(fo, "\n%d\n", M);
384 for (int i = 0; i < M; i++) {
385 fprintf(fo, "%.15g ", mw[i]);
387 fprintf(fo, "%.15g\n", mw[M]);
393 void SingleModel::startSimulation(simul* sampler, double* theta) {
394 this->sampler = sampler;
396 theta_cdf = new double[M + 1];
397 for (int i = 0; i <= M; i++) {
398 theta_cdf[i] = theta[i];
399 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
402 rspd->startSimulation(M, refs);
403 pro->startSimulation();
404 npro->startSimulation();
407 bool SingleModel::simulate(READ_INT_TYPE rid, SingleRead& read, int& sid) {
408 int dir, pos, readLen, fragLen;
411 std::ostringstream strout;
413 sid = sampler->sample(theta_cdf, M + 1);
417 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
418 readseq = npro->simulate(sampler, readLen);
421 RefSeq &ref = refs->getRef(sid);
422 dir = ori->simulate(sampler);
423 fragLen = gld->simulate(sampler, ref.getTotLen());
424 if (fragLen < 0) return false;
425 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
426 pos = rspd->simulate(sampler, sid, effL);
427 if (pos < 0) return false;
428 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
431 readLen = mld->simulate(sampler, fragLen);
432 if (readLen < 0) return false;
433 readseq = pro->simulate(sampler, readLen, pos, dir, ref);
436 readseq = pro->simulate(sampler, fragLen, pos, dir, ref);
440 strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
443 read = SingleRead(name, readseq);
448 void SingleModel::finishSimulation() {
451 rspd->finishSimulation();
452 pro->finishSimulation();
453 npro->finishSimulation();
456 void SingleModel::calcMW() {
459 assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
461 memset(mw, 0, sizeof(double) * (M + 1));
464 probF = ori->getProb(0);
465 probR = ori->getProb(1);
467 for (int i = 1; i <= M; i++) {
468 RefSeq& ref = refs->getRef(i);
469 int totLen = ref.getTotLen();
470 int fullLen = ref.getFullLen();
474 int end = std::min(fullLen, totLen - seedLen + 1);
477 for (int seedPos = 0; seedPos < end; seedPos++)
478 if (ref.getMask(seedPos)) {
480 minL = gld->getMinL();
481 maxL = std::min(gld->getMaxL(), totLen - seedPos);
483 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
484 effL = std::min(fullLen, totLen - fragLen + 1);
485 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
486 value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
489 minL = gld->getMinL();
490 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
491 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
492 pfpos = seedPos - (fragLen - seedLen);
493 effL = std::min(fullLen, totLen - fragLen + 1);
494 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
495 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
499 //for reverse strand masking
500 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
501 minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
502 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
503 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
504 pfpos = seedPos - (fragLen - seedLen);
505 effL = std::min(fullLen, totLen - fragLen + 1);
506 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
507 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
514 // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
520 #endif /* SINGLEMODEL_H_ */