1 #ifndef PAIREDENDMODEL_H_
2 #define PAIREDENDMODEL_H_
13 #include "Orientation.h"
17 #include "NoiseProfile.h"
19 #include "ModelParams.h"
22 #include "SingleRead.h"
23 #include "PairedEndRead.h"
24 #include "PairedEndHit.h"
25 #include "ReadReader.h"
29 class PairedEndModel {
31 PairedEndModel(Refs* refs = NULL) {
33 M = (refs != NULL ? refs->getM() : 0);
34 memset(N, 0, sizeof(N));
36 needCalcConPrb = true;
38 ori = new Orientation();
40 rspd = new RSPD(estRSPD);
42 npro = new NoiseProfile();
49 //If it is not a master node, only init & update can be used!
50 PairedEndModel(ModelParams& params, bool isMaster = true) {
52 memcpy(N, params.N, sizeof(params.N));
54 estRSPD = params.estRSPD;
55 seedLen = params.seedLen;
56 needCalcConPrb = true;
58 ori = NULL; gld = NULL; rspd = NULL; pro = NULL; npro = NULL; mld = NULL;
62 if (!estRSPD) rspd = new RSPD(estRSPD);
63 mld = new LenDist(params.mate_minL, params.mate_maxL);
66 ori = new Orientation(params.probF);
67 gld = new LenDist(params.minL, params.maxL);
68 if (estRSPD) rspd = new RSPD(estRSPD, params.B);
69 pro = new Profile(params.maxL);
70 npro = new NoiseProfile();
75 if (ori != NULL) delete ori;
76 if (gld != NULL) delete gld;
77 if (rspd != NULL) delete rspd;
78 if (pro != NULL) delete pro;
79 if (npro != NULL) delete npro;
80 if (mld != NULL) delete mld;
81 if (mw != NULL) delete mw;
84 void estimateFromReads(const char*);
86 //if prob is too small, just make it 0
87 double getConPrb(const PairedEndRead& read, const PairedEndHit& hit) {
88 if (read.isLowQuality()) return 0.0;
91 int sid = hit.getSid();
92 RefSeq &ref = refs->getRef(sid);
93 int dir = hit.getDir();
94 int pos = hit.getPos();
95 int fullLen = ref.getFullLen();
96 int totLen = ref.getTotLen();
97 int insertLen = hit.getInsertL();
99 int fpos = (dir == 0 ? pos : totLen - pos - insertLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
100 int effL = std::min(fullLen, totLen - insertLen + 1);
102 assert(fpos >= 0 && fpos + insertLen <= totLen && insertLen <= totLen);
103 if (fpos >= fullLen || ref.getMask(fpos)) return 0.0; // For paired-end model, fpos is the seedPos
105 prob = ori->getProb(dir) * gld->getAdjustedProb(insertLen, totLen) *
106 rspd->getAdjustedProb(fpos, effL, fullLen);
108 const SingleRead& mate1 = read.getMate1();
109 prob *= mld->getAdjustedProb(mate1.getReadLength(), insertLen) *
110 pro->getProb(mate1.getReadSeq(), ref, pos, dir);
112 const SingleRead& mate2 = read.getMate2();
113 int m2pos = totLen - pos - insertLen;
115 prob *= mld->getAdjustedProb(mate2.getReadLength(), insertLen) *
116 pro->getProb(mate2.getReadSeq(), ref, m2pos, m2dir);
118 if (prob < EPSILON) { prob = 0.0; }
120 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
125 double getNoiseConPrb(const PairedEndRead& read) {
126 if (read.isLowQuality()) return 0.0;
128 const SingleRead& mate1 = read.getMate1();
129 const SingleRead& mate2 = read.getMate2();
131 prob = mld->getProb(mate1.getReadLength()) * npro->getProb(mate1.getReadSeq());
132 prob *= mld->getProb(mate2.getReadLength()) * npro->getProb(mate2.getReadSeq());
134 if (prob < EPSILON) { prob = 0.0; }
136 prob = (mw[0] < EPSILON ? 0.0: prob / mw[0]);
141 double getLogP() { return npro->getLogP(); }
145 void update(const PairedEndRead& read, const PairedEndHit& hit, double frac) {
146 if (read.isLowQuality() || frac < EPSILON) return;
148 RefSeq& ref = refs->getRef(hit.getSid());
149 const SingleRead& mate1 = read.getMate1();
150 const SingleRead& mate2 = read.getMate2();
152 gld->update(hit.getInsertL(), frac);
154 int fpos = (hit.getDir() == 0 ? hit.getPos() : ref.getTotLen() - hit.getPos() - hit.getInsertL());
155 rspd->update(fpos, ref.getFullLen(), frac);
157 pro->update(mate1.getReadSeq(), ref, hit.getPos(), hit.getDir(), frac);
159 int m2pos = ref.getTotLen() - hit.getPos() - hit.getInsertL();
160 int m2dir = !hit.getDir();
161 pro->update(mate2.getReadSeq(), ref, m2pos, m2dir, frac);
164 void updateNoise(const PairedEndRead& read, double frac) {
165 if (read.isLowQuality() || frac < EPSILON) return;
167 const SingleRead& mate1 = read.getMate1();
168 const SingleRead& mate2 = read.getMate2();
170 npro->update(mate1.getReadSeq(), frac);
171 npro->update(mate2.getReadSeq(), frac);
176 void collect(const PairedEndModel&);
178 bool getNeedCalcConPrb() { return needCalcConPrb; }
179 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
181 void read(const char*);
182 void write(const char*);
184 const LenDist& getGLD() { return *gld; }
186 void startSimulation(simul*, double*);
187 bool simulate(int, PairedEndRead&, int&);
188 void finishSimulation();
190 //Use it after function 'read' or 'estimateFromReads'
196 int getModelType() const { return model_type; }
199 static const int model_type = 2;
200 static const int read_type = 2;
208 bool needCalcConPrb; //true need, false does not need
211 LenDist *gld, *mld; //mld1 mate_length_dist
216 simul *sampler; // for simulation
217 double *theta_cdf; // for simulation
219 double *mw; // for masking
224 void PairedEndModel::estimateFromReads(const char* readFN) {
226 char readFs[2][STRLEN];
230 for (int i = 0; i < 3; i++)
232 genReadFileNames(readFN, i, read_type, s, readFs);
233 ReadReader<PairedEndRead> reader(s, readFs);
236 while (reader.next(read)) {
237 SingleRead mate1 = read.getMate1();
238 SingleRead mate2 = read.getMate2();
240 mld->update(mate1.getReadLength(), 1.0);
241 mld->update(mate2.getReadLength(), 1.0);
244 npro->updateC(mate1.getReadSeq());
245 npro->updateC(mate2.getReadSeq());
249 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
252 if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
256 npro->calcInitParams();
258 mw = new double[M + 1];
262 void PairedEndModel::init() {
264 if (estRSPD) rspd->init();
269 void PairedEndModel::finish() {
271 if (estRSPD) rspd->finish();
274 needCalcConPrb = true;
278 void PairedEndModel::collect(const PairedEndModel& o) {
279 gld->collect(*(o.gld));
280 if (estRSPD) rspd->collect(*(o.rspd));
281 pro->collect(*(o.pro));
282 npro->collect(*(o.npro));
285 //Only master node can call
286 void PairedEndModel::read(const char* inpF) {
288 FILE *fi = fopen(inpF, "r");
289 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
291 fscanf(fi, "%d", &val);
292 assert(val == model_type);
301 if (fscanf(fi, "%d", &M) == 1) {
302 mw = new double[M + 1];
303 for (int i = 0; i <= M; i++) fscanf(fi, "%lf", &mw[i]);
309 //Only master node can call
310 void PairedEndModel::write(const char* outF) {
311 FILE *fo = fopen(outF, "w");
313 fprintf(fo, "%d\n", model_type);
316 ori->write(fo); fprintf(fo, "\n");
317 gld->write(fo); fprintf(fo, "\n");
318 mld->write(fo); fprintf(fo, "\n");
319 rspd->write(fo); fprintf(fo, "\n");
320 pro->write(fo); fprintf(fo, "\n");
324 fprintf(fo, "\n%d\n", M);
325 for (int i = 0; i < M; i++) {
326 fprintf(fo, "%.15g ", mw[i]);
328 fprintf(fo, "%.15g\n", mw[M]);
334 void PairedEndModel::startSimulation(simul* sampler, double* theta) {
335 this->sampler = sampler;
337 theta_cdf = new double[M + 1];
338 for (int i = 0; i <= M; i++) {
339 theta_cdf[i] = theta[i];
340 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
343 rspd->startSimulation(M, refs);
344 pro->startSimulation();
345 npro->startSimulation();
348 bool PairedEndModel::simulate(int rid, PairedEndRead& read, int& sid) {
350 int insertL, mateL1, mateL2;
352 std::string readseq1, readseq2;
353 std::ostringstream strout;
355 sid = sampler->sample(theta_cdf, M + 1);
358 dir = pos = insertL = 0;
359 mateL1 = mld->simulate(sampler, -1);
360 readseq1 = npro->simulate(sampler, mateL1);
362 mateL2 = mld->simulate(sampler, -1);
363 readseq2 = npro->simulate(sampler, mateL2);
366 RefSeq &ref = refs->getRef(sid);
367 dir = ori->simulate(sampler);
368 insertL = gld->simulate(sampler, ref.getTotLen());
369 if (insertL < 0) return false;
370 int effL = std::min(ref.getFullLen(), ref.getTotLen() - insertL + 1);
371 pos = rspd->simulate(sampler, sid, effL);
372 if (pos < 0) return false;
373 if (dir > 0) pos = ref.getTotLen() - pos - insertL;
375 mateL1 = mld->simulate(sampler, insertL);
376 readseq1 = pro->simulate(sampler, mateL1, pos, dir, ref);
378 int m2pos = ref.getTotLen() - pos - insertL;
381 mateL2 = mld->simulate(sampler, insertL);
382 readseq2 = pro->simulate(sampler, mateL2, m2pos, m2dir, ref);
385 std::ostringstream stdout;
386 stdout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos<<"_"<<insertL;
389 read = PairedEndRead(SingleRead(name + "/1", readseq1), SingleRead(name + "/2", readseq2));
394 void PairedEndModel::finishSimulation() {
397 rspd->finishSimulation();
398 pro->finishSimulation();
399 npro->finishSimulation();
402 void PairedEndModel::calcMW() {
403 assert(seedLen >= OLEN && mld->getMinL() >= seedLen);
405 memset(mw, 0, sizeof(double) * (M + 1));
408 for (int i = 1; i <= M; i++) {
409 RefSeq& ref = refs->getRef(i);
410 int totLen = ref.getTotLen();
411 int fullLen = ref.getFullLen();
412 int end = std::min(fullLen, totLen - gld->getMinL() + 1);
417 //seedPos is fpos here
418 for (int seedPos = 0; seedPos < end; seedPos++)
419 if (ref.getMask(seedPos)) {
420 minL = gld->getMinL();
421 maxL = std::min(gld->getMaxL(), totLen - seedPos);
423 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
424 effL = std::min(fullLen, totLen - fragLen + 1);
425 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
432 //fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
438 #endif /* PAIREDENDMODEL_H_ */