1 #ifndef PAIREDENDMODEL_H_
2 #define PAIREDENDMODEL_H_
15 #include "my_assert.h"
16 #include "Orientation.h"
20 #include "NoiseProfile.h"
22 #include "ModelParams.h"
25 #include "SingleRead.h"
26 #include "PairedEndRead.h"
27 #include "PairedEndHit.h"
28 #include "ReadReader.h"
32 class PairedEndModel {
34 PairedEndModel(Refs* refs = NULL) {
36 M = (refs != NULL ? refs->getM() : 0);
37 memset(N, 0, sizeof(N));
39 needCalcConPrb = true;
41 ori = new Orientation();
43 rspd = new RSPD(estRSPD);
45 npro = new NoiseProfile();
52 //If it is not a master node, only init & update can be used!
53 PairedEndModel(ModelParams& params, bool isMaster = true) {
55 memcpy(N, params.N, sizeof(params.N));
57 estRSPD = params.estRSPD;
58 seedLen = params.seedLen;
59 needCalcConPrb = true;
61 ori = NULL; gld = NULL; rspd = NULL; pro = NULL; npro = NULL; mld = NULL;
65 if (!estRSPD) rspd = new RSPD(estRSPD);
66 mld = new LenDist(params.mate_minL, params.mate_maxL);
69 ori = new Orientation(params.probF);
70 gld = new LenDist(params.minL, params.maxL);
71 if (estRSPD) rspd = new RSPD(estRSPD, params.B);
72 pro = new Profile(params.maxL);
73 npro = new NoiseProfile();
78 if (ori != NULL) delete ori;
79 if (gld != NULL) delete gld;
80 if (rspd != NULL) delete rspd;
81 if (pro != NULL) delete pro;
82 if (npro != NULL) delete npro;
83 if (mld != NULL) delete mld;
84 if (mw != NULL) delete mw;
87 void estimateFromReads(const char*);
89 //if prob is too small, just make it 0
90 double getConPrb(const PairedEndRead& read, const PairedEndHit& hit) {
91 if (read.isLowQuality()) return 0.0;
94 int sid = hit.getSid();
95 RefSeq &ref = refs->getRef(sid);
96 int dir = hit.getDir();
97 int pos = hit.getPos();
98 int fullLen = ref.getFullLen();
99 int totLen = ref.getTotLen();
100 int insertLen = hit.getInsertL();
102 int fpos = (dir == 0 ? pos : totLen - pos - insertLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
103 int effL = std::min(fullLen, totLen - insertLen + 1);
105 general_assert(fpos >= 0, "The alignment of fragment " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
106 " from the forward direction, which should be a non-negative number! " + \
107 "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
108 general_assert(fpos + insertLen <= totLen,"Fragment " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
109 + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
110 general_assert(insertLen <= totLen, "Fragment " + read.getName() + " has length " + itos(insertLen) + ", but it is aligned to transcript " \
111 + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the fragment's length!");
114 if (fpos >= fullLen || ref.getMask(fpos)) return 0.0; // For paired-end model, fpos is the seedPos
116 prob = ori->getProb(dir) * gld->getAdjustedProb(insertLen, totLen) *
117 rspd->getAdjustedProb(fpos, effL, fullLen);
119 const SingleRead& mate1 = read.getMate1();
120 prob *= mld->getAdjustedProb(mate1.getReadLength(), insertLen) *
121 pro->getProb(mate1.getReadSeq(), ref, pos, dir);
123 const SingleRead& mate2 = read.getMate2();
124 int m2pos = totLen - pos - insertLen;
126 prob *= mld->getAdjustedProb(mate2.getReadLength(), insertLen) *
127 pro->getProb(mate2.getReadSeq(), ref, m2pos, m2dir);
129 if (prob < EPSILON) { prob = 0.0; }
131 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
136 double getNoiseConPrb(const PairedEndRead& read) {
137 if (read.isLowQuality()) return 0.0;
139 const SingleRead& mate1 = read.getMate1();
140 const SingleRead& mate2 = read.getMate2();
142 prob = mld->getProb(mate1.getReadLength()) * npro->getProb(mate1.getReadSeq());
143 prob *= mld->getProb(mate2.getReadLength()) * npro->getProb(mate2.getReadSeq());
145 if (prob < EPSILON) { prob = 0.0; }
147 prob = (mw[0] < EPSILON ? 0.0: prob / mw[0]);
152 double getLogP() { return npro->getLogP(); }
156 void update(const PairedEndRead& read, const PairedEndHit& hit, double frac) {
157 if (read.isLowQuality() || frac < EPSILON) return;
159 RefSeq& ref = refs->getRef(hit.getSid());
160 const SingleRead& mate1 = read.getMate1();
161 const SingleRead& mate2 = read.getMate2();
163 gld->update(hit.getInsertL(), frac);
165 int fpos = (hit.getDir() == 0 ? hit.getPos() : ref.getTotLen() - hit.getPos() - hit.getInsertL());
166 rspd->update(fpos, ref.getFullLen(), frac);
168 pro->update(mate1.getReadSeq(), ref, hit.getPos(), hit.getDir(), frac);
170 int m2pos = ref.getTotLen() - hit.getPos() - hit.getInsertL();
171 int m2dir = !hit.getDir();
172 pro->update(mate2.getReadSeq(), ref, m2pos, m2dir, frac);
175 void updateNoise(const PairedEndRead& read, double frac) {
176 if (read.isLowQuality() || frac < EPSILON) return;
178 const SingleRead& mate1 = read.getMate1();
179 const SingleRead& mate2 = read.getMate2();
181 npro->update(mate1.getReadSeq(), frac);
182 npro->update(mate2.getReadSeq(), frac);
187 void collect(const PairedEndModel&);
189 bool getNeedCalcConPrb() { return needCalcConPrb; }
190 void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
192 void read(const char*);
193 void write(const char*);
195 const LenDist& getGLD() { return *gld; }
197 void startSimulation(simul*, const std::vector<double>&);
198 bool simulate(READ_INT_TYPE, PairedEndRead&, int&);
199 void finishSimulation();
201 //Use it after function 'read' or 'estimateFromReads'
202 const double* getMW() {
207 int getModelType() const { return model_type; }
210 static const int model_type = 2;
211 static const int read_type = 2;
219 bool needCalcConPrb; //true need, false does not need
222 LenDist *gld, *mld; //mld1 mate_length_dist
227 simul *sampler; // for simulation
228 double *theta_cdf; // for simulation
230 double *mw; // for masking
235 void PairedEndModel::estimateFromReads(const char* readFN) {
237 char readFs[2][STRLEN];
241 for (int i = 0; i < 3; i++)
243 genReadFileNames(readFN, i, read_type, s, readFs);
244 ReadReader<PairedEndRead> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
246 READ_INT_TYPE cnt = 0;
247 while (reader.next(read)) {
248 SingleRead mate1 = read.getMate1();
249 SingleRead mate2 = read.getMate2();
251 if (!read.isLowQuality()) {
252 mld->update(mate1.getReadLength(), 1.0);
253 mld->update(mate2.getReadLength(), 1.0);
256 npro->updateC(mate1.getReadSeq());
257 npro->updateC(mate2.getReadSeq());
260 else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) {
261 std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to at least one of the mates' length < seed length "<< seedLen<< "!"<< std::endl;
265 if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
268 if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; }
272 npro->calcInitParams();
274 mw = new double[M + 1];
278 void PairedEndModel::init() {
280 if (estRSPD) rspd->init();
285 void PairedEndModel::finish() {
287 if (estRSPD) rspd->finish();
290 needCalcConPrb = true;
294 void PairedEndModel::collect(const PairedEndModel& o) {
295 gld->collect(*(o.gld));
296 if (estRSPD) rspd->collect(*(o.rspd));
297 pro->collect(*(o.pro));
298 npro->collect(*(o.npro));
301 //Only master node can call
302 void PairedEndModel::read(const char* inpF) {
304 FILE *fi = fopen(inpF, "r");
305 if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
307 assert(fscanf(fi, "%d", &val) == 1);
308 assert(val == model_type);
317 if (fscanf(fi, "%d", &val) == 1) {
320 mw = new double[M + 1];
321 for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
328 //Only master node can call. Only be called at EM.cpp
329 void PairedEndModel::write(const char* outF) {
330 FILE *fo = fopen(outF, "w");
332 fprintf(fo, "%d\n", model_type);
335 ori->write(fo); fprintf(fo, "\n");
336 gld->write(fo); fprintf(fo, "\n");
337 mld->write(fo); fprintf(fo, "\n");
338 rspd->write(fo); fprintf(fo, "\n");
339 pro->write(fo); fprintf(fo, "\n");
343 fprintf(fo, "\n%d\n", M);
344 for (int i = 0; i < M; i++) {
345 fprintf(fo, "%.15g ", mw[i]);
347 fprintf(fo, "%.15g\n", mw[M]);
353 void PairedEndModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
354 this->sampler = sampler;
356 theta_cdf = new double[M + 1];
357 for (int i = 0; i <= M; i++) {
358 theta_cdf[i] = theta[i];
359 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
362 rspd->startSimulation(M, refs);
363 pro->startSimulation();
364 npro->startSimulation();
367 bool PairedEndModel::simulate(READ_INT_TYPE rid, PairedEndRead& read, int& sid) {
369 int insertL, mateL1, mateL2;
371 std::string readseq1, readseq2;
372 std::ostringstream strout;
374 sid = sampler->sample(theta_cdf, M + 1);
377 dir = pos = insertL = 0;
378 mateL1 = mld->simulate(sampler, -1);
379 readseq1 = npro->simulate(sampler, mateL1);
381 mateL2 = mld->simulate(sampler, -1);
382 readseq2 = npro->simulate(sampler, mateL2);
385 RefSeq &ref = refs->getRef(sid);
386 dir = ori->simulate(sampler);
387 insertL = gld->simulate(sampler, ref.getTotLen());
388 if (insertL < 0) return false;
389 int effL = std::min(ref.getFullLen(), ref.getTotLen() - insertL + 1);
390 pos = rspd->simulate(sampler, sid, effL);
391 if (pos < 0) return false;
392 if (dir > 0) pos = ref.getTotLen() - pos - insertL;
394 mateL1 = mld->simulate(sampler, insertL);
395 readseq1 = pro->simulate(sampler, mateL1, pos, dir, ref);
397 int m2pos = ref.getTotLen() - pos - insertL;
400 mateL2 = mld->simulate(sampler, insertL);
401 readseq2 = pro->simulate(sampler, mateL2, m2pos, m2dir, ref);
404 strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos<<"_"<<insertL;
407 read = PairedEndRead(SingleRead(name + "/1", readseq1), SingleRead(name + "/2", readseq2));
412 void PairedEndModel::finishSimulation() {
415 rspd->finishSimulation();
416 pro->finishSimulation();
417 npro->finishSimulation();
420 void PairedEndModel::calcMW() {
421 assert(mld->getMinL() >= seedLen);
423 memset(mw, 0, sizeof(double) * (M + 1));
426 for (int i = 1; i <= M; i++) {
427 RefSeq& ref = refs->getRef(i);
428 int totLen = ref.getTotLen();
429 int fullLen = ref.getFullLen();
430 int end = std::min(fullLen, totLen - gld->getMinL() + 1);
435 //seedPos is fpos here
436 for (int seedPos = 0; seedPos < end; seedPos++)
437 if (ref.getMask(seedPos)) {
438 minL = gld->getMinL();
439 maxL = std::min(gld->getMaxL(), totLen - seedPos);
441 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
442 effL = std::min(fullLen, totLen - fragLen + 1);
443 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
450 //fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
456 #endif /* PAIREDENDMODEL_H_ */