]> git.donarmstrong.com Git - rsem.git/blob - SingleQModel.h
Modified build rules for sam/libbam.a to enable parallel build
[rsem.git] / SingleQModel.h
1 #ifndef SINGLEQMODEL_H_
2 #define SINGLEQMODEL_H_
3
4 #include<cmath>
5 #include<cstdio>
6 #include<cassert>
7 #include<cstring>
8 #include<string>
9 #include<algorithm>
10 #include<sstream>
11
12 #include "utils.h"
13 #include "Orientation.h"
14 #include "LenDist.h"
15 #include "RSPD.h"
16 #include "QualDist.h"
17 #include "QProfile.h"
18 #include "NoiseQProfile.h"
19
20 #include "ModelParams.h"
21 #include "RefSeq.h"
22 #include "Refs.h"
23 #include "SingleReadQ.h"
24 #include "SingleHit.h"
25 #include "ReadReader.h"
26
27 #include "simul.h"
28
29 class SingleQModel {
30 public:
31         SingleQModel(Refs* refs = NULL) {
32                 this->refs = refs;
33                 M = (refs != NULL ? refs->getM() : 0);
34                 memset(N, 0, sizeof(N));
35                 estRSPD = false;
36                 needCalcConPrb = true;
37                 
38                 ori = new Orientation();
39                 gld = new LenDist();
40                 mld = NULL;
41                 rspd = new RSPD(estRSPD);
42                 qd = new QualDist();
43                 qpro = new QProfile();
44                 nqpro = new NoiseQProfile();
45
46                 mean = -1.0; sd = 0.0;
47                 mw = NULL;
48
49                 seedLen = 0;
50         }
51
52         //If it is not a master node, only init & update can be used!
53         SingleQModel(ModelParams& params, bool isMaster = true) {
54                 M = params.M;
55                 memcpy(N, params.N, sizeof(params.N));
56                 refs = params.refs;
57                 estRSPD = params.estRSPD;
58                 mean = params.mean; sd = params.sd;
59                 seedLen = params.seedLen;
60                 needCalcConPrb = true;
61
62                 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL;
63                 mw = NULL;
64
65                 if (isMaster) {
66                         gld = new LenDist(params.minL, params.maxL);                    
67                         if (mean >= EPSILON) {
68                                 mld = new LenDist(params.mate_minL, params.mate_maxL);
69                         }
70                         if (!estRSPD) { rspd = new RSPD(estRSPD); }
71                         qd = new QualDist();
72                 }
73
74                 ori = new Orientation(params.probF);
75                 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
76                 qpro = new QProfile();
77                 nqpro = new NoiseQProfile();
78         }
79
80         ~SingleQModel() {
81                 refs = NULL;
82                 if (ori != NULL) delete ori;
83                 if (gld != NULL) delete gld;
84                 if (mld != NULL) delete mld;
85                 if (rspd != NULL) delete rspd;
86                 if (qd != NULL) delete qd;
87                 if (qpro != NULL) delete qpro;
88                 if (nqpro != NULL) delete nqpro;
89                 if (mw != NULL) delete[] mw;
90                 /*delete[] p1, p2;*/
91         }
92
93         //SingleQModel& operator=(const SingleQModel&);
94
95         void estimateFromReads(const char*);
96
97         //if prob is too small, just make it 0
98         double getConPrb(const SingleReadQ& read, const SingleHit& hit) const {
99                 if (read.isLowQuality()) return 0.0;
100
101                 double prob;
102                 int sid = hit.getSid();
103                 RefSeq &ref = refs->getRef(sid);
104                 int fullLen = ref.getFullLen();
105                 int totLen = ref.getTotLen();
106                 int dir = hit.getDir();
107                 int pos = hit.getPos();
108                 int readLen = read.getReadLength();
109                 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
110
111                 assert(fpos >= 0 && fpos + readLen <= totLen && readLen <= totLen);
112                 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
113                 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
114
115                 int effL;
116                 double value;
117
118                 if (mld != NULL) {
119                         int minL = std::max(readLen, gld->getMinL());
120                         int maxL = std::min(totLen - pos, gld->getMaxL());
121                         int pfpos; // possible fpos for fragment
122                         value = 0.0;
123                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
124                                 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
125                                 effL = std::min(fullLen, totLen - fragLen + 1);
126                                 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
127                         }
128                 }
129                 else {
130                         effL = std::min(fullLen, totLen - readLen + 1);
131                         value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
132                 }
133
134                 prob = ori->getProb(dir) * value * qpro->getProb(read.getReadSeq(), read.getQScore(), ref, pos, dir);
135
136                 if (prob < EPSILON) { prob = 0.0; }
137
138                 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
139
140                 return prob;
141         }
142
143         double getNoiseConPrb(const SingleReadQ& read) {
144                 if (read.isLowQuality()) return 0.0;
145                 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
146                 prob *= nqpro->getProb(read.getReadSeq(), read.getQScore());
147                 if (prob < EPSILON) { prob = 0.0; }
148
149                 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
150
151                 return prob;
152         }
153
154         double getLogP() { return nqpro->getLogP(); }
155
156         void init();
157
158         void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
159                 if (read.isLowQuality() || frac < EPSILON) return;
160
161                 const RefSeq& ref = refs->getRef(hit.getSid());
162
163                 int dir = hit.getDir();
164                 int pos = hit.getPos();
165
166                 if (estRSPD) {
167                         int fullLen = ref.getFullLen();
168
169                         // Only use one strand to estimate RSPD
170                         if (ori->getProb(0) >= ORIVALVE && dir == 0) {
171                                 rspd->update(pos, fullLen, frac);
172                         }
173
174                         if (ori->getProb(0) < ORIVALVE && dir == 1) {
175                                 int totLen = ref.getTotLen();                     
176                                 int readLen = read.getReadLength();
177                                 
178                                 int pfpos, effL; 
179
180                                 if (mld != NULL) {
181                                         int minL = std::max(readLen, gld->getMinL());
182                                         int maxL = std::min(totLen - pos, gld->getMaxL());
183                                         double sum = 0.0;
184                                         assert(maxL >= minL);
185                                         std::vector<double> frag_vec(maxL - minL + 1, 0.0);
186
187                                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
188                                                 pfpos = totLen - pos - fragLen;
189                                                 effL = std::min(fullLen, totLen - fragLen + 1);
190                                                 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
191                                                 sum += frag_vec[fragLen - minL];
192                                         }
193                                         assert(sum >= EPSILON);
194                                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
195                                                 pfpos = totLen - pos - fragLen;
196                                                 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
197                                         }
198                                 }
199                                 else {
200                                         rspd->update(totLen - pos - readLen, fullLen, frac);
201                                 }
202                         }
203                 }
204                 qpro->update(read.getReadSeq(), read.getQScore(), ref, pos, dir, frac);
205         }
206
207         void updateNoise(const SingleReadQ& read, double frac) {
208                 if (read.isLowQuality() || frac < EPSILON) return;
209
210                 nqpro->update(read.getReadSeq(), read.getQScore(), frac);
211         }
212
213         void finish();
214
215         void collect(const SingleQModel&);
216
217         //void copy(const SingleQModel&);
218
219         bool getNeedCalcConPrb() { return needCalcConPrb; }
220         void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
221
222         //void calcP1();
223         //void calcP2();
224         //double* getP1() { return p1; }
225         //double* getP2() { return p2; }
226
227         void read(const char*);
228         void write(const char*);
229
230         const LenDist& getGLD() { return *gld; }
231
232         void startSimulation(simul*, double*);
233         bool simulate(int, SingleReadQ&, int&);
234         void finishSimulation();
235
236         //Use it after function 'read' or 'estimateFromReads'
237         double* getMW() { 
238           assert(mw != NULL);
239           return mw;
240         }
241
242         int getModelType() const { return model_type; }
243
244 private:
245         static const int model_type = 1;
246         static const int read_type = 1;
247
248         int M;
249         int N[3];
250         Refs *refs;
251         double mean, sd;
252         int seedLen;
253         //double *p1, *p2; P_i' & P_i'';
254
255         bool estRSPD; // true if estimate RSPD
256         bool needCalcConPrb; //true need, false does not need
257
258         Orientation *ori;
259         LenDist *gld, *mld;
260         RSPD *rspd;
261         QualDist *qd;
262         QProfile *qpro;
263         NoiseQProfile *nqpro;
264
265         simul *sampler; // for simulation
266         double *theta_cdf; // for simulation
267
268         double *mw; // for masking
269
270         void calcMW();
271 };
272
273 void SingleQModel::estimateFromReads(const char* readFN) {
274         int s;
275         char readFs[2][STRLEN];
276         SingleReadQ read;
277
278         mld != NULL ? mld->init() : gld->init();
279         for (int i = 0; i < 3; i++)
280                 if (N[i] > 0) {
281                         genReadFileNames(readFN, i, read_type, s, readFs);
282                         ReadReader<SingleReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
283
284                         int cnt = 0;
285                         while (reader.next(read)) {
286                                 if (!read.isLowQuality()) {
287                                         mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
288                                         qd->update(read.getQScore());
289                                         if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
290                                 }
291                                 else if (verbose && read.getReadLength() < seedLen) {
292                                         printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen);
293                                 }
294
295                                 ++cnt;
296                                 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
297                         }
298
299                         if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
300                 }
301
302         mld != NULL ? mld->finish() : gld->finish();
303
304         //mean should be > 0
305         if (mean >= EPSILON) { 
306           assert(mld->getMaxL() <= gld->getMaxL());
307           gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
308         }
309         qd->finish();
310         nqpro->calcInitParams();
311
312         mw = new double[M + 1];
313         calcMW();
314 }
315
316 void SingleQModel::init() {
317         if (estRSPD) rspd->init();
318         qpro->init();
319         nqpro->init();
320 }
321
322 void SingleQModel::finish() {
323         if (estRSPD) rspd->finish();
324         qpro->finish();
325         nqpro->finish();
326         needCalcConPrb = true;
327         if (estRSPD) calcMW();
328 }
329
330 void SingleQModel::collect(const SingleQModel& o) {
331         if (estRSPD) rspd->collect(*(o.rspd));
332         qpro->collect(*(o.qpro));
333         nqpro->collect(*(o.nqpro));
334 }
335
336 //Only master node can call
337 void SingleQModel::read(const char* inpF) {
338         int val;
339         FILE *fi = fopen(inpF, "r");
340         if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
341
342         assert(fscanf(fi, "%d", &val) == 1);
343         assert(val == model_type);
344
345         ori->read(fi);
346         gld->read(fi);
347         assert(fscanf(fi, "%d", &val) == 1);
348         if (val > 0) {
349                 if (mld == NULL) mld = new LenDist();
350                 mld->read(fi);
351         }
352         rspd->read(fi);
353         qd->read(fi);
354         qpro->read(fi);
355         nqpro->read(fi);
356
357         if (fscanf(fi, "%d", &val) == 1) {
358                 if (M == 0) M = val;
359                 if (M == val) {
360                         mw = new double[M + 1];
361                         for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
362                 }
363         }
364
365         fclose(fi);
366 }
367
368 //Only master node can call. Only be called at EM.cpp
369 void SingleQModel::write(const char* outF) {
370         FILE *fo = fopen(outF, "w");
371
372         fprintf(fo, "%d\n", model_type);
373         fprintf(fo, "\n");
374
375         ori->write(fo);  fprintf(fo, "\n");
376         gld->write(fo);  fprintf(fo, "\n");
377         if (mld != NULL) {
378                 fprintf(fo, "1\n");
379                 mld->write(fo);
380         }
381         else { fprintf(fo, "0\n"); }
382         fprintf(fo, "\n");
383         rspd->write(fo); fprintf(fo, "\n");
384         qd->write(fo);   fprintf(fo, "\n");
385         qpro->write(fo); fprintf(fo, "\n");
386         nqpro->write(fo);
387
388         if (mw != NULL) {
389           fprintf(fo, "\n%d\n", M);
390           for (int i = 0; i < M; i++) {
391             fprintf(fo, "%.15g ", mw[i]);
392           }
393           fprintf(fo, "%.15g\n", mw[M]);
394         }
395
396         fclose(fo);
397 }
398
399 void SingleQModel::startSimulation(simul* sampler, double* theta) {
400         this->sampler = sampler;
401
402         theta_cdf = new double[M + 1];
403         for (int i = 0; i <= M; i++) {
404                 theta_cdf[i] = theta[i];
405                 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
406         }
407
408         rspd->startSimulation(M, refs);
409         qd->startSimulation();
410         qpro->startSimulation();
411         nqpro->startSimulation();
412 }
413
414 bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) {
415         int dir, pos, readLen, fragLen;
416         std::string name;
417         std::string qual, readseq;
418         std::ostringstream strout;
419
420         sid = sampler->sample(theta_cdf, M + 1);
421
422         if (sid == 0) {
423                 dir = pos = 0;
424                 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
425                 qual = qd->simulate(sampler, readLen);
426                 readseq = nqpro->simulate(sampler, readLen, qual);
427         }
428         else {
429                 RefSeq &ref = refs->getRef(sid);
430                 dir = ori->simulate(sampler);
431                 fragLen = gld->simulate(sampler, ref.getTotLen());
432                 if (fragLen < 0) return false;
433
434                 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
435                 pos = rspd->simulate(sampler, sid, effL);
436                 if (pos < 0) return false;
437                 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
438
439                 if (mld != NULL) {
440                         readLen = mld->simulate(sampler, fragLen);
441                         if (readLen < 0) return false;
442                         qual = qd->simulate(sampler, readLen);
443                         readseq = qpro->simulate(sampler, readLen, pos, dir, qual, ref);
444                 }
445                 else {
446                         qual = qd->simulate(sampler, fragLen);
447                         readseq = qpro->simulate(sampler, fragLen, pos, dir, qual, ref);
448                 }
449         }
450
451         strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
452         name = strout.str();
453
454         read = SingleReadQ(name, readseq, qual);
455
456         return true;
457 }
458
459 void SingleQModel::finishSimulation() {
460         delete[] theta_cdf;
461
462         rspd->finishSimulation();
463         qd->finishSimulation();
464         qpro->finishSimulation();
465         nqpro->finishSimulation();
466 }
467
468 void SingleQModel::calcMW() {
469         double probF, probR;
470
471         assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
472
473         memset(mw, 0, sizeof(double) * (M + 1));
474         mw[0] = 1.0;
475
476         probF = ori->getProb(0);
477         probR = ori->getProb(1);
478
479         for (int i = 1; i <= M; i++) {
480                 RefSeq& ref = refs->getRef(i);
481                 int totLen = ref.getTotLen();
482                 int fullLen = ref.getFullLen();
483                 double value = 0.0;
484                 int minL, maxL;
485                 int effL, pfpos;
486                 int end = std::min(fullLen, totLen - seedLen + 1);
487                 double factor;
488
489                 for (int seedPos = 0; seedPos < end; seedPos++)
490                         if (ref.getMask(seedPos)) {
491                                 //forward
492                                 minL = gld->getMinL();
493                                 maxL = std::min(gld->getMaxL(), totLen - seedPos);
494                                 pfpos = seedPos;
495                                 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
496                                         effL = std::min(fullLen, totLen - fragLen + 1);
497                                         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
498                                         value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
499                                 }
500                                 //reverse
501                                 minL = gld->getMinL();
502                                 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
503                                 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
504                                         pfpos = seedPos - (fragLen - seedLen);
505                                         effL = std::min(fullLen, totLen - fragLen + 1);
506                                         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
507                                         value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
508                                 }
509                         }
510
511                 //for reverse strand masking
512                 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
513                         minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
514                         maxL = std::min(gld->getMaxL(), seedPos + seedLen);
515                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
516                                 pfpos = seedPos - (fragLen - seedLen);
517                                 effL = std::min(fullLen, totLen - fragLen + 1);
518                                 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
519                                 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
520                         }
521                 }
522
523                 mw[i] = 1.0 - value;
524
525                 if (mw[i] < 1e-8) {
526                         //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
527                         mw[i] = 0.0;
528                 }
529         }
530 }
531
532 #endif /* SINGLEQMODEL_H_ */