]> git.donarmstrong.com Git - rsem.git/blob - SingleQModel.h
Add warning messages for detected invalid alignments
[rsem.git] / SingleQModel.h
1 #ifndef SINGLEQMODEL_H_
2 #define SINGLEQMODEL_H_
3
4 #include<cmath>
5 #include<cstdio>
6 #include<cassert>
7 #include<cstring>
8 #include<string>
9 #include<algorithm>
10 #include<sstream>
11
12 #include "utils.h"
13 #include "my_assert.h"
14 #include "Orientation.h"
15 #include "LenDist.h"
16 #include "RSPD.h"
17 #include "QualDist.h"
18 #include "QProfile.h"
19 #include "NoiseQProfile.h"
20
21 #include "ModelParams.h"
22 #include "RefSeq.h"
23 #include "Refs.h"
24 #include "SingleReadQ.h"
25 #include "SingleHit.h"
26 #include "ReadReader.h"
27
28 #include "simul.h"
29
30 class SingleQModel {
31 public:
32         SingleQModel(Refs* refs = NULL) {
33                 this->refs = refs;
34                 M = (refs != NULL ? refs->getM() : 0);
35                 memset(N, 0, sizeof(N));
36                 estRSPD = false;
37                 needCalcConPrb = true;
38                 
39                 ori = new Orientation();
40                 gld = new LenDist();
41                 mld = NULL;
42                 rspd = new RSPD(estRSPD);
43                 qd = new QualDist();
44                 qpro = new QProfile();
45                 nqpro = new NoiseQProfile();
46
47                 mean = -1.0; sd = 0.0;
48                 mw = NULL;
49
50                 seedLen = 0;
51         }
52
53         //If it is not a master node, only init & update can be used!
54         SingleQModel(ModelParams& params, bool isMaster = true) {
55                 M = params.M;
56                 memcpy(N, params.N, sizeof(params.N));
57                 refs = params.refs;
58                 estRSPD = params.estRSPD;
59                 mean = params.mean; sd = params.sd;
60                 seedLen = params.seedLen;
61                 needCalcConPrb = true;
62
63                 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL;
64                 mw = NULL;
65
66                 if (isMaster) {
67                         gld = new LenDist(params.minL, params.maxL);                    
68                         if (mean >= EPSILON) {
69                                 mld = new LenDist(params.mate_minL, params.mate_maxL);
70                         }
71                         if (!estRSPD) { rspd = new RSPD(estRSPD); }
72                         qd = new QualDist();
73                 }
74
75                 ori = new Orientation(params.probF);
76                 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
77                 qpro = new QProfile();
78                 nqpro = new NoiseQProfile();
79         }
80
81         ~SingleQModel() {
82                 refs = NULL;
83                 if (ori != NULL) delete ori;
84                 if (gld != NULL) delete gld;
85                 if (mld != NULL) delete mld;
86                 if (rspd != NULL) delete rspd;
87                 if (qd != NULL) delete qd;
88                 if (qpro != NULL) delete qpro;
89                 if (nqpro != NULL) delete nqpro;
90                 if (mw != NULL) delete[] mw;
91                 /*delete[] p1, p2;*/
92         }
93
94         //SingleQModel& operator=(const SingleQModel&);
95
96         void estimateFromReads(const char*);
97
98         //if prob is too small, just make it 0
99         double getConPrb(const SingleReadQ& read, const SingleHit& hit) const {
100                 if (read.isLowQuality()) return 0.0;
101
102                 double prob;
103                 int sid = hit.getSid();
104                 RefSeq &ref = refs->getRef(sid);
105                 int fullLen = ref.getFullLen();
106                 int totLen = ref.getTotLen();
107                 int dir = hit.getDir();
108                 int pos = hit.getPos();
109                 int readLen = read.getReadLength();
110                 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
111
112                 general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
113                                 " from the forward direction, which should be a non-negative number! " + \
114                                 "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
115                 general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
116                                 + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
117                 general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \
118                                 + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!");
119
120                 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
121                 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
122
123                 int effL;
124                 double value;
125
126                 if (mld != NULL) {
127                         int minL = std::max(readLen, gld->getMinL());
128                         int maxL = std::min(totLen - pos, gld->getMaxL());
129                         int pfpos; // possible fpos for fragment
130                         value = 0.0;
131                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
132                                 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
133                                 effL = std::min(fullLen, totLen - fragLen + 1);
134                                 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
135                         }
136                 }
137                 else {
138                         effL = std::min(fullLen, totLen - readLen + 1);
139                         value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
140                 }
141
142                 prob = ori->getProb(dir) * value * qpro->getProb(read.getReadSeq(), read.getQScore(), ref, pos, dir);
143
144                 if (prob < EPSILON) { prob = 0.0; }
145
146                 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
147
148                 return prob;
149         }
150
151         double getNoiseConPrb(const SingleReadQ& read) {
152                 if (read.isLowQuality()) return 0.0;
153                 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
154                 prob *= nqpro->getProb(read.getReadSeq(), read.getQScore());
155                 if (prob < EPSILON) { prob = 0.0; }
156
157                 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
158
159                 return prob;
160         }
161
162         double getLogP() { return nqpro->getLogP(); }
163
164         void init();
165
166         void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
167                 if (read.isLowQuality() || frac < EPSILON) return;
168
169                 const RefSeq& ref = refs->getRef(hit.getSid());
170
171                 int dir = hit.getDir();
172                 int pos = hit.getPos();
173
174                 if (estRSPD) {
175                         int fullLen = ref.getFullLen();
176
177                         // Only use one strand to estimate RSPD
178                         if (ori->getProb(0) >= ORIVALVE && dir == 0) {
179                                 rspd->update(pos, fullLen, frac);
180                         }
181
182                         if (ori->getProb(0) < ORIVALVE && dir == 1) {
183                                 int totLen = ref.getTotLen();                     
184                                 int readLen = read.getReadLength();
185                                 
186                                 int pfpos, effL; 
187
188                                 if (mld != NULL) {
189                                         int minL = std::max(readLen, gld->getMinL());
190                                         int maxL = std::min(totLen - pos, gld->getMaxL());
191                                         double sum = 0.0;
192                                         assert(maxL >= minL);
193                                         std::vector<double> frag_vec(maxL - minL + 1, 0.0);
194
195                                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
196                                                 pfpos = totLen - pos - fragLen;
197                                                 effL = std::min(fullLen, totLen - fragLen + 1);
198                                                 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
199                                                 sum += frag_vec[fragLen - minL];
200                                         }
201                                         assert(sum >= EPSILON);
202                                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
203                                                 pfpos = totLen - pos - fragLen;
204                                                 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
205                                         }
206                                 }
207                                 else {
208                                         rspd->update(totLen - pos - readLen, fullLen, frac);
209                                 }
210                         }
211                 }
212                 qpro->update(read.getReadSeq(), read.getQScore(), ref, pos, dir, frac);
213         }
214
215         void updateNoise(const SingleReadQ& read, double frac) {
216                 if (read.isLowQuality() || frac < EPSILON) return;
217
218                 nqpro->update(read.getReadSeq(), read.getQScore(), frac);
219         }
220
221         void finish();
222
223         void collect(const SingleQModel&);
224
225         //void copy(const SingleQModel&);
226
227         bool getNeedCalcConPrb() { return needCalcConPrb; }
228         void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
229
230         //void calcP1();
231         //void calcP2();
232         //double* getP1() { return p1; }
233         //double* getP2() { return p2; }
234
235         void read(const char*);
236         void write(const char*);
237
238         const LenDist& getGLD() { return *gld; }
239
240         void startSimulation(simul*, double*);
241         bool simulate(int, SingleReadQ&, int&);
242         void finishSimulation();
243
244         //Use it after function 'read' or 'estimateFromReads'
245         double* getMW() { 
246           assert(mw != NULL);
247           return mw;
248         }
249
250         int getModelType() const { return model_type; }
251
252 private:
253         static const int model_type = 1;
254         static const int read_type = 1;
255
256         int M;
257         int N[3];
258         Refs *refs;
259         double mean, sd;
260         int seedLen;
261         //double *p1, *p2; P_i' & P_i'';
262
263         bool estRSPD; // true if estimate RSPD
264         bool needCalcConPrb; //true need, false does not need
265
266         Orientation *ori;
267         LenDist *gld, *mld;
268         RSPD *rspd;
269         QualDist *qd;
270         QProfile *qpro;
271         NoiseQProfile *nqpro;
272
273         simul *sampler; // for simulation
274         double *theta_cdf; // for simulation
275
276         double *mw; // for masking
277
278         void calcMW();
279 };
280
281 void SingleQModel::estimateFromReads(const char* readFN) {
282         int s;
283         char readFs[2][STRLEN];
284         SingleReadQ read;
285
286         mld != NULL ? mld->init() : gld->init();
287         for (int i = 0; i < 3; i++)
288                 if (N[i] > 0) {
289                         genReadFileNames(readFN, i, read_type, s, readFs);
290                         ReadReader<SingleReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
291
292                         int cnt = 0;
293                         while (reader.next(read)) {
294                                 if (!read.isLowQuality()) {
295                                         mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
296                                         qd->update(read.getQScore());
297                                         if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
298                                 }
299                                 else if (verbose && read.getReadLength() < seedLen) {
300                                         printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen);
301                                 }
302
303                                 ++cnt;
304                                 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
305                         }
306
307                         if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
308                 }
309
310         mld != NULL ? mld->finish() : gld->finish();
311
312         //mean should be > 0
313         if (mean >= EPSILON) { 
314           assert(mld->getMaxL() <= gld->getMaxL());
315           gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
316         }
317         qd->finish();
318         nqpro->calcInitParams();
319
320         mw = new double[M + 1];
321         calcMW();
322 }
323
324 void SingleQModel::init() {
325         if (estRSPD) rspd->init();
326         qpro->init();
327         nqpro->init();
328 }
329
330 void SingleQModel::finish() {
331         if (estRSPD) rspd->finish();
332         qpro->finish();
333         nqpro->finish();
334         needCalcConPrb = true;
335         if (estRSPD) calcMW();
336 }
337
338 void SingleQModel::collect(const SingleQModel& o) {
339         if (estRSPD) rspd->collect(*(o.rspd));
340         qpro->collect(*(o.qpro));
341         nqpro->collect(*(o.nqpro));
342 }
343
344 //Only master node can call
345 void SingleQModel::read(const char* inpF) {
346         int val;
347         FILE *fi = fopen(inpF, "r");
348         if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
349
350         assert(fscanf(fi, "%d", &val) == 1);
351         assert(val == model_type);
352
353         ori->read(fi);
354         gld->read(fi);
355         assert(fscanf(fi, "%d", &val) == 1);
356         if (val > 0) {
357                 if (mld == NULL) mld = new LenDist();
358                 mld->read(fi);
359         }
360         rspd->read(fi);
361         qd->read(fi);
362         qpro->read(fi);
363         nqpro->read(fi);
364
365         if (fscanf(fi, "%d", &val) == 1) {
366                 if (M == 0) M = val;
367                 if (M == val) {
368                         mw = new double[M + 1];
369                         for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
370                 }
371         }
372
373         fclose(fi);
374 }
375
376 //Only master node can call. Only be called at EM.cpp
377 void SingleQModel::write(const char* outF) {
378         FILE *fo = fopen(outF, "w");
379
380         fprintf(fo, "%d\n", model_type);
381         fprintf(fo, "\n");
382
383         ori->write(fo);  fprintf(fo, "\n");
384         gld->write(fo);  fprintf(fo, "\n");
385         if (mld != NULL) {
386                 fprintf(fo, "1\n");
387                 mld->write(fo);
388         }
389         else { fprintf(fo, "0\n"); }
390         fprintf(fo, "\n");
391         rspd->write(fo); fprintf(fo, "\n");
392         qd->write(fo);   fprintf(fo, "\n");
393         qpro->write(fo); fprintf(fo, "\n");
394         nqpro->write(fo);
395
396         if (mw != NULL) {
397           fprintf(fo, "\n%d\n", M);
398           for (int i = 0; i < M; i++) {
399             fprintf(fo, "%.15g ", mw[i]);
400           }
401           fprintf(fo, "%.15g\n", mw[M]);
402         }
403
404         fclose(fo);
405 }
406
407 void SingleQModel::startSimulation(simul* sampler, double* theta) {
408         this->sampler = sampler;
409
410         theta_cdf = new double[M + 1];
411         for (int i = 0; i <= M; i++) {
412                 theta_cdf[i] = theta[i];
413                 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
414         }
415
416         rspd->startSimulation(M, refs);
417         qd->startSimulation();
418         qpro->startSimulation();
419         nqpro->startSimulation();
420 }
421
422 bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) {
423         int dir, pos, readLen, fragLen;
424         std::string name;
425         std::string qual, readseq;
426         std::ostringstream strout;
427
428         sid = sampler->sample(theta_cdf, M + 1);
429
430         if (sid == 0) {
431                 dir = pos = 0;
432                 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
433                 qual = qd->simulate(sampler, readLen);
434                 readseq = nqpro->simulate(sampler, readLen, qual);
435         }
436         else {
437                 RefSeq &ref = refs->getRef(sid);
438                 dir = ori->simulate(sampler);
439                 fragLen = gld->simulate(sampler, ref.getTotLen());
440                 if (fragLen < 0) return false;
441
442                 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
443                 pos = rspd->simulate(sampler, sid, effL);
444                 if (pos < 0) return false;
445                 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
446
447                 if (mld != NULL) {
448                         readLen = mld->simulate(sampler, fragLen);
449                         if (readLen < 0) return false;
450                         qual = qd->simulate(sampler, readLen);
451                         readseq = qpro->simulate(sampler, readLen, pos, dir, qual, ref);
452                 }
453                 else {
454                         qual = qd->simulate(sampler, fragLen);
455                         readseq = qpro->simulate(sampler, fragLen, pos, dir, qual, ref);
456                 }
457         }
458
459         strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
460         name = strout.str();
461
462         read = SingleReadQ(name, readseq, qual);
463
464         return true;
465 }
466
467 void SingleQModel::finishSimulation() {
468         delete[] theta_cdf;
469
470         rspd->finishSimulation();
471         qd->finishSimulation();
472         qpro->finishSimulation();
473         nqpro->finishSimulation();
474 }
475
476 void SingleQModel::calcMW() {
477         double probF, probR;
478
479         assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
480
481         memset(mw, 0, sizeof(double) * (M + 1));
482         mw[0] = 1.0;
483
484         probF = ori->getProb(0);
485         probR = ori->getProb(1);
486
487         for (int i = 1; i <= M; i++) {
488                 RefSeq& ref = refs->getRef(i);
489                 int totLen = ref.getTotLen();
490                 int fullLen = ref.getFullLen();
491                 double value = 0.0;
492                 int minL, maxL;
493                 int effL, pfpos;
494                 int end = std::min(fullLen, totLen - seedLen + 1);
495                 double factor;
496
497                 for (int seedPos = 0; seedPos < end; seedPos++)
498                         if (ref.getMask(seedPos)) {
499                                 //forward
500                                 minL = gld->getMinL();
501                                 maxL = std::min(gld->getMaxL(), totLen - seedPos);
502                                 pfpos = seedPos;
503                                 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
504                                         effL = std::min(fullLen, totLen - fragLen + 1);
505                                         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
506                                         value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
507                                 }
508                                 //reverse
509                                 minL = gld->getMinL();
510                                 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
511                                 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
512                                         pfpos = seedPos - (fragLen - seedLen);
513                                         effL = std::min(fullLen, totLen - fragLen + 1);
514                                         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
515                                         value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
516                                 }
517                         }
518
519                 //for reverse strand masking
520                 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
521                         minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
522                         maxL = std::min(gld->getMaxL(), seedPos + seedLen);
523                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
524                                 pfpos = seedPos - (fragLen - seedLen);
525                                 effL = std::min(fullLen, totLen - fragLen + 1);
526                                 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
527                                 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
528                         }
529                 }
530
531                 mw[i] = 1.0 - value;
532
533                 if (mw[i] < 1e-8) {
534                         //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
535                         mw[i] = 0.0;
536                 }
537         }
538 }
539
540 #endif /* SINGLEQMODEL_H_ */