]> git.donarmstrong.com Git - rsem.git/blob - SingleQModel.h
Removed Mac ._* files from repo
[rsem.git] / SingleQModel.h
1 #ifndef SINGLEQMODEL_H_
2 #define SINGLEQMODEL_H_
3
4 #include<cmath>
5 #include<cstdio>
6 #include<cassert>
7 #include<cstring>
8 #include<string>
9 #include<algorithm>
10 #include<sstream>
11 #include<iostream>
12 #include<vector>
13
14 #include "utils.h"
15 #include "my_assert.h"
16 #include "Orientation.h"
17 #include "LenDist.h"
18 #include "RSPD.h"
19 #include "QualDist.h"
20 #include "QProfile.h"
21 #include "NoiseQProfile.h"
22
23 #include "ModelParams.h"
24 #include "RefSeq.h"
25 #include "Refs.h"
26 #include "SingleReadQ.h"
27 #include "SingleHit.h"
28 #include "ReadReader.h"
29
30 #include "simul.h"
31
32 class SingleQModel {
33 public:
34         SingleQModel(Refs* refs = NULL) {
35                 this->refs = refs;
36                 M = (refs != NULL ? refs->getM() : 0);
37                 memset(N, 0, sizeof(N));
38                 estRSPD = false;
39                 needCalcConPrb = true;
40                 
41                 ori = new Orientation();
42                 gld = new LenDist();
43                 mld = NULL;
44                 rspd = new RSPD(estRSPD);
45                 qd = new QualDist();
46                 qpro = new QProfile();
47                 nqpro = new NoiseQProfile();
48
49                 mean = -1.0; sd = 0.0;
50                 mw = NULL;
51
52                 seedLen = 0;
53         }
54
55         //If it is not a master node, only init & update can be used!
56         SingleQModel(ModelParams& params, bool isMaster = true) {
57                 M = params.M;
58                 memcpy(N, params.N, sizeof(params.N));
59                 refs = params.refs;
60                 estRSPD = params.estRSPD;
61                 mean = params.mean; sd = params.sd;
62                 seedLen = params.seedLen;
63                 needCalcConPrb = true;
64
65                 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL;
66                 mw = NULL;
67
68                 if (isMaster) {
69                         gld = new LenDist(params.minL, params.maxL);                    
70                         if (mean >= EPSILON) {
71                                 mld = new LenDist(params.mate_minL, params.mate_maxL);
72                         }
73                         if (!estRSPD) { rspd = new RSPD(estRSPD); }
74                         qd = new QualDist();
75                 }
76
77                 ori = new Orientation(params.probF);
78                 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
79                 qpro = new QProfile();
80                 nqpro = new NoiseQProfile();
81         }
82
83         ~SingleQModel() {
84                 refs = NULL;
85                 if (ori != NULL) delete ori;
86                 if (gld != NULL) delete gld;
87                 if (mld != NULL) delete mld;
88                 if (rspd != NULL) delete rspd;
89                 if (qd != NULL) delete qd;
90                 if (qpro != NULL) delete qpro;
91                 if (nqpro != NULL) delete nqpro;
92                 if (mw != NULL) delete[] mw;
93                 /*delete[] p1, p2;*/
94         }
95
96         //SingleQModel& operator=(const SingleQModel&);
97
98         void estimateFromReads(const char*);
99
100         //if prob is too small, just make it 0
101         double getConPrb(const SingleReadQ& read, const SingleHit& hit) const {
102                 if (read.isLowQuality()) return 0.0;
103
104                 double prob;
105                 int sid = hit.getSid();
106                 RefSeq &ref = refs->getRef(sid);
107                 int fullLen = ref.getFullLen();
108                 int totLen = ref.getTotLen();
109                 int dir = hit.getDir();
110                 int pos = hit.getPos();
111                 int readLen = read.getReadLength();
112                 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
113
114                 general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \
115                                 " from the forward direction, which should be a non-negative number! " + \
116                                 "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
117                 general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \
118                                 + "It is possible that the aligner you use gave different read lengths for a same read in SAM file.");
119                 general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \
120                                 + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!");
121
122                 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
123                 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
124
125                 int effL;
126                 double value;
127
128                 if (mld != NULL) {
129                         int minL = std::max(readLen, gld->getMinL());
130                         int maxL = std::min(totLen - pos, gld->getMaxL());
131                         int pfpos; // possible fpos for fragment
132                         value = 0.0;
133                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
134                                 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
135                                 effL = std::min(fullLen, totLen - fragLen + 1);
136                                 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
137                         }
138                 }
139                 else {
140                         effL = std::min(fullLen, totLen - readLen + 1);
141                         value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
142                 }
143
144                 prob = ori->getProb(dir) * value * qpro->getProb(read.getReadSeq(), read.getQScore(), ref, pos, dir);
145
146                 if (prob < EPSILON) { prob = 0.0; }
147
148                 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
149
150                 return prob;
151         }
152
153         double getNoiseConPrb(const SingleReadQ& read) {
154                 if (read.isLowQuality()) return 0.0;
155                 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
156                 prob *= nqpro->getProb(read.getReadSeq(), read.getQScore());
157                 if (prob < EPSILON) { prob = 0.0; }
158
159                 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
160
161                 return prob;
162         }
163
164         double getLogP() { return nqpro->getLogP(); }
165
166         void init();
167
168         void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
169                 if (read.isLowQuality() || frac < EPSILON) return;
170
171                 const RefSeq& ref = refs->getRef(hit.getSid());
172
173                 int dir = hit.getDir();
174                 int pos = hit.getPos();
175
176                 if (estRSPD) {
177                         int fullLen = ref.getFullLen();
178
179                         // Only use one strand to estimate RSPD
180                         if (ori->getProb(0) >= ORIVALVE && dir == 0) {
181                                 rspd->update(pos, fullLen, frac);
182                         }
183
184                         if (ori->getProb(0) < ORIVALVE && dir == 1) {
185                                 int totLen = ref.getTotLen();                     
186                                 int readLen = read.getReadLength();
187                                 
188                                 int pfpos, effL; 
189
190                                 if (mld != NULL) {
191                                         int minL = std::max(readLen, gld->getMinL());
192                                         int maxL = std::min(totLen - pos, gld->getMaxL());
193                                         double sum = 0.0;
194                                         assert(maxL >= minL);
195                                         std::vector<double> frag_vec(maxL - minL + 1, 0.0);
196
197                                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
198                                                 pfpos = totLen - pos - fragLen;
199                                                 effL = std::min(fullLen, totLen - fragLen + 1);
200                                                 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
201                                                 sum += frag_vec[fragLen - minL];
202                                         }
203                                         assert(sum >= EPSILON);
204                                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
205                                                 pfpos = totLen - pos - fragLen;
206                                                 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
207                                         }
208                                 }
209                                 else {
210                                         rspd->update(totLen - pos - readLen, fullLen, frac);
211                                 }
212                         }
213                 }
214                 qpro->update(read.getReadSeq(), read.getQScore(), ref, pos, dir, frac);
215         }
216
217         void updateNoise(const SingleReadQ& read, double frac) {
218                 if (read.isLowQuality() || frac < EPSILON) return;
219
220                 nqpro->update(read.getReadSeq(), read.getQScore(), frac);
221         }
222
223         void finish();
224
225         void collect(const SingleQModel&);
226
227         //void copy(const SingleQModel&);
228
229         bool getNeedCalcConPrb() { return needCalcConPrb; }
230         void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
231
232         //void calcP1();
233         //void calcP2();
234         //double* getP1() { return p1; }
235         //double* getP2() { return p2; }
236
237         void read(const char*);
238         void write(const char*);
239
240         const LenDist& getGLD() { return *gld; }
241
242         void startSimulation(simul*, const std::vector<double>&);
243         bool simulate(READ_INT_TYPE, SingleReadQ&, int&);
244         void finishSimulation();
245
246         //Use it after function 'read' or 'estimateFromReads'
247         const double* getMW() { 
248           assert(mw != NULL);
249           return mw;
250         }
251
252         int getModelType() const { return model_type; }
253
254 private:
255         static const int model_type = 1;
256         static const int read_type = 1;
257
258         int M;
259         READ_INT_TYPE N[3];
260         Refs *refs;
261         double mean, sd;
262         int seedLen;
263         //double *p1, *p2; P_i' & P_i'';
264
265         bool estRSPD; // true if estimate RSPD
266         bool needCalcConPrb; //true need, false does not need
267
268         Orientation *ori;
269         LenDist *gld, *mld;
270         RSPD *rspd;
271         QualDist *qd;
272         QProfile *qpro;
273         NoiseQProfile *nqpro;
274
275         simul *sampler; // for simulation
276         double *theta_cdf; // for simulation
277
278         double *mw; // for masking
279
280         void calcMW();
281 };
282
283 void SingleQModel::estimateFromReads(const char* readFN) {
284         int s;
285         char readFs[2][STRLEN];
286         SingleReadQ read;
287
288         mld != NULL ? mld->init() : gld->init();
289         for (int i = 0; i < 3; i++)
290                 if (N[i] > 0) {
291                         genReadFileNames(readFN, i, read_type, s, readFs);
292                         ReadReader<SingleReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
293
294                         READ_INT_TYPE cnt = 0;
295                         while (reader.next(read)) {
296                                 if (!read.isLowQuality()) {
297                                         mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
298                                         qd->update(read.getQScore());
299                                         if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
300                                 }
301                                 else if (verbose && read.getReadLength() < seedLen) {
302                                         std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to read length "<< read.getReadLength()<< " < seed length "<< seedLen<< "!"<< std::endl;
303                                 }
304
305                                 ++cnt;
306                                 if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; }
307                         }
308
309                         if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; }
310                 }
311
312         mld != NULL ? mld->finish() : gld->finish();
313
314         //mean should be > 0
315         if (mean >= EPSILON) { 
316           assert(mld->getMaxL() <= gld->getMaxL());
317           gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
318         }
319         qd->finish();
320         nqpro->calcInitParams();
321
322         mw = new double[M + 1];
323         calcMW();
324 }
325
326 void SingleQModel::init() {
327         if (estRSPD) rspd->init();
328         qpro->init();
329         nqpro->init();
330 }
331
332 void SingleQModel::finish() {
333         if (estRSPD) rspd->finish();
334         qpro->finish();
335         nqpro->finish();
336         needCalcConPrb = true;
337         if (estRSPD) calcMW();
338 }
339
340 void SingleQModel::collect(const SingleQModel& o) {
341         if (estRSPD) rspd->collect(*(o.rspd));
342         qpro->collect(*(o.qpro));
343         nqpro->collect(*(o.nqpro));
344 }
345
346 //Only master node can call
347 void SingleQModel::read(const char* inpF) {
348         int val;
349         FILE *fi = fopen(inpF, "r");
350         if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
351
352         assert(fscanf(fi, "%d", &val) == 1);
353         assert(val == model_type);
354
355         ori->read(fi);
356         gld->read(fi);
357         assert(fscanf(fi, "%d", &val) == 1);
358         if (val > 0) {
359                 if (mld == NULL) mld = new LenDist();
360                 mld->read(fi);
361         }
362         rspd->read(fi);
363         qd->read(fi);
364         qpro->read(fi);
365         nqpro->read(fi);
366
367         if (fscanf(fi, "%d", &val) == 1) {
368                 if (M == 0) M = val;
369                 if (M == val) {
370                         mw = new double[M + 1];
371                         for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1);
372                 }
373         }
374
375         fclose(fi);
376 }
377
378 //Only master node can call. Only be called at EM.cpp
379 void SingleQModel::write(const char* outF) {
380         FILE *fo = fopen(outF, "w");
381
382         fprintf(fo, "%d\n", model_type);
383         fprintf(fo, "\n");
384
385         ori->write(fo);  fprintf(fo, "\n");
386         gld->write(fo);  fprintf(fo, "\n");
387         if (mld != NULL) {
388                 fprintf(fo, "1\n");
389                 mld->write(fo);
390         }
391         else { fprintf(fo, "0\n"); }
392         fprintf(fo, "\n");
393         rspd->write(fo); fprintf(fo, "\n");
394         qd->write(fo);   fprintf(fo, "\n");
395         qpro->write(fo); fprintf(fo, "\n");
396         nqpro->write(fo);
397
398         if (mw != NULL) {
399           fprintf(fo, "\n%d\n", M);
400           for (int i = 0; i < M; i++) {
401             fprintf(fo, "%.15g ", mw[i]);
402           }
403           fprintf(fo, "%.15g\n", mw[M]);
404         }
405
406         fclose(fo);
407 }
408
409 void SingleQModel::startSimulation(simul* sampler, const std::vector<double>& theta) {
410         this->sampler = sampler;
411
412         theta_cdf = new double[M + 1];
413         for (int i = 0; i <= M; i++) {
414                 theta_cdf[i] = theta[i];
415                 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
416         }
417
418         rspd->startSimulation(M, refs);
419         qd->startSimulation();
420         qpro->startSimulation();
421         nqpro->startSimulation();
422 }
423
424 bool SingleQModel::simulate(READ_INT_TYPE rid, SingleReadQ& read, int& sid) {
425         int dir, pos, readLen, fragLen;
426         std::string name;
427         std::string qual, readseq;
428         std::ostringstream strout;
429
430         sid = sampler->sample(theta_cdf, M + 1);
431
432         if (sid == 0) {
433                 dir = pos = 0;
434                 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
435                 qual = qd->simulate(sampler, readLen);
436                 readseq = nqpro->simulate(sampler, readLen, qual);
437         }
438         else {
439                 RefSeq &ref = refs->getRef(sid);
440                 dir = ori->simulate(sampler);
441                 fragLen = gld->simulate(sampler, ref.getTotLen());
442                 if (fragLen < 0) return false;
443
444                 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
445                 pos = rspd->simulate(sampler, sid, effL);
446                 if (pos < 0) return false;
447                 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
448
449                 if (mld != NULL) {
450                         readLen = mld->simulate(sampler, fragLen);
451                         if (readLen < 0) return false;
452                         qual = qd->simulate(sampler, readLen);
453                         readseq = qpro->simulate(sampler, readLen, pos, dir, qual, ref);
454                 }
455                 else {
456                         qual = qd->simulate(sampler, fragLen);
457                         readseq = qpro->simulate(sampler, fragLen, pos, dir, qual, ref);
458                 }
459         }
460
461         strout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
462         name = strout.str();
463
464         read = SingleReadQ(name, readseq, qual);
465
466         return true;
467 }
468
469 void SingleQModel::finishSimulation() {
470         delete[] theta_cdf;
471
472         rspd->finishSimulation();
473         qd->finishSimulation();
474         qpro->finishSimulation();
475         nqpro->finishSimulation();
476 }
477
478 void SingleQModel::calcMW() {
479         double probF, probR;
480
481         assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
482
483         memset(mw, 0, sizeof(double) * (M + 1));
484         mw[0] = 1.0;
485
486         probF = ori->getProb(0);
487         probR = ori->getProb(1);
488
489         for (int i = 1; i <= M; i++) {
490                 RefSeq& ref = refs->getRef(i);
491                 int totLen = ref.getTotLen();
492                 int fullLen = ref.getFullLen();
493                 double value = 0.0;
494                 int minL, maxL;
495                 int effL, pfpos;
496                 int end = std::min(fullLen, totLen - seedLen + 1);
497                 double factor;
498
499                 for (int seedPos = 0; seedPos < end; seedPos++)
500                         if (ref.getMask(seedPos)) {
501                                 //forward
502                                 minL = gld->getMinL();
503                                 maxL = std::min(gld->getMaxL(), totLen - seedPos);
504                                 pfpos = seedPos;
505                                 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
506                                         effL = std::min(fullLen, totLen - fragLen + 1);
507                                         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
508                                         value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
509                                 }
510                                 //reverse
511                                 minL = gld->getMinL();
512                                 maxL = std::min(gld->getMaxL(), seedPos + seedLen);
513                                 for (int fragLen = minL; fragLen <= maxL; fragLen++) {
514                                         pfpos = seedPos - (fragLen - seedLen);
515                                         effL = std::min(fullLen, totLen - fragLen + 1);
516                                         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
517                                         value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
518                                 }
519                         }
520
521                 //for reverse strand masking
522                 for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
523                         minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
524                         maxL = std::min(gld->getMaxL(), seedPos + seedLen);
525                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
526                                 pfpos = seedPos - (fragLen - seedLen);
527                                 effL = std::min(fullLen, totLen - fragLen + 1);
528                                 factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
529                                 value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
530                         }
531                 }
532
533                 mw[i] = 1.0 - value;
534
535                 if (mw[i] < 1e-8) {
536                         //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
537                         mw[i] = 0.0;
538                 }
539         }
540 }
541
542 #endif /* SINGLEQMODEL_H_ */