SingleQModel.h

   1 #ifndef SINGLEQMODEL_H_
   2 #define SINGLEQMODEL_H_
   3
   4 #include<cmath>
   5 #include<cstdio>
   6 #include<cassert>
   7 #include<cstring>
   8 #include<string>
   9 #include<algorithm>
  10 #include<sstream>
  11
  12 #include "utils.h"
  13 #include "Orientation.h"
  14 #include "LenDist.h"
  15 #include "RSPD.h"
  16 #include "QualDist.h"
  17 #include "QProfile.h"
  18 #include "NoiseQProfile.h"
  19
  20 #include "ModelParams.h"
  21 #include "RefSeq.h"
  22 #include "Refs.h"
  23 #include "SingleReadQ.h"
  24 #include "SingleHit.h"
  25 #include "ReadReader.h"
  26
  27 #include "simul.h"
  28
  29 class SingleQModel {
  30 public:
  31         SingleQModel(Refs* refs = NULL) {
  32                 this->refs = refs;
  33                 M = (refs != NULL ? refs->getM() : 0);
  34                 memset(N, 0, sizeof(N));
  35                 estRSPD = false;
  36                 needCalcConPrb = true;
  37
  38                 ori = new Orientation();
  39                 gld = new LenDist();
  40                 mld = NULL;
  41                 rspd = new RSPD(estRSPD);
  42                 qd = new QualDist();
  43                 qpro = new QProfile();
  44                 nqpro = new NoiseQProfile();
  45
  46                 mean = -1.0; sd = 0.0;
  47                 mw = NULL;
  48
  49                 seedLen = 0;
  50         }
  51
  52         //If it is not a master node, only init & update can be used!
  53         SingleQModel(ModelParams& params, bool isMaster = true) {
  54                 M = params.M;
  55                 memcpy(N, params.N, sizeof(params.N));
  56                 refs = params.refs;
  57                 estRSPD = params.estRSPD;
  58                 mean = params.mean; sd = params.sd;
  59                 seedLen = params.seedLen;
  60                 needCalcConPrb = true;
  61
  62                 ori = NULL; gld = NULL; mld = NULL; rspd = NULL; qd = NULL; qpro = NULL; nqpro = NULL;
  63                 mw = NULL;
  64
  65                 if (isMaster) {
  66                         gld = new LenDist(params.minL, params.maxL);
  67                         if (mean >= EPSILON) {
  68                                 mld = new LenDist(params.mate_minL, params.mate_maxL);
  69                         }
  70                         if (!estRSPD) { rspd = new RSPD(estRSPD); }
  71                         qd = new QualDist();
  72                 }
  73
  74                 ori = new Orientation(params.probF);
  75                 if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
  76                 qpro = new QProfile();
  77                 nqpro = new NoiseQProfile();
  78         }
  79
  80         ~SingleQModel() {
  81                 refs = NULL;
  82                 if (ori != NULL) delete ori;
  83                 if (gld != NULL) delete gld;
  84                 if (mld != NULL) delete mld;
  85                 if (rspd != NULL) delete rspd;
  86                 if (qd != NULL) delete qd;
  87                 if (qpro != NULL) delete qpro;
  88                 if (nqpro != NULL) delete nqpro;
  89                 if (mw != NULL) delete[] mw;
  90                 /*delete[] p1, p2;*/
  91         }
  92
  93         //SingleQModel& operator=(const SingleQModel&);
  94
  95         void estimateFromReads(const char*);
  96
  97         //if prob is too small, just make it 0
  98         double getConPrb(const SingleReadQ& read, const SingleHit& hit) const {
  99                 if (read.isLowQuality()) return 0.0;
 100
 101                 double prob;
 102                 int sid = hit.getSid();
 103                 RefSeq &ref = refs->getRef(sid);
 104                 int fullLen = ref.getFullLen();
 105                 int totLen = ref.getTotLen();
 106                 int dir = hit.getDir();
 107                 int pos = hit.getPos();
 108                 int readLen = read.getReadLength();
 109                 int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand
 110
 111                 assert(fpos >= 0 && fpos + readLen <= totLen && readLen <= totLen);
 112                 int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates
 113                 if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0;
 114
 115                 int effL;
 116                 double value;
 117
 118                 if (mld != NULL) {
 119                         int minL = std::max(readLen, gld->getMinL());
 120                         int maxL = std::min(totLen - pos, gld->getMaxL());
 121                         int pfpos; // possible fpos for fragment
 122                         value = 0.0;
 123                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
 124                                 pfpos = (dir == 0 ? pos : totLen - pos - fragLen);
 125                                 effL = std::min(fullLen, totLen - fragLen + 1);
 126                                 value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
 127                         }
 128                 }
 129                 else {
 130                         effL = std::min(fullLen, totLen - readLen + 1);
 131                         value = gld->getAdjustedProb(readLen, totLen) * rspd->getAdjustedProb(fpos, effL, fullLen);
 132                 }
 133
 134                 prob = ori->getProb(dir) * value * qpro->getProb(read.getReadSeq(), read.getQScore(), ref, pos, dir);
 135
 136                 if (prob < EPSILON) { prob = 0.0; }
 137
 138                 prob = (mw[sid] < EPSILON ? 0.0 : prob / mw[sid]);
 139
 140                 return prob;
 141         }
 142
 143         double getNoiseConPrb(const SingleReadQ& read) {
 144                 if (read.isLowQuality()) return 0.0;
 145                 double prob = mld != NULL ? mld->getProb(read.getReadLength()) : gld->getProb(read.getReadLength());
 146                 prob *= nqpro->getProb(read.getReadSeq(), read.getQScore());
 147                 if (prob < EPSILON) { prob = 0.0; }
 148
 149                 prob = (mw[0] < EPSILON ? 0.0 : prob / mw[0]);
 150
 151                 return prob;
 152         }
 153
 154         double getLogP() { return nqpro->getLogP(); }
 155
 156         void init();
 157
 158         void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
 159                 if (read.isLowQuality() || frac < EPSILON) return;
 160
 161                 const RefSeq& ref = refs->getRef(hit.getSid());
 162
 163                 int dir = hit.getDir();
 164                 int pos = hit.getPos();
 165
 166                 if (estRSPD) {
 167                         int fullLen = ref.getFullLen();
 168
 169                         // Only use one strand to estimate RSPD
 170                         if (ori->getProb(0) >= ORIVALVE && dir == 0) {
 171                                 rspd->update(pos, fullLen, frac);
 172                         }
 173
 174                         if (ori->getProb(0) < ORIVALVE && dir == 1) {
 175                                 int totLen = ref.getTotLen();
 176                                 int readLen = read.getReadLength();
 177
 178                                 int pfpos, effL;
 179
 180                                 if (mld != NULL) {
 181                                         int minL = std::max(readLen, gld->getMinL());
 182                                         int maxL = std::min(totLen - pos, gld->getMaxL());
 183                                         double sum = 0.0;
 184                                         assert(maxL >= minL);
 185                                         std::vector<double> frag_vec(maxL - minL + 1, 0.0);
 186
 187                                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
 188                                                 pfpos = totLen - pos - fragLen;
 189                                                 effL = std::min(fullLen, totLen - fragLen + 1);
 190                                                 frag_vec[fragLen - minL] = gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * mld->getAdjustedProb(readLen, fragLen);
 191                                                 sum += frag_vec[fragLen - minL];
 192                                         }
 193                                         assert(sum >= EPSILON);
 194                                         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
 195                                                 pfpos = totLen - pos - fragLen;
 196                                                 rspd->update(pfpos, fullLen, frac * (frag_vec[fragLen - minL] / sum));
 197                                         }
 198                                 }
 199                                 else {
 200                                         rspd->update(totLen - pos - readLen, fullLen, frac);
 201                                 }
 202                         }
 203                 }
 204                 qpro->update(read.getReadSeq(), read.getQScore(), ref, pos, dir, frac);
 205         }
 206
 207         void updateNoise(const SingleReadQ& read, double frac) {
 208                 if (read.isLowQuality() || frac < EPSILON) return;
 209
 210                 nqpro->update(read.getReadSeq(), read.getQScore(), frac);
 211         }
 212
 213         void finish();
 214
 215         void collect(const SingleQModel&);
 216
 217         //void copy(const SingleQModel&);
 218
 219         bool getNeedCalcConPrb() { return needCalcConPrb; }
 220         void setNeedCalcConPrb(bool value) { needCalcConPrb = value; }
 221
 222         //void calcP1();
 223         //void calcP2();
 224         //double* getP1() { return p1; }
 225         //double* getP2() { return p2; }
 226
 227         void read(const char*);
 228         void write(const char*);
 229
 230         const LenDist& getGLD() { return *gld; }
 231
 232         void startSimulation(simul*, double*);
 233         bool simulate(int, SingleReadQ&, int&);
 234         void finishSimulation();
 235
 236         //Use it after function 'read' or 'estimateFromReads'
 237         double* getMW() {
 238           assert(mw != NULL);
 239           return mw;
 240         }
 241
 242         int getModelType() const { return model_type; }
 243
 244 private:
 245         static const int model_type = 1;
 246         static const int read_type = 1;
 247
 248         int M;
 249         int N[3];
 250         Refs *refs;
 251         double mean, sd;
 252         int seedLen;
 253         //double *p1, *p2; P_i' & P_i'';
 254
 255         bool estRSPD; // true if estimate RSPD
 256         bool needCalcConPrb; //true need, false does not need
 257
 258         Orientation *ori;
 259         LenDist *gld, *mld;
 260         RSPD *rspd;
 261         QualDist *qd;
 262         QProfile *qpro;
 263         NoiseQProfile *nqpro;
 264
 265         simul *sampler; // for simulation
 266         double *theta_cdf; // for simulation
 267
 268         double *mw; // for masking
 269
 270         void calcMW();
 271 };
 272
 273 void SingleQModel::estimateFromReads(const char* readFN) {
 274         int s;
 275         char readFs[2][STRLEN];
 276         SingleReadQ read;
 277
 278         mld != NULL ? mld->init() : gld->init();
 279         for (int i = 0; i < 3; i++)
 280                 if (N[i] > 0) {
 281                         genReadFileNames(readFN, i, read_type, s, readFs);
 282                         ReadReader<SingleReadQ> reader(s, readFs);
 283
 284                         int cnt = 0;
 285                         while (reader.next(read)) {
 286                                 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
 287                                 qd->update(read.getQScore());
 288                                 if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
 289
 290                                 ++cnt;
 291                                 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
 292                         }
 293
 294                         if (verbose) { printf("estimateFromReads, N%d finished.\n", i); }
 295                 }
 296
 297         mld != NULL ? mld->finish() : gld->finish();
 298
 299         //mean should be > 0
 300         if (mean >= EPSILON) {
 301           assert(mld->getMaxL() <= gld->getMaxL());
 302           gld->setAsNormal(mean, sd, std::max(mld->getMinL(), gld->getMinL()), gld->getMaxL());
 303         }
 304         qd->finish();
 305         nqpro->calcInitParams();
 306
 307         mw = new double[M + 1];
 308         calcMW();
 309 }
 310
 311 void SingleQModel::init() {
 312         if (estRSPD) rspd->init();
 313         qpro->init();
 314         nqpro->init();
 315 }
 316
 317 void SingleQModel::finish() {
 318         if (estRSPD) rspd->finish();
 319         qpro->finish();
 320         nqpro->finish();
 321         needCalcConPrb = true;
 322         if (estRSPD) calcMW();
 323 }
 324
 325 void SingleQModel::collect(const SingleQModel& o) {
 326         if (estRSPD) rspd->collect(*(o.rspd));
 327         qpro->collect(*(o.qpro));
 328         nqpro->collect(*(o.nqpro));
 329 }
 330
 331 //Only master node can call
 332 void SingleQModel::read(const char* inpF) {
 333         int val;
 334         FILE *fi = fopen(inpF, "r");
 335         if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
 336
 337         fscanf(fi, "%d", &val);
 338         assert(val == model_type);
 339
 340         ori->read(fi);
 341         gld->read(fi);
 342         fscanf(fi, "%d", &val);
 343         if (val > 0) {
 344                 if (mld == NULL) mld = new LenDist();
 345                 mld->read(fi);
 346         }
 347         rspd->read(fi);
 348         qd->read(fi);
 349         qpro->read(fi);
 350         nqpro->read(fi);
 351
 352         if (fscanf(fi, "%d", &val) == 1) {
 353                 if (M == 0) M = val;
 354                 if (M == val) {
 355                         mw = new double[M + 1];
 356                         for (int i = 0; i <= M; i++) fscanf(fi, "%lf", &mw[i]);
 357                 }
 358         }
 359
 360         fclose(fi);
 361 }
 362
 363 //Only master node can call. Only be called at EM.cpp
 364 void SingleQModel::write(const char* outF) {
 365         FILE *fo = fopen(outF, "w");
 366
 367         fprintf(fo, "%d\n", model_type);
 368         fprintf(fo, "\n");
 369
 370         ori->write(fo);  fprintf(fo, "\n");
 371         gld->write(fo);  fprintf(fo, "\n");
 372         if (mld != NULL) {
 373                 fprintf(fo, "1\n");
 374                 mld->write(fo);
 375         }
 376         else { fprintf(fo, "0\n"); }
 377         fprintf(fo, "\n");
 378         rspd->write(fo); fprintf(fo, "\n");
 379         qd->write(fo);   fprintf(fo, "\n");
 380         qpro->write(fo); fprintf(fo, "\n");
 381         nqpro->write(fo);
 382
 383         if (mw != NULL) {
 384           fprintf(fo, "\n%d\n", M);
 385           for (int i = 0; i < M; i++) {
 386             fprintf(fo, "%.15g ", mw[i]);
 387           }
 388           fprintf(fo, "%.15g\n", mw[M]);
 389         }
 390
 391         fclose(fo);
 392 }
 393
 394 void SingleQModel::startSimulation(simul* sampler, double* theta) {
 395         this->sampler = sampler;
 396
 397         theta_cdf = new double[M + 1];
 398         for (int i = 0; i <= M; i++) {
 399                 theta_cdf[i] = theta[i];
 400                 if (i > 0) theta_cdf[i] += theta_cdf[i - 1];
 401         }
 402
 403         rspd->startSimulation(M, refs);
 404         qd->startSimulation();
 405         qpro->startSimulation();
 406         nqpro->startSimulation();
 407 }
 408
 409 bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) {
 410         int dir, pos, readLen, fragLen;
 411         std::string name;
 412         std::string qual, readseq;
 413         std::ostringstream strout;
 414
 415         sid = sampler->sample(theta_cdf, M + 1);
 416
 417         if (sid == 0) {
 418                 dir = pos = 0;
 419                 readLen = (mld != NULL ? mld->simulate(sampler, -1) : gld->simulate(sampler, -1));
 420                 qual = qd->simulate(sampler, readLen);
 421                 readseq = nqpro->simulate(sampler, readLen, qual);
 422         }
 423         else {
 424                 RefSeq &ref = refs->getRef(sid);
 425                 dir = ori->simulate(sampler);
 426                 fragLen = gld->simulate(sampler, ref.getTotLen());
 427                 if (fragLen < 0) return false;
 428
 429                 int effL = std::min(ref.getFullLen(), ref.getTotLen() - fragLen + 1);
 430                 pos = rspd->simulate(sampler, sid, effL);
 431                 if (pos < 0) return false;
 432                 if (dir > 0) pos = ref.getTotLen() - pos - fragLen;
 433
 434                 if (mld != NULL) {
 435                         readLen = mld->simulate(sampler, fragLen);
 436                         if (readLen < 0) return false;
 437                         qual = qd->simulate(sampler, readLen);
 438                         readseq = qpro->simulate(sampler, readLen, pos, dir, qual, ref);
 439                 }
 440                 else {
 441                         qual = qd->simulate(sampler, fragLen);
 442                         readseq = qpro->simulate(sampler, fragLen, pos, dir, qual, ref);
 443                 }
 444         }
 445
 446         std::ostringstream stdout;
 447         stdout<<rid<<"_"<<dir<<"_"<<sid<<"_"<<pos;
 448         name = stdout.str();
 449
 450         read = SingleReadQ(name, readseq, qual);
 451
 452         return true;
 453 }
 454
 455 void SingleQModel::finishSimulation() {
 456         delete[] theta_cdf;
 457
 458         rspd->finishSimulation();
 459         qd->finishSimulation();
 460         qpro->finishSimulation();
 461         nqpro->finishSimulation();
 462 }
 463
 464 void SingleQModel::calcMW() {
 465   double probF, probR;
 466
 467   assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
 468
 469   memset(mw, 0, sizeof(double) * (M + 1));
 470   mw[0] = 1.0;
 471
 472   probF = ori->getProb(0);
 473   probR = ori->getProb(1);
 474
 475   for (int i = 1; i <= M; i++) {
 476     RefSeq& ref = refs->getRef(i);
 477     int totLen = ref.getTotLen();
 478     int fullLen = ref.getFullLen();
 479     double value = 0.0;
 480     int minL, maxL;
 481     int effL, pfpos;
 482     int end = std::min(fullLen, totLen - seedLen + 1);
 483     double factor;
 484
 485     for (int seedPos = 0; seedPos < end; seedPos++)
 486       if (ref.getMask(seedPos)) {
 487         //forward
 488         minL = gld->getMinL();
 489         maxL = std::min(gld->getMaxL(), totLen - seedPos);
 490         pfpos = seedPos;
 491         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
 492           effL = std::min(fullLen, totLen - fragLen + 1);
 493           factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
 494           value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
 495         }
 496         //reverse
 497         minL = gld->getMinL();
 498         maxL = std::min(gld->getMaxL(), seedPos + seedLen);
 499         for (int fragLen = minL; fragLen <= maxL; fragLen++) {
 500           pfpos = seedPos - (fragLen - seedLen);
 501           effL = std::min(fullLen, totLen - fragLen + 1);
 502           factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
 503           value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
 504         }
 505       }
 506
 507     //for reverse strand masking
 508     for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
 509       minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
 510       maxL = std::min(gld->getMaxL(), seedPos + seedLen);
 511       for (int fragLen = minL; fragLen <= maxL; fragLen++) {
 512         pfpos = seedPos - (fragLen - seedLen);
 513         effL = std::min(fullLen, totLen - fragLen + 1);
 514         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
 515         value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
 516       }
 517     }
 518
 519     mw[i] = 1.0 - value;
 520
 521     if (mw[i] < 1e-8) {
 522       //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
 523       mw[i] = 0.0;
 524     }
 525   }
 526 }
 527
 528 #endif /* SINGLEQMODEL_H_ */