X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=SingleQModel.h;h=786d6476383ec41fb06606f74c49099786f693e7;hb=cb94fd597b180aa7cb01ae84c9d1025201b98d8e;hp=92d8890c60958f370155b379d018ca80c06fc319;hpb=a95154919f950f86de9104b2b9dcf1f0c7e83387;p=rsem.git diff --git a/SingleQModel.h b/SingleQModel.h index 92d8890..786d647 100644 --- a/SingleQModel.h +++ b/SingleQModel.h @@ -63,7 +63,6 @@ public: mw = NULL; if (isMaster) { - ori = new Orientation(params.probF); gld = new LenDist(params.minL, params.maxL); if (mean >= EPSILON) { mld = new LenDist(params.mate_minL, params.mate_maxL); @@ -72,6 +71,7 @@ public: qd = new QualDist(); } + ori = new Orientation(params.probF); if (estRSPD) { rspd = new RSPD(estRSPD, params.B); } qpro = new QProfile(); nqpro = new NoiseQProfile(); @@ -158,7 +158,8 @@ public: void update(const SingleReadQ& read, const SingleHit& hit, double frac) { if (read.isLowQuality() || frac < EPSILON) return; - RefSeq& ref = refs->getRef(hit.getSid()); + const RefSeq& ref = refs->getRef(hit.getSid()); + int dir = hit.getDir(); int pos = hit.getPos(); @@ -278,13 +279,18 @@ void SingleQModel::estimateFromReads(const char* readFN) { for (int i = 0; i < 3; i++) if (N[i] > 0) { genReadFileNames(readFN, i, read_type, s, readFs); - ReadReader reader(s, readFs); + ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function int cnt = 0; while (reader.next(read)) { - mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); - qd->update(read.getQScore()); - if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); } + if (!read.isLowQuality()) { + mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); + qd->update(read.getQScore()); + if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); } + } + else if (verbose && read.getReadLength() < seedLen) { + printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen); + } ++cnt; if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } @@ -333,12 +339,12 @@ void SingleQModel::read(const char* inpF) { FILE *fi = fopen(inpF, "r"); if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); } - fscanf(fi, "%d", &val); + assert(fscanf(fi, "%d", &val) == 1); assert(val == model_type); ori->read(fi); gld->read(fi); - fscanf(fi, "%d", &val); + assert(fscanf(fi, "%d", &val) == 1); if (val > 0) { if (mld == NULL) mld = new LenDist(); mld->read(fi); @@ -348,15 +354,18 @@ void SingleQModel::read(const char* inpF) { qpro->read(fi); nqpro->read(fi); - if (fscanf(fi, "%d", &M) == 1) { - mw = new double[M + 1]; - for (int i = 0; i <= M; i++) fscanf(fi, "%lf", &mw[i]); + if (fscanf(fi, "%d", &val) == 1) { + if (M == 0) M = val; + if (M == val) { + mw = new double[M + 1]; + for (int i = 0; i <= M; i++) assert(fscanf(fi, "%lf", &mw[i]) == 1); + } } fclose(fi); } -//Only master node can call +//Only master node can call. Only be called at EM.cpp void SingleQModel::write(const char* outF) { FILE *fo = fopen(outF, "w"); @@ -377,7 +386,7 @@ void SingleQModel::write(const char* outF) { nqpro->write(fo); if (mw != NULL) { - fprintf(fo, "%d\n", M); + fprintf(fo, "\n%d\n", M); for (int i = 0; i < M; i++) { fprintf(fo, "%.15g ", mw[i]); } @@ -439,9 +448,8 @@ bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) { } } - std::ostringstream stdout; - stdout<= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen); - - memset(mw, 0, sizeof(double) * (M + 1)); - mw[0] = 1.0; - - - probF = ori->getProb(0); - probR = ori->getProb(1); - - for (int i = 1; i <= M; i++) { - RefSeq& ref = refs->getRef(i); - int totLen = ref.getTotLen(); - int fullLen = ref.getFullLen(); - double value = 0.0; - int minL, maxL; - int effL, pfpos; - int end = std::min(fullLen, totLen - seedLen + 1); - double factor; - - for (int seedPos = 0; seedPos < end; seedPos++) - if (ref.getMask(seedPos)) { - //forward - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), totLen - seedPos); - pfpos = seedPos; - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; - } - //reverse - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), seedPos + seedLen); - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - pfpos = seedPos - (fragLen - seedLen); - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + double probF, probR; + + assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen); + + memset(mw, 0, sizeof(double) * (M + 1)); + mw[0] = 1.0; + + probF = ori->getProb(0); + probR = ori->getProb(1); + + for (int i = 1; i <= M; i++) { + RefSeq& ref = refs->getRef(i); + int totLen = ref.getTotLen(); + int fullLen = ref.getFullLen(); + double value = 0.0; + int minL, maxL; + int effL, pfpos; + int end = std::min(fullLen, totLen - seedLen + 1); + double factor; + + for (int seedPos = 0; seedPos < end; seedPos++) + if (ref.getMask(seedPos)) { + //forward + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), totLen - seedPos); + pfpos = seedPos; + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + //reverse + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), seedPos + seedLen); + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + pfpos = seedPos - (fragLen - seedLen); + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + } + + //for reverse strand masking + for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) { + minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1); + maxL = std::min(gld->getMaxL(), seedPos + seedLen); + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + pfpos = seedPos - (fragLen - seedLen); + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + } + + mw[i] = 1.0 - value; + + if (mw[i] < 1e-8) { + // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); + mw[i] = 0.0; + } } - } - - //for reverse strand masking - for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) { - minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1); - maxL = std::min(gld->getMaxL(), seedPos + seedLen); - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - pfpos = seedPos - (fragLen - seedLen); - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; - } - } - - mw[i] = 1.0 - value; - - if (mw[i] < 1e-8) { - // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); - mw[i] = 0.0; - } - } } #endif /* SINGLEQMODEL_H_ */