X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=SingleQModel.h;h=ba43d9b608c934dd290ff5a9679b7f7d96e9586a;hp=5d4191a809d39c454e1bfea7805b12f5fde990c0;hb=b64d62d49f9b0446f10f87a2aadcde2f36854ab6;hpb=946f9a6adb2a82048c8453d44693cd3838d32939 diff --git a/SingleQModel.h b/SingleQModel.h index 5d4191a..ba43d9b 100644 --- a/SingleQModel.h +++ b/SingleQModel.h @@ -8,8 +8,11 @@ #include #include #include +#include +#include #include "utils.h" +#include "my_assert.h" #include "Orientation.h" #include "LenDist.h" #include "RSPD.h" @@ -108,7 +111,14 @@ public: int readLen = read.getReadLength(); int fpos = (dir == 0 ? pos : totLen - pos - readLen); // the aligned position reported in SAM file, should be a coordinate in forward strand - assert(fpos >= 0 && fpos + readLen <= totLen && readLen <= totLen); + general_assert(fpos >= 0, "The alignment of read " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \ + " from the forward direction, which should be a non-negative number! " + \ + "It is possible that the aligner you use gave different read lengths for a same read in SAM file."); + general_assert(fpos + readLen <= totLen,"Read " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \ + + "It is possible that the aligner you use gave different read lengths for a same read in SAM file."); + general_assert(readLen <= totLen, "Read " + read.getName() + " has length " + itos(readLen) + ", but it is aligned to transcript " \ + + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the read's length!"); + int seedPos = (dir == 0 ? pos : totLen - pos - seedLen); // the aligned position of the seed in forward strand coordinates if (seedPos >= fullLen || ref.getMask(seedPos)) return 0.0; @@ -229,12 +239,12 @@ public: const LenDist& getGLD() { return *gld; } - void startSimulation(simul*, double*); - bool simulate(int, SingleReadQ&, int&); + void startSimulation(simul*, const std::vector&); + bool simulate(READ_INT_TYPE, SingleReadQ&, int&); void finishSimulation(); //Use it after function 'read' or 'estimateFromReads' - double* getMW() { + const double* getMW() { assert(mw != NULL); return mw; } @@ -246,7 +256,7 @@ private: static const int read_type = 1; int M; - int N[3]; + READ_INT_TYPE N[3]; Refs *refs; double mean, sd; int seedLen; @@ -279,22 +289,24 @@ void SingleQModel::estimateFromReads(const char* readFN) { for (int i = 0; i < 3; i++) if (N[i] > 0) { genReadFileNames(readFN, i, read_type, s, readFs); - ReadReader reader(s, readFs); + ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function - int cnt = 0; + READ_INT_TYPE cnt = 0; while (reader.next(read)) { if (!read.isLowQuality()) { - mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); - qd->update(read.getQScore()); - if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); } + mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); + qd->update(read.getQScore()); + if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); } + } + else if (verbose && read.getReadLength() < seedLen) { + std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to read length "<< read.getReadLength()<< " < seed length "<< seedLen<< "!"<< std::endl; } - else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); } ++cnt; - if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; } } - if (verbose) { printf("estimateFromReads, N%d finished.\n", i); } + if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; } } mld != NULL ? mld->finish() : gld->finish(); @@ -394,7 +406,7 @@ void SingleQModel::write(const char* outF) { fclose(fo); } -void SingleQModel::startSimulation(simul* sampler, double* theta) { +void SingleQModel::startSimulation(simul* sampler, const std::vector& theta) { this->sampler = sampler; theta_cdf = new double[M + 1]; @@ -409,7 +421,7 @@ void SingleQModel::startSimulation(simul* sampler, double* theta) { nqpro->startSimulation(); } -bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) { +bool SingleQModel::simulate(READ_INT_TYPE rid, SingleReadQ& read, int& sid) { int dir, pos, readLen, fragLen; std::string name; std::string qual, readseq; @@ -464,67 +476,67 @@ void SingleQModel::finishSimulation() { } void SingleQModel::calcMW() { - double probF, probR; - - assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen); - - memset(mw, 0, sizeof(double) * (M + 1)); - mw[0] = 1.0; - - probF = ori->getProb(0); - probR = ori->getProb(1); - - for (int i = 1; i <= M; i++) { - RefSeq& ref = refs->getRef(i); - int totLen = ref.getTotLen(); - int fullLen = ref.getFullLen(); - double value = 0.0; - int minL, maxL; - int effL, pfpos; - int end = std::min(fullLen, totLen - seedLen + 1); - double factor; - - for (int seedPos = 0; seedPos < end; seedPos++) - if (ref.getMask(seedPos)) { - //forward - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), totLen - seedPos); - pfpos = seedPos; - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; - } - //reverse - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), seedPos + seedLen); - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - pfpos = seedPos - (fragLen - seedLen); - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + double probF, probR; + + assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen); + + memset(mw, 0, sizeof(double) * (M + 1)); + mw[0] = 1.0; + + probF = ori->getProb(0); + probR = ori->getProb(1); + + for (int i = 1; i <= M; i++) { + RefSeq& ref = refs->getRef(i); + int totLen = ref.getTotLen(); + int fullLen = ref.getFullLen(); + double value = 0.0; + int minL, maxL; + int effL, pfpos; + int end = std::min(fullLen, totLen - seedLen + 1); + double factor; + + for (int seedPos = 0; seedPos < end; seedPos++) + if (ref.getMask(seedPos)) { + //forward + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), totLen - seedPos); + pfpos = seedPos; + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + //reverse + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), seedPos + seedLen); + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + pfpos = seedPos - (fragLen - seedLen); + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + } + + //for reverse strand masking + for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) { + minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1); + maxL = std::min(gld->getMaxL(), seedPos + seedLen); + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + pfpos = seedPos - (fragLen - seedLen); + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + } + + mw[i] = 1.0 - value; + + if (mw[i] < 1e-8) { + // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); + mw[i] = 0.0; + } } - } - - //for reverse strand masking - for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) { - minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1); - maxL = std::min(gld->getMaxL(), seedPos + seedLen); - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - pfpos = seedPos - (fragLen - seedLen); - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; - } - } - - mw[i] = 1.0 - value; - - if (mw[i] < 1e-8) { - // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); - mw[i] = 0.0; - } - } } #endif /* SINGLEQMODEL_H_ */