X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=PairedEndModel.h;h=0d2c9b3f216db8df91fa47a85c57065123fe2331;hp=c9f7a81668f36ece34f2f672c4da1a803fca9be2;hb=HEAD;hpb=7b8405a9fe481d041d7e50d1d8abb4b589bdc0d3 diff --git a/PairedEndModel.h b/PairedEndModel.h index c9f7a81..0d2c9b3 100644 --- a/PairedEndModel.h +++ b/PairedEndModel.h @@ -8,8 +8,11 @@ #include #include #include +#include +#include #include "utils.h" +#include "my_assert.h" #include "Orientation.h" #include "LenDist.h" #include "RSPD.h" @@ -99,7 +102,15 @@ public: int fpos = (dir == 0 ? pos : totLen - pos - insertLen); // the aligned position reported in SAM file, should be a coordinate in forward strand int effL = std::min(fullLen, totLen - insertLen + 1); - assert(fpos >= 0 && fpos + insertLen <= totLen && insertLen <= totLen); + general_assert(fpos >= 0, "The alignment of fragment " + read.getName() + " to transcript " + itos(sid) + " starts at " + itos(fpos) + \ + " from the forward direction, which should be a non-negative number! " + \ + "It is possible that the aligner you use gave different read lengths for a same read in SAM file."); + general_assert(fpos + insertLen <= totLen,"Fragment " + read.getName() + " is hung over the end of transcript " + itos(sid) + "! " \ + + "It is possible that the aligner you use gave different read lengths for a same read in SAM file."); + general_assert(insertLen <= totLen, "Fragment " + read.getName() + " has length " + itos(insertLen) + ", but it is aligned to transcript " \ + + itos(sid) + ", whose length (" + itos(totLen) + ") is shorter than the fragment's length!"); + + if (fpos >= fullLen || ref.getMask(fpos)) return 0.0; // For paired-end model, fpos is the seedPos prob = ori->getProb(dir) * gld->getAdjustedProb(insertLen, totLen) * @@ -183,12 +194,12 @@ public: const LenDist& getGLD() { return *gld; } - void startSimulation(simul*, double*); - bool simulate(int, PairedEndRead&, int&); + void startSimulation(simul*, const std::vector&); + bool simulate(READ_INT_TYPE, PairedEndRead&, int&); void finishSimulation(); //Use it after function 'read' or 'estimateFromReads' - double* getMW() { + const double* getMW() { assert(mw != NULL); return mw; } @@ -200,7 +211,7 @@ private: static const int read_type = 2; int M; - int N[3]; + READ_INT_TYPE N[3]; Refs *refs; int seedLen; @@ -230,26 +241,31 @@ void PairedEndModel::estimateFromReads(const char* readFN) { for (int i = 0; i < 3; i++) if (N[i] > 0) { genReadFileNames(readFN, i, read_type, s, readFs); - ReadReader reader(s, readFs); + ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function - int cnt = 0; + READ_INT_TYPE cnt = 0; while (reader.next(read)) { SingleRead mate1 = read.getMate1(); SingleRead mate2 = read.getMate2(); - - mld->update(mate1.getReadLength(), 1.0); - mld->update(mate2.getReadLength(), 1.0); - - if (i == 0) { - npro->updateC(mate1.getReadSeq()); - npro->updateC(mate2.getReadSeq()); + + if (!read.isLowQuality()) { + mld->update(mate1.getReadLength(), 1.0); + mld->update(mate2.getReadLength(), 1.0); + + if (i == 0) { + npro->updateC(mate1.getReadSeq()); + npro->updateC(mate2.getReadSeq()); + } + } + else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) { + std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to at least one of the mates' length < seed length "<< seedLen<< "!"<< std::endl; } ++cnt; - if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; } } - if (verbose) { printf("estimateFromReads, N%d finished.\n", i); } + if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; } } mld->finish(); @@ -334,7 +350,7 @@ void PairedEndModel::write(const char* outF) { fclose(fo); } -void PairedEndModel::startSimulation(simul* sampler, double* theta) { +void PairedEndModel::startSimulation(simul* sampler, const std::vector& theta) { this->sampler = sampler; theta_cdf = new double[M + 1]; @@ -348,7 +364,7 @@ void PairedEndModel::startSimulation(simul* sampler, double* theta) { npro->startSimulation(); } -bool PairedEndModel::simulate(int rid, PairedEndRead& read, int& sid) { +bool PairedEndModel::simulate(READ_INT_TYPE rid, PairedEndRead& read, int& sid) { int dir, pos; int insertL, mateL1, mateL2; std::string name; @@ -402,39 +418,39 @@ void PairedEndModel::finishSimulation() { } void PairedEndModel::calcMW() { - assert(seedLen >= OLEN && mld->getMinL() >= seedLen); - - memset(mw, 0, sizeof(double) * (M + 1)); - mw[0] = 1.0; - - for (int i = 1; i <= M; i++) { - RefSeq& ref = refs->getRef(i); - int totLen = ref.getTotLen(); - int fullLen = ref.getFullLen(); - int end = std::min(fullLen, totLen - gld->getMinL() + 1); - double value = 0.0; - int minL, maxL; - int effL, pfpos; - - //seedPos is fpos here - for (int seedPos = 0; seedPos < end; seedPos++) - if (ref.getMask(seedPos)) { - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), totLen - seedPos); - pfpos = seedPos; - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - effL = std::min(fullLen, totLen - fragLen + 1); - value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen); + assert(mld->getMinL() >= seedLen); + + memset(mw, 0, sizeof(double) * (M + 1)); + mw[0] = 1.0; + + for (int i = 1; i <= M; i++) { + RefSeq& ref = refs->getRef(i); + int totLen = ref.getTotLen(); + int fullLen = ref.getFullLen(); + int end = std::min(fullLen, totLen - gld->getMinL() + 1); + double value = 0.0; + int minL, maxL; + int effL, pfpos; + + //seedPos is fpos here + for (int seedPos = 0; seedPos < end; seedPos++) + if (ref.getMask(seedPos)) { + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), totLen - seedPos); + pfpos = seedPos; + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + effL = std::min(fullLen, totLen - fragLen + 1); + value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen); + } + } + + mw[i] = 1.0 - value; + + if (mw[i] < 1e-8) { + //fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); + mw[i] = 0.0; + } } - } - - mw[i] = 1.0 - value; - - if (mw[i] < 1e-8) { - //fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); - mw[i] = 0.0; - } - } } #endif /* PAIREDENDMODEL_H_ */