From 86e650e9577999a7ba00ab454d1f6bf674b0ea70 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Sat, 5 Nov 2011 19:02:05 -0500 Subject: [PATCH] rsem v1.1.13, speed up EM by only updating model parameters for first 10 iterations, skip reads with its (or at least one of its mates', for paired-end reads) length < 25bp --- EM.cpp | 6 ++++-- LenDist.h | 3 +++ NoiseProfile.h | 2 +- PairedEndModel.h | 21 +++++++++++++-------- PairedEndQModel.h | 25 +++++++++++++++---------- PairedEndRead.h | 1 + PairedEndReadQ.h | 1 + SingleModel.h | 9 ++++++--- SingleQModel.h | 11 +++++++---- SingleRead.h | 2 ++ SingleReadQ.h | 2 ++ buildReadIndex.cpp | 2 +- rsem-calculate-expression | 4 ++-- 13 files changed, 58 insertions(+), 31 deletions(-) diff --git a/EM.cpp b/EM.cpp index 5f9b8c0..322a306 100644 --- a/EM.cpp +++ b/EM.cpp @@ -394,9 +394,11 @@ void release(ReadReader **readers, HitContainer **hitvs, doub delete[] mhps; } +int tmp_n; + inline bool doesUpdateModel(int ROUND) { - //return false; // never update, for debugging only - return ROUND <= 20 || ROUND % 100 == 0; + // return ROUND <= 20 || ROUND % 100 == 0; + return ROUND <= 10; } //Including initialize, algorithm and results saving diff --git a/LenDist.h b/LenDist.h index f19baba..90d47d7 100644 --- a/LenDist.h +++ b/LenDist.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -189,6 +190,8 @@ void LenDist::finish() { sum += pdf[i]; } + if (sum <= EPSILON) { fprintf(stderr, "No valid read to estimate the length distribution!\n"); exit(-1); } + for (int i = 1; i <= span; i++) { pdf[i] = pdf[i] / sum; cdf[i] = cdf[i - 1] + pdf[i]; diff --git a/NoiseProfile.h b/NoiseProfile.h index c8d2107..5662e0b 100644 --- a/NoiseProfile.h +++ b/NoiseProfile.h @@ -81,7 +81,7 @@ void NoiseProfile::finish() { logp = 0.0; sum = 0.0; for (int i = 0; i < NCODES; i++) sum += (p[i] + c[i]); - if (sum <= 0.0) return; + if (sum <= EPSILON) return; for (int i = 0; i < NCODES; i++) { p[i] = (p[i] + c[i]) / sum; if (c[i] > 0.0) { logp += c[i] * log(p[i]); } diff --git a/PairedEndModel.h b/PairedEndModel.h index c9f7a81..c011cee 100644 --- a/PairedEndModel.h +++ b/PairedEndModel.h @@ -236,14 +236,19 @@ void PairedEndModel::estimateFromReads(const char* readFN) { while (reader.next(read)) { SingleRead mate1 = read.getMate1(); SingleRead mate2 = read.getMate2(); - - mld->update(mate1.getReadLength(), 1.0); - mld->update(mate2.getReadLength(), 1.0); - - if (i == 0) { - npro->updateC(mate1.getReadSeq()); - npro->updateC(mate2.getReadSeq()); - } + + if (!read.isLowQuality()) { + mld->update(mate1.getReadLength(), 1.0); + mld->update(mate2.getReadLength(), 1.0); + + if (i == 0) { + npro->updateC(mate1.getReadSeq()); + npro->updateC(mate2.getReadSeq()); + } + } + else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) { + printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN); + } ++cnt; if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } diff --git a/PairedEndQModel.h b/PairedEndQModel.h index 64e486b..cbc5475 100644 --- a/PairedEndQModel.h +++ b/PairedEndQModel.h @@ -243,16 +243,21 @@ void PairedEndQModel::estimateFromReads(const char* readFN) { SingleReadQ mate1 = read.getMate1(); SingleReadQ mate2 = read.getMate2(); - mld->update(mate1.getReadLength(), 1.0); - mld->update(mate2.getReadLength(), 1.0); - - qd->update(mate1.getQScore()); - qd->update(mate2.getQScore()); - - if (i == 0) { - nqpro->updateC(mate1.getReadSeq(), mate1.getQScore()); - nqpro->updateC(mate2.getReadSeq(), mate2.getQScore()); - } + if (!read.isLowQuality()) { + mld->update(mate1.getReadLength(), 1.0); + mld->update(mate2.getReadLength(), 1.0); + + qd->update(mate1.getQScore()); + qd->update(mate2.getQScore()); + + if (i == 0) { + nqpro->updateC(mate1.getReadSeq(), mate1.getQScore()); + nqpro->updateC(mate2.getReadSeq(), mate2.getQScore()); + } + } + else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) { + printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN); + } ++cnt; if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } diff --git a/PairedEndRead.h b/PairedEndRead.h index 9c81f9e..6504e61 100644 --- a/PairedEndRead.h +++ b/PairedEndRead.h @@ -61,6 +61,7 @@ void PairedEndRead::write(int argc, std::ostream *argv[]) { void PairedEndRead::calc_lq() { low_quality = mate1.isLowQuality() && mate2.isLowQuality(); + if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true; } #endif diff --git a/PairedEndReadQ.h b/PairedEndReadQ.h index f04afb7..36b703b 100644 --- a/PairedEndReadQ.h +++ b/PairedEndReadQ.h @@ -61,6 +61,7 @@ void PairedEndReadQ::write(int argc, std::ostream* argv[]) { void PairedEndReadQ::calc_lq() { low_quality = mate1.isLowQuality() && mate2.isLowQuality(); + if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true; } #endif /* PAIREDENDREADQ_H_ */ diff --git a/SingleModel.h b/SingleModel.h index bdf920f..97ffd29 100644 --- a/SingleModel.h +++ b/SingleModel.h @@ -273,8 +273,11 @@ void SingleModel::estimateFromReads(const char* readFN) { int cnt = 0; while (reader.next(read)) { - mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); - if (i == 0) { npro->updateC(read.getReadSeq()); } + if (!read.isLowQuality()) { + mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); + if (i == 0) { npro->updateC(read.getReadSeq()); } + } + else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); } ++cnt; if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } @@ -441,7 +444,7 @@ void SingleModel::finishSimulation() { void SingleModel::calcMW() { double probF, probR; - + assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen); memset(mw, 0, sizeof(double) * (M + 1)); diff --git a/SingleQModel.h b/SingleQModel.h index f32b747..5d4191a 100644 --- a/SingleQModel.h +++ b/SingleQModel.h @@ -283,9 +283,12 @@ void SingleQModel::estimateFromReads(const char* readFN) { int cnt = 0; while (reader.next(read)) { - mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); - qd->update(read.getQScore()); - if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); } + if (!read.isLowQuality()) { + mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); + qd->update(read.getQScore()); + if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); } + } + else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); } ++cnt; if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } @@ -339,7 +342,7 @@ void SingleQModel::read(const char* inpF) { ori->read(fi); gld->read(fi); - fscanf(fi, "%d", &val); + assert(fscanf(fi, "%d", &val) == 1); if (val > 0) { if (mld == NULL) mld = new LenDist(); mld->read(fi); diff --git a/SingleRead.h b/SingleRead.h index 2531176..3c61242 100644 --- a/SingleRead.h +++ b/SingleRead.h @@ -61,6 +61,8 @@ void SingleRead::calc_lq() { int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region int threshold_1, threshold_2; + if (len < OLEN) { low_quality = true; return; } + threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5); threshold_2 = (OLEN - 1) / 2 + 1; for (int i = 0; i < len; i++) { diff --git a/SingleReadQ.h b/SingleReadQ.h index a0099d9..976637f 100644 --- a/SingleReadQ.h +++ b/SingleReadQ.h @@ -67,6 +67,8 @@ void SingleReadQ::calc_lq() { int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region int threshold_1, threshold_2; + if (len < OLEN) { low_quality = true; return; } + threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5); threshold_2 = (OLEN - 1) / 2 + 1; for (int i = 0; i < len; i++) { diff --git a/buildReadIndex.cpp b/buildReadIndex.cpp index 1986a4a..c9e1deb 100644 --- a/buildReadIndex.cpp +++ b/buildReadIndex.cpp @@ -53,7 +53,7 @@ void buildIndex(char* readF, int gap, bool hasQ) { } ++nReads; - if (verbose && nReads % 1000000 == 0) { printf("FIN %lld\n", nReads); } + if (verbose && nReads % 1000000 == 0) { printf("FIN %lld\n", (long long)nReads); } } while (success); fout.seekp(startPos); diff --git a/rsem-calculate-expression b/rsem-calculate-expression index 9b4e69e..249ca8b 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -110,6 +110,7 @@ pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -v pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL); pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1); pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1); +pod2usage(-msg => "Seed length should be at least 25!\n", -exitval => 2, -verbose => 2) if ($L < 25); if ($strand_specific) { $probF = 1.0; } @@ -484,7 +485,7 @@ Calculate 95% credibility intervals and posterior mean estimates. (Default: off =item B<--seed-length> -Seed length used by the read aligner. Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. (Default: 25) +Seed length used by the read aligner. Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. Any read with its or at least one of its mates' (for paired-end reads) length less than 25 will be ignored. (Default: 25) =item B<--tag> @@ -694,4 +695,3 @@ Assume the path to the bowtie executables is in the user's PATH environment vari =cut -# LocalWords: usr -- 2.39.2