rsem v1.1.13, speed up EM by only updating model parameters for first 10 iterations...

author Bo Li <bli@cs.wisc.edu>

Sun, 6 Nov 2011 00:02:05 +0000 (19:02 -0500)

committer Bo Li <bli@cs.wisc.edu>

Sun, 6 Nov 2011 00:02:05 +0000 (19:02 -0500)
author Bo Li <bli@cs.wisc.edu>
Sun, 6 Nov 2011 00:02:05 +0000 (19:02 -0500)
committer Bo Li <bli@cs.wisc.edu>
Sun, 6 Nov 2011 00:02:05 +0000 (19:02 -0500)
diff --git a/EM.cpp b/EM.cpp

index 5f9b8c05a3d11f785eb31de57466e5dd3c4d5e4c..322a3063ff6596c9b1cce15d037065c8c2c1af85 100644 (file)
--- a/EM.cpp
+++ b/EM.cpp
@@ -394,9 +394,11 @@ void release(ReadReader<ReadType> **readers, HitContainer<HitType> **hitvs, doub
         delete[] mhps;
  }
  
+int tmp_n;
+
  inline bool doesUpdateModel(int ROUND) {
-       //return false; // never update, for debugging only
-       return ROUND <= 20 || ROUND % 100 == 0;
+  //  return ROUND <= 20 || ROUND % 100 == 0;
+  return ROUND <= 10;
  }
  
  //Including initialize, algorithm and results saving
diff --git a/LenDist.h b/LenDist.h

index f19babaa1bf79079f22c2fae4e3ce86926f6ea63..90d47d74522cbc90088ff8267b387046a533a0e3 100644 (file)
--- a/LenDist.h
+++ b/LenDist.h
@@ -3,6 +3,7 @@
  
  #include<cstdio>
  #include<cstring>
+#include<cstdlib>
  #include<cassert>
  #include<algorithm>
  
@@ -189,6 +190,8 @@ void LenDist::finish() {
                 sum += pdf[i];
         }
  
+       if (sum <= EPSILON) { fprintf(stderr, "No valid read to estimate the length distribution!\n"); exit(-1); }
+
         for (int i = 1; i <= span; i++) {
                 pdf[i] = pdf[i] / sum;
                 cdf[i] = cdf[i - 1] + pdf[i];
diff --git a/NoiseProfile.h b/NoiseProfile.h

index c8d21073b3c21e7ded8b50b04f01da458f5b8ab6..5662e0b216b473f3f3cea0440457c5b97d2e4fd2 100644 (file)
--- a/NoiseProfile.h
+++ b/NoiseProfile.h
@@ -81,7 +81,7 @@ void NoiseProfile::finish() {
         logp = 0.0;
         sum = 0.0;
         for (int i = 0; i < NCODES; i++) sum += (p[i] + c[i]);
-       if (sum <= 0.0) return;
+       if (sum <= EPSILON) return;
         for (int i = 0; i < NCODES; i++) {
                 p[i] = (p[i] + c[i]) / sum;
                 if (c[i] > 0.0) { logp += c[i] * log(p[i]); }
diff --git a/PairedEndModel.h b/PairedEndModel.h

index c9f7a81668f36ece34f2f672c4da1a803fca9be2..c011ceea16d53757384e320ae7196251eea7b956 100644 (file)
--- a/PairedEndModel.h
+++ b/PairedEndModel.h
@@ -236,14 +236,19 @@ void PairedEndModel::estimateFromReads(const char* readFN) {
                 while (reader.next(read)) {
                         SingleRead mate1 = read.getMate1();
                         SingleRead mate2 = read.getMate2();
-
-                       mld->update(mate1.getReadLength(), 1.0);
-                       mld->update(mate2.getReadLength(), 1.0);
-
-                       if (i == 0) {
-                               npro->updateC(mate1.getReadSeq());
-                               npro->updateC(mate2.getReadSeq());
-                       }
+                       
+                       if (!read.isLowQuality()) {
+                         mld->update(mate1.getReadLength(), 1.0);
+                         mld->update(mate2.getReadLength(), 1.0);
+                         
+                         if (i == 0) {
+                           npro->updateC(mate1.getReadSeq());
+                           npro->updateC(mate2.getReadSeq());
+                         }
+                       }
+                       else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) { 
+                         printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN);
+                       }
  
                         ++cnt;
                         if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
diff --git a/PairedEndQModel.h b/PairedEndQModel.h

index 64e486b66b98d19fafc98bce67ef0a5bd85131ad..cbc5475588d19996154d892cc99af336b75ddbde 100644 (file)
--- a/PairedEndQModel.h
+++ b/PairedEndQModel.h
@@ -243,16 +243,21 @@ void PairedEndQModel::estimateFromReads(const char* readFN) {
                         SingleReadQ mate1 = read.getMate1();
                         SingleReadQ mate2 = read.getMate2();
  
-                       mld->update(mate1.getReadLength(), 1.0);
-                       mld->update(mate2.getReadLength(), 1.0);
-
-                       qd->update(mate1.getQScore());
-                       qd->update(mate2.getQScore());
-
-                       if (i == 0) {
-                               nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
-                               nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
-                       }
+                       if (!read.isLowQuality()) {
+                         mld->update(mate1.getReadLength(), 1.0);
+                         mld->update(mate2.getReadLength(), 1.0);
+
+                         qd->update(mate1.getQScore());
+                         qd->update(mate2.getQScore());
+                         
+                         if (i == 0) {
+                           nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
+                           nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
+                         }
+                       }
+                       else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) { 
+                         printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN);
+                       }
  
                         ++cnt;
                         if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
diff --git a/PairedEndRead.h b/PairedEndRead.h

index 9c81f9e780d8375bda56babd549aea6069d7b84c..6504e610141e9db33d027bdb47cff33fb2af95a9 100644 (file)
--- a/PairedEndRead.h
+++ b/PairedEndRead.h
@@ -61,6 +61,7 @@ void PairedEndRead::write(int argc, std::ostream *argv[]) {
  
  void PairedEndRead::calc_lq() {
         low_quality = mate1.isLowQuality() && mate2.isLowQuality();
+       if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true;
  }
  
  #endif
diff --git a/PairedEndReadQ.h b/PairedEndReadQ.h

index f04afb7089e66a853aa798afbc5415b678ee8ca5..36b703b059ed4bd4fad7a3eb873c216f49f22e19 100644 (file)
--- a/PairedEndReadQ.h
+++ b/PairedEndReadQ.h
@@ -61,6 +61,7 @@ void PairedEndReadQ::write(int argc, std::ostream* argv[]) {
  
  void PairedEndReadQ::calc_lq() {
         low_quality = mate1.isLowQuality() && mate2.isLowQuality();
+       if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true;
  }
  
  #endif /* PAIREDENDREADQ_H_ */
diff --git a/SingleModel.h b/SingleModel.h

index bdf920ff271c133c749214244604fa319eac80c4..97ffd29c8ce9b7d499d99979bf0d51546cb17ab7 100644 (file)
--- a/SingleModel.h
+++ b/SingleModel.h
@@ -273,8 +273,11 @@ void SingleModel::estimateFromReads(const char* readFN) {
  
                         int cnt = 0;
                         while (reader.next(read)) {
-                               mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
-                               if (i == 0) { npro->updateC(read.getReadSeq()); }
+                               if (!read.isLowQuality()) {
+                                 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
+                                 if (i == 0) { npro->updateC(read.getReadSeq()); }
+                               }
+                               else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); }
  
                                 ++cnt;
                                 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
@@ -441,7 +444,7 @@ void SingleModel::finishSimulation() {
  
  void SingleModel::calcMW() {
    double probF, probR;
-  
+
    assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
    
    memset(mw, 0, sizeof(double) * (M + 1));
diff --git a/SingleQModel.h b/SingleQModel.h

index f32b747ad298d2bcc94f109083025a3c4277dac0..5d4191a809d39c454e1bfea7805b12f5fde990c0 100644 (file)
--- a/SingleQModel.h
+++ b/SingleQModel.h
@@ -283,9 +283,12 @@ void SingleQModel::estimateFromReads(const char* readFN) {
  
                         int cnt = 0;
                         while (reader.next(read)) {
-                               mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
-                               qd->update(read.getQScore());
-                               if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
+                               if (!read.isLowQuality()) {
+                                 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
+                                 qd->update(read.getQScore());
+                                 if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
+                               }
+                               else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); }
  
                                 ++cnt;
                                 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
@@ -339,7 +342,7 @@ void SingleQModel::read(const char* inpF) {
  
         ori->read(fi);
         gld->read(fi);
-       fscanf(fi, "%d", &val);
+       assert(fscanf(fi, "%d", &val) == 1);
         if (val > 0) {
                 if (mld == NULL) mld = new LenDist();
                 mld->read(fi);
diff --git a/SingleRead.h b/SingleRead.h

index 2531176b6cb470941537dce24fb58e26eb0967d8..3c61242202ab3f4bfaad173787767417b04858f3 100644 (file)
--- a/SingleRead.h
+++ b/SingleRead.h
@@ -61,6 +61,8 @@ void SingleRead::calc_lq() {
    int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
    int threshold_1, threshold_2;
  
+  if (len < OLEN) { low_quality = true; return; }
+
    threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
    threshold_2 = (OLEN - 1) / 2 + 1;
    for (int i = 0; i < len; i++) {
diff --git a/SingleReadQ.h b/SingleReadQ.h

index a0099d9c41054b68da45880f94a8857e8f232f47..976637fc269d79fa484cb0033421e7f7a0769c49 100644 (file)
--- a/SingleReadQ.h
+++ b/SingleReadQ.h
@@ -67,6 +67,8 @@ void SingleReadQ::calc_lq() {
    int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
    int threshold_1, threshold_2;
  
+  if (len < OLEN) { low_quality = true; return; }
+
    threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
    threshold_2 = (OLEN - 1) / 2 + 1;
    for (int i = 0; i < len; i++) {
diff --git a/buildReadIndex.cpp b/buildReadIndex.cpp

index 1986a4a4ae8837cddc67244cc76363e0973eed70..c9e1deb729c1b948a48a7a6c4e10b48806dfb04e 100644 (file)
--- a/buildReadIndex.cpp
+++ b/buildReadIndex.cpp
@@ -53,7 +53,7 @@ void buildIndex(char* readF, int gap, bool hasQ) {
                 }
                 ++nReads;
  
-               if (verbose && nReads % 1000000 == 0) { printf("FIN %lld\n", nReads); }
+               if (verbose && nReads % 1000000 == 0) { printf("FIN %lld\n", (long long)nReads); }
         } while (success);
  
         fout.seekp(startPos);
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index 9b4e69ea0d47ce19ee292b9e4b6f073e6891c1b7..249ca8baa06d6f7067ffd9f7548c13fb99a7ad51 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -110,6 +110,7 @@ pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -v
  pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL);
  pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
  pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
+pod2usage(-msg => "Seed length should be at least 25!\n", -exitval => 2, -verbose => 2) if ($L < 25);
  
  if ($strand_specific) { $probF = 1.0; }
  
@@ -484,7 +485,7 @@ Calculate 95% credibility intervals and posterior mean estimates.  (Default: off
  
  =item B<--seed-length> <int>
  
-Seed length used by the read aligner.  Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads.  If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. (Default: 25)
+Seed length used by the read aligner.  Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads.  If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. Any read with its or at least one of its mates' (for paired-end reads) length less than 25 will be ignored. (Default: 25)
  
  =item B<--tag> <string>
  
@@ -694,4 +695,3 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
  
  =cut
  
-#  LocalWords:  usr
author	Bo Li <bli@cs.wisc.edu>
	Sun, 6 Nov 2011 00:02:05 +0000 (19:02 -0500)
committer	Bo Li <bli@cs.wisc.edu>
	Sun, 6 Nov 2011 00:02:05 +0000 (19:02 -0500)
EM.cpp		patch \| blob \| history
LenDist.h		patch \| blob \| history
NoiseProfile.h		patch \| blob \| history
PairedEndModel.h		patch \| blob \| history
PairedEndQModel.h		patch \| blob \| history
PairedEndRead.h		patch \| blob \| history
PairedEndReadQ.h		patch \| blob \| history
SingleModel.h		patch \| blob \| history
SingleQModel.h		patch \| blob \| history
SingleRead.h		patch \| blob \| history
SingleReadQ.h		patch \| blob \| history
buildReadIndex.cpp		patch \| blob \| history
rsem-calculate-expression		patch \| blob \| history