Added user-friendly error messages if users forget to compile the source codes

[rsem.git] / EM.cpp
diff --git a/EM.cpp b/EM.cpp

index 8362f7d47dea60b762801d5877352d196bcf35f7..59783f3089b5a96c7ea67db0113f9b558818d861 100644 (file)
--- a/EM.cpp
+++ b/EM.cpp
@@ -7,9 +7,12 @@
  #include<string>
  #include<vector>
  #include<algorithm>
+#include<fstream>
+#include<iostream>
  #include<pthread.h>
  
  #include "utils.h"
+#include "my_assert.h"
  #include "sampling.h"
  
  #include "Read.h"
@@ -54,7 +57,7 @@ struct Params {
  
  int read_type;
  int m, M; // m genes, M isoforms
-int N0, N1, N2, N_tot;
+READ_INT_TYPE N0, N1, N2, N_tot;
  int nThreads;
  
  
@@ -66,7 +69,7 @@ bool genGibbsOut; // generate file for Gibbs sampler
  char refName[STRLEN], outName[STRLEN];
  char imdName[STRLEN], statName[STRLEN];
  char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN];
-char mparamsF[STRLEN], bmparamsF[STRLEN];
+char mparamsF[STRLEN];
  char modelF[STRLEN], thetaF[STRLEN];
  
  char inpSamType;
@@ -87,8 +90,12 @@ ModelParams mparams;
  
  template<class ReadType, class HitType, class ModelType>
  void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, double **&ncpvs, ModelType **&mhps) {
-       int nReads, nHits, rt;
-       int nrLeft, nhT, curnr; // nrLeft : number of reads left, nhT : hit threshold per thread, curnr: current number of reads
+       READ_INT_TYPE nReads;
+       HIT_INT_TYPE nHits;
+       int rt; // read type
+
+       READ_INT_TYPE nrLeft, curnr; // nrLeft : number of reads left, curnr: current number of reads
+       HIT_INT_TYPE nhT; // nhT : hit threshold per thread
         char datF[STRLEN];
  
         int s;
@@ -102,7 +109,7 @@ void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, doubl
                 indices[i] = new ReadIndex(readFs[i]);
         }
         for (int i = 0; i < nThreads; i++) {
-               readers[i] = new ReadReader<ReadType>(s, readFs);
+               readers[i] = new ReadReader<ReadType>(s, readFs, refs.hasPolyA(), mparams.seedLen); // allow calculation of calc_lq() function
                 readers[i]->setIndices(indices);
         }
  
@@ -113,12 +120,11 @@ void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, doubl
  
         sprintf(datF, "%s.dat", imdName);
         fin.open(datF);
-       if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", datF); exit(-1); }
+       general_assert(fin.is_open(), "Cannot open " + cstrtos(datF) + "! It may not exist.");
         fin>>nReads>>nHits>>rt;
-       if (nReads != N1) { fprintf(stderr, "Number of alignable reads does not match!\n"); exit(-1); }
-       //assert(nReads == N1);
-       if (rt != read_type) { fprintf(stderr, "Data file (.dat) does not have the right read type!\n"); exit(-1); }
-       //assert(rt == read_type);
+       general_assert(nReads == N1, "Number of alignable reads does not match!");
+       general_assert(rt == read_type, "Data file (.dat) does not have the right read type!");
+
  
         //A just so so strategy for paralleling
         nhT = nHits / nThreads;
@@ -127,21 +133,21 @@ void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, doubl
  
         ncpvs = new double*[nThreads];
         for (int i = 0; i < nThreads; i++) {
-               int ntLeft = nThreads - i - 1; // # of threads left
-               if (!readers[i]->locate(curnr)) { fprintf(stderr, "Read indices files do not match!\n"); exit(-1); }
-               //assert(readers[i]->locate(curnr));
+               HIT_INT_TYPE ntLeft = nThreads - i - 1; // # of threads left
+
+               general_assert(readers[i]->locate(curnr), "Read indices files do not match!");
  
                 while (nrLeft > ntLeft && (i == nThreads - 1 || hitvs[i]->getNHits() < nhT)) {
-                       if (!hitvs[i]->read(fin)) { fprintf(stderr, "Cannot read alignments from .dat file!\n"); exit(-1); }
-                       //assert(hitvs[i]->read(fin));
+                       general_assert(hitvs[i]->read(fin), "Cannot read alignments from .dat file!");
+
                         --nrLeft;
-                       if (verbose && nrLeft % 1000000 == 0) { printf("DAT %d reads left!\n", nrLeft); }
+                       if (verbose && nrLeft % 1000000 == 0) { cout<< "DAT "<< nrLeft << " reads left"<< endl; }
                 }
                 ncpvs[i] = new double[hitvs[i]->getN()];
                 memset(ncpvs[i], 0, sizeof(double) * hitvs[i]->getN());
                 curnr += hitvs[i]->getN();
  
-               if (verbose) { printf("Thread %d : N = %d, NHit = %d\n", i, hitvs[i]->getN(), hitvs[i]->getNHits()); }
+               if (verbose) { cout<<"Thread "<< i<< " : N = "<< hitvs[i]->getN()<< ", NHit = "<< hitvs[i]->getNHits()<< endl; }
         }
  
         fin.close();
@@ -175,23 +181,20 @@ void* E_STEP(void* arg) {
  
         ReadType read;
  
-       int N = hitv->getN();
+       READ_INT_TYPE N = hitv->getN();
         double sum;
         vector<double> fracs; //to remove this, do calculation twice
-       int fr, to, id;
+       HIT_INT_TYPE fr, to, id;
  
         if (needCalcConPrb || updateModel) { reader->reset(); }
         if (updateModel) { mhp->init(); }
  
         memset(countv, 0, sizeof(double) * (M + 1));
-       for (int i = 0; i < N; i++) {
+       for (READ_INT_TYPE i = 0; i < N; i++) {
                 if (needCalcConPrb || updateModel) {
-                       if (!reader->next(read)) {
-                               fprintf(stderr, "Can not load a read!\n");
-                               exit(-1);
-                       }
-                       //assert(reader->next(read));
+                       general_assert(reader->next(read), "Can not load a read!");
                 }
+
                 fr = hitv->getSAt(i);
                 to = hitv->getSAt(i + 1);
                 fracs.resize(to - fr + 1);
@@ -202,7 +205,7 @@ void* E_STEP(void* arg) {
                 fracs[0] = probv[0] * ncpv[i];
                 if (fracs[0] < EPSILON) fracs[0] = 0.0;
                 sum += fracs[0];
-               for (int j = fr; j < to; j++) {
+               for (HIT_INT_TYPE j = fr; j < to; j++) {
                         HitType &hit = hitv->getHitAt(j);
                         if (needCalcConPrb) { hit.setConPrb(model->getConPrb(read, hit)); }
                         id = j - fr + 1;
@@ -216,7 +219,7 @@ void* E_STEP(void* arg) {
                         countv[0] += fracs[0];
                         if (updateModel) { mhp->updateNoise(read, fracs[0]); }
                         if (calcExpectedWeights) { ncpv[i] = fracs[0]; }
-                       for (int j = fr; j < to; j++) {
+                       for (HIT_INT_TYPE j = fr; j < to; j++) {
                                 HitType &hit = hitv->getHitAt(j);
                                 id = j - fr + 1;
                                 fracs[id] /= sum;
@@ -227,7 +230,7 @@ void* E_STEP(void* arg) {
                 }
                 else if (calcExpectedWeights) {
                         ncpv[i] = 0.0;
-                       for (int j = fr; j < to; j++) {
+                       for (HIT_INT_TYPE j = fr; j < to; j++) {
                                 HitType &hit = hitv->getHitAt(j);
                                 hit.setConPrb(0.0);
                         }
@@ -246,22 +249,20 @@ void* calcConProbs(void* arg) {
         double *ncpv = (double*)(params->ncpv);
  
         ReadType read;
-       int N = hitv->getN();
-       int fr, to;
+       READ_INT_TYPE N = hitv->getN();
+       HIT_INT_TYPE fr, to;
  
         assert(model->getNeedCalcConPrb());
         reader->reset();
  
-       for (int i = 0; i < N; i++) {
-               if (!reader->next(read)) {
-                       fprintf(stderr, "Can not load a read!\n");
-                       exit(-1);
-               }
+       for (READ_INT_TYPE i = 0; i < N; i++) {
+               general_assert(reader->next(read), "Can not load a read!");
+
                 fr = hitv->getSAt(i);
                 to = hitv->getSAt(i + 1);
  
                 ncpv[i] = model->getNoiseConPrb(read);
-               for (int j = fr; j < to; j++) {
+               for (HIT_INT_TYPE j = fr; j < to; j++) {
                         HitType &hit = hitv->getHitAt(j);
                         hit.setConPrb(model->getConPrb(read, hit));
                 }
@@ -272,59 +273,130 @@ void* calcConProbs(void* arg) {
  
  template<class ModelType>
  void calcExpectedEffectiveLengths(ModelType& model) {
-  int lb, ub, span;
-  double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
+       int lb, ub, span;
+       double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
    
-  model.getGLD().copyTo(pdf, cdf, lb, ub, span);
-  clen = new double[span + 1];
-  clen[0] = 0.0;
-  for (int i = 1; i <= span; i++) {
-    clen[i] = clen[i - 1] + pdf[i] * (lb + i);
-  }
-
-  eel.clear();
-  eel.resize(M + 1, 0.0);
-  for (int i = 1; i <= M; i++) {
-    int totLen = refs.getRef(i).getTotLen();
-    int fullLen = refs.getRef(i).getFullLen();
-    int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
-    int pos2 = max(min(totLen, ub) - lb, 0);
-
-    if (pos2 == 0) { eel[i] = 0.0; continue; }
+       model.getGLD().copyTo(pdf, cdf, lb, ub, span);
+       clen = new double[span + 1];
+       clen[0] = 0.0;
+       for (int i = 1; i <= span; i++) {
+               clen[i] = clen[i - 1] + pdf[i] * (lb + i);
+       }
+
+       eel.assign(M + 1, 0.0);
+       for (int i = 1; i <= M; i++) {
+               int totLen = refs.getRef(i).getTotLen();
+               int fullLen = refs.getRef(i).getFullLen();
+               int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
+               int pos2 = max(min(totLen, ub) - lb, 0);
+
+               if (pos2 == 0) { eel[i] = 0.0; continue; }
      
-    eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
-    assert(eel[i] >= 0);
-    if (eel[i] < MINEEL) { eel[i] = 0.0; }
-  }
+               eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
+               assert(eel[i] >= 0);
+               if (eel[i] < MINEEL) { eel[i] = 0.0; }
+       }
    
-  delete[] pdf;
-  delete[] cdf;
-  delete[] clen;
+       delete[] pdf;
+       delete[] cdf;
+       delete[] clen;
+}
+
+void polishTheta(vector<double>& theta, const vector<double>& eel, const double* mw) {
+       double sum = 0.0;
+
+       /* The reason that for noise gene, mw value is 1 is :
+        * currently, all masked positions are for poly(A) sites, which in theory should be filtered out.
+        * So the theta0 does not containing reads from any masked position
+        */
+
+       for (int i = 0; i <= M; i++) {
+               // i == 0, mw[i] == 1
+               if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) {
+                       theta[i] = 0.0;
+                       continue;
+               }
+               theta[i] = theta[i] / mw[i];
+               sum += theta[i];
+       }
+       // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked
+       general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !");
+       for (int i = 0; i <= M; i++) theta[i] /= sum;
+}
+
+void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
+       double denom;
+       vector<double> frac;
+
+       //calculate fraction of count over all mappabile reads
+       denom = 0.0;
+       frac.assign(M + 1, 0.0);
+       for (int i = 1; i <= M; i++) 
+         if (eel[i] >= EPSILON) {
+           frac[i] = theta[i];
+           denom += frac[i];
+         }
+       general_assert(denom >= EPSILON, "No alignable reads?!");
+       for (int i = 1; i <= M; i++) frac[i] /= denom;
+  
+       //calculate FPKM
+       fpkm.assign(M + 1, 0.0);
+       for (int i = 1; i <= M; i++)
+               if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
+
+       //calculate TPM
+       tpm.assign(M + 1, 0.0);
+       denom = 0.0;
+       for (int i = 1; i <= M; i++) denom += fpkm[i];
+       for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;  
  }
  
  template<class ModelType>
  void writeResults(ModelType& model, double* counts) {
-       double denom;
         char outF[STRLEN];
         FILE *fo;
  
         sprintf(modelF, "%s.model", statName);
         model.write(modelF);
  
-       //calculate tau values
-       double *tau = new double[M + 1];
-       memset(tau, 0, sizeof(double) * (M + 1));
+       vector<int> tlens;
+       vector<double> fpkm, tpm, isopct;
+       vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
  
-       denom = 0.0;
-       for (int i = 1; i <= M; i++) 
-         if (eel[i] >= EPSILON) {
-           tau[i] = theta[i] / eel[i];
-           denom += tau[i];
-         }   
-       if (denom <= 0) { fprintf(stderr, "No alignable reads?!\n"); exit(-1); }
-       //assert(denom > 0);
-       for (int i = 1; i <= M; i++) {
-               tau[i] /= denom;
+       calcExpressionValues(theta, eel, tpm, fpkm);
+
+       //calculate IsoPct, etc.
+       isopct.assign(M + 1, 0.0);
+       tlens.assign(M + 1, 0);
+
+       glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
+       gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+       for (int i = 0; i < m; i++) {
+               int b = gi.spAt(i), e = gi.spAt(i + 1);
+               for (int j = b; j < e; j++) {
+                       const Transcript& transcript = transcripts.getTranscriptAt(j);
+                       tlens[j] = transcript.getLength();
+
+                       gene_counts[i] += counts[j];
+                       gene_tpm[i] += tpm[j];
+                       gene_fpkm[i] += fpkm[j];
+               }
+
+               if (gene_tpm[i] < EPSILON) {
+                       double frac = 1.0 / (e - b);
+                       for (int j = b; j < e; j++) {
+                               glens[i] += tlens[j] * frac;
+                               gene_eels[i] += eel[j] * frac;
+                       }
+               }
+               else {
+                       for (int j = b; j < e; j++) {
+                               isopct[j] = tpm[j] / gene_tpm[i];
+                               glens[i] += tlens[j] * isopct[j];
+                               gene_eels[i] += eel[j] * isopct[j];
+                       }
+               }
         }
  
         //isoform level results
@@ -334,34 +406,30 @@ void writeResults(ModelType& model, double* counts) {
                 const Transcript& transcript = transcripts.getTranscriptAt(i);
                 fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
         }
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n'));
         for (int i = 1; i <= M; i++) {
                 const Transcript& transcript = transcripts.getTranscriptAt(i);
                 fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
         }
+       for (int i = 1; i <= M; i++)
+               fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
+       for (int i = 1; i <= M; i++)
+               fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n'));
+       for (int i = 1; i <= M; i++)
+               fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
+       for (int i = 1; i <= M; i++)
+               fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n'));
+       for (int i = 1; i <= M; i++)
+               fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n'));
+       for (int i = 1; i <= M; i++)
+               fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
         fclose(fo);
  
         //gene level results
         sprintf(outF, "%s.gene_res", imdName);
         fo = fopen(outF, "w");
         for (int i = 0; i < m; i++) {
-               const string& gene_id = transcripts.getTranscriptAt(gi.spAt(i)).getGeneID();
-               fprintf(fo, "%s%c", gene_id.c_str(), (i < m - 1 ? '\t' : '\n'));
-       }
-       for (int i = 0; i < m; i++) {
-               double sumC = 0.0; // sum of counts
-               int b = gi.spAt(i), e = gi.spAt(i + 1);
-               for (int j = b; j < e; j++) sumC += counts[j];
-               fprintf(fo, "%.2f%c", sumC, (i < m - 1 ? '\t' : '\n'));
-       }
-       for (int i = 0; i < m; i++) {
-               double sumT = 0.0; // sum of tau values
-               int b = gi.spAt(i), e = gi.spAt(i + 1);
-               for (int j = b; j < e; j++) sumT += tau[j];
-               fprintf(fo, "%.15g%c", sumT, (i < m - 1 ? '\t' : '\n'));
+               const Transcript& transcript = transcripts.getTranscriptAt(gi.spAt(i));
+               fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m - 1 ? '\t' : '\n'));
         }
         for (int i = 0; i < m; i++) {
                 int b = gi.spAt(i), e = gi.spAt(i + 1);
@@ -369,10 +437,18 @@ void writeResults(ModelType& model, double* counts) {
                         fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : (i < m - 1 ? '\t' :'\n')));
                 }
         }
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", glens[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_eels[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
         fclose(fo);
  
-       delete[] tau;
-
         if (verbose) { printf("Expression Results are written!\n"); }
  }
  
@@ -421,7 +497,6 @@ void EM() {
         Params fparams[nThreads];
         pthread_t threads[nThreads];
         pthread_attr_t attr;
-       void *status;
         int rc;
  
  
@@ -465,14 +540,12 @@ void EM() {
                 //E step
                 for (int i = 0; i < nThreads; i++) {
                         rc = pthread_create(&threads[i], &attr, E_STEP<ReadType, HitType, ModelType>, (void*)(&fparams[i]));
-                       if (rc != 0) { fprintf(stderr, "Cannot create thread %d at ROUND %d! (numbered from 0)\n", i, ROUND); exit(-1); }
-                       //assert(rc == 0);
+                       pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) at ROUND " + itos(ROUND) + "!");
                 }
  
                 for (int i = 0; i < nThreads; i++) {
-                       rc = pthread_join(threads[i], &status);
-                       if (rc != 0) { fprintf(stderr, "Cannot join thread %d at ROUND %d! (numbered from 0)\n", i, ROUND); exit(-1); }
-                       //assert(rc == 0);
+                       rc = pthread_join(threads[i], NULL);
+                       pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) at ROUND " + itos(ROUND) + "!");
                 }
  
                 model.setNeedCalcConPrb(false);
@@ -507,82 +580,62 @@ void EM() {
                                 if (bChange < change) bChange = change;
                         }
  
-               if (verbose) printf("ROUND = %d, SUM = %.15g, bChange = %f, totNum = %d\n", ROUND, sum, bChange, totNum);
+               if (verbose) { cout<< "ROUND = "<< ROUND<< ", SUM = "<< setprecision(15)<< sum<< ", bChange = " << setprecision(6)<< bChange<< ", totNum = " << totNum<< endl; }
         } while (ROUND < MIN_ROUND || (totNum > 0 && ROUND < MAX_ROUND));
-         //while (ROUND < MAX_ROUND);
+//     } while (ROUND < 1);
  
-       if (totNum > 0) fprintf(stderr, "Warning: RSEM reaches %d iterations before meeting the convergence criteria.\n", MAX_ROUND);
+       if (totNum > 0) { cout<< "Warning: RSEM reaches "<< MAX_ROUND<< " iterations before meeting the convergence criteria."<< endl; }
  
         //generate output file used by Gibbs sampler
         if (genGibbsOut) {
                 if (model.getNeedCalcConPrb()) {
                         for (int i = 0; i < nThreads; i++) {
                                 rc = pthread_create(&threads[i], &attr, calcConProbs<ReadType, HitType, ModelType>, (void*)(&fparams[i]));
-                               if (rc != 0) { fprintf(stderr, "Cannot create thread %d when generate files for Gibbs sampler! (numbered from 0)\n", i); exit(-1); }
+                               pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) when generating files for Gibbs sampler!");
                         }
                         for (int i = 0; i < nThreads; i++) {
-                               rc = pthread_join(threads[i], &status);
-                               if (rc != 0) { fprintf(stderr, "Cannot join thread %d when generate files for Gibbs sampler! (numbered from 0)\n", i); exit(-1); }
+                               rc = pthread_join(threads[i], NULL);
+                               pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) when generating files for Gibbs sampler!");
                         }
                 }
                 model.setNeedCalcConPrb(false);
  
                 sprintf(out_for_gibbs_F, "%s.ofg", imdName);
-               fo = fopen(out_for_gibbs_F, "w");
-               fprintf(fo, "%d %d\n", M, N0);
+               ofstream fout(out_for_gibbs_F);
+               fout<< M<< " "<< N0<< endl;
                 for (int i = 0; i < nThreads; i++) {
-                       int numN = hitvs[i]->getN();
-                       for (int j = 0; j < numN; j++) {
-                               int fr = hitvs[i]->getSAt(j);
-                               int to = hitvs[i]->getSAt(j + 1);
-                               int totNum = 0;
-
-                               if (ncpvs[i][j] >= EPSILON) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); }
-                               for (int k = fr; k < to; k++) {
+                       READ_INT_TYPE numN = hitvs[i]->getN();
+                       for (READ_INT_TYPE j = 0; j < numN; j++) {
+                               HIT_INT_TYPE fr = hitvs[i]->getSAt(j);
+                               HIT_INT_TYPE to = hitvs[i]->getSAt(j + 1);
+                               HIT_INT_TYPE totNum = 0;
+
+                               if (ncpvs[i][j] >= EPSILON) { ++totNum; fout<< "0 "<< setprecision(15)<< ncpvs[i][j]<< " "; }
+                               for (HIT_INT_TYPE k = fr; k < to; k++) {
                                         HitType &hit = hitvs[i]->getHitAt(k);
                                         if (hit.getConPrb() >= EPSILON) {
                                                 ++totNum;
-                                               fprintf(fo, "%d %.15g ", hit.getSid(), hit.getConPrb());
+                                               fout<< hit.getSid()<< " "<< setprecision(15)<< hit.getConPrb()<< " ";
                                         }
                                 }
  
-                               if (totNum > 0) { fprintf(fo, "\n"); }
+                               if (totNum > 0) { fout<< endl; }
                         }
                 }
-               fclose(fo);
+               fout.close();
         }
  
-       sprintf(thetaF, "%s.theta", statName);
-       fo = fopen(thetaF, "w");
-       fprintf(fo, "%d\n", M + 1);
-
-       // output theta'
-       for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
-       fprintf(fo, "%.15g\n", theta[M]);
-       
-       //calculate expected effective lengths for each isoform
-       calcExpectedEffectiveLengths<ModelType>(model);
-
-       //correct theta vector
-       sum = theta[0];
-       for (int i = 1; i <= M; i++) 
-         if (eel[i] < EPSILON) { theta[i] = 0.0; }
-         else sum += theta[i];
-       if (sum < EPSILON) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); }
-       for (int i = 0; i <= M; i++) theta[i] /= sum;
-
         //calculate expected weights and counts using learned parameters
+       //just use the raw theta learned from the data, do not correct for eel or mw
         updateModel = false; calcExpectedWeights = true;
         for (int i = 0; i <= M; i++) probv[i] = theta[i];
         for (int i = 0; i < nThreads; i++) {
                 rc = pthread_create(&threads[i], &attr, E_STEP<ReadType, HitType, ModelType>, (void*)(&fparams[i]));
-               if (rc != 0) { fprintf(stderr, "Cannot create thread %d when calculate expected weights! (numbered from 0)\n", i); exit(-1); }
-               //assert(rc == 0);
+               pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) when calculating expected weights!");
         }
         for (int i = 0; i < nThreads; i++) {
-               rc = pthread_join(threads[i], &status);
-               if (rc != 0) { fprintf(stderr, "Cannot join thread %d! (numbered from 0) when calculate expected weights!\n", i); exit(-1); }
-               //assert(rc == 0);
+               rc = pthread_join(threads[i], NULL);
+               pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) when calculating expected weights!");
         }
         model.setNeedCalcConPrb(false);
         for (int i = 1; i < nThreads; i++) {
@@ -595,15 +648,18 @@ void EM() {
         /* destroy attribute */
         pthread_attr_destroy(&attr);
  
-       //convert theta' to theta
-               double *mw = model.getMW();
-       sum = 0.0;
-       for (int i = 0; i <= M; i++) {
-         theta[i] = (mw[i] < EPSILON ? 0.0 : theta[i] / mw[i]);
-         sum += theta[i]; 
-       }
-       assert(sum >= EPSILON);
-       for (int i = 0; i <= M; i++) theta[i] /= sum;
+
+       sprintf(thetaF, "%s.theta", statName);
+       fo = fopen(thetaF, "w");
+       fprintf(fo, "%d\n", M + 1);
+
+       // output theta'
+       for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
+       fprintf(fo, "%.15g\n", theta[M]);
+       
+       //calculate expected effective lengths for each isoform
+       calcExpectedEffectiveLengths<ModelType>(model);
+       polishTheta(theta, eel, model.getMW());
  
         // output theta
         for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
@@ -617,26 +673,27 @@ void EM() {
                 sprintf(outBamF, "%s.transcript.bam", outName);
                 
                 if (bamSampling) {
-                       int local_N;
-                       int fr, to, len, id;
+                       READ_INT_TYPE local_N;
+                       HIT_INT_TYPE fr, to, len, id;
                         vector<double> arr;
-                       arr.clear();
+                       uniform01 rg(engine_type(time(NULL)));
  
-                       if (verbose) printf("Begin to sample reads from their posteriors.\n");
+                       if (verbose) cout<< "Begin to sample reads from their posteriors."<< endl;
                         for (int i = 0; i < nThreads; i++) {
                                 local_N = hitvs[i]->getN();
-                               for (int j = 0; j < local_N; j++) {
+                               for (READ_INT_TYPE j = 0; j < local_N; j++) {
                                         fr = hitvs[i]->getSAt(j);
                                         to = hitvs[i]->getSAt(j + 1);
                                         len = to - fr + 1;
-                                       arr.resize(len);
+                                       arr.assign(len, 0);
                                         arr[0] = ncpvs[i][j];
-                                       for (int k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb();
-                                       id = (arr[len - 1] < EPSILON ? -1 : sample(arr, len)); // if all entries in arr are 0, let id be -1
-                                       for (int k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0);
+                                       for (HIT_INT_TYPE k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb();
+                                       id = (arr[len - 1] < EPSILON ? -1 : sample(rg, arr, len)); // if all entries in arr are 0, let id be -1
+                                       for (HIT_INT_TYPE k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0);
                                 }
                         }
-                       if (verbose) printf("Sampling is finished.\n");
+
+                       if (verbose) cout<< "Sampling is finished."<< endl;
                 }
  
                 BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, transcripts);
@@ -651,8 +708,8 @@ int main(int argc, char* argv[]) {
         ifstream fin;
         bool quiet = false;
  
-       if (argc < 5) {
-               printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
+       if (argc < 6) {
+               printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
                 printf("  refName: reference name\n");
                 printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
                 printf("  sampleName: sample's name, including the path\n");
@@ -671,8 +728,8 @@ int main(int argc, char* argv[]) {
         strcpy(refName, argv[1]);
         read_type = atoi(argv[2]);
         strcpy(outName, argv[3]);
-       sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
-       sprintf(statName, "%s.stat/%s", argv[3], argv[4]);
+       strcpy(imdName, argv[4]);
+       strcpy(statName, argv[5]);
  
         nThreads = 1;
  
@@ -681,7 +738,7 @@ int main(int argc, char* argv[]) {
         genGibbsOut = false;
         pt_fn_list = pt_chr_list = NULL;
  
-       for (int i = 5; i < argc; i++) {
+       for (int i = 6; i < argc; i++) {
                 if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
                 if (!strcmp(argv[i], "-b")) {
                         genBamF = true;
@@ -696,8 +753,8 @@ int main(int argc, char* argv[]) {
                 if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
                 if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
         }
-       if (nThreads <= 0) { fprintf(stderr, "Number of threads should be bigger than 0!\n"); exit(-1); }
-       //assert(nThreads > 0);
+
+       general_assert(nThreads > 0, "Number of threads should be bigger than 0!");
  
         verbose = !quiet;
  
@@ -714,13 +771,15 @@ int main(int argc, char* argv[]) {
  
         sprintf(cntF, "%s.cnt", statName);
         fin.open(cntF);
-       if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", cntF); exit(-1); }
+
+       general_assert(fin.is_open(), "Cannot open " + cstrtos(cntF) + "! It may not exist.");
+
         fin>>N0>>N1>>N2>>N_tot;
         fin.close();
  
-       if (N1 <= 0) { fprintf(stderr, "There are no alignable reads!\n"); exit(-1); }
+       general_assert(N1 > 0, "There are no alignable reads!");
  
-       if (nThreads > N1) nThreads = N1;
+       if ((READ_INT_TYPE)nThreads > N1) nThreads = N1;
  
         //set model parameters
         mparams.M = M;
@@ -729,7 +788,9 @@ int main(int argc, char* argv[]) {
  
         sprintf(mparamsF, "%s.mparams", imdName);
         fin.open(mparamsF);
-       if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", mparamsF); exit(-1); }
+
+       general_assert(fin.is_open(), "Cannot open " + cstrtos(mparamsF) + "It may not exist.");
+
         fin>> mparams.minL>> mparams.maxL>> mparams.probF;
         int val; // 0 or 1 , for estRSPD
         fin>>val;
@@ -749,7 +810,7 @@ int main(int argc, char* argv[]) {
  
         time_t b = time(NULL);
  
-       printTimeUsed(a, b);
+       printTimeUsed(a, b, "EM.cpp");
  
         return 0;
  }