updated some description

[rsem.git] / EM.cpp
diff --git a/EM.cpp b/EM.cpp

index 00cb8527afe966771b61408debe86a74ad0294a1..f9449a85cd3226a3f29356f10fa018e4ca908cba 100644 (file)
--- a/EM.cpp
+++ b/EM.cpp
@@ -10,6 +10,7 @@
  #include<pthread.h>
  
  #include "utils.h"
+#include "sampling.h"
  
  #include "Read.h"
  #include "SingleRead.h"
@@ -58,12 +59,15 @@ int nThreads;
  
  
  bool genBamF; // If user wants to generate bam file, true; otherwise, false.
+bool bamSampling; // true if sampling from read posterior distribution when bam file is generated
  bool updateModel, calcExpectedWeights;
  bool genGibbsOut; // generate file for Gibbs sampler
  
-char refName[STRLEN], imdName[STRLEN], outName[STRLEN];
+char refName[STRLEN], outName[STRLEN];
+char imdName[STRLEN], statName[STRLEN];
  char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN];
  char mparamsF[STRLEN], bmparamsF[STRLEN];
+char modelF[STRLEN], thetaF[STRLEN];
  
  char inpSamType;
  char *pt_fn_list, *pt_chr_list;
@@ -98,7 +102,7 @@ void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, doubl
                 indices[i] = new ReadIndex(readFs[i]);
         }
         for (int i = 0; i < nThreads; i++) {
-               readers[i] = new ReadReader<ReadType>(s, readFs);
+               readers[i] = new ReadReader<ReadType>(s, readFs, refs.hasPolyA(), mparams.seedLen); // allow calculation of calc_lq() function
                 readers[i]->setIndices(indices);
         }
  
@@ -127,7 +131,7 @@ void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, doubl
                 if (!readers[i]->locate(curnr)) { fprintf(stderr, "Read indices files do not match!\n"); exit(-1); }
                 //assert(readers[i]->locate(curnr));
  
-               while (nrLeft > ntLeft && hitvs[i]->getNHits() < nhT) {
+               while (nrLeft > ntLeft && (i == nThreads - 1 || hitvs[i]->getNHits() < nhT)) {
                         if (!hitvs[i]->read(fin)) { fprintf(stderr, "Cannot read alignments from .dat file!\n"); exit(-1); }
                         //assert(hitvs[i]->read(fin));
                         --nrLeft;
@@ -301,35 +305,19 @@ void calcExpectedEffectiveLengths(ModelType& model) {
  template<class ModelType>
  void writeResults(ModelType& model, double* counts) {
         double denom;
-       char modelF[STRLEN], thetaF[STRLEN];
         char outF[STRLEN];
         FILE *fo;
  
-       sprintf(modelF, "%s.model", outName);
+       sprintf(modelF, "%s.model", statName);
         model.write(modelF);
  
-       sprintf(thetaF, "%s.theta", outName);
-       fo = fopen(thetaF, "w");
-       fprintf(fo, "%d\n", M + 1);
-       for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
-       fprintf(fo, "%.15g\n", theta[M]);
-       fclose(fo);
-
-
-       //calculate normalized read fraction
-       double *nrf = new double[M + 1];
-       memset(nrf, 0, sizeof(double) * (M + 1));
-       denom = 1.0 - theta[0];
-       if (denom <= 0) { fprintf(stderr, "No alignable reads?!\n"); exit(-1); }
-       for (int i = 1; i <= M; i++) nrf[i] = theta[i] / denom;
-
         //calculate tau values
         double *tau = new double[M + 1];
         memset(tau, 0, sizeof(double) * (M + 1));
  
         denom = 0.0;
         for (int i = 1; i <= M; i++) 
-         if (eel[i] > EPSILON) {
+         if (eel[i] >= EPSILON) {
             tau[i] = theta[i] / eel[i];
             denom += tau[i];
           }   
@@ -348,13 +336,11 @@ void writeResults(ModelType& model, double* counts) {
         }
         for (int i = 1; i <= M; i++)
                 fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.15g%c", nrf[i], (i < M ? '\t' : '\n'));
         for (int i = 1; i <= M; i++)
                 fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n'));
         for (int i = 1; i <= M; i++) {
                 const Transcript& transcript = transcripts.getTranscriptAt(i);
-               fprintf(fo, "%s%c", transcript.getLeft().c_str(), (i < M ? '\t' : '\n'));
+               fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
         }
         fclose(fo);
  
@@ -371,12 +357,6 @@ void writeResults(ModelType& model, double* counts) {
                 for (int j = b; j < e; j++) sumC += counts[j];
                 fprintf(fo, "%.2f%c", sumC, (i < m - 1 ? '\t' : '\n'));
         }
-       for (int i = 0; i < m; i++) {
-               double sumN = 0.0; // sum of normalized read fraction
-               int b = gi.spAt(i), e = gi.spAt(i + 1);
-               for (int j = b; j < e; j++) sumN += nrf[j];
-               fprintf(fo, "%.15g%c", sumN, (i < m - 1 ? '\t' : '\n'));
-       }
         for (int i = 0; i < m; i++) {
                 double sumT = 0.0; // sum of tau values
                 int b = gi.spAt(i), e = gi.spAt(i + 1);
@@ -391,7 +371,6 @@ void writeResults(ModelType& model, double* counts) {
         }
         fclose(fo);
  
-       delete[] nrf;
         delete[] tau;
  
         if (verbose) { printf("Expression Results are written!\n"); }
@@ -418,13 +397,15 @@ void release(ReadReader<ReadType> **readers, HitContainer<HitType> **hitvs, doub
  }
  
  inline bool doesUpdateModel(int ROUND) {
-       //return false; // never update, for debugging only
-       return ROUND <= 20 || ROUND % 100 == 0;
+  //  return ROUND <= 20 || ROUND % 100 == 0;
+  return ROUND <= 10;
  }
  
  //Including initialize, algorithm and results saving
  template<class ReadType, class HitType, class ModelType>
  void EM() {
+       FILE *fo;
+
         int ROUND;
         double sum;
  
@@ -443,6 +424,7 @@ void EM() {
         void *status;
         int rc;
  
+
         //initialize boolean variables
         updateModel = calcExpectedWeights = false;
  
@@ -472,7 +454,6 @@ void EM() {
         pthread_attr_init(&attr);
         pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
  
-
         ROUND = 0;
         do {
                 ++ROUND;
@@ -527,22 +508,11 @@ void EM() {
                         }
  
                 if (verbose) printf("ROUND = %d, SUM = %.15g, bChange = %f, totNum = %d\n", ROUND, sum, bChange, totNum);
-       } while (ROUND < MIN_ROUND || totNum > 0 && ROUND < MAX_ROUND);
+       } while (ROUND < MIN_ROUND || (totNum > 0 && ROUND < MAX_ROUND));
           //while (ROUND < MAX_ROUND);
  
         if (totNum > 0) fprintf(stderr, "Warning: RSEM reaches %d iterations before meeting the convergence criteria.\n", MAX_ROUND);
  
-       //calculate expected effective lengths for each isoform
-       calcExpectedEffectiveLengths<ModelType>(model);
-
-       //correct theta vector
-       sum = theta[0];
-       for (int i = 1; i <= M; i++) 
-         if (eel[i] < EPSILON) { theta[i] = 0.0; }
-         else sum += theta[i];
-       if (sum < EPSILON) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); }
-       for (int i = 0; i <= M; i++) theta[i] /= sum;
-
         //generate output file used by Gibbs sampler
         if (genGibbsOut) {
                 if (model.getNeedCalcConPrb()) {
@@ -558,7 +528,7 @@ void EM() {
                 model.setNeedCalcConPrb(false);
  
                 sprintf(out_for_gibbs_F, "%s.ofg", imdName);
-               FILE *fo = fopen(out_for_gibbs_F, "w");
+               fo = fopen(out_for_gibbs_F, "w");
                 fprintf(fo, "%d %d\n", M, N0);
                 for (int i = 0; i < nThreads; i++) {
                         int numN = hitvs[i]->getN();
@@ -567,7 +537,7 @@ void EM() {
                                 int to = hitvs[i]->getSAt(j + 1);
                                 int totNum = 0;
  
-                               if (ncpvs[i][j] > 0.0) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); }
+                               if (ncpvs[i][j] >= EPSILON) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); }
                                 for (int k = fr; k < to; k++) {
                                         HitType &hit = hitvs[i]->getHitAt(k);
                                         if (hit.getConPrb() >= EPSILON) {
@@ -582,8 +552,28 @@ void EM() {
                 fclose(fo);
         }
  
+       sprintf(thetaF, "%s.theta", statName);
+       fo = fopen(thetaF, "w");
+       fprintf(fo, "%d\n", M + 1);
+
+       // output theta'
+       for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
+       fprintf(fo, "%.15g\n", theta[M]);
+       
+       //calculate expected effective lengths for each isoform
+       calcExpectedEffectiveLengths<ModelType>(model);
+
+       //correct theta vector
+       sum = theta[0];
+       for (int i = 1; i <= M; i++) 
+         if (eel[i] < EPSILON) { theta[i] = 0.0; }
+         else sum += theta[i];
+       if (sum < EPSILON) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); }
+       for (int i = 0; i <= M; i++) theta[i] /= sum;
+
         //calculate expected weights and counts using learned parameters
         updateModel = false; calcExpectedWeights = true;
+       for (int i = 0; i <= M; i++) probv[i] = theta[i];
         for (int i = 0; i < nThreads; i++) {
                 rc = pthread_create(&threads[i], &attr, E_STEP<ReadType, HitType, ModelType>, (void*)(&fparams[i]));
                 if (rc != 0) { fprintf(stderr, "Cannot create thread %d when calculate expected weights! (numbered from 0)\n", i); exit(-1); }
@@ -605,8 +595,7 @@ void EM() {
         /* destroy attribute */
         pthread_attr_destroy(&attr);
  
-       
-       //for all
+       //convert theta' to theta
                 double *mw = model.getMW();
         sum = 0.0;
         for (int i = 0; i <= M; i++) {
@@ -616,18 +605,43 @@ void EM() {
         assert(sum >= EPSILON);
         for (int i = 0; i <= M; i++) theta[i] /= sum;
  
+       // output theta
+       for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
+       fprintf(fo, "%.15g\n", theta[M]);
+
+       fclose(fo);
+
         writeResults<ModelType>(model, countvs[0]);
  
         if (genBamF) {
-               sprintf(outBamF, "%s.bam", outName);
-               if (transcripts.getType() == 0) {
-                       sprintf(chr_list, "%s.chrlist", refName);
-                       pt_chr_list = (char*)(&chr_list);
+               sprintf(outBamF, "%s.transcript.bam", outName);
+               
+               if (bamSampling) {
+                       int local_N;
+                       int fr, to, len, id;
+                       vector<double> arr;
+                       arr.clear();
+
+                       if (verbose) printf("Begin to sample reads from their posteriors.\n");
+                       for (int i = 0; i < nThreads; i++) {
+                               local_N = hitvs[i]->getN();
+                               for (int j = 0; j < local_N; j++) {
+                                       fr = hitvs[i]->getSAt(j);
+                                       to = hitvs[i]->getSAt(j + 1);
+                                       len = to - fr + 1;
+                                       arr.resize(len);
+                                       arr[0] = ncpvs[i][j];
+                                       for (int k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb();
+                                       id = (arr[len - 1] < EPSILON ? -1 : sample(arr, len)); // if all entries in arr are 0, let id be -1
+                                       for (int k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0);
+                               }
+                       }
+                       if (verbose) printf("Sampling is finished.\n");
                 }
  
-               BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, pt_chr_list);
+               BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, transcripts);
                 HitWrapper<HitType> wrapper(nThreads, hitvs);
-               writer.work(wrapper, transcripts);
+               writer.work(wrapper);
         }
  
         release<ReadType, HitType, ModelType>(readers, hitvs, ncpvs, mhps);
@@ -638,15 +652,16 @@ int main(int argc, char* argv[]) {
         bool quiet = false;
  
         if (argc < 5) {
-               printf("Usage : rsem-run-em refName read_type imdName outName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n");
+               printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
                 printf("  refName: reference name\n");
                 printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
-               printf("  imdName: name for all upstream/downstream user-unseen files. (different files have different suffices)\n");
-               printf("  outName: name for all output files. (different files have different suffices)\n");
+               printf("  sampleName: sample's name, including the path\n");
+               printf("  sampleToken: sampleName excludes the path\n");
                 printf("  -p: number of threads which user wants to use. (default: 1)\n");
                 printf("  -b: produce bam format output file. (default: off)\n");
                 printf("  -q: set it quiet\n");
-               printf("  --gibbs-out: generate output file use by Gibbs sampler. (default: off)\n");
+               printf("  --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n");
+               printf("  --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n");
                 printf("// model parameters should be in imdName.mparams.\n");
                 exit(-1);
         }
@@ -655,12 +670,14 @@ int main(int argc, char* argv[]) {
  
         strcpy(refName, argv[1]);
         read_type = atoi(argv[2]);
-       strcpy(imdName, argv[3]);
-       strcpy(outName, argv[4]);
+       strcpy(outName, argv[3]);
+       sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
+       sprintf(statName, "%s.stat/%s", argv[3], argv[4]);
  
         nThreads = 1;
  
         genBamF = false;
+       bamSampling = false;
         genGibbsOut = false;
         pt_fn_list = pt_chr_list = NULL;
  
@@ -677,6 +694,7 @@ int main(int argc, char* argv[]) {
                 }
                 if (!strcmp(argv[i], "-q")) { quiet = true; }
                 if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; }
+               if (!strcmp(argv[i], "--sampling")) { bamSampling = true; }
         }
         if (nThreads <= 0) { fprintf(stderr, "Number of threads should be bigger than 0!\n"); exit(-1); }
         //assert(nThreads > 0);
@@ -694,7 +712,7 @@ int main(int argc, char* argv[]) {
         sprintf(tiF, "%s.ti", refName);
         transcripts.readFrom(tiF);
  
-       sprintf(cntF, "%s.cnt", imdName);
+       sprintf(cntF, "%s.cnt", statName);
         fin.open(cntF);
         if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", cntF); exit(-1); }
         fin>>N0>>N1>>N2>>N_tot;