Added support for allele-specific expression estimation

author Bo Li <bli@cs.wisc.edu>

Fri, 21 Mar 2014 16:27:31 +0000 (11:27 -0500)

committer Bo Li <bli@cs.wisc.edu>

Fri, 21 Mar 2014 16:27:31 +0000 (11:27 -0500)
author Bo Li <bli@cs.wisc.edu>
Fri, 21 Mar 2014 16:27:31 +0000 (11:27 -0500)
committer Bo Li <bli@cs.wisc.edu>
Fri, 21 Mar 2014 16:27:31 +0000 (11:27 -0500)
diff --git a/EM.cpp b/EM.cpp

index 59783f3089b5a96c7ea67db0113f9b558818d861..bf15c2b4b6493ca3674804428bc456540f4aa345 100644 (file)
--- a/EM.cpp
+++ b/EM.cpp
@@ -44,6 +44,8 @@
  #include "HitWrapper.h"
  #include "BamWriter.h"
  
+#include "WriteResults.h"
+
  using namespace std;
  
  const double STOP_CRITERIA = 0.001;
@@ -68,7 +70,7 @@ bool genGibbsOut; // generate file for Gibbs sampler
  
  char refName[STRLEN], outName[STRLEN];
  char imdName[STRLEN], statName[STRLEN];
-char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN];
+char refF[STRLEN], cntF[STRLEN], tiF[STRLEN];
  char mparamsF[STRLEN];
  char modelF[STRLEN], thetaF[STRLEN];
  
@@ -83,7 +85,6 @@ vector<double> theta, eel; // eel : expected effective length
  double *probv, **countvs;
  
  Refs refs;
-GroupInfo gi;
  Transcripts transcripts;
  
  ModelParams mparams;
@@ -271,185 +272,11 @@ void* calcConProbs(void* arg) {
         return NULL;
  }
  
-template<class ModelType>
-void calcExpectedEffectiveLengths(ModelType& model) {
-       int lb, ub, span;
-       double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
-  
-       model.getGLD().copyTo(pdf, cdf, lb, ub, span);
-       clen = new double[span + 1];
-       clen[0] = 0.0;
-       for (int i = 1; i <= span; i++) {
-               clen[i] = clen[i - 1] + pdf[i] * (lb + i);
-       }
-
-       eel.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++) {
-               int totLen = refs.getRef(i).getTotLen();
-               int fullLen = refs.getRef(i).getFullLen();
-               int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
-               int pos2 = max(min(totLen, ub) - lb, 0);
-
-               if (pos2 == 0) { eel[i] = 0.0; continue; }
-    
-               eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
-               assert(eel[i] >= 0);
-               if (eel[i] < MINEEL) { eel[i] = 0.0; }
-       }
-  
-       delete[] pdf;
-       delete[] cdf;
-       delete[] clen;
-}
-
-void polishTheta(vector<double>& theta, const vector<double>& eel, const double* mw) {
-       double sum = 0.0;
-
-       /* The reason that for noise gene, mw value is 1 is :
-        * currently, all masked positions are for poly(A) sites, which in theory should be filtered out.
-        * So the theta0 does not containing reads from any masked position
-        */
-
-       for (int i = 0; i <= M; i++) {
-               // i == 0, mw[i] == 1
-               if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) {
-                       theta[i] = 0.0;
-                       continue;
-               }
-               theta[i] = theta[i] / mw[i];
-               sum += theta[i];
-       }
-       // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked
-       general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !");
-       for (int i = 0; i <= M; i++) theta[i] /= sum;
-}
-
-void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
-       double denom;
-       vector<double> frac;
-
-       //calculate fraction of count over all mappabile reads
-       denom = 0.0;
-       frac.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++) 
-         if (eel[i] >= EPSILON) {
-           frac[i] = theta[i];
-           denom += frac[i];
-         }
-       general_assert(denom >= EPSILON, "No alignable reads?!");
-       for (int i = 1; i <= M; i++) frac[i] /= denom;
-  
-       //calculate FPKM
-       fpkm.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++)
-               if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
-
-       //calculate TPM
-       tpm.assign(M + 1, 0.0);
-       denom = 0.0;
-       for (int i = 1; i <= M; i++) denom += fpkm[i];
-       for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;  
-}
-
  template<class ModelType>
  void writeResults(ModelType& model, double* counts) {
-       char outF[STRLEN];
-       FILE *fo;
-
-       sprintf(modelF, "%s.model", statName);
-       model.write(modelF);
-
-       vector<int> tlens;
-       vector<double> fpkm, tpm, isopct;
-       vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
-
-       calcExpressionValues(theta, eel, tpm, fpkm);
-
-       //calculate IsoPct, etc.
-       isopct.assign(M + 1, 0.0);
-       tlens.assign(M + 1, 0);
-
-       glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
-       gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
-
-       for (int i = 0; i < m; i++) {
-               int b = gi.spAt(i), e = gi.spAt(i + 1);
-               for (int j = b; j < e; j++) {
-                       const Transcript& transcript = transcripts.getTranscriptAt(j);
-                       tlens[j] = transcript.getLength();
-
-                       gene_counts[i] += counts[j];
-                       gene_tpm[i] += tpm[j];
-                       gene_fpkm[i] += fpkm[j];
-               }
-
-               if (gene_tpm[i] < EPSILON) {
-                       double frac = 1.0 / (e - b);
-                       for (int j = b; j < e; j++) {
-                               glens[i] += tlens[j] * frac;
-                               gene_eels[i] += eel[j] * frac;
-                       }
-               }
-               else {
-                       for (int j = b; j < e; j++) {
-                               isopct[j] = tpm[j] / gene_tpm[i];
-                               glens[i] += tlens[j] * isopct[j];
-                               gene_eels[i] += eel[j] * isopct[j];
-                       }
-               }
-       }
-
-       //isoform level results
-       sprintf(outF, "%s.iso_res", imdName);
-       fo = fopen(outF, "w");
-       for (int i = 1; i <= M; i++) {
-               const Transcript& transcript = transcripts.getTranscriptAt(i);
-               fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
-       }
-       for (int i = 1; i <= M; i++) {
-               const Transcript& transcript = transcripts.getTranscriptAt(i);
-               fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
-       }
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
-       fclose(fo);
-
-       //gene level results
-       sprintf(outF, "%s.gene_res", imdName);
-       fo = fopen(outF, "w");
-       for (int i = 0; i < m; i++) {
-               const Transcript& transcript = transcripts.getTranscriptAt(gi.spAt(i));
-               fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m - 1 ? '\t' : '\n'));
-       }
-       for (int i = 0; i < m; i++) {
-               int b = gi.spAt(i), e = gi.spAt(i + 1);
-               for (int j = b; j < e; j++) {
-                       fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : (i < m - 1 ? '\t' :'\n')));
-               }
-       }
-       for (int i = 0; i < m; i++)
-               fprintf(fo, "%.2f%c", glens[i], (i < m - 1 ? '\t' : '\n'));
-       for (int i = 0; i < m; i++)
-               fprintf(fo, "%.2f%c", gene_eels[i], (i < m - 1 ? '\t' : '\n'));
-       for (int i = 0; i < m; i++)
-               fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
-       for (int i = 0; i < m; i++)
-               fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
-       for (int i = 0; i < m; i++)
-               fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
-       fclose(fo);
-
-       if (verbose) { printf("Expression Results are written!\n"); }
+  sprintf(modelF, "%s.model", statName);
+  model.write(modelF);
+  writeResultsEM(M, refName, imdName, transcripts, theta, eel, countvs[0]);
  }
  
  template<class ReadType, class HitType, class ModelType>
@@ -658,8 +485,8 @@ void EM() {
         fprintf(fo, "%.15g\n", theta[M]);
         
         //calculate expected effective lengths for each isoform
-       calcExpectedEffectiveLengths<ModelType>(model);
-       polishTheta(theta, eel, model.getMW());
+       calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
+       polishTheta(M, theta, eel, model.getMW());
  
         // output theta
         for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
@@ -762,9 +589,6 @@ int main(int argc, char* argv[]) {
         sprintf(refF, "%s.seq", refName);
         refs.loadRefs(refF);
         M = refs.getM();
-       sprintf(groupF, "%s.grp", refName);
-       gi.load(groupF);
-       m = gi.getm();
  
         sprintf(tiF, "%s.ti", refName);
         transcripts.readFrom(tiF);
diff --git a/Gibbs.cpp b/Gibbs.cpp

index b0287a334cef604ea4385e250c7e22a8511c0424..e7a1182ce4b8ca78e5a41f59a0cdf5ddf657020a 100644 (file)
--- a/Gibbs.cpp
+++ b/Gibbs.cpp
@@ -18,7 +18,9 @@
  #include "PairedEndQModel.h"
  
  #include "Refs.h"
+
  #include "GroupInfo.h"
+#include "WriteResults.h"
  
  using namespace std;
  
@@ -44,17 +46,16 @@ struct Item {
  int nThreads;
  
  int model_type;
-int m, M;
+int M;
  READ_INT_TYPE N0, N1;
  HIT_INT_TYPE nHits;
  double totc;
  int BURNIN, NSAMPLES, GAP;
-char imdName[STRLEN], statName[STRLEN];
-char thetaF[STRLEN], ofgF[STRLEN], groupF[STRLEN], refF[STRLEN], modelF[STRLEN];
+char refName[STRLEN], imdName[STRLEN], statName[STRLEN];
+char thetaF[STRLEN], ofgF[STRLEN], refF[STRLEN], modelF[STRLEN];
  char cvsF[STRLEN];
  
  Refs refs;
-GroupInfo gi;
  
  vector<HIT_INT_TYPE> s;
  vector<Item> hits;
@@ -73,21 +74,16 @@ pthread_t *threads;
  pthread_attr_t attr;
  int rc;
  
-void load_data(char* reference_name, char* statName, char* imdName) {
+void load_data(char* refName, char* statName, char* imdName) {
         ifstream fin;
         string line;
         int tmpVal;
  
         //load reference file
-       sprintf(refF, "%s.seq", reference_name);
+       sprintf(refF, "%s.seq", refName);
         refs.loadRefs(refF, 1);
         M = refs.getM();
  
-       //load groupF
-       sprintf(groupF, "%s.grp", reference_name);
-       gi.load(groupF);
-       m = gi.getm();
-
         //load ofgF;
         sprintf(ofgF, "%s.ofg", imdName);
         fin.open(ofgF);
@@ -118,43 +114,12 @@ void load_data(char* reference_name, char* statName, char* imdName) {
         if (verbose) { printf("Loading Data is finished!\n"); }
  }
  
-template<class ModelType>
-void calcExpectedEffectiveLengths(ModelType& model) {
-       int lb, ub, span;
-       double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
-  
-       model.getGLD().copyTo(pdf, cdf, lb, ub, span);
-       clen = new double[span + 1];
-       clen[0] = 0.0;
-       for (int i = 1; i <= span; i++) {
-               clen[i] = clen[i - 1] + pdf[i] * (lb + i);
-       }
-
-       eel.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++) {
-               int totLen = refs.getRef(i).getTotLen();
-               int fullLen = refs.getRef(i).getFullLen();
-               int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
-               int pos2 = max(min(totLen, ub) - lb, 0);
-
-               if (pos2 == 0) { eel[i] = 0.0; continue; }
-    
-               eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
-               assert(eel[i] >= 0);
-               if (eel[i] < MINEEL) { eel[i] = 0.0; }
-       }
-  
-       delete[] pdf;
-       delete[] cdf;
-       delete[] clen;
-}
-
  template<class ModelType>
  void init_model_related(char* modelF) {
         ModelType model;
         model.read(modelF);
  
-       calcExpectedEffectiveLengths<ModelType>(model);
+       calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
         memcpy(mw, model.getMW(), sizeof(double) * (M + 1)); // otherwise, after exiting this procedure, mw becomes undefined
  }
  
@@ -220,55 +185,6 @@ void writeCountVector(FILE* fo, vector<int>& counts) {
         fprintf(fo, "%d\n", counts[M]);
  }
  
-void polishTheta(vector<double>& theta, const vector<double>& eel, const double* mw) {
-       double sum = 0.0;
-
-       /* The reason that for noise gene, mw value is 1 is :
-        * currently, all masked positions are for poly(A) sites, which in theory should be filtered out.
-        * So the theta0 does not containing reads from any masked position
-        */
-
-       for (int i = 0; i <= M; i++) {
-               // i == 0, mw[i] == 1
-               if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) {
-                       theta[i] = 0.0;
-                       continue;
-               }
-               theta[i] = theta[i] / mw[i];
-               sum += theta[i];
-       }
-       // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked
-       general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !");
-       for (int i = 0; i <= M; i++) theta[i] /= sum;
-}
-
-void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
-       double denom;
-       vector<double> frac;
-
-       //calculate fraction of count over all mappabile reads
-       denom = 0.0;
-       frac.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++) 
-         if (eel[i] >= EPSILON) {
-           frac[i] = theta[i];
-           denom += frac[i];
-         }
-       general_assert(denom >= EPSILON, "No alignable reads?!");
-       for (int i = 1; i <= M; i++) frac[i] /= denom;
-  
-       //calculate FPKM
-       fpkm.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++)
-               if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
-
-       //calculate TPM
-       tpm.assign(M + 1, 0.0);
-       denom = 0.0;
-       for (int i = 1; i <= M; i++) denom += fpkm[i];
-       for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;  
-}
-
  void* Gibbs(void* arg) {
         int CHAINLEN;
         HIT_INT_TYPE len, fr, to;
@@ -320,8 +236,8 @@ void* Gibbs(void* arg) {
                         if ((ROUND - BURNIN - 1) % GAP == 0) {
                                 writeCountVector(params->fo, counts);
                                 for (int i = 0; i <= M; i++) theta[i] = counts[i] / totc;
-                               polishTheta(theta, eel, mw);
-                               calcExpressionValues(theta, eel, tpm, fpkm);
+                               polishTheta(M, theta, eel, mw);
+                               calcExpressionValues(M, theta, eel, tpm, fpkm);
                                 for (int i = 0; i <= M; i++) {
                                         params->pme_c[i] += counts[i] - 1;
                                         params->pve_c[i] += (counts[i] - 1) * (counts[i] - 1);
@@ -374,66 +290,13 @@ void release() {
         }
  }
  
-void writeResults(char* imdName) {
-       char outF[STRLEN];
-       FILE *fo;
-
-       vector<double> isopct;
-       vector<double> gene_counts, gene_tpm, gene_fpkm;
-
-       //calculate IsoPct, etc.
-       isopct.assign(M + 1, 0.0);
-       gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
-
-       for (int i = 0; i < m; i++) {
-               int b = gi.spAt(i), e = gi.spAt(i + 1);
-               for (int j = b; j < e; j++) {
-                       gene_counts[i] += pme_c[j];
-                       gene_tpm[i] += pme_tpm[j];
-                       gene_fpkm[i] += pme_fpkm[j];
-               }
-               if (gene_tpm[i] < EPSILON) continue;
-               for (int j = b; j < e; j++)
-                       isopct[j] = pme_tpm[j] / gene_tpm[i];
-       }
-
-       //isoform level results
-       sprintf(outF, "%s.iso_res", imdName);
-       fo = fopen(outF, "a");
-       general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
-
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n'));
-       for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
-       fclose(fo);
-
-       //gene level results
-       sprintf(outF, "%s.gene_res", imdName);
-       fo = fopen(outF, "a");
-       general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
-
-       for (int i = 0; i < m; i++)
-               fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
-       for (int i = 0; i < m; i++)
-               fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
-       for (int i = 0; i < m; i++)
-               fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
-       fclose(fo);
-
-       if (verbose) { printf("Gibbs based expression values are written!\n"); }
-}
-
  int main(int argc, char* argv[]) {
         if (argc < 7) {
                 printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n");
                 exit(-1);
         }
  
+       strcpy(refName, argv[1]);
         strcpy(imdName, argv[2]);
         strcpy(statName, argv[3]);
  
@@ -459,7 +322,7 @@ int main(int argc, char* argv[]) {
                 printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads);
         }
  
-       load_data(argv[1], statName, imdName);
+       load_data(refName, statName, imdName);
  
         sprintf(modelF, "%s.model", statName);
         FILE *fi = fopen(modelF, "r");
@@ -491,11 +354,19 @@ int main(int argc, char* argv[]) {
  
         if (verbose) printf("Gibbs finished!\n");
         
-       writeResults(imdName);
+       writeResultsGibbs(M, refName, imdName, pme_c, pme_fpkm, pme_tpm);
  
         if (var_opt) {
                 char varF[STRLEN];
  
+               // Load group info
+               int m;
+               GroupInfo gi;
+               char groupF[STRLEN];
+               sprintf(groupF, "%s.grp", refName);
+               gi.load(groupF);
+               m = gi.getm();
+               
                 sprintf(varF, "%s.var", statName);
                 FILE *fo = fopen(varF, "w");
                 general_assert(fo != NULL, "Cannot open " + cstrtos(varF) + "!");
@@ -507,7 +378,7 @@ int main(int argc, char* argv[]) {
                 }
                 fclose(fo);
         }
-
+       
         delete mw; // delete the copy
  
         return 0;
diff --git a/README.md b/README.md

index b93678dd78421a6c1bf1bfe571c7621683f29cfd..f6b5ec857f44861c1cd4d5d62b483738e511833d 100644 (file)
--- a/README.md
+++ b/README.md
@@ -296,7 +296,7 @@ __reference_name:__ The name of RSEM references, which should be already generat
  
  __estimated_model_file:__ This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'. 'model_file_description.txt' provides the format and meanings of this file.    
  
-__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required.   
+__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. If the RSEM references built are aware of allele-specific transcripts, 'sample_name.alleles.results' should be used instead.   
  
  __theta0:__ This parameter determines the fraction of reads that are coming from background "noise" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.   
  
diff --git a/Transcript.h b/Transcript.h

index 90f6e6797b17f949b38e4776fcff7eacb8ace2ce..f7afa2b20542ebdb2f4883d1cc12631eb2b1176e 100644 (file)
--- a/Transcript.h
+++ b/Transcript.h
@@ -10,6 +10,10 @@
  
  #include "utils.h"
  
+/**
+   If no genome is provided, seqname field is used to store the allele name.
+ */
+
  struct Interval {
         int start, end;
  
@@ -49,7 +53,7 @@ public:
         }
  
         bool operator< (const Transcript& o) const {
-         return gene_id < o.gene_id || (gene_id == o.gene_id && transcript_id < o.transcript_id);
+         return gene_id < o.gene_id || (gene_id == o.gene_id && transcript_id < o.transcript_id) || (gene_id == o.gene_id && transcript_id == o.transcript_id && seqname < o.seqname);
         }
  
         const std::string& getTranscriptID() const { return transcript_id; }
diff --git a/Transcripts.h b/Transcripts.h

index 19b025eb2a0868a051db6f0530caf2b9819296a6..5c17f95352fcd91ea9e80d8dc5f1559531e2ae09 100644 (file)
--- a/Transcripts.h
+++ b/Transcripts.h
@@ -28,7 +28,11 @@ public:
         }
  
         int getM() { return M; }
+
         int getType() { return type; }
+       void setType(int type) { this->type = type; }
+
+       bool isAlleleSpecific() { return type == 2; }
  
         const Transcript& getTranscriptAt(int pos) {
                 assert(pos > 0 && pos <= M);
@@ -60,7 +64,7 @@ public:
         void buildMappings(int, char**);
  
  private:
-       int M, type; // type 0 from genome , 1 standalone transcriptome
+       int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific 
         std::vector<Transcript> transcripts;
  
         std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
@@ -94,11 +98,11 @@ void Transcripts::buildMappings(int n_targets, char** target_name) {
         std::map<std::string, int> dict;
         std::map<std::string, int>::iterator iter;
  
-       general_assert(n_targets == M, "Number of transcripts does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+       general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
  
         dict.clear();
         for (int i = 1; i <= M; i++) {
-               const std::string& tid = transcripts[i].getTranscriptID();
+               const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
                 iter = dict.find(tid);
                 general_assert(iter == dict.end(), tid + " appears more than once!");
                 dict[tid] = i;
@@ -108,7 +112,7 @@ void Transcripts::buildMappings(int n_targets, char** target_name) {
         i2e.assign(M + 1, 0);
         for (int i = 0; i < n_targets; i++) {
                 iter = dict.find(std::string(target_name[i]));
-               general_assert(iter != dict.end(), "RSEM can not recognize transcript " + cstrtos(target_name[i]) + "!");
+               general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
                 general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
                 e2i[i + 1] = iter->second;
                 i2e[iter->second] = i + 1;
diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW

index 24ae065d19dc6a56093fba3286d7ff0729a57147..b942a635a25fb74ed249f98d1e50f838949b91a4 100644 (file)
--- a/WHAT_IS_NEW
+++ b/WHAT_IS_NEW
@@ -1,3 +1,10 @@
+RSEM v1.2.12
+
+- Enabled allele-specific expression estimation
+- Added option '--calc-pme' option to calculate posterior mean estimates only (no credibility intervals)
+
+--------------------------------------------------------------------------------------------
+
  RSEM v1.2.11
  
  - Enabled RSEM to use Bowtie 2 aligner (indel, local and discordant alignments are not supported yet)
diff --git a/WriteResults.h b/WriteResults.h

new file mode 100644 (file)

index 0000000..95aa7b0
--- /dev/null
+++ b/WriteResults.h
@@ -0,0 +1,624 @@
+#ifndef WRITERESULTS_H_
+#define WRITERESULTS_H_
+
+#include<cstdio>
+#include<vector>
+#include<string>
+#include<fstream>
+#include<algorithm>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "GroupInfo.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+#include "Refs.h"
+
+#include "Model.h"
+#include "SingleModel.h"
+#include "SingleQModel.h"
+#include "PairedEndModel.h"
+#include "PairedEndQModel.h"
+
+template<class ModelType>
+void calcExpectedEffectiveLengths(int M, Refs& refs, ModelType& model, std::vector<double>& eel) {
+       int lb, ub, span;
+       double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
+  
+       model.getGLD().copyTo(pdf, cdf, lb, ub, span);
+       clen = new double[span + 1];
+       clen[0] = 0.0;
+       for (int i = 1; i <= span; i++) {
+               clen[i] = clen[i - 1] + pdf[i] * (lb + i);
+       }
+
+       eel.assign(M + 1, 0.0);
+       for (int i = 1; i <= M; i++) {
+               int totLen = refs.getRef(i).getTotLen();
+               int fullLen = refs.getRef(i).getFullLen();
+               int pos1 = std::max(std::min(totLen - fullLen + 1, ub) - lb, 0);
+               int pos2 = std::max(std::min(totLen, ub) - lb, 0);
+
+               if (pos2 == 0) { eel[i] = 0.0; continue; }
+    
+               eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
+               assert(eel[i] >= 0);
+               if (eel[i] < MINEEL) { eel[i] = 0.0; }
+       }
+  
+       delete[] pdf;
+       delete[] cdf;
+       delete[] clen;
+}
+
+void polishTheta(int M, std::vector<double>& theta, const std::vector<double>& eel, const double* mw) {
+       double sum = 0.0;
+
+       /* The reason that for noise gene, mw value is 1 is :
+        * currently, all masked positions are for poly(A) sites, which in theory should be filtered out.
+        * So the theta0 does not containing reads from any masked position
+        */
+
+       for (int i = 0; i <= M; i++) {
+               // i == 0, mw[i] == 1
+               if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) {
+                       theta[i] = 0.0;
+                       continue;
+               }
+               theta[i] = theta[i] / mw[i];
+               sum += theta[i];
+       }
+       // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked
+       general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !");
+       for (int i = 0; i <= M; i++) theta[i] /= sum;
+}
+
+void calcExpressionValues(int M, const std::vector<double>& theta, const std::vector<double>& eel, std::vector<double>& tpm, std::vector<double>& fpkm) {
+       double denom;
+       std::vector<double> frac;
+
+       //calculate fraction of count over all mappabile reads
+       denom = 0.0;
+       frac.assign(M + 1, 0.0);
+       for (int i = 1; i <= M; i++) 
+         if (eel[i] >= EPSILON) {
+           frac[i] = theta[i];
+           denom += frac[i];
+         }
+       general_assert(denom >= EPSILON, "No alignable reads?!");
+       for (int i = 1; i <= M; i++) frac[i] /= denom;
+  
+       //calculate FPKM
+       fpkm.assign(M + 1, 0.0);
+       for (int i = 1; i <= M; i++)
+               if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
+
+       //calculate TPM
+       tpm.assign(M + 1, 0.0);
+       denom = 0.0;
+       for (int i = 1; i <= M; i++) denom += fpkm[i];
+       for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;  
+}
+
+inline bool isAlleleSpecific(const char* refName, GroupInfo* gt = NULL, GroupInfo* ta = NULL) {
+  bool alleleS;
+  char gtF[STRLEN], taF[STRLEN];
+
+  sprintf(gtF, "%s.gt", refName);
+  sprintf(taF, "%s.ta", refName);
+  std::ifstream gtIF(gtF), taIF(taF);
+  alleleS = gtIF.is_open() && taIF.is_open();
+  if (gtIF.is_open()) gtIF.close();
+  if (taIF.is_open()) taIF.close();
+
+  if (alleleS) { 
+    if (gt != NULL) gt->load(gtF); 
+    if (ta != NULL) ta->load(taF); 
+  }
+
+  return alleleS;
+}
+
+void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts& transcripts, std::vector<double>& theta, std::vector<double>& eel, double* counts) {
+       char outF[STRLEN];
+       FILE *fo;
+
+       int m;
+       GroupInfo gi;
+       char groupF[STRLEN];
+
+       std::vector<int> tlens;
+       std::vector<double> fpkm, tpm, isopct;
+       std::vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
+
+       // Load group info
+       sprintf(groupF, "%s.grp", refName);
+       gi.load(groupF);
+       m = gi.getm();
+
+       // For allele-specific expression
+       int m_trans = 0;
+       GroupInfo gt, ta;
+       std::vector<double> trans_lens, trans_eels, trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct;
+
+       bool alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific
+
+       calcExpressionValues(M, theta, eel, tpm, fpkm);
+
+       //calculate IsoPct, etc.
+       isopct.assign(M + 1, 0.0);
+       tlens.assign(M + 1, 0);
+
+       glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
+       gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+       for (int i = 0; i < m; i++) {
+               int b = gi.spAt(i), e = gi.spAt(i + 1);
+               for (int j = b; j < e; j++) {
+                       const Transcript& transcript = transcripts.getTranscriptAt(j);
+                       tlens[j] = transcript.getLength();
+
+                       gene_counts[i] += counts[j];
+                       gene_tpm[i] += tpm[j];
+                       gene_fpkm[i] += fpkm[j];
+               }
+
+               if (gene_tpm[i] < EPSILON) {
+                       double frac = 1.0 / (e - b);
+                       for (int j = b; j < e; j++) {
+                               glens[i] += tlens[j] * frac;
+                               gene_eels[i] += eel[j] * frac;
+                       }
+               }
+               else {
+                       for (int j = b; j < e; j++) {
+                               isopct[j] = tpm[j] / gene_tpm[i];
+                               glens[i] += tlens[j] * isopct[j];
+                               gene_eels[i] += eel[j] * isopct[j];
+                       }
+               }
+       }
+
+       if (alleleS) {
+         m_trans = ta.getm();
+         ta_pct.assign(M + 1, 0.0);
+         trans_lens.assign(m_trans, 0.0); trans_eels.assign(m_trans, 0.0);
+         trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0);
+         
+         for (int i = 0; i < m_trans; i++) {
+               int b = ta.spAt(i), e = ta.spAt(i + 1);
+               for (int j = b; j < e; j++) {
+                       trans_counts[i] += counts[j];
+                       trans_tpm[i] += tpm[j];
+                       trans_fpkm[i] += fpkm[j];
+               }
+
+               if (trans_tpm[i] < EPSILON) {
+                       double frac = 1.0 / (e - b);
+                       for (int j = b; j < e; j++) {
+                               trans_lens[i] += tlens[j] * frac;
+                               trans_eels[i] += eel[j] * frac;
+                       }
+               }
+               else {
+                       for (int j = b; j < e; j++) {
+                               ta_pct[j] = tpm[j] / trans_tpm[i];
+                               trans_lens[i] += tlens[j] * ta_pct[j];
+                               trans_eels[i] += eel[j] * ta_pct[j];
+                       }
+               }
+         } 
+         
+         gt_pct.assign(m_trans, 0.0);
+         for (int i = 0; i < m; i++) 
+           if (gene_tpm[i] >= EPSILON) {
+             int b = gt.spAt(i), e = gt.spAt(i + 1);
+             for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i];
+           }
+       }
+
+       if (!alleleS) {
+         //isoform level results
+         sprintf(outF, "%s.iso_res", imdName);
+         fo = fopen(outF, "w");
+         for (int i = 1; i <= M; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(i);
+           fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
+         }
+         for (int i = 1; i <= M; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(i);
+           fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
+         }
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+         fclose(fo);
+       }
+       else {
+         // allele level results
+         sprintf(outF, "%s.allele_res", imdName);
+         fo = fopen(outF, "w");
+         for (int i = 1; i <= M; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(i);
+           fprintf(fo, "%s%c", transcript.getSeqName().c_str(), (i < M ? '\t' : '\n'));
+         }
+         for (int i = 1; i <= M; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(i);
+           fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
+         }
+         for (int i = 1; i <= M; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(i);
+           fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
+         }
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", ta_pct[i] * 1e2, (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+         fclose(fo);
+
+         // isoform level results
+         sprintf(outF, "%s.iso_res", imdName);
+         fo = fopen(outF, "w");
+         for (int i = 0; i < m_trans; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i));
+           fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < m_trans - 1 ? '\t' : '\n'));
+         }
+         for (int i = 0; i < m_trans; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i));
+           fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m_trans - 1 ? '\t' : '\n'));
+         }
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", trans_lens[i], (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", trans_eels[i], (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", trans_counts[i], (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", trans_tpm[i], (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", trans_fpkm[i], (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", gt_pct[i] * 1e2, (i < m_trans - 1 ? '\t' : '\n'));
+         fclose(fo);
+       }
+
+       //gene level results
+       sprintf(outF, "%s.gene_res", imdName);
+       fo = fopen(outF, "w");
+       for (int i = 0; i < m; i++) {
+               const Transcript& transcript = transcripts.getTranscriptAt(gi.spAt(i));
+               fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m - 1 ? '\t' : '\n'));
+       }
+       for (int i = 0; i < m; i++) {
+               int b = gi.spAt(i), e = gi.spAt(i + 1);
+               std::string curtid = "", tid;
+               for (int j = b; j < e; j++) {
+                       tid = transcripts.getTranscriptAt(j).getTranscriptID();
+                       if (curtid != tid) {
+                         if (curtid != "") fprintf(fo, ",");
+                         fprintf(fo, "%s", tid.c_str());
+                         curtid = tid;
+                       }
+               }
+               fprintf(fo, "%c", (i < m - 1 ? '\t' : '\n'));
+       }
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", glens[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_eels[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
+       fclose(fo);
+
+       if (verbose) { printf("Expression Results are written!\n"); }
+}
+
+void writeResultsGibbs(int M, char* refName, char* imdName, std::vector<double>& pme_c, std::vector<double>& pme_fpkm, std::vector<double>& pme_tpm) {
+       char outF[STRLEN];
+       FILE *fo;
+
+       int m;
+       GroupInfo gi;
+       char groupF[STRLEN];
+       std::vector<double> isopct;
+       std::vector<double> gene_counts, gene_tpm, gene_fpkm;
+
+       // Load group info
+       sprintf(groupF, "%s.grp", refName);
+       gi.load(groupF);
+       m = gi.getm();
+
+       // For allele-specific expression
+       int m_trans = 0;
+       GroupInfo gt, ta;
+       std::vector<double> trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct;
+
+       bool alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific
+
+       //calculate IsoPct, etc.
+       isopct.assign(M + 1, 0.0);
+       gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+       for (int i = 0; i < m; i++) {
+               int b = gi.spAt(i), e = gi.spAt(i + 1);
+               for (int j = b; j < e; j++) {
+                       gene_counts[i] += pme_c[j];
+                       gene_tpm[i] += pme_tpm[j];
+                       gene_fpkm[i] += pme_fpkm[j];
+               }
+               if (gene_tpm[i] < EPSILON) continue;
+               for (int j = b; j < e; j++)
+                       isopct[j] = pme_tpm[j] / gene_tpm[i];
+       }
+
+       if (alleleS) {
+         m_trans = ta.getm();
+         ta_pct.assign(M + 1, 0.0);
+         trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0);
+         
+         for (int i = 0; i < m_trans; i++) {
+               int b = ta.spAt(i), e = ta.spAt(i + 1);
+               for (int j = b; j < e; j++) {
+                       trans_counts[i] += pme_c[j];
+                       trans_tpm[i] += pme_tpm[j];
+                       trans_fpkm[i] += pme_fpkm[j];
+               }
+               if (trans_tpm[i] < EPSILON) continue;
+               for (int j = b; j < e; j++) 
+                 ta_pct[j] = pme_tpm[j] / trans_tpm[i];        
+         }
+
+         gt_pct.assign(m_trans, 0.0);
+         for (int i = 0; i < m; i++) 
+           if (gene_tpm[i] >= EPSILON) {
+             int b = gt.spAt(i), e = gt.spAt(i + 1);
+             for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i];
+           }
+       }
+
+       if (!alleleS) {
+         //isoform level results
+         sprintf(outF, "%s.iso_res", imdName);
+         fo = fopen(outF, "a");
+         general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+         
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+         fclose(fo);    
+       }
+       else {
+         //allele level results
+         sprintf(outF, "%s.allele_res", imdName);
+         fo = fopen(outF, "a");
+         general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+         
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", ta_pct[i] * 1e2, (i < M ? '\t' : '\n'));
+         for (int i = 1; i <= M; i++)
+           fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+         fclose(fo);
+
+         //isoform level results
+         sprintf(outF, "%s.iso_res", imdName);
+         fo = fopen(outF, "a");
+         general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+         
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", trans_counts[i], (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", trans_tpm[i], (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", trans_fpkm[i], (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.2f%c", gt_pct[i] * 1e2, (i < m_trans - 1 ? '\t' : '\n'));
+         fclose(fo);
+       }
+ 
+       //gene level results
+       sprintf(outF, "%s.gene_res", imdName);
+       fo = fopen(outF, "a");
+       general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
+       for (int i = 0; i < m; i++)
+               fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
+       fclose(fo);
+
+       if (verbose) { printf("Gibbs based expression values are written!\n"); }
+}
+
+void writeResultsSimulation(int M, char* refName, char* outFN, Transcripts& transcripts, std::vector<double>& eel, std::vector<double>& counts) {
+       char outF[STRLEN];
+       FILE *fo;
+
+       int m;
+       GroupInfo gi;
+       char groupF[STRLEN];
+
+        // Load group info
+        sprintf(groupF, "%s.grp", refName);
+        gi.load(groupF);
+       m = gi.getm();
+
+       std::vector<int> tlens;
+       std::vector<double> tpm, fpkm, isopct;
+       std::vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
+       
+        // For allele-specific expression
+       int m_trans = 0;
+        GroupInfo gt, ta;
+       std::vector<double> trans_lens, trans_eels, trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct;
+
+       bool alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific
+       
+       for (int i = 1; i <= M; i++)
+               general_assert(eel[i] > EPSILON || counts[i] <= EPSILON, "An isoform whose effecitve length < " + ftos(MINEEL, 6) + " got sampled!");
+
+       calcExpressionValues(M, counts, eel, tpm, fpkm);
+
+       //calculate IsoPct, etc.
+       isopct.assign(M + 1, 0.0);
+       tlens.assign(M + 1, 0);
+
+       glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
+       gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+        for (int i = 0; i < m; i++) {
+         int b = gi.spAt(i), e = gi.spAt(i + 1);
+         for (int j = b; j < e; j++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(j);
+           tlens[j] = transcript.getLength();
+           
+           gene_counts[i] += counts[j];
+           gene_tpm[i] += tpm[j];
+           gene_fpkm[i] += fpkm[j];
+         }
+
+         if (gene_tpm[i] < EPSILON) {
+           double frac = 1.0 / (e - b);
+           for (int j = b; j < e; j++) {
+             glens[i] += tlens[j] * frac;
+             gene_eels[i] += eel[j] * frac;
+           }
+         }
+         else {
+           for (int j = b; j < e; j++) {
+             isopct[j] = tpm[j] / gene_tpm[i];
+             glens[i] += tlens[j] * isopct[j];
+             gene_eels[i] += eel[j] * isopct[j];
+           }
+         }
+        }
+
+        if (alleleS) {
+          m_trans = ta.getm();
+          ta_pct.assign(M + 1, 0.0);
+          trans_lens.assign(m_trans, 0.0); trans_eels.assign(m_trans, 0.0);
+          trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0);
+
+          for (int i = 0; i < m_trans; i++) {
+           int b = ta.spAt(i), e = ta.spAt(i + 1);
+           for (int j = b; j < e; j++) {
+             trans_counts[i] += counts[j];
+             trans_tpm[i] += tpm[j];
+             trans_fpkm[i] += fpkm[j];
+           }
+
+           if (trans_tpm[i] < EPSILON) {
+             double frac = 1.0 / (e - b);
+             for (int j = b; j < e; j++) {
+               trans_lens[i] += tlens[j] * frac;
+               trans_eels[i] += eel[j] * frac;
+             }
+           }
+           else {
+             for (int j = b; j < e; j++) {
+               ta_pct[j] = tpm[j] / trans_tpm[i];
+               trans_lens[i] += tlens[j] * ta_pct[j];
+               trans_eels[i] += eel[j] * ta_pct[j];
+             }
+           }
+          }
+
+          gt_pct.assign(m_trans, 0.0);
+          for (int i = 0; i < m; i++)
+            if (gene_tpm[i] >= EPSILON) {
+              int b = gt.spAt(i), e = gt.spAt(i + 1);
+              for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i];
+            }
+        }
+
+       //allele level
+       if (alleleS) {
+         sprintf(outF, "%s.sim.alleles.results", outFN);
+         fo = fopen(outF, "w");
+         fprintf(fo, "allele_id\ttranscript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tAlleleIsoPct\tAlleleGenePct\n");
+         for (int i = 1; i <= M; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(i);
+           fprintf(fo, "%s\t%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getSeqName().c_str(), transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
+                   eel[i], counts[i], tpm[i], fpkm[i], ta_pct[i] * 1e2, isopct[i] * 1e2);
+         }
+         fclose(fo);
+       }
+
+       //isoform level
+       sprintf(outF, "%s.sim.isoforms.results", outFN);
+       fo = fopen(outF, "w");
+       fprintf(fo, "transcript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tIsoPct\n");
+       if (!alleleS) {
+         for (int i = 1; i <= M; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(i);
+           fprintf(fo, "%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
+                   eel[i], counts[i], tpm[i], fpkm[i], isopct[i] * 1e2);
+         }
+       }
+       else {
+         for (int i = 0; i < m_trans; i++) {
+           const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i));
+           fprintf(fo, "%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), trans_lens[i],
+                    trans_eels[i], trans_counts[i], trans_tpm[i], trans_fpkm[i], gt_pct[i] * 1e2);
+         }
+       }
+       fclose(fo);
+
+       //gene level
+       sprintf(outF, "%s.sim.genes.results", outFN);
+       fo = fopen(outF, "w");
+       fprintf(fo, "gene_id\ttranscript_id(s)\tlength\teffective_length\tcount\tTPM\tFPKM\n");
+       for (int i = 0; i < m; i++) {
+               int b = gi.spAt(i), e = gi.spAt(i + 1);
+               const std::string& gene_id = transcripts.getTranscriptAt(b).getGeneID();
+               fprintf(fo, "%s\t", gene_id.c_str());
+               std::string curtid = "", tid;
+               for (int j = b; j < e; j++) {
+                       tid = transcripts.getTranscriptAt(j).getTranscriptID();
+                       if (curtid != tid) {
+                         if (curtid != "") fprintf(fo, ",");
+                         fprintf(fo, "%s", tid.c_str());
+                         curtid = tid;
+                       }
+               }
+               fprintf(fo, "\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", glens[i], gene_eels[i], gene_counts[i], gene_tpm[i], gene_fpkm[i]);
+       }
+       fclose(fo);
+}
+
+#endif
diff --git a/calcCI.cpp b/calcCI.cpp

index 45c76838c150490240e24a8c215cb4d05069d203..97eba64e1ea9965c3983409090dd252abb78cbad 100644 (file)
--- a/calcCI.cpp
+++ b/calcCI.cpp
@@ -20,6 +20,7 @@
  
  #include "Refs.h"
  #include "GroupInfo.h"
+#include "WriteResults.h"
  
  #include "Buffer.h"
  
@@ -32,17 +33,17 @@ struct Params {
         const double *mw;
  };
  
+struct CIParams {
+       int no;
+       int start_gene_id, end_gene_id;
+};
+
  struct CIType {
         float lb, ub; // the interval is [lb, ub]
  
         CIType() { lb = ub = 0.0; }
  };
  
-struct CIParams {
-       int no;
-       int start_gene_id, end_gene_id;
-};
-
  int model_type;
  
  int nMB;
@@ -54,14 +55,20 @@ float *l_bars;
  
  char cvsF[STRLEN], tmpF[STRLEN], command[STRLEN];
  
-CIType *iso_tpm, *gene_tpm, *iso_fpkm, *gene_fpkm;
+CIType *tpm, *fpkm;
+CIType *iso_tpm = NULL, *iso_fpkm = NULL;
+CIType *gene_tpm, *gene_fpkm;
  
  int M, m;
  Refs refs;
  GroupInfo gi;
-char imdName[STRLEN], statName[STRLEN];
+char refName[STRLEN], imdName[STRLEN], statName[STRLEN];
  char modelF[STRLEN], groupF[STRLEN], refF[STRLEN];
  
+bool alleleS;
+int m_trans;
+GroupInfo ta;
+
  vector<double> eel; //expected effective lengths
  
  Buffer *buffer;
@@ -75,37 +82,6 @@ int rc;
  
  CIParams *ciParamsArray;
  
-template<class ModelType>
-void calcExpectedEffectiveLengths(ModelType& model) {
-       int lb, ub, span;
-       double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = \sigma_{j=1}^{i}pdf[i]*(lb+i)
-  
-       model.getGLD().copyTo(pdf, cdf, lb, ub, span);
-       clen = new double[span + 1];
-       clen[0] = 0.0;
-       for (int i = 1; i <= span; i++) {
-               clen[i] = clen[i - 1] + pdf[i] * (lb + i);
-       }
-
-       eel.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++) {
-               int totLen = refs.getRef(i).getTotLen();
-               int fullLen = refs.getRef(i).getFullLen();
-               int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
-               int pos2 = max(min(totLen, ub) - lb, 0);
-
-               if (pos2 == 0) { eel[i] = 0.0; continue; }
-    
-               eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
-               assert(eel[i] >= 0);
-               if (eel[i] < MINEEL) { eel[i] = 0.0; }
-       }
-  
-       delete[] pdf;
-       delete[] cdf;
-       delete[] clen;
-}
-
  void* sample_theta_from_c(void* arg) {
         int *cvec;
         double *theta;
@@ -179,7 +155,7 @@ template<class ModelType>
  void sample_theta_vectors_from_count_vectors() {
         ModelType model;
         model.read(modelF);
-       calcExpectedEffectiveLengths<ModelType>(model);
+       calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
  
         int num_threads = min(nThreads, nCV);
  
@@ -269,13 +245,19 @@ void calcCI(int nSamples, float *samples, float &lb, float &ub) {
  }
  
  void* calcCI_batch(void* arg) {
-       float *itsamples, *gtsamples, *ifsamples, *gfsamples;
+       float *tsamples, *fsamples;
+       float *itsamples = NULL, *ifsamples = NULL, *gtsamples, *gfsamples;
         ifstream fin;
         CIParams *ciParams = (CIParams*)arg;
+       int curtid, curaid, tid;
  
-       itsamples = new float[nSamples];
+       tsamples = new float[nSamples];
+       fsamples = new float[nSamples];
+       if (alleleS) { 
+         itsamples = new float[nSamples];
+         ifsamples = new float[nSamples];
+       }
         gtsamples = new float[nSamples];
-       ifsamples = new float[nSamples];
         gfsamples = new float[nSamples];
  
         fin.open(tmpF, ios::binary);
@@ -284,19 +266,46 @@ void* calcCI_batch(void* arg) {
         fin.seekg(pos, ios::beg);
  
         int cnt = 0;
+       if (alleleS) {
+         curtid = curaid = -1;
+         memset(itsamples, 0, FLOATSIZE * nSamples);
+         memset(ifsamples, 0, FLOATSIZE * nSamples);
+       }
         for (int i = ciParams->start_gene_id; i < ciParams->end_gene_id; i++) {
                 int b = gi.spAt(i), e = gi.spAt(i + 1);
                 memset(gtsamples, 0, FLOATSIZE * nSamples);
                 memset(gfsamples, 0, FLOATSIZE * nSamples);
                 for (int j = b; j < e; j++) {
+                       if (alleleS) {
+                         tid = ta.gidAt(j);
+                         if (curtid != tid) {
+                           if (curtid >= 0) {
+                             if (j - curaid > 1) {
+                               calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub);
+                               calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub);
+                             }
+                             else {
+                               iso_tpm[curtid] = tpm[curaid];
+                               iso_fpkm[curtid] = fpkm[curaid];
+                             }
+                           }
+                           curtid = tid;
+                           curaid = j;
+                         }
+                       }
+
                         for (int k = 0; k < nSamples; k++) {
-                               fin.read((char*)(&itsamples[k]), FLOATSIZE);
-                               gtsamples[k] += itsamples[k];
-                               ifsamples[k] = 1e3 / l_bars[k] * itsamples[k];
-                               gfsamples[k] += ifsamples[k];
+                               fin.read((char*)(&tsamples[k]), FLOATSIZE);
+                               fsamples[k] = 1e3 / l_bars[k] * tsamples[k];
+                               if (alleleS) {
+                                 itsamples[k] += tsamples[k];
+                                 ifsamples[k] += fsamples[k];
+                               }
+                               gtsamples[k] += tsamples[k];
+                               gfsamples[k] += fsamples[k];
                         }
-                       calcCI(nSamples, itsamples, iso_tpm[j].lb, iso_tpm[j].ub);
-                       calcCI(nSamples, ifsamples, iso_fpkm[j].lb, iso_fpkm[j].ub);
+                       calcCI(nSamples, tsamples, tpm[j].lb, tpm[j].ub);
+                       calcCI(nSamples, fsamples, fpkm[j].lb, fpkm[j].ub);
                 }
  
                 if (e - b > 1) {
@@ -304,18 +313,34 @@ void* calcCI_batch(void* arg) {
                         calcCI(nSamples, gfsamples, gene_fpkm[i].lb, gene_fpkm[i].ub);
                 }
                 else {
-                       gene_tpm[i].lb = iso_tpm[b].lb; gene_tpm[i].ub = iso_tpm[b].ub;
-                       gene_fpkm[i].lb = iso_fpkm[b].lb; gene_fpkm[i].ub = iso_fpkm[b].ub;
+                       gene_tpm[i] = tpm[b];
+                       gene_fpkm[i] = fpkm[b];
                 }
  
                 ++cnt;
                 if (verbose && cnt % 1000 == 0) { printf("In thread %d, %d genes are processed for CI calculation!\n", ciParams->no, cnt); }
         }
-
         fin.close();
  
-       delete[] itsamples;
+       if (alleleS && (curtid >= 0)) {
+         if (gi.spAt(ciParams->end_gene_id) - curaid > 1) {
+           calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub);
+           calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub);
+         }
+         else {
+           iso_tpm[curtid] = tpm[curaid];
+           iso_fpkm[curtid] = fpkm[curaid];
+         }
+       }
+       
+       delete[] tsamples;
+       delete[] fsamples;
+       if (alleleS) {
+         delete[] itsamples;
+         delete[] ifsamples;
+       }
         delete[] gtsamples;
+       delete[] gfsamples;
  
         return NULL;
  }
@@ -325,9 +350,13 @@ void calculate_credibility_intervals(char* imdName) {
         char outF[STRLEN];
         int num_threads = nThreads;
  
-       iso_tpm = new CIType[M + 1];
+       tpm = new CIType[M + 1];
+       fpkm = new CIType[M + 1];
+       if (alleleS) {
+         iso_tpm = new CIType[m_trans];
+         iso_fpkm = new CIType[m_trans];
+       }
         gene_tpm = new CIType[m];
-       iso_fpkm = new CIType[M + 1];
         gene_fpkm = new CIType[m];
  
         assert(M > 0);
@@ -375,19 +404,33 @@ void calculate_credibility_intervals(char* imdName) {
  
         delete[] ciParamsArray;
  
-       //isoform level results
-       sprintf(outF, "%s.iso_res", imdName);
+       alleleS ? sprintf(outF, "%s.allele_res", imdName) : sprintf(outF, "%s.iso_res", imdName);
         fo = fopen(outF, "a");
         for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < M ? '\t' : '\n'));
+         fprintf(fo, "%.6g%c", tpm[i].lb, (i < M ? '\t' : '\n'));
         for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < M ? '\t' : '\n'));
+         fprintf(fo, "%.6g%c", tpm[i].ub, (i < M ? '\t' : '\n'));
         for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < M ? '\t' : '\n'));
+         fprintf(fo, "%.6g%c", fpkm[i].lb, (i < M ? '\t' : '\n'));
         for (int i = 1; i <= M; i++)
-               fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < M ? '\t' : '\n'));
+         fprintf(fo, "%.6g%c", fpkm[i].ub, (i < M ? '\t' : '\n'));
         fclose(fo);
  
+       if (alleleS) {
+         //isoform level results
+         sprintf(outF, "%s.iso_res", imdName);
+         fo = fopen(outF, "a");
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < m_trans - 1 ? '\t' : '\n'));
+         for (int i = 0; i < m_trans; i++)
+           fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
+         fclose(fo);
+       }
+
         //gene level results
         sprintf(outF, "%s.gene_res", imdName);
         fo = fopen(outF, "a");
@@ -401,9 +444,13 @@ void calculate_credibility_intervals(char* imdName) {
                 fprintf(fo, "%.6g%c", gene_fpkm[i].ub, (i < m - 1 ? '\t' : '\n'));
         fclose(fo);
  
-       delete[] iso_tpm;
+       delete[] tpm;
+       delete[] fpkm;
+       if (alleleS) { 
+         delete[] iso_tpm; 
+         delete[] iso_fpkm; 
+       }
         delete[] gene_tpm;
-       delete[] iso_fpkm;
         delete[] gene_fpkm;
  
         if (verbose) { printf("All credibility intervals are calculated!\n"); }
@@ -415,6 +462,7 @@ int main(int argc, char* argv[]) {
                 exit(-1);
         }
  
+       strcpy(refName, argv[1]);
         strcpy(imdName, argv[2]);
         strcpy(statName, argv[3]);
  
@@ -431,13 +479,18 @@ int main(int argc, char* argv[]) {
         }
         verbose = !quiet;
  
-       sprintf(refF, "%s.seq", argv[1]);
+       sprintf(refF, "%s.seq", refName);
         refs.loadRefs(refF, 1);
         M = refs.getM();
-       sprintf(groupF, "%s.grp", argv[1]);
+
+       sprintf(groupF, "%s.grp", refName);
         gi.load(groupF);
         m = gi.getm();
  
+       // allele-specific 
+       alleleS = isAlleleSpecific(refName, NULL, &ta);
+       if (alleleS) m_trans = ta.getm();
+
         nSamples = nCV * nSpC;
         assert(nSamples > 0 && M > 0); // for Buffter.h: (bufsize_type)nSamples
         l_bars = new float[nSamples];
@@ -463,14 +516,6 @@ int main(int argc, char* argv[]) {
         calculate_credibility_intervals(imdName);
  
         delete l_bars;
-       /*
-       sprintf(command, "rm -f %s", tmpF);
-       int status = system(command);
-       if (status != 0) {
-               fprintf(stderr, "Cannot delete %s!\n", tmpF);
-               exit(-1);
-       }
-       */
  
         return 0;
  }
diff --git a/makefile b/makefile

index c95fd14467cb0263ac6122d321cd7af4f8e0b9bb..aec4b2ddb6b9aeb37222e7cfeea23c5a6b7a27bb 100644 (file)
--- a/makefile
+++ b/makefile
@@ -15,7 +15,7 @@ Transcripts.h : my_assert.h Transcript.h
  rsem-extract-reference-transcripts : utils.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp
         $(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts
  
-rsem-synthesis-reference-transcripts : utils.h Transcript.h Transcripts.h synthesisRef.cpp
+rsem-synthesis-reference-transcripts : utils.h my_assert.h Transcript.h Transcripts.h synthesisRef.cpp
         $(CC) -Wall -O3 synthesisRef.cpp -o rsem-synthesis-reference-transcripts
  
  BowtieRefSeqPolicy.h : RefSeqPolicy.h
@@ -82,10 +82,12 @@ BamWriter.h : sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h SingleHit.h Pair
  
  sampling.h : boost/random.hpp
  
+WriteResults.h : utils.h my_assert.h GroupInfo.h Transcript.h Transcripts.h RefSeq.h Refs.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h
+
  rsem-run-em : EM.o sam/libbam.a
         $(CC) -o rsem-run-em EM.o sam/libbam.a -lz -lpthread
  
-EM.o : utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sam_rsem_aux.h sampling.h boost/random.hpp EM.cpp
+EM.o : utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sam_rsem_aux.h sampling.h boost/random.hpp WriteResults.h EM.cpp
         $(CC) $(COFLAGS) EM.cpp
  
  bc_aux.h : sam/bam.h
@@ -107,14 +109,14 @@ wiggle.o: sam/bam.h sam/sam.h wiggle.cpp wiggle.h
  rsem-simulate-reads : simulation.o
         $(CC) -o rsem-simulate-reads simulation.o
  
-simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp simulation.cpp
+simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp WriteResults.h simulation.cpp
         $(CC) $(COFLAGS) simulation.cpp
  
  rsem-run-gibbs : Gibbs.o
         $(CC) -o rsem-run-gibbs Gibbs.o -lpthread
  
  #some header files are omitted
-Gibbs.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h Gibbs.cpp 
+Gibbs.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Gibbs.cpp 
         $(CC) $(COFLAGS) Gibbs.cpp
  
  Buffer.h : my_assert.h
@@ -123,7 +125,7 @@ rsem-calculate-credibility-intervals : calcCI.o
         $(CC) -o rsem-calculate-credibility-intervals calcCI.o -lpthread
  
  #some header files are omitted
-calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h Buffer.h calcCI.cpp
+calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Buffer.h calcCI.cpp
         $(CC) $(COFLAGS) calcCI.cpp
  
  rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index b523eddd932313ce70da5b7d349f59f60cb7f3b1..8a7ca0e45b6d58b7d1089bd98dceb306a9cf4d79 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -51,6 +51,7 @@ my $nThreads = 1;
  my $genBamF = 1;  # default is generating transcript bam file
  my $genGenomeBamF = 0;
  my $sampling = 0;
+my $calcPME = 0;
  my $calcCI = 0;
  my $var_opt = 0; # temporarily, only for internal use
  my $quiet = 0;
@@ -80,6 +81,8 @@ my $inpF = "";
  my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ();
  my $gap = 32;
  
+my $alleleS = 0;
+
  GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "temporary-folder=s" => \$temp_dir,
            "no-qualities" => \$no_qual,
@@ -114,6 +117,7 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "no-bam-output" => sub { $genBamF = 0; },
            "output-genome-bam" => \$genGenomeBamF,
            "sampling-for-bam" => \$sampling,
+          "calc-pme" => \$calcPME,
            "var" => \$var_opt,
            "calc-ci" => \$calcCI,
            "ci-memory=i" => \$NMB,
@@ -181,6 +185,13 @@ else {
      $sampleName = $ARGV[3];
  }
  
+if (((-e "$refName.ta") && !(-e "$refName.gt")) || (!(-e "$refName.ta") && (-e "$refName.gt"))) {
+    print "Allele-specific expression related reference files are corrupted!\n";
+    exit(-1);
+}
+
+$alleleS = (-e "$refName.ta") && (-e "$refName.gt");
+
  if ($genGenomeBamF) {
      open(INPUT, "$refName.ti");
      my $line = <INPUT>; chomp($line);
@@ -329,13 +340,20 @@ if ($genBamF) {
      else { $command .= " 0"; }
      if ($sampling) { $command .= " --sampling"; }
  }
-if ($calcCI || $var_opt) { $command .= " --gibbs-out"; }
+if ($calcPME || $var_opt || $calcCI) { $command .= " --gibbs-out"; }
  if ($quiet) { $command .= " -q"; }
  
  &runCommand($command);
  
-&collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+if ($alleleS) {
+    &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level
+    &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+    &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+} 
+else {
+    &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+    &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+}
  
  if ($genBamF) {
      $command = $dir."sam/samtools sort -@ $nThreads -m $SortMem $sampleName.transcript.bam $sampleName.transcript.sorted";
@@ -357,7 +375,7 @@ if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
  
  if ($mTime) { $time_start = time(); }
  
-if ($calcCI || $var_opt) {
+if ($calcPME || $var_opt || $calcCI ) {
      $command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
      $command .= " -p $nThreads";
      if ($var_opt) { $command .= " --var"; }
@@ -365,21 +383,43 @@ if ($calcCI || $var_opt) {
      &runCommand($command);
  }
  
-if ($calcCI) {
-    system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
-    system("mv $sampleName.genes.results $imdName.genes.results.bak1");
-    &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-    &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+if ($calcPME || $calcCI) {
+    if ($alleleS) {
+       system("mv $sampleName.alleles.results $imdName.alleles.results.bak1");
+       system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
+       system("mv $sampleName.genes.results $imdName.genes.results.bak1");
+       &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level
+       &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+       &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+    }
+    else {
+       system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
+       system("mv $sampleName.genes.results $imdName.genes.results.bak1");
+       &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+       &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+    }
+}
  
+if ($calcCI) {
      $command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
      $command .= " -p $nThreads";
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
  
-    system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
-    system("mv $sampleName.genes.results $imdName.genes.results.bak2");
-    &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-    &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+    if ($alleleS) {
+       system("mv $sampleName.alleles.results $imdName.alleles.results.bak2");
+       system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
+       system("mv $sampleName.genes.results $imdName.genes.results.bak2");
+       &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level
+       &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+       &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+    }
+    else {
+       system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
+       system("mv $sampleName.genes.results $imdName.genes.results.bak2");
+       &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+       &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+    }
  }
  
  if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; }
@@ -397,7 +437,7 @@ if ($mTime) {
      print OUTPUT "Aligning reads: $time_alignment s.\n";
      print OUTPUT "Estimating expression levels: $time_rsem s.\n";
      print OUTPUT "Calculating credibility intervals: $time_ci s.\n";
-    my $time_del = $time_end - $time_start;
+#    my $time_del = $time_end - $time_start;
  #    print OUTPUT "Delete: $time_del s.\n";
      close(OUTPUT);
  }
@@ -484,9 +524,13 @@ Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic
  
  When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off)
  
+=item B<--calc-pme>
+
+Run RSEM's collapsed Gibbs sampler to calculate posterior mean estimates. (Default: off) 
+
  =item B<--calc-ci>
  
-Calculate 95% credibility intervals and posterior mean estimates.  (Default: off)
+Calculate 95% credibility intervals and posterior mean estimates. (Default: off)
  
  =item B<--seed-length> <int>
  
@@ -644,8 +688,9 @@ each line in the rest of this file is:
  
  transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
  
-Fields are separated by the tab character. Fields within "[]" are only
-presented if '--calc-ci' is set.
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
  
  'transcript_id' is the transcript name of this transcript. 'gene_id'
  is the gene name of the gene which this transcript belongs to (denote
@@ -704,8 +749,9 @@ each line in the rest of this file is:
  
  gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
  
-Fields are separated by the tab character. Fields within "[]" are only
-presented if '--calc-ci' is set. 
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
  
  'transcript_id(s)' is a comma-separated list of transcript_ids
  belonging to this gene. If no gene information is provided, 'gene_id'
@@ -716,6 +762,43 @@ defined as the weighted average of its transcripts' lengths and
  effective lengths (weighted by 'IsoPct'). A gene's abundance estimates
  are just the sum of its transcripts' abundance estimates.
  
+=item B<sample_name.alleles.results>
+
+Only generated when the RSEM references are built with allele-specific
+transcripts.
+
+This file contains allele level expression estimates for
+allele-specific expression calculation. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+
+allele_id transcript_id gene_id length effective_length expected_count TPM FPKM AlleleIsoPct AlleleGenePct [pme_expected_count pme_TPM pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
+
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
+
+'allele_id' is the allele-specific name of this allele-specific transcript.
+
+'AlleleIsoPct' stands for allele-specific percentage on isoform
+level. It is the percentage of this allele-specific transcript's
+abundance over its parent transcript's abundance. If its parent
+transcript has only one allele variant form, this field will be set to
+100.
+
+'AlleleGenePct' stands for allele-specific percentage on gene
+level. It is the percentage of this allele-specific transcript's
+abundance over its parent gene's abundance.
+
+'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have
+similar meanings. They are calculated based on posterior mean
+estimates.
+
+Please note that if this file is present, the fields 'length' and
+'effective_length' in 'sample_name.isoforms.results' should be
+interpreted similarly as the corresponding definitions in
+'sample_name.genes.results'.
+
  =item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
  
  Only generated when --no-bam-output is not specified.
diff --git a/rsem-gen-transcript-plots b/rsem-gen-transcript-plots

index 4ea1d87ba9ce14348dbb9d0b37173df66ee52149..5b8cc98434d70083c1573868b6162b612ed1a572 100755 (executable)
--- a/rsem-gen-transcript-plots
+++ b/rsem-gen-transcript-plots
@@ -2,83 +2,89 @@
  
  nrow_per_page <- 3 # if input_list is composed of transcript ids
  ncol_per_page <- 2 # if input_list is composed of transcript ids
-num_plots_per_page <- nrow_per_page * ncol_per_page # if input_list is composed of transcript ids
-
+num_plots_per_page <- nrow_per_page * ncol_per_page # if input_list is composed of transcript/allele ids
  
  exit_with_error <- function(errmsg) {
    cat(errmsg, "\n", sep = "", file = stderr())
    quit(save = "no", status = 1)
  }
  
-
  args <- commandArgs(TRUE)
-if (length(args) != 5) 
-  exit_with_error("Usage: rsem-gen-transcript-plots sample_name input_list is_gene show_uniq output_plot_file")
+if (length(args) != 6) 
+  exit_with_error("Usage: rsem-gen-transcript-plots sample_name input_list is_allele_specific id_type<0,allele;1,isoform;2,gene> show_uniq output_plot_file")
  
  sample_name <- args[1]
  input_list <- args[2]
-is_gene <- as.numeric(args[3])
-show_uniq <- as.numeric(args[4])
-output_plot_file <- args[5]
+alleleS <- as.numeric(args[3])
+id_type <- as.numeric(args[4])
+show_uniq <- as.numeric(args[5])
+output_plot_file <- args[6]
  
+is_composite <- (!alleleS && (id_type == 2)) || (alleleS && (id_type > 0))
+ 
  load_readdepth_file <- function(filename) {
    data <- read.table(file = filename, sep = "\t", stringsAsFactors = FALSE)
    readdepth <- split(data[, 2:3], data[, 1])
  }
  
-build_t2gmap <- function(filename) {
-  tpos <- 1   # the position of transcript_id
-  gpos <- 2   # the position of gene_id
+build_map <- function(filename) {
+  value_pos <- 1
+  if (!alleleS || (alleleS && id_type == 1)) {
+    key_pos <- 2
+  } else {
+    key_pos <- 3
+  }
  
    data <- read.delim(file = filename, sep = "\t", stringsAsFactors = FALSE)
-  tmp <- aggregate(data[tpos], data[gpos], function(x) x)
-  t2gmap <- tmp[,2]
-  names(t2gmap) <- tmp[,1]
+  tmp <- aggregate(data[value_pos], data[key_pos], function(x) x)
+  map <- tmp[,2]
+  names(map) <- tmp[,1]
  
-  t2gmap
+  map
  }
  
-make_a_plot <- function(tid) {
-  vec <- readdepth[[tid]]      
-  if (is.null(vec)) exit_with_error(paste("Unknown transcript detected,", tid, "is not included in RSEM's indices."))
+make_a_plot <- function(id) {
+  vec <- readdepth[[id]]       
+  if (is.null(vec)) exit_with_error(sprintf("Unknown %s detected, %s is not included in RSEM's indices.", ifelse(alleleS, "allele-specific transcript", "transcript"), id))
    if (is.na(vec[[2]])) wiggle <- rep(0, vec[[1]]) else wiggle <- as.numeric(unlist(strsplit(vec[[2]], split = " ")))
    len <- length(wiggle)
    if (!show_uniq) {
      plot(wiggle, type = "h")
    } else {
-    vec <- readdepth_uniq[[tid]]
+    vec <- readdepth_uniq[[id]]
      stopifnot(!is.null(vec))
      if (is.na(vec[[2]])) wiggle_uniq <- rep(0, vec[[1]]) else wiggle_uniq <- as.numeric(unlist(strsplit(vec[[2]], split = " ")))
      stopifnot(len == length(wiggle_uniq))
      if (len != sum(wiggle >= wiggle_uniq)) {
-      cat("Warning: transcript ", tid, " has position(s) that read covarege with multireads is smaller than read covarge without multireads.\n", "         The 1-based position(s) is(are) : ", which(wiggle < wiggle_uniq), ".\n", "         This may be due to floating point arithmetics.\n", sep = "") 
+      cat("Warning: ", ifelse(alleleS, "allele-specific transcript", "transcript"), " ", id, " has position(s) that read covarege with multireads is smaller than read covarge without multireads.\n", "         The 1-based position(s) is(are) : ", which(wiggle < wiggle_uniq), ".\n", "         This may be due to floating point arithmetics.\n", sep = "") 
      }
      heights <- rbind(wiggle_uniq, wiggle - wiggle_uniq)        
      barplot(heights, space = 0, border = NA, names.arg = 1:len, col = c("black", "red")) 
    }
-  title(main = tid) #, xlab = "Position in transcript", ylab = "Read depth")
+  title(main = id)
  }
  
-generate_a_page <- function(tids, gene_id = NULL) {
-  n <- length(tids)
-  ncol <- ifelse(is_gene, floor(sqrt(n)), ncol_per_page)
-  nrow <- ifelse(is_gene, ceiling(n / ncol), nrow_per_page)
+generate_a_page <- function(ids, title = NULL) {
+  n <- length(ids)
+  ncol <- ifelse(is_composite, floor(sqrt(n)), ncol_per_page)
+  nrow <- ifelse(is_composite, ceiling(n / ncol), nrow_per_page)
  
    par(mfrow = c(nrow, ncol), mar = c(2, 2, 2, 2))
-  if (is_gene) par(oma = c(0, 0, 3, 0)) 
-  sapply(tids, make_a_plot)
-  if (is_gene) mtext(gene_id, outer = TRUE, line = 1)
+  if (is_composite) par(oma = c(0, 0, 3, 0)) 
+  sapply(ids, make_a_plot)
+  if (is_composite) mtext(title, outer = TRUE, line = 1)
  }
  
-plot_a_transcript <- function(i) {
+plot_individual <- function(i) {
    fr <- (i - 1) * num_plots_per_page + 1
    to <- min(i * num_plots_per_page, n)
    generate_a_page(ids[fr:to])
  }
  
-plot_a_gene <- function(gene_id) {
-  if (is.null(t2gmap[[gene_id]])) exit_with_error(paste("Unknown gene detected,", gene_id, "is not included in RSEM's in indices."))
-  generate_a_page(t2gmap[[gene_id]], gene_id)
+# cid, composite id, can be either a gene id or transcript id (for allele-specific expression only)
+plot_composite <- function(cid) {
+  if (is.null(map[[cid]])) exit_with_error(sprintf("Unknown %s detected, %s is not included in RSEM's indices.", ifelse(alleleS && id_type == 1, "transcript", "gene"), cid))
+  generate_a_page(map[[cid]], cid)
  }
  
  readdepth <- load_readdepth_file(paste(sample_name, ".transcript.readdepth", sep = ""))
@@ -91,24 +97,22 @@ ids <- scan(file = input_list, what = "", sep = "\n")
  
  cat("Loading files is done!\n")
  
-if (is_gene) {
-  t2gmap <- build_t2gmap(paste(sample_name, ".isoforms.results", sep = ""))
+if (is_composite) {
+  file_name <- sprintf("%s.%s.results", sample_name, ifelse(alleleS, "alleles", "isoforms"))
+  map <- build_map(file_name)
    cat("Building transcript to gene map is done!\n")
  }
  
  pdf(output_plot_file)
  
-if (!is_gene) {        
+if (!is_composite) {   
    n <- length(ids)
    ub <- (n - 1) %/% num_plots_per_page + 1
-  tmp <- sapply(1:ub, plot_a_transcript)
+  dumbvar <- sapply(1:ub, plot_individual)
  } else {
-  tmp <- sapply(ids, plot_a_gene)
+  dumbvar <- sapply(ids, plot_composite)
  }
  
  cat("Plots are generated!\n")
  
  dev.off.output <- dev.off()
-
-
-
diff --git a/rsem-plot-model b/rsem-plot-model

index 225d08f06bad85c9365fde9939632174d47122d5..e87226d224a946109c60cc3821984ec9ba1793e8 100755 (executable)
--- a/rsem-plot-model
+++ b/rsem-plot-model
@@ -146,4 +146,4 @@ barplot(pair[,2], names.arg = pair[,1],
          ylab = "Number of fragments",
          main = "Histogram of Alignments per Fragment")
  
-dev.off()
+dev.off.output <- dev.off()
diff --git a/rsem-plot-transcript-wiggles b/rsem-plot-transcript-wiggles

index efe00ba91863d7125268d0e492d602ae6b9cbeb7..74076ec7d4acc2053bde04581a81bf514ae9c6f8 100755 (executable)
--- a/rsem-plot-transcript-wiggles
+++ b/rsem-plot-transcript-wiggles
@@ -8,17 +8,25 @@ use strict;
  
  use rsem_perl_utils;
  
-my $gene_list = 0; # default is 0, means input is a transcript list; 1 means input is a gene list
+my $gene_list = 0; # default is 0, means input is not a gene list
+my $transcript_list = 0; # default is 0, this option can only be turned on if allele-specific expression is calculated
  my $show_unique = 0; # 0, default value, means do not show unique transcript wiggles; 1 means show unique transcript wiggles
  my $help = 0;
  
  GetOptions("gene-list" => \$gene_list,
+          "transcript-list" => \$transcript_list,
            "show-unique" => \$show_unique,
            "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
  
  pod2usage(-verbose => 2) if ($help == 1);
  pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
  
+my $alleleS = 0;
+if (-e "$ARGV[0].alleles.results") { $alleleS = 1; }
+
+pod2usage(-msg => "--transcript-list cannot be set if allele-specific reference is not built!", -exitval => 2, -verbose => 2) if (!$alleleS && $transcript_list);
+pod2usage(-msg => "--gene-list and --transcript-list cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($gene_list && $transcript_list);
+
  my $dir = "$FindBin::Bin/";
  my $command = "";
  
@@ -46,7 +54,19 @@ if ($show_unique) {
      }
  }
  
-$command = $dir."rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $gene_list $show_unique $ARGV[2]";
+my $id_type;
+
+if ($alleleS) {
+    $id_type = 0;
+    if ($transcript_list) { $id_type = 1; }
+    if ($gene_list) { $id_type = 2; }
+} 
+else {
+    $id_type = 1;
+    if ($gene_list) { $id_type = 2; }
+}
+
+$command = $dir."rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $alleleS $id_type $show_unique $ARGV[2]";
  &runCommand($command);
  
  __END__
@@ -85,6 +105,10 @@ The file name of the pdf file which contains all plots.
  
  The input-list is a list of gene ids. (Default: off)
  
+=item B<--transcript-list>
+
+The input-list is a list of transcript ids. This option can only be turned on if allele-specific expression is calculated. (Default: off)
+
  =item B<--show-unique>
  
  Show the wiggle plots as stacked bar plots. See description section for details. (Default: off)
@@ -97,7 +121,7 @@ Show help information.
  
  =head1 DESCRIPTION
  
-This program generates transcript wiggle plots and outputs them in a pdf file. This program can accept either a list of transcript ids or gene ids (if transcript to gene mapping information is provided) and has two modes of showing wiggle plots. If '--show-unique' is not specified, the wiggle plot for each transcript is a histogram where each position has the expected read depth at this position as its height. If '--show-unique' is specified, for each transcript a stacked bar plot is generated. For each position, the read depth of unique reads, which have only one alignment, is showed in black. The read depth of multi-reads, which align to more than one places, is showed in red on top of the read depth of unique reads.This program will use some files RSEM generated previouslly. So please do not delete/move any file 'rsem-calculate-expression' generated.
+This program generates transcript wiggle plots and outputs them in a pdf file. This program can accept either a list of transcript ids or gene ids (if transcript to gene mapping information is provided) and has two modes of showing wiggle plots. If '--show-unique' is not specified, the wiggle plot for each transcript is a histogram where each position has the expected read depth at this position as its height. If '--show-unique' is specified, for each transcript a stacked bar plot is generated. For each position, the read depth of unique reads, which have only one alignment, is showed in black. The read depth of multi-reads, which align to more than one places, is showed in red on top of the read depth of unique reads.This program will use some files RSEM generated previouslly. So please do not delete/move any file 'rsem-calculate-expression' generated. If allele-specific expression is calculated, the basic unit for plotting is an allele-specific transcript and plots can be grouped by either transcript ids (--transcript-list) or gene ids (--gene-list). 
  
  =head1 OUTPUT
  
@@ -105,7 +129,7 @@ This program generates transcript wiggle plots and outputs them in a pdf file. T
  
  =item B<output_plot_file>
  
-This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangment of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. 
+This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangment of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. If allele-specific expression is calculated, the basin unit becomes an allele-specific transcript and transcript ids and gene ids can be used to group allele-specific transcripts.
  
  =item B<sample_name.transcript.sorted.bam and sample_name.transcript.readdepth>
  
diff --git a/rsem-prepare-reference b/rsem-prepare-reference

index 81fd13ea74edab4b31dd147db9be502e62321167..de4f061612d4b3a976147afe48e5a22594ae3f42 100755 (executable)
--- a/rsem-prepare-reference
+++ b/rsem-prepare-reference
@@ -24,8 +24,11 @@ my $bowtie2_path = "";
  my $quiet = 0;
  my $help = 0;
  
+my $alleleMappingF = "";
+
  GetOptions("gtf=s" => \$gtfF,
            "transcript-to-gene-map=s" => \$mappingF,
+          "allele-to-gene-map=s" => \$alleleMappingF,
            "no-polyA" => \$no_polyA, 
            "no-polyA-subset=s" => \$subsetFile,
            "polyA-length=i" => \$polyALen,
@@ -39,6 +42,8 @@ GetOptions("gtf=s" => \$gtfF,
  
  pod2usage(-verbose => 2) if ($help == 1);
  pod2usage(-msg => "Set --no-polyA & --no-polyA-subset at the same time!", -exitval => 2, -verbose => 2) if ($no_polyA == 1 && $subsetFile ne '');
+pod2usage(-msg => "--transcript-to-gene-map and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($mappingF ne "") && ($alleleMappingF ne ""));
+pod2usage(-msg => "--gtf and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($gtfF ne "") && ($alleleMappingF ne ""));
  pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
  
  if ($bowtie2) { $no_bowtie = 1; $no_ntog = 1; }
@@ -85,6 +90,7 @@ else {
      $"=" ";
      $command = $dir."rsem-synthesis-reference-transcripts $ARGV[1] $quiet";
      if ($mappingF ne "") { $command .= " 1 $mappingF"; }
+    elsif ($alleleMappingF ne "") { $command .= " 2 $alleleMappingF"; }
      else { $command .= " 0"; } 
      $command .= " @list";
      &runCommand($command);
@@ -164,6 +170,19 @@ If this option is off, then the mapping of isoforms to genes depends on whether
  
  (Default: off)
  
+=item B<--allele-to-gene-map> <file>
+
+Use information from <file> to provide gene_id and transcript_id information for each allele-specific transcript.
+Each line of <file> should be of the form:
+
+gene_id transcript_id allele_id
+
+with the fields separated by a tab character.
+
+This option is designed for quantifying allele-specific expression. It is only valid if '--gtf' option is not specified. allele_id should be the sequence names presented in the FASTA-formatted files.  
+
+(Default: off) 
+
  =item B<--no-polyA>
  
  Do not add poly(A) tails to the end of reference isoforms. (Default: adding poly(A) tails to all transcripts)
diff --git a/rsem_perl_utils.pm b/rsem_perl_utils.pm

index b2a040f53be5ed4b53d4e278008697167dfb7dfe..4321a45ab3df0b58c2e312567a90d7c054f02a69 100644 (file)
--- a/rsem_perl_utils.pm
+++ b/rsem_perl_utils.pm
@@ -31,12 +31,13 @@ sub runCommand {
      print "\n";
  }
  
+my @allele_title = ("allele_id", "transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "AlleleIsoPct", "AlleleGenePct", "pme_expected_count", "pme_TPM", "pme_FPKM", "AlleleIsoPct_from_pme_TPM", "AlleleGenePct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
  
  my @transcript_title = ("transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "IsoPct", "pme_expected_count", "pme_TPM", "pme_FPKM", "IsoPct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
  
  my @gene_title = ("gene_id", "transcript_id(s)", "length", "effective_length", "expected_count", "TPM", "FPKM", "pme_expected_count", "pme_TPM", "pme_FPKM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
  
-# inpF, outF
+# type, inpF, outF
  sub collectResults {
      my $local_status;
      my ($inpF, $outF);
@@ -69,7 +70,8 @@ sub collectResults {
  
      my @out_arr = ();
      for (my $i = 0; $i < $n; $i++) {
-       if ($_[0] eq "isoform") { push(@out_arr, $transcript_title[$i]); }
+       if ($_[0] eq "allele") { push(@out_arr, $allele_title[$i]); }
+       elsif ($_[0] eq "isoform") { push(@out_arr, $transcript_title[$i]); }
         elsif ($_[0] eq "gene") { push(@out_arr, $gene_title[$i]); }
         else { print "A bug on 'collectResults' is detected!\n"; exit(-1); }
      }
diff --git a/simulation.cpp b/simulation.cpp

index 0073c3ec1eee70249f087e4edeb1d769c48b61a5..3eb42422c2bf8f2d28a5baf250981cd8d0a0332d 100644 (file)
--- a/simulation.cpp
+++ b/simulation.cpp
@@ -23,21 +23,22 @@
  #include "PairedEndQModel.h"
  
  #include "Refs.h"
-#include "GroupInfo.h"
  #include "Transcript.h"
  #include "Transcripts.h"
  
+#include "WriteResults.h"
+
  #include "simul.h"
  
  using namespace std;
  
-const int OFFSITE = 5;
+bool alleleS;
+int OFFSITE;
  
  READ_INT_TYPE N;
-int model_type, M, m;
+int model_type, M;
  
  Refs refs;
-GroupInfo gi;
  Transcripts transcripts;
  
  vector<double> eel;
@@ -47,8 +48,8 @@ int n_os;
  ostream *os[2];
  char outReadF[2][STRLEN];
  
-char refF[STRLEN], groupF[STRLEN], tiF[STRLEN];
-char geneResF[STRLEN], isoResF[STRLEN];
+char refName[STRLEN];
+char refF[STRLEN], tiF[STRLEN];
  
  simul sampler;
  
@@ -78,37 +79,6 @@ void genOutReadStreams(int type, char *outFN) {
                 os[i] = new ofstream(outReadF[i]);
  }
  
-template<class ModelType>
-void calcExpectedEffectiveLengths(ModelType& model) {
-       int lb, ub, span;
-       double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
-  
-       model.getGLD().copyTo(pdf, cdf, lb, ub, span);
-       clen = new double[span + 1];
-       clen[0] = 0.0;
-       for (int i = 1; i <= span; i++) {
-               clen[i] = clen[i - 1] + pdf[i] * (lb + i);
-       }
-
-       eel.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++) {
-               int totLen = refs.getRef(i).getTotLen();
-               int fullLen = refs.getRef(i).getFullLen();
-               int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
-               int pos2 = max(min(totLen, ub) - lb, 0);
-
-               if (pos2 == 0) { eel[i] = 0.0; continue; }
-    
-               eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
-               assert(eel[i] >= 0);
-               if (eel[i] < MINEEL) { eel[i] = 0.0; }
-       }
-  
-       delete[] pdf;
-       delete[] cdf;
-       delete[] clen;
-}
-
  template<class ReadType, class ModelType>
  void simulate(char* modelF, char* resultsF) {
         ModelType model(&refs);
@@ -118,7 +88,7 @@ void simulate(char* modelF, char* resultsF) {
         model.read(modelF);
         
         //calculate eel
-       calcExpectedEffectiveLengths<ModelType>(model);
+       calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
  
         //generate theta vector
         ifstream fin(resultsF);
@@ -155,99 +125,6 @@ void simulate(char* modelF, char* resultsF) {
         cout<< "Total number of resimulation is "<< resimulation_count<< endl;
  }
  
-void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
-       double denom;
-       vector<double> frac;
-
-       //calculate fraction of count over all mappabile reads
-       denom = 0.0;
-       frac.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++)
-         if (eel[i] >= EPSILON) {
-           frac[i] = theta[i];
-           denom += frac[i];
-         }
-       general_assert(denom > 0, "No alignable reads?!");
-       for (int i = 1; i <= M; i++) frac[i] /= denom;
-
-       //calculate FPKM
-       fpkm.assign(M + 1, 0.0);
-       for (int i = 1; i <= M; i++)
-               if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
-
-       //calculate TPM
-       tpm.assign(M + 1, 0.0);
-       denom = 0.0;
-       for (int i = 1; i <= M; i++) denom += fpkm[i];
-       for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;
-}
-
-void writeResFiles(char* outFN) {
-       FILE *fo;
-       vector<int> tlens;
-       vector<double> fpkm, tpm, isopct;
-       vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
-
-       for (int i = 1; i <= M; i++)
-               general_assert(eel[i] > EPSILON || counts[i] <= EPSILON, "An isoform whose effecitve length < " + ftos(MINEEL, 6) + " got sampled!");
-
-       calcExpressionValues(counts, eel, tpm, fpkm);
-
-       //calculate IsoPct, etc.
-       isopct.assign(M + 1, 0.0);
-       tlens.assign(M + 1, 0);
-
-       glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
-       gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
-
-       for (int i = 0; i < m; i++) {
-               int b = gi.spAt(i), e = gi.spAt(i + 1);
-               for (int j = b; j < e; j++) {
-                       const Transcript& transcript = transcripts.getTranscriptAt(j);
-                       tlens[j] = transcript.getLength();
-
-                       glens[i] += tlens[j] * tpm[j];
-                       gene_eels[i] += eel[j] * tpm[j];
-                       gene_counts[i] += counts[j];
-                       gene_tpm[i] += tpm[j];
-                       gene_fpkm[i] += fpkm[j];
-               }
-
-               if (gene_tpm[i] < EPSILON) continue;
-
-               for (int j = b; j < e; j++)
-                       isopct[j] = tpm[j] / gene_tpm[i];
-               glens[i] /= gene_tpm[i];
-               gene_eels[i] /= gene_tpm[i];
-       }
-
-       //isoform level
-       sprintf(isoResF, "%s.sim.isoforms.results", outFN);
-       fo = fopen(isoResF, "w");
-       fprintf(fo, "transcript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tIsoPct\n");
-       for (int i = 1; i <= M; i++) {
-               const Transcript& transcript = transcripts.getTranscriptAt(i);
-               fprintf(fo, "%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
-                               eel[i], counts[i], tpm[i], fpkm[i], isopct[i] * 1e2);
-       }
-       fclose(fo);
-
-       //gene level
-       sprintf(geneResF, "%s.sim.genes.results", outFN);
-       fo = fopen(geneResF, "w");
-       fprintf(fo, "gene_id\ttranscript_id(s)\tlength\teffective_length\tcount\tTPM\tFPKM\n");
-       for (int i = 0; i < m; i++) {
-               int b = gi.spAt(i), e = gi.spAt(i + 1);
-               const string& gene_id = transcripts.getTranscriptAt(b).getGeneID();
-               fprintf(fo, "%s\t", gene_id.c_str());
-               for (int j = b; j < e; j++) {
-                       fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : '\t'));
-               }
-               fprintf(fo, "%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", glens[i], gene_eels[i], gene_counts[i], gene_tpm[i], gene_fpkm[i]);
-       }
-       fclose(fo);
-}
-
  void releaseOutReadStreams() {
         for (int i = 0; i < n_os; i++) {
                 ((ofstream*)os[i])->close();
@@ -264,7 +141,7 @@ int main(int argc, char* argv[]) {
                 printf("Parameters:\n\n");
                 printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n");
                 printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n");
-               printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required.\n");
+               printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. If the RSEM references built are aware of allele-specific transcripts, 'sample_name.alleles.results' should be used instead.\n");
                 printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n");
                 printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n");
                 printf("output_name: Prefix for all output files.\n");
@@ -289,13 +166,14 @@ int main(int argc, char* argv[]) {
         if (argc == 8 && !strcmp(argv[7], "-q")) quiet = true;
         verbose = !quiet;
  
+       strcpy(refName, argv[1]);
+       alleleS = isAlleleSpecific(refName);
+       OFFSITE = (alleleS ? 6: 5);
+
         //load basic files
         sprintf(refF, "%s.seq", argv[1]);
         refs.loadRefs(refF);
         M = refs.getM();
-       sprintf(groupF, "%s.grp", argv[1]);
-       gi.load(groupF);
-       m = gi.getm();
         sprintf(tiF, "%s.ti", argv[1]);
         transcripts.readFrom(tiF);
  
@@ -320,7 +198,7 @@ int main(int argc, char* argv[]) {
         case 3: simulate<PairedEndReadQ, PairedEndQModel>(argv[2], argv[3]); break;
         }
  
-       writeResFiles(argv[6]);
+       writeResultsSimulation(M, refName, argv[6], transcripts, eel, counts);
         releaseOutReadStreams();
  
         return 0;
diff --git a/synthesisRef.cpp b/synthesisRef.cpp

index aa0d473b6537088fcb820f246de9898fa1864ef4..bde2d5b76045b7d125ca2761e3add289784c857a 100644 (file)
--- a/synthesisRef.cpp
+++ b/synthesisRef.cpp
@@ -5,8 +5,10 @@
  #include<fstream>
  #include<sstream>
  #include<map>
+#include<vector>
  
  #include "utils.h"
+#include "my_assert.h"
  #include "Transcript.h"
  #include "Transcripts.h"
  
@@ -19,29 +21,47 @@ map<string, string>::iterator iter;
  
  Transcripts transcripts(1); // no genome, just transcript set
  char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN], chromListF[STRLEN];
+char gtF[STRLEN], taF[STRLEN]; // group info between gene and transcript, transcript and allele
  
-bool hasMappingFile;
+int hasMappingFile;
  char mappingFile[STRLEN];
  
  map<string, string> mi_table; // mapping info table
  map<string, string>::iterator mi_iter; //mapping info table's iterator
  
-void loadMappingInfo(char* mappingF) {
-  ifstream fin(mappingF);
-  string line, key, value;
-
-  if (!fin.is_open()) {
-         fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF);
-         exit(-1);
-  }
+map<string, string> mi_table2; // allele_id to transcript_id
+map<string, string>::iterator mi_iter2; // corresponding iterator
  
-  mi_table.clear();
-  while (getline(fin, line)) {
-    line = cleanStr(line);
-    if (line[0] == '#') continue;
-    istringstream strin(line);
-    strin>>value>>key;
-    mi_table[key] = value;
+void loadMappingInfo(int file_type, char* mappingF) {
+  ifstream fin(mappingF);
+  string line, key, value, value2;
+
+  general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist.");
+
+  switch(file_type) {
+  case 1: 
+    mi_table.clear();
+    while (getline(fin, line)) {
+      line = cleanStr(line);
+      if (line[0] == '#') continue;
+      istringstream strin(line);
+      strin>>value>>key;
+      mi_table[key] = value;
+    }
+    break;
+  case 2: 
+    mi_table.clear();
+    mi_table2.clear();
+    while (getline(fin, line)) {
+      line = cleanStr(line);
+      if (line[0] == '#') continue;
+      istringstream strin(line);
+      strin>> value>> value2>> key;
+      mi_table[key] = value;
+      mi_table2[key] = value2;
+    }
+    break;
+  default: assert(false);
    }
  
    fin.close();
@@ -55,40 +75,58 @@ char check(char c) {
         return c;
  }
  
-void writeResults(char* refName) {
+void writeResults(int option, char* refName) {
         ofstream fout, fout2;
-       string cur_gene_id, name;
+       string cur_gene_id, cur_transcript_id, name;
+       vector<int> gi, gt, ta;
  
-       sprintf(groupF, "%s.grp", refName);
         sprintf(tiF, "%s.ti", refName);
-       sprintf(refFastaF, "%s.transcripts.fa", refName);
-       sprintf(chromListF, "%s.chrlist", refName);
-
         transcripts.writeTo(tiF);
         if (verbose) { printf("Transcript Information File is generated!\n"); }
  
-       fout.open(groupF);
-       cur_gene_id = "";
+       cur_gene_id = ""; gi.clear(); 
+       if (option == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); }
         for (int i = 1; i <= M; i++) {
                 const Transcript& transcript = transcripts.getTranscriptAt(i);
                 if (cur_gene_id != transcript.getGeneID()) {
-                       fout<<i<<endl;
-                       cur_gene_id = transcript.getGeneID();
+                 gi.push_back(i);
+                 if (option == 2) gt.push_back((int)ta.size());
+                 cur_gene_id = transcript.getGeneID();
+               }
+               if ((option == 2) && (cur_transcript_id != transcript.getTranscriptID())) {
+                   ta.push_back(i);
+                   cur_transcript_id = transcript.getTranscriptID();
                 }
         }
-       fout<<M + 1<<endl;
+       gi.push_back(M + 1);
+       if (option == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); }
+
+       sprintf(groupF, "%s.grp", refName);
+       fout.open(groupF);
+       for (int i = 0; i < (int)gi.size(); i++) fout<< gi[i]<< endl;
         fout.close();
         if (verbose) { printf("Group File is generated!\n"); }
  
+       if (option == 2) {
+         sprintf(gtF, "%s.gt", refName);
+         fout.open(gtF);
+         for (int i = 0; i < (int)gt.size(); i++) fout<< gt[i]<< endl;
+         fout.close();
+         sprintf(taF, "%s.ta", refName);
+         fout.open(taF);
+         for (int i = 0; i < (int)ta.size(); i++) fout<< ta[i]<< endl;
+         fout.close();
+         if (verbose) { printf("Allele-specific group files are generated!\n"); }
+       }
+
+       sprintf(refFastaF, "%s.transcripts.fa", refName);
+       sprintf(chromListF, "%s.chrlist", refName);
         fout2.open(chromListF);
         fout.open(refFastaF);
         for (int i = 1; i <= M; i++) {
-               name = transcripts.getTranscriptAt(i).getTranscriptID();
+               name = transcripts.getTranscriptAt(i).getSeqName();
                 iter = name2seq.find(name);
-               if (iter == name2seq.end()) {
-                       fprintf(stderr, "Cannot recognize transcript ID %s!\n", name.c_str());
-                       exit(-1);
-               }
+               general_assert(iter != name2seq.end(), "Cannot recognize sequence ID" + name + "!");
                 fout<<">"<<name<<endl;
                 fout<<iter->second<<endl;
  
@@ -105,19 +143,22 @@ void writeResults(char* refName) {
  
  int main(int argc, char* argv[]) {
    if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) {
-               printf("Usage: synthesisRef refName quiet hasMappingFile [mappingFile] reference_file_1 [reference_file_2 ...]\n");
+               printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n");
                 exit(-1);
         }
  
         verbose = !atoi(argv[2]);
  
-       if (hasMappingFile) { loadMappingInfo(argv[4]); }
+       if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); }
+
+       // allele-specific
+       if (hasMappingFile == 2) { transcripts.setType(2); }
  
         int start = hasMappingFile ? 5 : 4;
  
         ifstream fin;
         string line, gseq;
-       string seqname, gene_id;
+       string seqname, gene_id, transcript_id;
  
         vector<Interval> vec;
  
@@ -125,7 +166,7 @@ int main(int argc, char* argv[]) {
         name2seq.clear();
         for (int i = start; i < argc; i++) {
                 fin.open(argv[i]);
-               if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); }
+               general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); 
                 getline(fin, line);
                 while ((fin) && (line[0] == '>')) {
                         istringstream strin(line.substr(1));
@@ -142,20 +183,23 @@ int main(int argc, char* argv[]) {
  
                         name2seq[seqname] = gseq;
  
+                       transcript_id = seqname;
+                       gene_id = seqname;
+
                         if (hasMappingFile) {
                               mi_iter = mi_table.find(seqname);
-                             if (mi_iter == mi_table.end()) {
-                                 fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", seqname.c_str());
-                                 exit(-1);
-                             }
-                             //assert(iter != table.end());
+                             general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!");
                               gene_id = mi_iter->second;
+                             if (hasMappingFile == 2) {
+                               mi_iter2 = mi_table2.find(seqname);
+                               general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!");
+                               transcript_id = mi_iter2->second;
+                             }
                         }
-                       else gene_id = seqname;
-
+                       
                         vec.clear();
                         vec.push_back(Interval(1, len));
-                       transcripts.add(Transcript(seqname, gene_id, seqname, '+', vec, ""));
+                       transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, ""));
                         ++M;
  
                         if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); }
@@ -171,7 +215,7 @@ int main(int argc, char* argv[]) {
         assert(M == transcripts.getM());
         transcripts.sort();
  
-       writeResults(argv[1]);
+       writeResults(hasMappingFile, argv[1]);
  
         return 0;
  }
author	Bo Li <bli@cs.wisc.edu>
	Fri, 21 Mar 2014 16:27:31 +0000 (11:27 -0500)
committer	Bo Li <bli@cs.wisc.edu>
	Fri, 21 Mar 2014 16:27:31 +0000 (11:27 -0500)
EM.cpp		patch \| blob \| history
Gibbs.cpp		patch \| blob \| history
README.md		patch \| blob \| history
Transcript.h		patch \| blob \| history
Transcripts.h		patch \| blob \| history
WHAT_IS_NEW		patch \| blob \| history
WriteResults.h	[new file with mode: 0644]	patch \| blob
calcCI.cpp		patch \| blob \| history
makefile		patch \| blob \| history
rsem-calculate-expression		patch \| blob \| history
rsem-gen-transcript-plots		patch \| blob \| history
rsem-plot-model		patch \| blob \| history
rsem-plot-transcript-wiggles		patch \| blob \| history
rsem-prepare-reference		patch \| blob \| history
rsem_perl_utils.pm		patch \| blob \| history
simulation.cpp		patch \| blob \| history
synthesisRef.cpp		patch \| blob \| history