From 4f7502168c3816ba3283385f093e599527e2b144 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Fri, 21 Mar 2014 11:27:31 -0500 Subject: [PATCH] Added support for allele-specific expression estimation --- EM.cpp | 192 +---------- Gibbs.cpp | 173 ++-------- README.md | 2 +- Transcript.h | 6 +- Transcripts.h | 12 +- WHAT_IS_NEW | 7 + WriteResults.h | 624 +++++++++++++++++++++++++++++++++++ calcCI.cpp | 189 +++++++---- makefile | 12 +- rsem-calculate-expression | 121 +++++-- rsem-gen-transcript-plots | 84 ++--- rsem-plot-model | 2 +- rsem-plot-transcript-wiggles | 32 +- rsem-prepare-reference | 19 ++ rsem_perl_utils.pm | 6 +- simulation.cpp | 150 +-------- synthesisRef.cpp | 134 +++++--- 17 files changed, 1100 insertions(+), 665 deletions(-) create mode 100644 WriteResults.h diff --git a/EM.cpp b/EM.cpp index 59783f3..bf15c2b 100644 --- a/EM.cpp +++ b/EM.cpp @@ -44,6 +44,8 @@ #include "HitWrapper.h" #include "BamWriter.h" +#include "WriteResults.h" + using namespace std; const double STOP_CRITERIA = 0.001; @@ -68,7 +70,7 @@ bool genGibbsOut; // generate file for Gibbs sampler char refName[STRLEN], outName[STRLEN]; char imdName[STRLEN], statName[STRLEN]; -char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN]; +char refF[STRLEN], cntF[STRLEN], tiF[STRLEN]; char mparamsF[STRLEN]; char modelF[STRLEN], thetaF[STRLEN]; @@ -83,7 +85,6 @@ vector theta, eel; // eel : expected effective length double *probv, **countvs; Refs refs; -GroupInfo gi; Transcripts transcripts; ModelParams mparams; @@ -271,185 +272,11 @@ void* calcConProbs(void* arg) { return NULL; } -template -void calcExpectedEffectiveLengths(ModelType& model) { - int lb, ub, span; - double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i) - - model.getGLD().copyTo(pdf, cdf, lb, ub, span); - clen = new double[span + 1]; - clen[0] = 0.0; - for (int i = 1; i <= span; i++) { - clen[i] = clen[i - 1] + pdf[i] * (lb + i); - } - - eel.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) { - int totLen = refs.getRef(i).getTotLen(); - int fullLen = refs.getRef(i).getFullLen(); - int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0); - int pos2 = max(min(totLen, ub) - lb, 0); - - if (pos2 == 0) { eel[i] = 0.0; continue; } - - eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1])); - assert(eel[i] >= 0); - if (eel[i] < MINEEL) { eel[i] = 0.0; } - } - - delete[] pdf; - delete[] cdf; - delete[] clen; -} - -void polishTheta(vector& theta, const vector& eel, const double* mw) { - double sum = 0.0; - - /* The reason that for noise gene, mw value is 1 is : - * currently, all masked positions are for poly(A) sites, which in theory should be filtered out. - * So the theta0 does not containing reads from any masked position - */ - - for (int i = 0; i <= M; i++) { - // i == 0, mw[i] == 1 - if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) { - theta[i] = 0.0; - continue; - } - theta[i] = theta[i] / mw[i]; - sum += theta[i]; - } - // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked - general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !"); - for (int i = 0; i <= M; i++) theta[i] /= sum; -} - -void calcExpressionValues(const vector& theta, const vector& eel, vector& tpm, vector& fpkm) { - double denom; - vector frac; - - //calculate fraction of count over all mappabile reads - denom = 0.0; - frac.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) - if (eel[i] >= EPSILON) { - frac[i] = theta[i]; - denom += frac[i]; - } - general_assert(denom >= EPSILON, "No alignable reads?!"); - for (int i = 1; i <= M; i++) frac[i] /= denom; - - //calculate FPKM - fpkm.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) - if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i]; - - //calculate TPM - tpm.assign(M + 1, 0.0); - denom = 0.0; - for (int i = 1; i <= M; i++) denom += fpkm[i]; - for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6; -} - template void writeResults(ModelType& model, double* counts) { - char outF[STRLEN]; - FILE *fo; - - sprintf(modelF, "%s.model", statName); - model.write(modelF); - - vector tlens; - vector fpkm, tpm, isopct; - vector glens, gene_eels, gene_counts, gene_tpm, gene_fpkm; - - calcExpressionValues(theta, eel, tpm, fpkm); - - //calculate IsoPct, etc. - isopct.assign(M + 1, 0.0); - tlens.assign(M + 1, 0); - - glens.assign(m, 0.0); gene_eels.assign(m, 0.0); - gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0); - - for (int i = 0; i < m; i++) { - int b = gi.spAt(i), e = gi.spAt(i + 1); - for (int j = b; j < e; j++) { - const Transcript& transcript = transcripts.getTranscriptAt(j); - tlens[j] = transcript.getLength(); - - gene_counts[i] += counts[j]; - gene_tpm[i] += tpm[j]; - gene_fpkm[i] += fpkm[j]; - } - - if (gene_tpm[i] < EPSILON) { - double frac = 1.0 / (e - b); - for (int j = b; j < e; j++) { - glens[i] += tlens[j] * frac; - gene_eels[i] += eel[j] * frac; - } - } - else { - for (int j = b; j < e; j++) { - isopct[j] = tpm[j] / gene_tpm[i]; - glens[i] += tlens[j] * isopct[j]; - gene_eels[i] += eel[j] * isopct[j]; - } - } - } - - //isoform level results - sprintf(outF, "%s.iso_res", imdName); - fo = fopen(outF, "w"); - for (int i = 1; i <= M; i++) { - const Transcript& transcript = transcripts.getTranscriptAt(i); - fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n')); - } - for (int i = 1; i <= M; i++) { - const Transcript& transcript = transcripts.getTranscriptAt(i); - fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n')); - } - for (int i = 1; i <= M; i++) - fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n')); - fclose(fo); - - //gene level results - sprintf(outF, "%s.gene_res", imdName); - fo = fopen(outF, "w"); - for (int i = 0; i < m; i++) { - const Transcript& transcript = transcripts.getTranscriptAt(gi.spAt(i)); - fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m - 1 ? '\t' : '\n')); - } - for (int i = 0; i < m; i++) { - int b = gi.spAt(i), e = gi.spAt(i + 1); - for (int j = b; j < e; j++) { - fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : (i < m - 1 ? '\t' :'\n'))); - } - } - for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", glens[i], (i < m - 1 ? '\t' : '\n')); - for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_eels[i], (i < m - 1 ? '\t' : '\n')); - for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n')); - for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n')); - for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n')); - fclose(fo); - - if (verbose) { printf("Expression Results are written!\n"); } + sprintf(modelF, "%s.model", statName); + model.write(modelF); + writeResultsEM(M, refName, imdName, transcripts, theta, eel, countvs[0]); } template @@ -658,8 +485,8 @@ void EM() { fprintf(fo, "%.15g\n", theta[M]); //calculate expected effective lengths for each isoform - calcExpectedEffectiveLengths(model); - polishTheta(theta, eel, model.getMW()); + calcExpectedEffectiveLengths(M, refs, model, eel); + polishTheta(M, theta, eel, model.getMW()); // output theta for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]); @@ -762,9 +589,6 @@ int main(int argc, char* argv[]) { sprintf(refF, "%s.seq", refName); refs.loadRefs(refF); M = refs.getM(); - sprintf(groupF, "%s.grp", refName); - gi.load(groupF); - m = gi.getm(); sprintf(tiF, "%s.ti", refName); transcripts.readFrom(tiF); diff --git a/Gibbs.cpp b/Gibbs.cpp index b0287a3..e7a1182 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -18,7 +18,9 @@ #include "PairedEndQModel.h" #include "Refs.h" + #include "GroupInfo.h" +#include "WriteResults.h" using namespace std; @@ -44,17 +46,16 @@ struct Item { int nThreads; int model_type; -int m, M; +int M; READ_INT_TYPE N0, N1; HIT_INT_TYPE nHits; double totc; int BURNIN, NSAMPLES, GAP; -char imdName[STRLEN], statName[STRLEN]; -char thetaF[STRLEN], ofgF[STRLEN], groupF[STRLEN], refF[STRLEN], modelF[STRLEN]; +char refName[STRLEN], imdName[STRLEN], statName[STRLEN]; +char thetaF[STRLEN], ofgF[STRLEN], refF[STRLEN], modelF[STRLEN]; char cvsF[STRLEN]; Refs refs; -GroupInfo gi; vector s; vector hits; @@ -73,21 +74,16 @@ pthread_t *threads; pthread_attr_t attr; int rc; -void load_data(char* reference_name, char* statName, char* imdName) { +void load_data(char* refName, char* statName, char* imdName) { ifstream fin; string line; int tmpVal; //load reference file - sprintf(refF, "%s.seq", reference_name); + sprintf(refF, "%s.seq", refName); refs.loadRefs(refF, 1); M = refs.getM(); - //load groupF - sprintf(groupF, "%s.grp", reference_name); - gi.load(groupF); - m = gi.getm(); - //load ofgF; sprintf(ofgF, "%s.ofg", imdName); fin.open(ofgF); @@ -118,43 +114,12 @@ void load_data(char* reference_name, char* statName, char* imdName) { if (verbose) { printf("Loading Data is finished!\n"); } } -template -void calcExpectedEffectiveLengths(ModelType& model) { - int lb, ub, span; - double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i) - - model.getGLD().copyTo(pdf, cdf, lb, ub, span); - clen = new double[span + 1]; - clen[0] = 0.0; - for (int i = 1; i <= span; i++) { - clen[i] = clen[i - 1] + pdf[i] * (lb + i); - } - - eel.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) { - int totLen = refs.getRef(i).getTotLen(); - int fullLen = refs.getRef(i).getFullLen(); - int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0); - int pos2 = max(min(totLen, ub) - lb, 0); - - if (pos2 == 0) { eel[i] = 0.0; continue; } - - eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1])); - assert(eel[i] >= 0); - if (eel[i] < MINEEL) { eel[i] = 0.0; } - } - - delete[] pdf; - delete[] cdf; - delete[] clen; -} - template void init_model_related(char* modelF) { ModelType model; model.read(modelF); - calcExpectedEffectiveLengths(model); + calcExpectedEffectiveLengths(M, refs, model, eel); memcpy(mw, model.getMW(), sizeof(double) * (M + 1)); // otherwise, after exiting this procedure, mw becomes undefined } @@ -220,55 +185,6 @@ void writeCountVector(FILE* fo, vector& counts) { fprintf(fo, "%d\n", counts[M]); } -void polishTheta(vector& theta, const vector& eel, const double* mw) { - double sum = 0.0; - - /* The reason that for noise gene, mw value is 1 is : - * currently, all masked positions are for poly(A) sites, which in theory should be filtered out. - * So the theta0 does not containing reads from any masked position - */ - - for (int i = 0; i <= M; i++) { - // i == 0, mw[i] == 1 - if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) { - theta[i] = 0.0; - continue; - } - theta[i] = theta[i] / mw[i]; - sum += theta[i]; - } - // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked - general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !"); - for (int i = 0; i <= M; i++) theta[i] /= sum; -} - -void calcExpressionValues(const vector& theta, const vector& eel, vector& tpm, vector& fpkm) { - double denom; - vector frac; - - //calculate fraction of count over all mappabile reads - denom = 0.0; - frac.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) - if (eel[i] >= EPSILON) { - frac[i] = theta[i]; - denom += frac[i]; - } - general_assert(denom >= EPSILON, "No alignable reads?!"); - for (int i = 1; i <= M; i++) frac[i] /= denom; - - //calculate FPKM - fpkm.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) - if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i]; - - //calculate TPM - tpm.assign(M + 1, 0.0); - denom = 0.0; - for (int i = 1; i <= M; i++) denom += fpkm[i]; - for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6; -} - void* Gibbs(void* arg) { int CHAINLEN; HIT_INT_TYPE len, fr, to; @@ -320,8 +236,8 @@ void* Gibbs(void* arg) { if ((ROUND - BURNIN - 1) % GAP == 0) { writeCountVector(params->fo, counts); for (int i = 0; i <= M; i++) theta[i] = counts[i] / totc; - polishTheta(theta, eel, mw); - calcExpressionValues(theta, eel, tpm, fpkm); + polishTheta(M, theta, eel, mw); + calcExpressionValues(M, theta, eel, tpm, fpkm); for (int i = 0; i <= M; i++) { params->pme_c[i] += counts[i] - 1; params->pve_c[i] += (counts[i] - 1) * (counts[i] - 1); @@ -374,66 +290,13 @@ void release() { } } -void writeResults(char* imdName) { - char outF[STRLEN]; - FILE *fo; - - vector isopct; - vector gene_counts, gene_tpm, gene_fpkm; - - //calculate IsoPct, etc. - isopct.assign(M + 1, 0.0); - gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0); - - for (int i = 0; i < m; i++) { - int b = gi.spAt(i), e = gi.spAt(i + 1); - for (int j = b; j < e; j++) { - gene_counts[i] += pme_c[j]; - gene_tpm[i] += pme_tpm[j]; - gene_fpkm[i] += pme_fpkm[j]; - } - if (gene_tpm[i] < EPSILON) continue; - for (int j = b; j < e; j++) - isopct[j] = pme_tpm[j] / gene_tpm[i]; - } - - //isoform level results - sprintf(outF, "%s.iso_res", imdName); - fo = fopen(outF, "a"); - general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!"); - - for (int i = 1; i <= M; i++) - fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n')); - fclose(fo); - - //gene level results - sprintf(outF, "%s.gene_res", imdName); - fo = fopen(outF, "a"); - general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!"); - - for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n')); - for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n')); - for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n')); - fclose(fo); - - if (verbose) { printf("Gibbs based expression values are written!\n"); } -} - int main(int argc, char* argv[]) { if (argc < 7) { printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n"); exit(-1); } + strcpy(refName, argv[1]); strcpy(imdName, argv[2]); strcpy(statName, argv[3]); @@ -459,7 +322,7 @@ int main(int argc, char* argv[]) { printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads); } - load_data(argv[1], statName, imdName); + load_data(refName, statName, imdName); sprintf(modelF, "%s.model", statName); FILE *fi = fopen(modelF, "r"); @@ -491,11 +354,19 @@ int main(int argc, char* argv[]) { if (verbose) printf("Gibbs finished!\n"); - writeResults(imdName); + writeResultsGibbs(M, refName, imdName, pme_c, pme_fpkm, pme_tpm); if (var_opt) { char varF[STRLEN]; + // Load group info + int m; + GroupInfo gi; + char groupF[STRLEN]; + sprintf(groupF, "%s.grp", refName); + gi.load(groupF); + m = gi.getm(); + sprintf(varF, "%s.var", statName); FILE *fo = fopen(varF, "w"); general_assert(fo != NULL, "Cannot open " + cstrtos(varF) + "!"); @@ -507,7 +378,7 @@ int main(int argc, char* argv[]) { } fclose(fo); } - + delete mw; // delete the copy return 0; diff --git a/README.md b/README.md index b93678d..f6b5ec8 100644 --- a/README.md +++ b/README.md @@ -296,7 +296,7 @@ __reference_name:__ The name of RSEM references, which should be already generat __estimated_model_file:__ This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'. 'model_file_description.txt' provides the format and meanings of this file. -__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. +__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. If the RSEM references built are aware of allele-specific transcripts, 'sample_name.alleles.results' should be used instead. __theta0:__ This parameter determines the fraction of reads that are coming from background "noise" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'. diff --git a/Transcript.h b/Transcript.h index 90f6e67..f7afa2b 100644 --- a/Transcript.h +++ b/Transcript.h @@ -10,6 +10,10 @@ #include "utils.h" +/** + If no genome is provided, seqname field is used to store the allele name. + */ + struct Interval { int start, end; @@ -49,7 +53,7 @@ public: } bool operator< (const Transcript& o) const { - return gene_id < o.gene_id || (gene_id == o.gene_id && transcript_id < o.transcript_id); + return gene_id < o.gene_id || (gene_id == o.gene_id && transcript_id < o.transcript_id) || (gene_id == o.gene_id && transcript_id == o.transcript_id && seqname < o.seqname); } const std::string& getTranscriptID() const { return transcript_id; } diff --git a/Transcripts.h b/Transcripts.h index 19b025e..5c17f95 100644 --- a/Transcripts.h +++ b/Transcripts.h @@ -28,7 +28,11 @@ public: } int getM() { return M; } + int getType() { return type; } + void setType(int type) { this->type = type; } + + bool isAlleleSpecific() { return type == 2; } const Transcript& getTranscriptAt(int pos) { assert(pos > 0 && pos <= M); @@ -60,7 +64,7 @@ public: void buildMappings(int, char**); private: - int M, type; // type 0 from genome , 1 standalone transcriptome + int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific std::vector transcripts; std::vector e2i, i2e; // external sid to internal sid, internal sid to external sid @@ -94,11 +98,11 @@ void Transcripts::buildMappings(int n_targets, char** target_name) { std::map dict; std::map::iterator iter; - general_assert(n_targets == M, "Number of transcripts does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!"); + general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!"); dict.clear(); for (int i = 1; i <= M; i++) { - const std::string& tid = transcripts[i].getTranscriptID(); + const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID(); iter = dict.find(tid); general_assert(iter == dict.end(), tid + " appears more than once!"); dict[tid] = i; @@ -108,7 +112,7 @@ void Transcripts::buildMappings(int n_targets, char** target_name) { i2e.assign(M + 1, 0); for (int i = 0; i < n_targets; i++) { iter = dict.find(std::string(target_name[i])); - general_assert(iter != dict.end(), "RSEM can not recognize transcript " + cstrtos(target_name[i]) + "!"); + general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!"); general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!"); e2i[i + 1] = iter->second; i2e[iter->second] = i + 1; diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW index 24ae065..b942a63 100644 --- a/WHAT_IS_NEW +++ b/WHAT_IS_NEW @@ -1,3 +1,10 @@ +RSEM v1.2.12 + +- Enabled allele-specific expression estimation +- Added option '--calc-pme' option to calculate posterior mean estimates only (no credibility intervals) + +-------------------------------------------------------------------------------------------- + RSEM v1.2.11 - Enabled RSEM to use Bowtie 2 aligner (indel, local and discordant alignments are not supported yet) diff --git a/WriteResults.h b/WriteResults.h new file mode 100644 index 0000000..95aa7b0 --- /dev/null +++ b/WriteResults.h @@ -0,0 +1,624 @@ +#ifndef WRITERESULTS_H_ +#define WRITERESULTS_H_ + +#include +#include +#include +#include +#include + +#include "utils.h" +#include "my_assert.h" +#include "GroupInfo.h" +#include "Transcript.h" +#include "Transcripts.h" +#include "Refs.h" + +#include "Model.h" +#include "SingleModel.h" +#include "SingleQModel.h" +#include "PairedEndModel.h" +#include "PairedEndQModel.h" + +template +void calcExpectedEffectiveLengths(int M, Refs& refs, ModelType& model, std::vector& eel) { + int lb, ub, span; + double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i) + + model.getGLD().copyTo(pdf, cdf, lb, ub, span); + clen = new double[span + 1]; + clen[0] = 0.0; + for (int i = 1; i <= span; i++) { + clen[i] = clen[i - 1] + pdf[i] * (lb + i); + } + + eel.assign(M + 1, 0.0); + for (int i = 1; i <= M; i++) { + int totLen = refs.getRef(i).getTotLen(); + int fullLen = refs.getRef(i).getFullLen(); + int pos1 = std::max(std::min(totLen - fullLen + 1, ub) - lb, 0); + int pos2 = std::max(std::min(totLen, ub) - lb, 0); + + if (pos2 == 0) { eel[i] = 0.0; continue; } + + eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1])); + assert(eel[i] >= 0); + if (eel[i] < MINEEL) { eel[i] = 0.0; } + } + + delete[] pdf; + delete[] cdf; + delete[] clen; +} + +void polishTheta(int M, std::vector& theta, const std::vector& eel, const double* mw) { + double sum = 0.0; + + /* The reason that for noise gene, mw value is 1 is : + * currently, all masked positions are for poly(A) sites, which in theory should be filtered out. + * So the theta0 does not containing reads from any masked position + */ + + for (int i = 0; i <= M; i++) { + // i == 0, mw[i] == 1 + if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) { + theta[i] = 0.0; + continue; + } + theta[i] = theta[i] / mw[i]; + sum += theta[i]; + } + // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked + general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !"); + for (int i = 0; i <= M; i++) theta[i] /= sum; +} + +void calcExpressionValues(int M, const std::vector& theta, const std::vector& eel, std::vector& tpm, std::vector& fpkm) { + double denom; + std::vector frac; + + //calculate fraction of count over all mappabile reads + denom = 0.0; + frac.assign(M + 1, 0.0); + for (int i = 1; i <= M; i++) + if (eel[i] >= EPSILON) { + frac[i] = theta[i]; + denom += frac[i]; + } + general_assert(denom >= EPSILON, "No alignable reads?!"); + for (int i = 1; i <= M; i++) frac[i] /= denom; + + //calculate FPKM + fpkm.assign(M + 1, 0.0); + for (int i = 1; i <= M; i++) + if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i]; + + //calculate TPM + tpm.assign(M + 1, 0.0); + denom = 0.0; + for (int i = 1; i <= M; i++) denom += fpkm[i]; + for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6; +} + +inline bool isAlleleSpecific(const char* refName, GroupInfo* gt = NULL, GroupInfo* ta = NULL) { + bool alleleS; + char gtF[STRLEN], taF[STRLEN]; + + sprintf(gtF, "%s.gt", refName); + sprintf(taF, "%s.ta", refName); + std::ifstream gtIF(gtF), taIF(taF); + alleleS = gtIF.is_open() && taIF.is_open(); + if (gtIF.is_open()) gtIF.close(); + if (taIF.is_open()) taIF.close(); + + if (alleleS) { + if (gt != NULL) gt->load(gtF); + if (ta != NULL) ta->load(taF); + } + + return alleleS; +} + +void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts& transcripts, std::vector& theta, std::vector& eel, double* counts) { + char outF[STRLEN]; + FILE *fo; + + int m; + GroupInfo gi; + char groupF[STRLEN]; + + std::vector tlens; + std::vector fpkm, tpm, isopct; + std::vector glens, gene_eels, gene_counts, gene_tpm, gene_fpkm; + + // Load group info + sprintf(groupF, "%s.grp", refName); + gi.load(groupF); + m = gi.getm(); + + // For allele-specific expression + int m_trans = 0; + GroupInfo gt, ta; + std::vector trans_lens, trans_eels, trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct; + + bool alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific + + calcExpressionValues(M, theta, eel, tpm, fpkm); + + //calculate IsoPct, etc. + isopct.assign(M + 1, 0.0); + tlens.assign(M + 1, 0); + + glens.assign(m, 0.0); gene_eels.assign(m, 0.0); + gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0); + + for (int i = 0; i < m; i++) { + int b = gi.spAt(i), e = gi.spAt(i + 1); + for (int j = b; j < e; j++) { + const Transcript& transcript = transcripts.getTranscriptAt(j); + tlens[j] = transcript.getLength(); + + gene_counts[i] += counts[j]; + gene_tpm[i] += tpm[j]; + gene_fpkm[i] += fpkm[j]; + } + + if (gene_tpm[i] < EPSILON) { + double frac = 1.0 / (e - b); + for (int j = b; j < e; j++) { + glens[i] += tlens[j] * frac; + gene_eels[i] += eel[j] * frac; + } + } + else { + for (int j = b; j < e; j++) { + isopct[j] = tpm[j] / gene_tpm[i]; + glens[i] += tlens[j] * isopct[j]; + gene_eels[i] += eel[j] * isopct[j]; + } + } + } + + if (alleleS) { + m_trans = ta.getm(); + ta_pct.assign(M + 1, 0.0); + trans_lens.assign(m_trans, 0.0); trans_eels.assign(m_trans, 0.0); + trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0); + + for (int i = 0; i < m_trans; i++) { + int b = ta.spAt(i), e = ta.spAt(i + 1); + for (int j = b; j < e; j++) { + trans_counts[i] += counts[j]; + trans_tpm[i] += tpm[j]; + trans_fpkm[i] += fpkm[j]; + } + + if (trans_tpm[i] < EPSILON) { + double frac = 1.0 / (e - b); + for (int j = b; j < e; j++) { + trans_lens[i] += tlens[j] * frac; + trans_eels[i] += eel[j] * frac; + } + } + else { + for (int j = b; j < e; j++) { + ta_pct[j] = tpm[j] / trans_tpm[i]; + trans_lens[i] += tlens[j] * ta_pct[j]; + trans_eels[i] += eel[j] * ta_pct[j]; + } + } + } + + gt_pct.assign(m_trans, 0.0); + for (int i = 0; i < m; i++) + if (gene_tpm[i] >= EPSILON) { + int b = gt.spAt(i), e = gt.spAt(i + 1); + for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i]; + } + } + + if (!alleleS) { + //isoform level results + sprintf(outF, "%s.iso_res", imdName); + fo = fopen(outF, "w"); + for (int i = 1; i <= M; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(i); + fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n')); + } + for (int i = 1; i <= M; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(i); + fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n')); + } + for (int i = 1; i <= M; i++) + fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n')); + fclose(fo); + } + else { + // allele level results + sprintf(outF, "%s.allele_res", imdName); + fo = fopen(outF, "w"); + for (int i = 1; i <= M; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(i); + fprintf(fo, "%s%c", transcript.getSeqName().c_str(), (i < M ? '\t' : '\n')); + } + for (int i = 1; i <= M; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(i); + fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n')); + } + for (int i = 1; i <= M; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(i); + fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n')); + } + for (int i = 1; i <= M; i++) + fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", ta_pct[i] * 1e2, (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n')); + fclose(fo); + + // isoform level results + sprintf(outF, "%s.iso_res", imdName); + fo = fopen(outF, "w"); + for (int i = 0; i < m_trans; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i)); + fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < m_trans - 1 ? '\t' : '\n')); + } + for (int i = 0; i < m_trans; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i)); + fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m_trans - 1 ? '\t' : '\n')); + } + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", trans_lens[i], (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", trans_eels[i], (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", trans_counts[i], (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", trans_tpm[i], (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", trans_fpkm[i], (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", gt_pct[i] * 1e2, (i < m_trans - 1 ? '\t' : '\n')); + fclose(fo); + } + + //gene level results + sprintf(outF, "%s.gene_res", imdName); + fo = fopen(outF, "w"); + for (int i = 0; i < m; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(gi.spAt(i)); + fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m - 1 ? '\t' : '\n')); + } + for (int i = 0; i < m; i++) { + int b = gi.spAt(i), e = gi.spAt(i + 1); + std::string curtid = "", tid; + for (int j = b; j < e; j++) { + tid = transcripts.getTranscriptAt(j).getTranscriptID(); + if (curtid != tid) { + if (curtid != "") fprintf(fo, ","); + fprintf(fo, "%s", tid.c_str()); + curtid = tid; + } + } + fprintf(fo, "%c", (i < m - 1 ? '\t' : '\n')); + } + for (int i = 0; i < m; i++) + fprintf(fo, "%.2f%c", glens[i], (i < m - 1 ? '\t' : '\n')); + for (int i = 0; i < m; i++) + fprintf(fo, "%.2f%c", gene_eels[i], (i < m - 1 ? '\t' : '\n')); + for (int i = 0; i < m; i++) + fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n')); + for (int i = 0; i < m; i++) + fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n')); + for (int i = 0; i < m; i++) + fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n')); + fclose(fo); + + if (verbose) { printf("Expression Results are written!\n"); } +} + +void writeResultsGibbs(int M, char* refName, char* imdName, std::vector& pme_c, std::vector& pme_fpkm, std::vector& pme_tpm) { + char outF[STRLEN]; + FILE *fo; + + int m; + GroupInfo gi; + char groupF[STRLEN]; + std::vector isopct; + std::vector gene_counts, gene_tpm, gene_fpkm; + + // Load group info + sprintf(groupF, "%s.grp", refName); + gi.load(groupF); + m = gi.getm(); + + // For allele-specific expression + int m_trans = 0; + GroupInfo gt, ta; + std::vector trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct; + + bool alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific + + //calculate IsoPct, etc. + isopct.assign(M + 1, 0.0); + gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0); + + for (int i = 0; i < m; i++) { + int b = gi.spAt(i), e = gi.spAt(i + 1); + for (int j = b; j < e; j++) { + gene_counts[i] += pme_c[j]; + gene_tpm[i] += pme_tpm[j]; + gene_fpkm[i] += pme_fpkm[j]; + } + if (gene_tpm[i] < EPSILON) continue; + for (int j = b; j < e; j++) + isopct[j] = pme_tpm[j] / gene_tpm[i]; + } + + if (alleleS) { + m_trans = ta.getm(); + ta_pct.assign(M + 1, 0.0); + trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0); + + for (int i = 0; i < m_trans; i++) { + int b = ta.spAt(i), e = ta.spAt(i + 1); + for (int j = b; j < e; j++) { + trans_counts[i] += pme_c[j]; + trans_tpm[i] += pme_tpm[j]; + trans_fpkm[i] += pme_fpkm[j]; + } + if (trans_tpm[i] < EPSILON) continue; + for (int j = b; j < e; j++) + ta_pct[j] = pme_tpm[j] / trans_tpm[i]; + } + + gt_pct.assign(m_trans, 0.0); + for (int i = 0; i < m; i++) + if (gene_tpm[i] >= EPSILON) { + int b = gt.spAt(i), e = gt.spAt(i + 1); + for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i]; + } + } + + if (!alleleS) { + //isoform level results + sprintf(outF, "%s.iso_res", imdName); + fo = fopen(outF, "a"); + general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!"); + + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n')); + fclose(fo); + } + else { + //allele level results + sprintf(outF, "%s.allele_res", imdName); + fo = fopen(outF, "a"); + general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!"); + + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", ta_pct[i] * 1e2, (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n')); + fclose(fo); + + //isoform level results + sprintf(outF, "%s.iso_res", imdName); + fo = fopen(outF, "a"); + general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!"); + + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", trans_counts[i], (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", trans_tpm[i], (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", trans_fpkm[i], (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", gt_pct[i] * 1e2, (i < m_trans - 1 ? '\t' : '\n')); + fclose(fo); + } + + //gene level results + sprintf(outF, "%s.gene_res", imdName); + fo = fopen(outF, "a"); + general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!"); + + for (int i = 0; i < m; i++) + fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n')); + for (int i = 0; i < m; i++) + fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n')); + for (int i = 0; i < m; i++) + fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n')); + fclose(fo); + + if (verbose) { printf("Gibbs based expression values are written!\n"); } +} + +void writeResultsSimulation(int M, char* refName, char* outFN, Transcripts& transcripts, std::vector& eel, std::vector& counts) { + char outF[STRLEN]; + FILE *fo; + + int m; + GroupInfo gi; + char groupF[STRLEN]; + + // Load group info + sprintf(groupF, "%s.grp", refName); + gi.load(groupF); + m = gi.getm(); + + std::vector tlens; + std::vector tpm, fpkm, isopct; + std::vector glens, gene_eels, gene_counts, gene_tpm, gene_fpkm; + + // For allele-specific expression + int m_trans = 0; + GroupInfo gt, ta; + std::vector trans_lens, trans_eels, trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct; + + bool alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific + + for (int i = 1; i <= M; i++) + general_assert(eel[i] > EPSILON || counts[i] <= EPSILON, "An isoform whose effecitve length < " + ftos(MINEEL, 6) + " got sampled!"); + + calcExpressionValues(M, counts, eel, tpm, fpkm); + + //calculate IsoPct, etc. + isopct.assign(M + 1, 0.0); + tlens.assign(M + 1, 0); + + glens.assign(m, 0.0); gene_eels.assign(m, 0.0); + gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0); + + for (int i = 0; i < m; i++) { + int b = gi.spAt(i), e = gi.spAt(i + 1); + for (int j = b; j < e; j++) { + const Transcript& transcript = transcripts.getTranscriptAt(j); + tlens[j] = transcript.getLength(); + + gene_counts[i] += counts[j]; + gene_tpm[i] += tpm[j]; + gene_fpkm[i] += fpkm[j]; + } + + if (gene_tpm[i] < EPSILON) { + double frac = 1.0 / (e - b); + for (int j = b; j < e; j++) { + glens[i] += tlens[j] * frac; + gene_eels[i] += eel[j] * frac; + } + } + else { + for (int j = b; j < e; j++) { + isopct[j] = tpm[j] / gene_tpm[i]; + glens[i] += tlens[j] * isopct[j]; + gene_eels[i] += eel[j] * isopct[j]; + } + } + } + + if (alleleS) { + m_trans = ta.getm(); + ta_pct.assign(M + 1, 0.0); + trans_lens.assign(m_trans, 0.0); trans_eels.assign(m_trans, 0.0); + trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0); + + for (int i = 0; i < m_trans; i++) { + int b = ta.spAt(i), e = ta.spAt(i + 1); + for (int j = b; j < e; j++) { + trans_counts[i] += counts[j]; + trans_tpm[i] += tpm[j]; + trans_fpkm[i] += fpkm[j]; + } + + if (trans_tpm[i] < EPSILON) { + double frac = 1.0 / (e - b); + for (int j = b; j < e; j++) { + trans_lens[i] += tlens[j] * frac; + trans_eels[i] += eel[j] * frac; + } + } + else { + for (int j = b; j < e; j++) { + ta_pct[j] = tpm[j] / trans_tpm[i]; + trans_lens[i] += tlens[j] * ta_pct[j]; + trans_eels[i] += eel[j] * ta_pct[j]; + } + } + } + + gt_pct.assign(m_trans, 0.0); + for (int i = 0; i < m; i++) + if (gene_tpm[i] >= EPSILON) { + int b = gt.spAt(i), e = gt.spAt(i + 1); + for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i]; + } + } + + //allele level + if (alleleS) { + sprintf(outF, "%s.sim.alleles.results", outFN); + fo = fopen(outF, "w"); + fprintf(fo, "allele_id\ttranscript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tAlleleIsoPct\tAlleleGenePct\n"); + for (int i = 1; i <= M; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(i); + fprintf(fo, "%s\t%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getSeqName().c_str(), transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i], + eel[i], counts[i], tpm[i], fpkm[i], ta_pct[i] * 1e2, isopct[i] * 1e2); + } + fclose(fo); + } + + //isoform level + sprintf(outF, "%s.sim.isoforms.results", outFN); + fo = fopen(outF, "w"); + fprintf(fo, "transcript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tIsoPct\n"); + if (!alleleS) { + for (int i = 1; i <= M; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(i); + fprintf(fo, "%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i], + eel[i], counts[i], tpm[i], fpkm[i], isopct[i] * 1e2); + } + } + else { + for (int i = 0; i < m_trans; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i)); + fprintf(fo, "%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), trans_lens[i], + trans_eels[i], trans_counts[i], trans_tpm[i], trans_fpkm[i], gt_pct[i] * 1e2); + } + } + fclose(fo); + + //gene level + sprintf(outF, "%s.sim.genes.results", outFN); + fo = fopen(outF, "w"); + fprintf(fo, "gene_id\ttranscript_id(s)\tlength\teffective_length\tcount\tTPM\tFPKM\n"); + for (int i = 0; i < m; i++) { + int b = gi.spAt(i), e = gi.spAt(i + 1); + const std::string& gene_id = transcripts.getTranscriptAt(b).getGeneID(); + fprintf(fo, "%s\t", gene_id.c_str()); + std::string curtid = "", tid; + for (int j = b; j < e; j++) { + tid = transcripts.getTranscriptAt(j).getTranscriptID(); + if (curtid != tid) { + if (curtid != "") fprintf(fo, ","); + fprintf(fo, "%s", tid.c_str()); + curtid = tid; + } + } + fprintf(fo, "\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", glens[i], gene_eels[i], gene_counts[i], gene_tpm[i], gene_fpkm[i]); + } + fclose(fo); +} + +#endif diff --git a/calcCI.cpp b/calcCI.cpp index 45c7683..97eba64 100644 --- a/calcCI.cpp +++ b/calcCI.cpp @@ -20,6 +20,7 @@ #include "Refs.h" #include "GroupInfo.h" +#include "WriteResults.h" #include "Buffer.h" @@ -32,17 +33,17 @@ struct Params { const double *mw; }; +struct CIParams { + int no; + int start_gene_id, end_gene_id; +}; + struct CIType { float lb, ub; // the interval is [lb, ub] CIType() { lb = ub = 0.0; } }; -struct CIParams { - int no; - int start_gene_id, end_gene_id; -}; - int model_type; int nMB; @@ -54,14 +55,20 @@ float *l_bars; char cvsF[STRLEN], tmpF[STRLEN], command[STRLEN]; -CIType *iso_tpm, *gene_tpm, *iso_fpkm, *gene_fpkm; +CIType *tpm, *fpkm; +CIType *iso_tpm = NULL, *iso_fpkm = NULL; +CIType *gene_tpm, *gene_fpkm; int M, m; Refs refs; GroupInfo gi; -char imdName[STRLEN], statName[STRLEN]; +char refName[STRLEN], imdName[STRLEN], statName[STRLEN]; char modelF[STRLEN], groupF[STRLEN], refF[STRLEN]; +bool alleleS; +int m_trans; +GroupInfo ta; + vector eel; //expected effective lengths Buffer *buffer; @@ -75,37 +82,6 @@ int rc; CIParams *ciParamsArray; -template -void calcExpectedEffectiveLengths(ModelType& model) { - int lb, ub, span; - double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = \sigma_{j=1}^{i}pdf[i]*(lb+i) - - model.getGLD().copyTo(pdf, cdf, lb, ub, span); - clen = new double[span + 1]; - clen[0] = 0.0; - for (int i = 1; i <= span; i++) { - clen[i] = clen[i - 1] + pdf[i] * (lb + i); - } - - eel.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) { - int totLen = refs.getRef(i).getTotLen(); - int fullLen = refs.getRef(i).getFullLen(); - int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0); - int pos2 = max(min(totLen, ub) - lb, 0); - - if (pos2 == 0) { eel[i] = 0.0; continue; } - - eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1])); - assert(eel[i] >= 0); - if (eel[i] < MINEEL) { eel[i] = 0.0; } - } - - delete[] pdf; - delete[] cdf; - delete[] clen; -} - void* sample_theta_from_c(void* arg) { int *cvec; double *theta; @@ -179,7 +155,7 @@ template void sample_theta_vectors_from_count_vectors() { ModelType model; model.read(modelF); - calcExpectedEffectiveLengths(model); + calcExpectedEffectiveLengths(M, refs, model, eel); int num_threads = min(nThreads, nCV); @@ -269,13 +245,19 @@ void calcCI(int nSamples, float *samples, float &lb, float &ub) { } void* calcCI_batch(void* arg) { - float *itsamples, *gtsamples, *ifsamples, *gfsamples; + float *tsamples, *fsamples; + float *itsamples = NULL, *ifsamples = NULL, *gtsamples, *gfsamples; ifstream fin; CIParams *ciParams = (CIParams*)arg; + int curtid, curaid, tid; - itsamples = new float[nSamples]; + tsamples = new float[nSamples]; + fsamples = new float[nSamples]; + if (alleleS) { + itsamples = new float[nSamples]; + ifsamples = new float[nSamples]; + } gtsamples = new float[nSamples]; - ifsamples = new float[nSamples]; gfsamples = new float[nSamples]; fin.open(tmpF, ios::binary); @@ -284,19 +266,46 @@ void* calcCI_batch(void* arg) { fin.seekg(pos, ios::beg); int cnt = 0; + if (alleleS) { + curtid = curaid = -1; + memset(itsamples, 0, FLOATSIZE * nSamples); + memset(ifsamples, 0, FLOATSIZE * nSamples); + } for (int i = ciParams->start_gene_id; i < ciParams->end_gene_id; i++) { int b = gi.spAt(i), e = gi.spAt(i + 1); memset(gtsamples, 0, FLOATSIZE * nSamples); memset(gfsamples, 0, FLOATSIZE * nSamples); for (int j = b; j < e; j++) { + if (alleleS) { + tid = ta.gidAt(j); + if (curtid != tid) { + if (curtid >= 0) { + if (j - curaid > 1) { + calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub); + calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub); + } + else { + iso_tpm[curtid] = tpm[curaid]; + iso_fpkm[curtid] = fpkm[curaid]; + } + } + curtid = tid; + curaid = j; + } + } + for (int k = 0; k < nSamples; k++) { - fin.read((char*)(&itsamples[k]), FLOATSIZE); - gtsamples[k] += itsamples[k]; - ifsamples[k] = 1e3 / l_bars[k] * itsamples[k]; - gfsamples[k] += ifsamples[k]; + fin.read((char*)(&tsamples[k]), FLOATSIZE); + fsamples[k] = 1e3 / l_bars[k] * tsamples[k]; + if (alleleS) { + itsamples[k] += tsamples[k]; + ifsamples[k] += fsamples[k]; + } + gtsamples[k] += tsamples[k]; + gfsamples[k] += fsamples[k]; } - calcCI(nSamples, itsamples, iso_tpm[j].lb, iso_tpm[j].ub); - calcCI(nSamples, ifsamples, iso_fpkm[j].lb, iso_fpkm[j].ub); + calcCI(nSamples, tsamples, tpm[j].lb, tpm[j].ub); + calcCI(nSamples, fsamples, fpkm[j].lb, fpkm[j].ub); } if (e - b > 1) { @@ -304,18 +313,34 @@ void* calcCI_batch(void* arg) { calcCI(nSamples, gfsamples, gene_fpkm[i].lb, gene_fpkm[i].ub); } else { - gene_tpm[i].lb = iso_tpm[b].lb; gene_tpm[i].ub = iso_tpm[b].ub; - gene_fpkm[i].lb = iso_fpkm[b].lb; gene_fpkm[i].ub = iso_fpkm[b].ub; + gene_tpm[i] = tpm[b]; + gene_fpkm[i] = fpkm[b]; } ++cnt; if (verbose && cnt % 1000 == 0) { printf("In thread %d, %d genes are processed for CI calculation!\n", ciParams->no, cnt); } } - fin.close(); - delete[] itsamples; + if (alleleS && (curtid >= 0)) { + if (gi.spAt(ciParams->end_gene_id) - curaid > 1) { + calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub); + calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub); + } + else { + iso_tpm[curtid] = tpm[curaid]; + iso_fpkm[curtid] = fpkm[curaid]; + } + } + + delete[] tsamples; + delete[] fsamples; + if (alleleS) { + delete[] itsamples; + delete[] ifsamples; + } delete[] gtsamples; + delete[] gfsamples; return NULL; } @@ -325,9 +350,13 @@ void calculate_credibility_intervals(char* imdName) { char outF[STRLEN]; int num_threads = nThreads; - iso_tpm = new CIType[M + 1]; + tpm = new CIType[M + 1]; + fpkm = new CIType[M + 1]; + if (alleleS) { + iso_tpm = new CIType[m_trans]; + iso_fpkm = new CIType[m_trans]; + } gene_tpm = new CIType[m]; - iso_fpkm = new CIType[M + 1]; gene_fpkm = new CIType[m]; assert(M > 0); @@ -375,19 +404,33 @@ void calculate_credibility_intervals(char* imdName) { delete[] ciParamsArray; - //isoform level results - sprintf(outF, "%s.iso_res", imdName); + alleleS ? sprintf(outF, "%s.allele_res", imdName) : sprintf(outF, "%s.iso_res", imdName); fo = fopen(outF, "a"); for (int i = 1; i <= M; i++) - fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < M ? '\t' : '\n')); + fprintf(fo, "%.6g%c", tpm[i].lb, (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) - fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < M ? '\t' : '\n')); + fprintf(fo, "%.6g%c", tpm[i].ub, (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) - fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < M ? '\t' : '\n')); + fprintf(fo, "%.6g%c", fpkm[i].lb, (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) - fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < M ? '\t' : '\n')); + fprintf(fo, "%.6g%c", fpkm[i].ub, (i < M ? '\t' : '\n')); fclose(fo); + if (alleleS) { + //isoform level results + sprintf(outF, "%s.iso_res", imdName); + fo = fopen(outF, "a"); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < m_trans - 1 ? '\t' : '\n')); + fclose(fo); + } + //gene level results sprintf(outF, "%s.gene_res", imdName); fo = fopen(outF, "a"); @@ -401,9 +444,13 @@ void calculate_credibility_intervals(char* imdName) { fprintf(fo, "%.6g%c", gene_fpkm[i].ub, (i < m - 1 ? '\t' : '\n')); fclose(fo); - delete[] iso_tpm; + delete[] tpm; + delete[] fpkm; + if (alleleS) { + delete[] iso_tpm; + delete[] iso_fpkm; + } delete[] gene_tpm; - delete[] iso_fpkm; delete[] gene_fpkm; if (verbose) { printf("All credibility intervals are calculated!\n"); } @@ -415,6 +462,7 @@ int main(int argc, char* argv[]) { exit(-1); } + strcpy(refName, argv[1]); strcpy(imdName, argv[2]); strcpy(statName, argv[3]); @@ -431,13 +479,18 @@ int main(int argc, char* argv[]) { } verbose = !quiet; - sprintf(refF, "%s.seq", argv[1]); + sprintf(refF, "%s.seq", refName); refs.loadRefs(refF, 1); M = refs.getM(); - sprintf(groupF, "%s.grp", argv[1]); + + sprintf(groupF, "%s.grp", refName); gi.load(groupF); m = gi.getm(); + // allele-specific + alleleS = isAlleleSpecific(refName, NULL, &ta); + if (alleleS) m_trans = ta.getm(); + nSamples = nCV * nSpC; assert(nSamples > 0 && M > 0); // for Buffter.h: (bufsize_type)nSamples l_bars = new float[nSamples]; @@ -463,14 +516,6 @@ int main(int argc, char* argv[]) { calculate_credibility_intervals(imdName); delete l_bars; - /* - sprintf(command, "rm -f %s", tmpF); - int status = system(command); - if (status != 0) { - fprintf(stderr, "Cannot delete %s!\n", tmpF); - exit(-1); - } - */ return 0; } diff --git a/makefile b/makefile index c95fd14..aec4b2d 100644 --- a/makefile +++ b/makefile @@ -15,7 +15,7 @@ Transcripts.h : my_assert.h Transcript.h rsem-extract-reference-transcripts : utils.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp $(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts -rsem-synthesis-reference-transcripts : utils.h Transcript.h Transcripts.h synthesisRef.cpp +rsem-synthesis-reference-transcripts : utils.h my_assert.h Transcript.h Transcripts.h synthesisRef.cpp $(CC) -Wall -O3 synthesisRef.cpp -o rsem-synthesis-reference-transcripts BowtieRefSeqPolicy.h : RefSeqPolicy.h @@ -82,10 +82,12 @@ BamWriter.h : sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h SingleHit.h Pair sampling.h : boost/random.hpp +WriteResults.h : utils.h my_assert.h GroupInfo.h Transcript.h Transcripts.h RefSeq.h Refs.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h + rsem-run-em : EM.o sam/libbam.a $(CC) -o rsem-run-em EM.o sam/libbam.a -lz -lpthread -EM.o : utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sam_rsem_aux.h sampling.h boost/random.hpp EM.cpp +EM.o : utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sam_rsem_aux.h sampling.h boost/random.hpp WriteResults.h EM.cpp $(CC) $(COFLAGS) EM.cpp bc_aux.h : sam/bam.h @@ -107,14 +109,14 @@ wiggle.o: sam/bam.h sam/sam.h wiggle.cpp wiggle.h rsem-simulate-reads : simulation.o $(CC) -o rsem-simulate-reads simulation.o -simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp simulation.cpp +simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp WriteResults.h simulation.cpp $(CC) $(COFLAGS) simulation.cpp rsem-run-gibbs : Gibbs.o $(CC) -o rsem-run-gibbs Gibbs.o -lpthread #some header files are omitted -Gibbs.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h Gibbs.cpp +Gibbs.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Gibbs.cpp $(CC) $(COFLAGS) Gibbs.cpp Buffer.h : my_assert.h @@ -123,7 +125,7 @@ rsem-calculate-credibility-intervals : calcCI.o $(CC) -o rsem-calculate-credibility-intervals calcCI.o -lpthread #some header files are omitted -calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h Buffer.h calcCI.cpp +calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Buffer.h calcCI.cpp $(CC) $(COFLAGS) calcCI.cpp rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a diff --git a/rsem-calculate-expression b/rsem-calculate-expression index b523edd..8a7ca0e 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -51,6 +51,7 @@ my $nThreads = 1; my $genBamF = 1; # default is generating transcript bam file my $genGenomeBamF = 0; my $sampling = 0; +my $calcPME = 0; my $calcCI = 0; my $var_opt = 0; # temporarily, only for internal use my $quiet = 0; @@ -80,6 +81,8 @@ my $inpF = ""; my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = (); my $gap = 32; +my $alleleS = 0; + GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "temporary-folder=s" => \$temp_dir, "no-qualities" => \$no_qual, @@ -114,6 +117,7 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "no-bam-output" => sub { $genBamF = 0; }, "output-genome-bam" => \$genGenomeBamF, "sampling-for-bam" => \$sampling, + "calc-pme" => \$calcPME, "var" => \$var_opt, "calc-ci" => \$calcCI, "ci-memory=i" => \$NMB, @@ -181,6 +185,13 @@ else { $sampleName = $ARGV[3]; } +if (((-e "$refName.ta") && !(-e "$refName.gt")) || (!(-e "$refName.ta") && (-e "$refName.gt"))) { + print "Allele-specific expression related reference files are corrupted!\n"; + exit(-1); +} + +$alleleS = (-e "$refName.ta") && (-e "$refName.gt"); + if ($genGenomeBamF) { open(INPUT, "$refName.ti"); my $line = ; chomp($line); @@ -329,13 +340,20 @@ if ($genBamF) { else { $command .= " 0"; } if ($sampling) { $command .= " --sampling"; } } -if ($calcCI || $var_opt) { $command .= " --gibbs-out"; } +if ($calcPME || $var_opt || $calcCI) { $command .= " --gibbs-out"; } if ($quiet) { $command .= " -q"; } &runCommand($command); -&collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level -&collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level +if ($alleleS) { + &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level +} +else { + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level +} if ($genBamF) { $command = $dir."sam/samtools sort -@ $nThreads -m $SortMem $sampleName.transcript.bam $sampleName.transcript.sorted"; @@ -357,7 +375,7 @@ if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; } if ($mTime) { $time_start = time(); } -if ($calcCI || $var_opt) { +if ($calcPME || $var_opt || $calcCI ) { $command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP"; $command .= " -p $nThreads"; if ($var_opt) { $command .= " --var"; } @@ -365,21 +383,43 @@ if ($calcCI || $var_opt) { &runCommand($command); } -if ($calcCI) { - system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1"); - system("mv $sampleName.genes.results $imdName.genes.results.bak1"); - &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level - &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level +if ($calcPME || $calcCI) { + if ($alleleS) { + system("mv $sampleName.alleles.results $imdName.alleles.results.bak1"); + system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1"); + system("mv $sampleName.genes.results $imdName.genes.results.bak1"); + &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level + } + else { + system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1"); + system("mv $sampleName.genes.results $imdName.genes.results.bak1"); + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level + } +} +if ($calcCI) { $command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB"; $command .= " -p $nThreads"; if ($quiet) { $command .= " -q"; } &runCommand($command); - system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2"); - system("mv $sampleName.genes.results $imdName.genes.results.bak2"); - &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level - &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level + if ($alleleS) { + system("mv $sampleName.alleles.results $imdName.alleles.results.bak2"); + system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2"); + system("mv $sampleName.genes.results $imdName.genes.results.bak2"); + &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level + } + else { + system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2"); + system("mv $sampleName.genes.results $imdName.genes.results.bak2"); + &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level + &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level + } } if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; } @@ -397,7 +437,7 @@ if ($mTime) { print OUTPUT "Aligning reads: $time_alignment s.\n"; print OUTPUT "Estimating expression levels: $time_rsem s.\n"; print OUTPUT "Calculating credibility intervals: $time_ci s.\n"; - my $time_del = $time_end - $time_start; +# my $time_del = $time_end - $time_start; # print OUTPUT "Delete: $time_del s.\n"; close(OUTPUT); } @@ -484,9 +524,13 @@ Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off) +=item B<--calc-pme> + +Run RSEM's collapsed Gibbs sampler to calculate posterior mean estimates. (Default: off) + =item B<--calc-ci> -Calculate 95% credibility intervals and posterior mean estimates. (Default: off) +Calculate 95% credibility intervals and posterior mean estimates. (Default: off) =item B<--seed-length> @@ -644,8 +688,9 @@ each line in the rest of this file is: transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] -Fields are separated by the tab character. Fields within "[]" are only -presented if '--calc-ci' is set. +Fields are separated by the tab character. Fields within "[]" are +optional. They will not be presented if neither '--calc-pme' nor +'--calc-ci' is set. 'transcript_id' is the transcript name of this transcript. 'gene_id' is the gene name of the gene which this transcript belongs to (denote @@ -704,8 +749,9 @@ each line in the rest of this file is: gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] -Fields are separated by the tab character. Fields within "[]" are only -presented if '--calc-ci' is set. +Fields are separated by the tab character. Fields within "[]" are +optional. They will not be presented if neither '--calc-pme' nor +'--calc-ci' is set. 'transcript_id(s)' is a comma-separated list of transcript_ids belonging to this gene. If no gene information is provided, 'gene_id' @@ -716,6 +762,43 @@ defined as the weighted average of its transcripts' lengths and effective lengths (weighted by 'IsoPct'). A gene's abundance estimates are just the sum of its transcripts' abundance estimates. +=item B + +Only generated when the RSEM references are built with allele-specific +transcripts. + +This file contains allele level expression estimates for +allele-specific expression calculation. The first line +contains column names separated by the tab character. The format of +each line in the rest of this file is: + +allele_id transcript_id gene_id length effective_length expected_count TPM FPKM AlleleIsoPct AlleleGenePct [pme_expected_count pme_TPM pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] + +Fields are separated by the tab character. Fields within "[]" are +optional. They will not be presented if neither '--calc-pme' nor +'--calc-ci' is set. + +'allele_id' is the allele-specific name of this allele-specific transcript. + +'AlleleIsoPct' stands for allele-specific percentage on isoform +level. It is the percentage of this allele-specific transcript's +abundance over its parent transcript's abundance. If its parent +transcript has only one allele variant form, this field will be set to +100. + +'AlleleGenePct' stands for allele-specific percentage on gene +level. It is the percentage of this allele-specific transcript's +abundance over its parent gene's abundance. + +'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have +similar meanings. They are calculated based on posterior mean +estimates. + +Please note that if this file is present, the fields 'length' and +'effective_length' in 'sample_name.isoforms.results' should be +interpreted similarly as the corresponding definitions in +'sample_name.genes.results'. + =item B Only generated when --no-bam-output is not specified. diff --git a/rsem-gen-transcript-plots b/rsem-gen-transcript-plots index 4ea1d87..5b8cc98 100755 --- a/rsem-gen-transcript-plots +++ b/rsem-gen-transcript-plots @@ -2,83 +2,89 @@ nrow_per_page <- 3 # if input_list is composed of transcript ids ncol_per_page <- 2 # if input_list is composed of transcript ids -num_plots_per_page <- nrow_per_page * ncol_per_page # if input_list is composed of transcript ids - +num_plots_per_page <- nrow_per_page * ncol_per_page # if input_list is composed of transcript/allele ids exit_with_error <- function(errmsg) { cat(errmsg, "\n", sep = "", file = stderr()) quit(save = "no", status = 1) } - args <- commandArgs(TRUE) -if (length(args) != 5) - exit_with_error("Usage: rsem-gen-transcript-plots sample_name input_list is_gene show_uniq output_plot_file") +if (length(args) != 6) + exit_with_error("Usage: rsem-gen-transcript-plots sample_name input_list is_allele_specific id_type<0,allele;1,isoform;2,gene> show_uniq output_plot_file") sample_name <- args[1] input_list <- args[2] -is_gene <- as.numeric(args[3]) -show_uniq <- as.numeric(args[4]) -output_plot_file <- args[5] +alleleS <- as.numeric(args[3]) +id_type <- as.numeric(args[4]) +show_uniq <- as.numeric(args[5]) +output_plot_file <- args[6] +is_composite <- (!alleleS && (id_type == 2)) || (alleleS && (id_type > 0)) + load_readdepth_file <- function(filename) { data <- read.table(file = filename, sep = "\t", stringsAsFactors = FALSE) readdepth <- split(data[, 2:3], data[, 1]) } -build_t2gmap <- function(filename) { - tpos <- 1 # the position of transcript_id - gpos <- 2 # the position of gene_id +build_map <- function(filename) { + value_pos <- 1 + if (!alleleS || (alleleS && id_type == 1)) { + key_pos <- 2 + } else { + key_pos <- 3 + } data <- read.delim(file = filename, sep = "\t", stringsAsFactors = FALSE) - tmp <- aggregate(data[tpos], data[gpos], function(x) x) - t2gmap <- tmp[,2] - names(t2gmap) <- tmp[,1] + tmp <- aggregate(data[value_pos], data[key_pos], function(x) x) + map <- tmp[,2] + names(map) <- tmp[,1] - t2gmap + map } -make_a_plot <- function(tid) { - vec <- readdepth[[tid]] - if (is.null(vec)) exit_with_error(paste("Unknown transcript detected,", tid, "is not included in RSEM's indices.")) +make_a_plot <- function(id) { + vec <- readdepth[[id]] + if (is.null(vec)) exit_with_error(sprintf("Unknown %s detected, %s is not included in RSEM's indices.", ifelse(alleleS, "allele-specific transcript", "transcript"), id)) if (is.na(vec[[2]])) wiggle <- rep(0, vec[[1]]) else wiggle <- as.numeric(unlist(strsplit(vec[[2]], split = " "))) len <- length(wiggle) if (!show_uniq) { plot(wiggle, type = "h") } else { - vec <- readdepth_uniq[[tid]] + vec <- readdepth_uniq[[id]] stopifnot(!is.null(vec)) if (is.na(vec[[2]])) wiggle_uniq <- rep(0, vec[[1]]) else wiggle_uniq <- as.numeric(unlist(strsplit(vec[[2]], split = " "))) stopifnot(len == length(wiggle_uniq)) if (len != sum(wiggle >= wiggle_uniq)) { - cat("Warning: transcript ", tid, " has position(s) that read covarege with multireads is smaller than read covarge without multireads.\n", " The 1-based position(s) is(are) : ", which(wiggle < wiggle_uniq), ".\n", " This may be due to floating point arithmetics.\n", sep = "") + cat("Warning: ", ifelse(alleleS, "allele-specific transcript", "transcript"), " ", id, " has position(s) that read covarege with multireads is smaller than read covarge without multireads.\n", " The 1-based position(s) is(are) : ", which(wiggle < wiggle_uniq), ".\n", " This may be due to floating point arithmetics.\n", sep = "") } heights <- rbind(wiggle_uniq, wiggle - wiggle_uniq) barplot(heights, space = 0, border = NA, names.arg = 1:len, col = c("black", "red")) } - title(main = tid) #, xlab = "Position in transcript", ylab = "Read depth") + title(main = id) } -generate_a_page <- function(tids, gene_id = NULL) { - n <- length(tids) - ncol <- ifelse(is_gene, floor(sqrt(n)), ncol_per_page) - nrow <- ifelse(is_gene, ceiling(n / ncol), nrow_per_page) +generate_a_page <- function(ids, title = NULL) { + n <- length(ids) + ncol <- ifelse(is_composite, floor(sqrt(n)), ncol_per_page) + nrow <- ifelse(is_composite, ceiling(n / ncol), nrow_per_page) par(mfrow = c(nrow, ncol), mar = c(2, 2, 2, 2)) - if (is_gene) par(oma = c(0, 0, 3, 0)) - sapply(tids, make_a_plot) - if (is_gene) mtext(gene_id, outer = TRUE, line = 1) + if (is_composite) par(oma = c(0, 0, 3, 0)) + sapply(ids, make_a_plot) + if (is_composite) mtext(title, outer = TRUE, line = 1) } -plot_a_transcript <- function(i) { +plot_individual <- function(i) { fr <- (i - 1) * num_plots_per_page + 1 to <- min(i * num_plots_per_page, n) generate_a_page(ids[fr:to]) } -plot_a_gene <- function(gene_id) { - if (is.null(t2gmap[[gene_id]])) exit_with_error(paste("Unknown gene detected,", gene_id, "is not included in RSEM's in indices.")) - generate_a_page(t2gmap[[gene_id]], gene_id) +# cid, composite id, can be either a gene id or transcript id (for allele-specific expression only) +plot_composite <- function(cid) { + if (is.null(map[[cid]])) exit_with_error(sprintf("Unknown %s detected, %s is not included in RSEM's indices.", ifelse(alleleS && id_type == 1, "transcript", "gene"), cid)) + generate_a_page(map[[cid]], cid) } readdepth <- load_readdepth_file(paste(sample_name, ".transcript.readdepth", sep = "")) @@ -91,24 +97,22 @@ ids <- scan(file = input_list, what = "", sep = "\n") cat("Loading files is done!\n") -if (is_gene) { - t2gmap <- build_t2gmap(paste(sample_name, ".isoforms.results", sep = "")) +if (is_composite) { + file_name <- sprintf("%s.%s.results", sample_name, ifelse(alleleS, "alleles", "isoforms")) + map <- build_map(file_name) cat("Building transcript to gene map is done!\n") } pdf(output_plot_file) -if (!is_gene) { +if (!is_composite) { n <- length(ids) ub <- (n - 1) %/% num_plots_per_page + 1 - tmp <- sapply(1:ub, plot_a_transcript) + dumbvar <- sapply(1:ub, plot_individual) } else { - tmp <- sapply(ids, plot_a_gene) + dumbvar <- sapply(ids, plot_composite) } cat("Plots are generated!\n") dev.off.output <- dev.off() - - - diff --git a/rsem-plot-model b/rsem-plot-model index 225d08f..e87226d 100755 --- a/rsem-plot-model +++ b/rsem-plot-model @@ -146,4 +146,4 @@ barplot(pair[,2], names.arg = pair[,1], ylab = "Number of fragments", main = "Histogram of Alignments per Fragment") -dev.off() +dev.off.output <- dev.off() diff --git a/rsem-plot-transcript-wiggles b/rsem-plot-transcript-wiggles index efe00ba..74076ec 100755 --- a/rsem-plot-transcript-wiggles +++ b/rsem-plot-transcript-wiggles @@ -8,17 +8,25 @@ use strict; use rsem_perl_utils; -my $gene_list = 0; # default is 0, means input is a transcript list; 1 means input is a gene list +my $gene_list = 0; # default is 0, means input is not a gene list +my $transcript_list = 0; # default is 0, this option can only be turned on if allele-specific expression is calculated my $show_unique = 0; # 0, default value, means do not show unique transcript wiggles; 1 means show unique transcript wiggles my $help = 0; GetOptions("gene-list" => \$gene_list, + "transcript-list" => \$transcript_list, "show-unique" => \$show_unique, "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2); pod2usage(-verbose => 2) if ($help == 1); pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3); +my $alleleS = 0; +if (-e "$ARGV[0].alleles.results") { $alleleS = 1; } + +pod2usage(-msg => "--transcript-list cannot be set if allele-specific reference is not built!", -exitval => 2, -verbose => 2) if (!$alleleS && $transcript_list); +pod2usage(-msg => "--gene-list and --transcript-list cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($gene_list && $transcript_list); + my $dir = "$FindBin::Bin/"; my $command = ""; @@ -46,7 +54,19 @@ if ($show_unique) { } } -$command = $dir."rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $gene_list $show_unique $ARGV[2]"; +my $id_type; + +if ($alleleS) { + $id_type = 0; + if ($transcript_list) { $id_type = 1; } + if ($gene_list) { $id_type = 2; } +} +else { + $id_type = 1; + if ($gene_list) { $id_type = 2; } +} + +$command = $dir."rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $alleleS $id_type $show_unique $ARGV[2]"; &runCommand($command); __END__ @@ -85,6 +105,10 @@ The file name of the pdf file which contains all plots. The input-list is a list of gene ids. (Default: off) +=item B<--transcript-list> + +The input-list is a list of transcript ids. This option can only be turned on if allele-specific expression is calculated. (Default: off) + =item B<--show-unique> Show the wiggle plots as stacked bar plots. See description section for details. (Default: off) @@ -97,7 +121,7 @@ Show help information. =head1 DESCRIPTION -This program generates transcript wiggle plots and outputs them in a pdf file. This program can accept either a list of transcript ids or gene ids (if transcript to gene mapping information is provided) and has two modes of showing wiggle plots. If '--show-unique' is not specified, the wiggle plot for each transcript is a histogram where each position has the expected read depth at this position as its height. If '--show-unique' is specified, for each transcript a stacked bar plot is generated. For each position, the read depth of unique reads, which have only one alignment, is showed in black. The read depth of multi-reads, which align to more than one places, is showed in red on top of the read depth of unique reads.This program will use some files RSEM generated previouslly. So please do not delete/move any file 'rsem-calculate-expression' generated. +This program generates transcript wiggle plots and outputs them in a pdf file. This program can accept either a list of transcript ids or gene ids (if transcript to gene mapping information is provided) and has two modes of showing wiggle plots. If '--show-unique' is not specified, the wiggle plot for each transcript is a histogram where each position has the expected read depth at this position as its height. If '--show-unique' is specified, for each transcript a stacked bar plot is generated. For each position, the read depth of unique reads, which have only one alignment, is showed in black. The read depth of multi-reads, which align to more than one places, is showed in red on top of the read depth of unique reads.This program will use some files RSEM generated previouslly. So please do not delete/move any file 'rsem-calculate-expression' generated. If allele-specific expression is calculated, the basic unit for plotting is an allele-specific transcript and plots can be grouped by either transcript ids (--transcript-list) or gene ids (--gene-list). =head1 OUTPUT @@ -105,7 +129,7 @@ This program generates transcript wiggle plots and outputs them in a pdf file. T =item B -This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangment of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. +This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangment of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. If allele-specific expression is calculated, the basin unit becomes an allele-specific transcript and transcript ids and gene ids can be used to group allele-specific transcripts. =item B diff --git a/rsem-prepare-reference b/rsem-prepare-reference index 81fd13e..de4f061 100755 --- a/rsem-prepare-reference +++ b/rsem-prepare-reference @@ -24,8 +24,11 @@ my $bowtie2_path = ""; my $quiet = 0; my $help = 0; +my $alleleMappingF = ""; + GetOptions("gtf=s" => \$gtfF, "transcript-to-gene-map=s" => \$mappingF, + "allele-to-gene-map=s" => \$alleleMappingF, "no-polyA" => \$no_polyA, "no-polyA-subset=s" => \$subsetFile, "polyA-length=i" => \$polyALen, @@ -39,6 +42,8 @@ GetOptions("gtf=s" => \$gtfF, pod2usage(-verbose => 2) if ($help == 1); pod2usage(-msg => "Set --no-polyA & --no-polyA-subset at the same time!", -exitval => 2, -verbose => 2) if ($no_polyA == 1 && $subsetFile ne ''); +pod2usage(-msg => "--transcript-to-gene-map and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($mappingF ne "") && ($alleleMappingF ne "")); +pod2usage(-msg => "--gtf and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($gtfF ne "") && ($alleleMappingF ne "")); pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); if ($bowtie2) { $no_bowtie = 1; $no_ntog = 1; } @@ -85,6 +90,7 @@ else { $"=" "; $command = $dir."rsem-synthesis-reference-transcripts $ARGV[1] $quiet"; if ($mappingF ne "") { $command .= " 1 $mappingF"; } + elsif ($alleleMappingF ne "") { $command .= " 2 $alleleMappingF"; } else { $command .= " 0"; } $command .= " @list"; &runCommand($command); @@ -164,6 +170,19 @@ If this option is off, then the mapping of isoforms to genes depends on whether (Default: off) +=item B<--allele-to-gene-map> + +Use information from to provide gene_id and transcript_id information for each allele-specific transcript. +Each line of should be of the form: + +gene_id transcript_id allele_id + +with the fields separated by a tab character. + +This option is designed for quantifying allele-specific expression. It is only valid if '--gtf' option is not specified. allele_id should be the sequence names presented in the FASTA-formatted files. + +(Default: off) + =item B<--no-polyA> Do not add poly(A) tails to the end of reference isoforms. (Default: adding poly(A) tails to all transcripts) diff --git a/rsem_perl_utils.pm b/rsem_perl_utils.pm index b2a040f..4321a45 100644 --- a/rsem_perl_utils.pm +++ b/rsem_perl_utils.pm @@ -31,12 +31,13 @@ sub runCommand { print "\n"; } +my @allele_title = ("allele_id", "transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "AlleleIsoPct", "AlleleGenePct", "pme_expected_count", "pme_TPM", "pme_FPKM", "AlleleIsoPct_from_pme_TPM", "AlleleGenePct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound"); my @transcript_title = ("transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "IsoPct", "pme_expected_count", "pme_TPM", "pme_FPKM", "IsoPct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound"); my @gene_title = ("gene_id", "transcript_id(s)", "length", "effective_length", "expected_count", "TPM", "FPKM", "pme_expected_count", "pme_TPM", "pme_FPKM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound"); -# inpF, outF +# type, inpF, outF sub collectResults { my $local_status; my ($inpF, $outF); @@ -69,7 +70,8 @@ sub collectResults { my @out_arr = (); for (my $i = 0; $i < $n; $i++) { - if ($_[0] eq "isoform") { push(@out_arr, $transcript_title[$i]); } + if ($_[0] eq "allele") { push(@out_arr, $allele_title[$i]); } + elsif ($_[0] eq "isoform") { push(@out_arr, $transcript_title[$i]); } elsif ($_[0] eq "gene") { push(@out_arr, $gene_title[$i]); } else { print "A bug on 'collectResults' is detected!\n"; exit(-1); } } diff --git a/simulation.cpp b/simulation.cpp index 0073c3e..3eb4242 100644 --- a/simulation.cpp +++ b/simulation.cpp @@ -23,21 +23,22 @@ #include "PairedEndQModel.h" #include "Refs.h" -#include "GroupInfo.h" #include "Transcript.h" #include "Transcripts.h" +#include "WriteResults.h" + #include "simul.h" using namespace std; -const int OFFSITE = 5; +bool alleleS; +int OFFSITE; READ_INT_TYPE N; -int model_type, M, m; +int model_type, M; Refs refs; -GroupInfo gi; Transcripts transcripts; vector eel; @@ -47,8 +48,8 @@ int n_os; ostream *os[2]; char outReadF[2][STRLEN]; -char refF[STRLEN], groupF[STRLEN], tiF[STRLEN]; -char geneResF[STRLEN], isoResF[STRLEN]; +char refName[STRLEN]; +char refF[STRLEN], tiF[STRLEN]; simul sampler; @@ -78,37 +79,6 @@ void genOutReadStreams(int type, char *outFN) { os[i] = new ofstream(outReadF[i]); } -template -void calcExpectedEffectiveLengths(ModelType& model) { - int lb, ub, span; - double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i) - - model.getGLD().copyTo(pdf, cdf, lb, ub, span); - clen = new double[span + 1]; - clen[0] = 0.0; - for (int i = 1; i <= span; i++) { - clen[i] = clen[i - 1] + pdf[i] * (lb + i); - } - - eel.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) { - int totLen = refs.getRef(i).getTotLen(); - int fullLen = refs.getRef(i).getFullLen(); - int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0); - int pos2 = max(min(totLen, ub) - lb, 0); - - if (pos2 == 0) { eel[i] = 0.0; continue; } - - eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1])); - assert(eel[i] >= 0); - if (eel[i] < MINEEL) { eel[i] = 0.0; } - } - - delete[] pdf; - delete[] cdf; - delete[] clen; -} - template void simulate(char* modelF, char* resultsF) { ModelType model(&refs); @@ -118,7 +88,7 @@ void simulate(char* modelF, char* resultsF) { model.read(modelF); //calculate eel - calcExpectedEffectiveLengths(model); + calcExpectedEffectiveLengths(M, refs, model, eel); //generate theta vector ifstream fin(resultsF); @@ -155,99 +125,6 @@ void simulate(char* modelF, char* resultsF) { cout<< "Total number of resimulation is "<< resimulation_count<< endl; } -void calcExpressionValues(const vector& theta, const vector& eel, vector& tpm, vector& fpkm) { - double denom; - vector frac; - - //calculate fraction of count over all mappabile reads - denom = 0.0; - frac.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) - if (eel[i] >= EPSILON) { - frac[i] = theta[i]; - denom += frac[i]; - } - general_assert(denom > 0, "No alignable reads?!"); - for (int i = 1; i <= M; i++) frac[i] /= denom; - - //calculate FPKM - fpkm.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) - if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i]; - - //calculate TPM - tpm.assign(M + 1, 0.0); - denom = 0.0; - for (int i = 1; i <= M; i++) denom += fpkm[i]; - for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6; -} - -void writeResFiles(char* outFN) { - FILE *fo; - vector tlens; - vector fpkm, tpm, isopct; - vector glens, gene_eels, gene_counts, gene_tpm, gene_fpkm; - - for (int i = 1; i <= M; i++) - general_assert(eel[i] > EPSILON || counts[i] <= EPSILON, "An isoform whose effecitve length < " + ftos(MINEEL, 6) + " got sampled!"); - - calcExpressionValues(counts, eel, tpm, fpkm); - - //calculate IsoPct, etc. - isopct.assign(M + 1, 0.0); - tlens.assign(M + 1, 0); - - glens.assign(m, 0.0); gene_eels.assign(m, 0.0); - gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0); - - for (int i = 0; i < m; i++) { - int b = gi.spAt(i), e = gi.spAt(i + 1); - for (int j = b; j < e; j++) { - const Transcript& transcript = transcripts.getTranscriptAt(j); - tlens[j] = transcript.getLength(); - - glens[i] += tlens[j] * tpm[j]; - gene_eels[i] += eel[j] * tpm[j]; - gene_counts[i] += counts[j]; - gene_tpm[i] += tpm[j]; - gene_fpkm[i] += fpkm[j]; - } - - if (gene_tpm[i] < EPSILON) continue; - - for (int j = b; j < e; j++) - isopct[j] = tpm[j] / gene_tpm[i]; - glens[i] /= gene_tpm[i]; - gene_eels[i] /= gene_tpm[i]; - } - - //isoform level - sprintf(isoResF, "%s.sim.isoforms.results", outFN); - fo = fopen(isoResF, "w"); - fprintf(fo, "transcript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tIsoPct\n"); - for (int i = 1; i <= M; i++) { - const Transcript& transcript = transcripts.getTranscriptAt(i); - fprintf(fo, "%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i], - eel[i], counts[i], tpm[i], fpkm[i], isopct[i] * 1e2); - } - fclose(fo); - - //gene level - sprintf(geneResF, "%s.sim.genes.results", outFN); - fo = fopen(geneResF, "w"); - fprintf(fo, "gene_id\ttranscript_id(s)\tlength\teffective_length\tcount\tTPM\tFPKM\n"); - for (int i = 0; i < m; i++) { - int b = gi.spAt(i), e = gi.spAt(i + 1); - const string& gene_id = transcripts.getTranscriptAt(b).getGeneID(); - fprintf(fo, "%s\t", gene_id.c_str()); - for (int j = b; j < e; j++) { - fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : '\t')); - } - fprintf(fo, "%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", glens[i], gene_eels[i], gene_counts[i], gene_tpm[i], gene_fpkm[i]); - } - fclose(fo); -} - void releaseOutReadStreams() { for (int i = 0; i < n_os; i++) { ((ofstream*)os[i])->close(); @@ -264,7 +141,7 @@ int main(int argc, char* argv[]) { printf("Parameters:\n\n"); printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n"); printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n"); - printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required.\n"); + printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. If the RSEM references built are aware of allele-specific transcripts, 'sample_name.alleles.results' should be used instead.\n"); printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n"); printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n"); printf("output_name: Prefix for all output files.\n"); @@ -289,13 +166,14 @@ int main(int argc, char* argv[]) { if (argc == 8 && !strcmp(argv[7], "-q")) quiet = true; verbose = !quiet; + strcpy(refName, argv[1]); + alleleS = isAlleleSpecific(refName); + OFFSITE = (alleleS ? 6: 5); + //load basic files sprintf(refF, "%s.seq", argv[1]); refs.loadRefs(refF); M = refs.getM(); - sprintf(groupF, "%s.grp", argv[1]); - gi.load(groupF); - m = gi.getm(); sprintf(tiF, "%s.ti", argv[1]); transcripts.readFrom(tiF); @@ -320,7 +198,7 @@ int main(int argc, char* argv[]) { case 3: simulate(argv[2], argv[3]); break; } - writeResFiles(argv[6]); + writeResultsSimulation(M, refName, argv[6], transcripts, eel, counts); releaseOutReadStreams(); return 0; diff --git a/synthesisRef.cpp b/synthesisRef.cpp index aa0d473..bde2d5b 100644 --- a/synthesisRef.cpp +++ b/synthesisRef.cpp @@ -5,8 +5,10 @@ #include #include #include +#include #include "utils.h" +#include "my_assert.h" #include "Transcript.h" #include "Transcripts.h" @@ -19,29 +21,47 @@ map::iterator iter; Transcripts transcripts(1); // no genome, just transcript set char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN], chromListF[STRLEN]; +char gtF[STRLEN], taF[STRLEN]; // group info between gene and transcript, transcript and allele -bool hasMappingFile; +int hasMappingFile; char mappingFile[STRLEN]; map mi_table; // mapping info table map::iterator mi_iter; //mapping info table's iterator -void loadMappingInfo(char* mappingF) { - ifstream fin(mappingF); - string line, key, value; - - if (!fin.is_open()) { - fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF); - exit(-1); - } +map mi_table2; // allele_id to transcript_id +map::iterator mi_iter2; // corresponding iterator - mi_table.clear(); - while (getline(fin, line)) { - line = cleanStr(line); - if (line[0] == '#') continue; - istringstream strin(line); - strin>>value>>key; - mi_table[key] = value; +void loadMappingInfo(int file_type, char* mappingF) { + ifstream fin(mappingF); + string line, key, value, value2; + + general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist."); + + switch(file_type) { + case 1: + mi_table.clear(); + while (getline(fin, line)) { + line = cleanStr(line); + if (line[0] == '#') continue; + istringstream strin(line); + strin>>value>>key; + mi_table[key] = value; + } + break; + case 2: + mi_table.clear(); + mi_table2.clear(); + while (getline(fin, line)) { + line = cleanStr(line); + if (line[0] == '#') continue; + istringstream strin(line); + strin>> value>> value2>> key; + mi_table[key] = value; + mi_table2[key] = value2; + } + break; + default: assert(false); } fin.close(); @@ -55,40 +75,58 @@ char check(char c) { return c; } -void writeResults(char* refName) { +void writeResults(int option, char* refName) { ofstream fout, fout2; - string cur_gene_id, name; + string cur_gene_id, cur_transcript_id, name; + vector gi, gt, ta; - sprintf(groupF, "%s.grp", refName); sprintf(tiF, "%s.ti", refName); - sprintf(refFastaF, "%s.transcripts.fa", refName); - sprintf(chromListF, "%s.chrlist", refName); - transcripts.writeTo(tiF); if (verbose) { printf("Transcript Information File is generated!\n"); } - fout.open(groupF); - cur_gene_id = ""; + cur_gene_id = ""; gi.clear(); + if (option == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); } for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (cur_gene_id != transcript.getGeneID()) { - fout<"<second< [mappingFile] reference_file_1 [reference_file_2 ...]\n"); exit(-1); } verbose = !atoi(argv[2]); - if (hasMappingFile) { loadMappingInfo(argv[4]); } + if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); } + + // allele-specific + if (hasMappingFile == 2) { transcripts.setType(2); } int start = hasMappingFile ? 5 : 4; ifstream fin; string line, gseq; - string seqname, gene_id; + string seqname, gene_id, transcript_id; vector vec; @@ -125,7 +166,7 @@ int main(int argc, char* argv[]) { name2seq.clear(); for (int i = start; i < argc; i++) { fin.open(argv[i]); - if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); } + general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist."); getline(fin, line); while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); @@ -142,20 +183,23 @@ int main(int argc, char* argv[]) { name2seq[seqname] = gseq; + transcript_id = seqname; + gene_id = seqname; + if (hasMappingFile) { mi_iter = mi_table.find(seqname); - if (mi_iter == mi_table.end()) { - fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", seqname.c_str()); - exit(-1); - } - //assert(iter != table.end()); + general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!"); gene_id = mi_iter->second; + if (hasMappingFile == 2) { + mi_iter2 = mi_table2.find(seqname); + general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!"); + transcript_id = mi_iter2->second; + } } - else gene_id = seqname; - + vec.clear(); vec.push_back(Interval(1, len)); - transcripts.add(Transcript(seqname, gene_id, seqname, '+', vec, "")); + transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, "")); ++M; if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); } @@ -171,7 +215,7 @@ int main(int argc, char* argv[]) { assert(M == transcripts.getM()); transcripts.sort(); - writeResults(argv[1]); + writeResults(hasMappingFile, argv[1]); return 0; } -- 2.39.2