#include "HitWrapper.h"
#include "BamWriter.h"
+#include "WriteResults.h"
+
using namespace std;
const double STOP_CRITERIA = 0.001;
char refName[STRLEN], outName[STRLEN];
char imdName[STRLEN], statName[STRLEN];
-char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN];
+char refF[STRLEN], cntF[STRLEN], tiF[STRLEN];
char mparamsF[STRLEN];
char modelF[STRLEN], thetaF[STRLEN];
double *probv, **countvs;
Refs refs;
-GroupInfo gi;
Transcripts transcripts;
ModelParams mparams;
return NULL;
}
-template<class ModelType>
-void calcExpectedEffectiveLengths(ModelType& model) {
- int lb, ub, span;
- double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
-
- model.getGLD().copyTo(pdf, cdf, lb, ub, span);
- clen = new double[span + 1];
- clen[0] = 0.0;
- for (int i = 1; i <= span; i++) {
- clen[i] = clen[i - 1] + pdf[i] * (lb + i);
- }
-
- eel.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++) {
- int totLen = refs.getRef(i).getTotLen();
- int fullLen = refs.getRef(i).getFullLen();
- int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
- int pos2 = max(min(totLen, ub) - lb, 0);
-
- if (pos2 == 0) { eel[i] = 0.0; continue; }
-
- eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
- assert(eel[i] >= 0);
- if (eel[i] < MINEEL) { eel[i] = 0.0; }
- }
-
- delete[] pdf;
- delete[] cdf;
- delete[] clen;
-}
-
-void polishTheta(vector<double>& theta, const vector<double>& eel, const double* mw) {
- double sum = 0.0;
-
- /* The reason that for noise gene, mw value is 1 is :
- * currently, all masked positions are for poly(A) sites, which in theory should be filtered out.
- * So the theta0 does not containing reads from any masked position
- */
-
- for (int i = 0; i <= M; i++) {
- // i == 0, mw[i] == 1
- if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) {
- theta[i] = 0.0;
- continue;
- }
- theta[i] = theta[i] / mw[i];
- sum += theta[i];
- }
- // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked
- general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !");
- for (int i = 0; i <= M; i++) theta[i] /= sum;
-}
-
-void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
- double denom;
- vector<double> frac;
-
- //calculate fraction of count over all mappabile reads
- denom = 0.0;
- frac.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++)
- if (eel[i] >= EPSILON) {
- frac[i] = theta[i];
- denom += frac[i];
- }
- general_assert(denom >= EPSILON, "No alignable reads?!");
- for (int i = 1; i <= M; i++) frac[i] /= denom;
-
- //calculate FPKM
- fpkm.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++)
- if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
-
- //calculate TPM
- tpm.assign(M + 1, 0.0);
- denom = 0.0;
- for (int i = 1; i <= M; i++) denom += fpkm[i];
- for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;
-}
-
template<class ModelType>
void writeResults(ModelType& model, double* counts) {
- char outF[STRLEN];
- FILE *fo;
-
- sprintf(modelF, "%s.model", statName);
- model.write(modelF);
-
- vector<int> tlens;
- vector<double> fpkm, tpm, isopct;
- vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
-
- calcExpressionValues(theta, eel, tpm, fpkm);
-
- //calculate IsoPct, etc.
- isopct.assign(M + 1, 0.0);
- tlens.assign(M + 1, 0);
-
- glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
- gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
-
- for (int i = 0; i < m; i++) {
- int b = gi.spAt(i), e = gi.spAt(i + 1);
- for (int j = b; j < e; j++) {
- const Transcript& transcript = transcripts.getTranscriptAt(j);
- tlens[j] = transcript.getLength();
-
- gene_counts[i] += counts[j];
- gene_tpm[i] += tpm[j];
- gene_fpkm[i] += fpkm[j];
- }
-
- if (gene_tpm[i] < EPSILON) {
- double frac = 1.0 / (e - b);
- for (int j = b; j < e; j++) {
- glens[i] += tlens[j] * frac;
- gene_eels[i] += eel[j] * frac;
- }
- }
- else {
- for (int j = b; j < e; j++) {
- isopct[j] = tpm[j] / gene_tpm[i];
- glens[i] += tlens[j] * isopct[j];
- gene_eels[i] += eel[j] * isopct[j];
- }
- }
- }
-
- //isoform level results
- sprintf(outF, "%s.iso_res", imdName);
- fo = fopen(outF, "w");
- for (int i = 1; i <= M; i++) {
- const Transcript& transcript = transcripts.getTranscriptAt(i);
- fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
- }
- for (int i = 1; i <= M; i++) {
- const Transcript& transcript = transcripts.getTranscriptAt(i);
- fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
- }
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
- fclose(fo);
-
- //gene level results
- sprintf(outF, "%s.gene_res", imdName);
- fo = fopen(outF, "w");
- for (int i = 0; i < m; i++) {
- const Transcript& transcript = transcripts.getTranscriptAt(gi.spAt(i));
- fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m - 1 ? '\t' : '\n'));
- }
- for (int i = 0; i < m; i++) {
- int b = gi.spAt(i), e = gi.spAt(i + 1);
- for (int j = b; j < e; j++) {
- fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : (i < m - 1 ? '\t' :'\n')));
- }
- }
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.2f%c", glens[i], (i < m - 1 ? '\t' : '\n'));
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.2f%c", gene_eels[i], (i < m - 1 ? '\t' : '\n'));
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
- fclose(fo);
-
- if (verbose) { printf("Expression Results are written!\n"); }
+ sprintf(modelF, "%s.model", statName);
+ model.write(modelF);
+ writeResultsEM(M, refName, imdName, transcripts, theta, eel, countvs[0]);
}
template<class ReadType, class HitType, class ModelType>
fprintf(fo, "%.15g\n", theta[M]);
//calculate expected effective lengths for each isoform
- calcExpectedEffectiveLengths<ModelType>(model);
- polishTheta(theta, eel, model.getMW());
+ calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
+ polishTheta(M, theta, eel, model.getMW());
// output theta
for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
sprintf(refF, "%s.seq", refName);
refs.loadRefs(refF);
M = refs.getM();
- sprintf(groupF, "%s.grp", refName);
- gi.load(groupF);
- m = gi.getm();
sprintf(tiF, "%s.ti", refName);
transcripts.readFrom(tiF);
#include "PairedEndQModel.h"
#include "Refs.h"
+
#include "GroupInfo.h"
+#include "WriteResults.h"
using namespace std;
int nThreads;
int model_type;
-int m, M;
+int M;
READ_INT_TYPE N0, N1;
HIT_INT_TYPE nHits;
double totc;
int BURNIN, NSAMPLES, GAP;
-char imdName[STRLEN], statName[STRLEN];
-char thetaF[STRLEN], ofgF[STRLEN], groupF[STRLEN], refF[STRLEN], modelF[STRLEN];
+char refName[STRLEN], imdName[STRLEN], statName[STRLEN];
+char thetaF[STRLEN], ofgF[STRLEN], refF[STRLEN], modelF[STRLEN];
char cvsF[STRLEN];
Refs refs;
-GroupInfo gi;
vector<HIT_INT_TYPE> s;
vector<Item> hits;
pthread_attr_t attr;
int rc;
-void load_data(char* reference_name, char* statName, char* imdName) {
+void load_data(char* refName, char* statName, char* imdName) {
ifstream fin;
string line;
int tmpVal;
//load reference file
- sprintf(refF, "%s.seq", reference_name);
+ sprintf(refF, "%s.seq", refName);
refs.loadRefs(refF, 1);
M = refs.getM();
- //load groupF
- sprintf(groupF, "%s.grp", reference_name);
- gi.load(groupF);
- m = gi.getm();
-
//load ofgF;
sprintf(ofgF, "%s.ofg", imdName);
fin.open(ofgF);
if (verbose) { printf("Loading Data is finished!\n"); }
}
-template<class ModelType>
-void calcExpectedEffectiveLengths(ModelType& model) {
- int lb, ub, span;
- double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
-
- model.getGLD().copyTo(pdf, cdf, lb, ub, span);
- clen = new double[span + 1];
- clen[0] = 0.0;
- for (int i = 1; i <= span; i++) {
- clen[i] = clen[i - 1] + pdf[i] * (lb + i);
- }
-
- eel.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++) {
- int totLen = refs.getRef(i).getTotLen();
- int fullLen = refs.getRef(i).getFullLen();
- int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
- int pos2 = max(min(totLen, ub) - lb, 0);
-
- if (pos2 == 0) { eel[i] = 0.0; continue; }
-
- eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
- assert(eel[i] >= 0);
- if (eel[i] < MINEEL) { eel[i] = 0.0; }
- }
-
- delete[] pdf;
- delete[] cdf;
- delete[] clen;
-}
-
template<class ModelType>
void init_model_related(char* modelF) {
ModelType model;
model.read(modelF);
- calcExpectedEffectiveLengths<ModelType>(model);
+ calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
memcpy(mw, model.getMW(), sizeof(double) * (M + 1)); // otherwise, after exiting this procedure, mw becomes undefined
}
fprintf(fo, "%d\n", counts[M]);
}
-void polishTheta(vector<double>& theta, const vector<double>& eel, const double* mw) {
- double sum = 0.0;
-
- /* The reason that for noise gene, mw value is 1 is :
- * currently, all masked positions are for poly(A) sites, which in theory should be filtered out.
- * So the theta0 does not containing reads from any masked position
- */
-
- for (int i = 0; i <= M; i++) {
- // i == 0, mw[i] == 1
- if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) {
- theta[i] = 0.0;
- continue;
- }
- theta[i] = theta[i] / mw[i];
- sum += theta[i];
- }
- // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked
- general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !");
- for (int i = 0; i <= M; i++) theta[i] /= sum;
-}
-
-void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
- double denom;
- vector<double> frac;
-
- //calculate fraction of count over all mappabile reads
- denom = 0.0;
- frac.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++)
- if (eel[i] >= EPSILON) {
- frac[i] = theta[i];
- denom += frac[i];
- }
- general_assert(denom >= EPSILON, "No alignable reads?!");
- for (int i = 1; i <= M; i++) frac[i] /= denom;
-
- //calculate FPKM
- fpkm.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++)
- if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
-
- //calculate TPM
- tpm.assign(M + 1, 0.0);
- denom = 0.0;
- for (int i = 1; i <= M; i++) denom += fpkm[i];
- for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;
-}
-
void* Gibbs(void* arg) {
int CHAINLEN;
HIT_INT_TYPE len, fr, to;
if ((ROUND - BURNIN - 1) % GAP == 0) {
writeCountVector(params->fo, counts);
for (int i = 0; i <= M; i++) theta[i] = counts[i] / totc;
- polishTheta(theta, eel, mw);
- calcExpressionValues(theta, eel, tpm, fpkm);
+ polishTheta(M, theta, eel, mw);
+ calcExpressionValues(M, theta, eel, tpm, fpkm);
for (int i = 0; i <= M; i++) {
params->pme_c[i] += counts[i] - 1;
params->pve_c[i] += (counts[i] - 1) * (counts[i] - 1);
}
}
-void writeResults(char* imdName) {
- char outF[STRLEN];
- FILE *fo;
-
- vector<double> isopct;
- vector<double> gene_counts, gene_tpm, gene_fpkm;
-
- //calculate IsoPct, etc.
- isopct.assign(M + 1, 0.0);
- gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
-
- for (int i = 0; i < m; i++) {
- int b = gi.spAt(i), e = gi.spAt(i + 1);
- for (int j = b; j < e; j++) {
- gene_counts[i] += pme_c[j];
- gene_tpm[i] += pme_tpm[j];
- gene_fpkm[i] += pme_fpkm[j];
- }
- if (gene_tpm[i] < EPSILON) continue;
- for (int j = b; j < e; j++)
- isopct[j] = pme_tpm[j] / gene_tpm[i];
- }
-
- //isoform level results
- sprintf(outF, "%s.iso_res", imdName);
- fo = fopen(outF, "a");
- general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
-
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
- fclose(fo);
-
- //gene level results
- sprintf(outF, "%s.gene_res", imdName);
- fo = fopen(outF, "a");
- general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
-
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
- fclose(fo);
-
- if (verbose) { printf("Gibbs based expression values are written!\n"); }
-}
-
int main(int argc, char* argv[]) {
if (argc < 7) {
printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n");
exit(-1);
}
+ strcpy(refName, argv[1]);
strcpy(imdName, argv[2]);
strcpy(statName, argv[3]);
printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads);
}
- load_data(argv[1], statName, imdName);
+ load_data(refName, statName, imdName);
sprintf(modelF, "%s.model", statName);
FILE *fi = fopen(modelF, "r");
if (verbose) printf("Gibbs finished!\n");
- writeResults(imdName);
+ writeResultsGibbs(M, refName, imdName, pme_c, pme_fpkm, pme_tpm);
if (var_opt) {
char varF[STRLEN];
+ // Load group info
+ int m;
+ GroupInfo gi;
+ char groupF[STRLEN];
+ sprintf(groupF, "%s.grp", refName);
+ gi.load(groupF);
+ m = gi.getm();
+
sprintf(varF, "%s.var", statName);
FILE *fo = fopen(varF, "w");
general_assert(fo != NULL, "Cannot open " + cstrtos(varF) + "!");
}
fclose(fo);
}
-
+
delete mw; // delete the copy
return 0;
__estimated_model_file:__ This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'. 'model_file_description.txt' provides the format and meanings of this file.
-__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required.
+__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. If the RSEM references built are aware of allele-specific transcripts, 'sample_name.alleles.results' should be used instead.
__theta0:__ This parameter determines the fraction of reads that are coming from background "noise" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.
#include "utils.h"
+/**
+ If no genome is provided, seqname field is used to store the allele name.
+ */
+
struct Interval {
int start, end;
}
bool operator< (const Transcript& o) const {
- return gene_id < o.gene_id || (gene_id == o.gene_id && transcript_id < o.transcript_id);
+ return gene_id < o.gene_id || (gene_id == o.gene_id && transcript_id < o.transcript_id) || (gene_id == o.gene_id && transcript_id == o.transcript_id && seqname < o.seqname);
}
const std::string& getTranscriptID() const { return transcript_id; }
}
int getM() { return M; }
+
int getType() { return type; }
+ void setType(int type) { this->type = type; }
+
+ bool isAlleleSpecific() { return type == 2; }
const Transcript& getTranscriptAt(int pos) {
assert(pos > 0 && pos <= M);
void buildMappings(int, char**);
private:
- int M, type; // type 0 from genome , 1 standalone transcriptome
+ int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific
std::vector<Transcript> transcripts;
std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
std::map<std::string, int> dict;
std::map<std::string, int>::iterator iter;
- general_assert(n_targets == M, "Number of transcripts does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+ general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
dict.clear();
for (int i = 1; i <= M; i++) {
- const std::string& tid = transcripts[i].getTranscriptID();
+ const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
iter = dict.find(tid);
general_assert(iter == dict.end(), tid + " appears more than once!");
dict[tid] = i;
i2e.assign(M + 1, 0);
for (int i = 0; i < n_targets; i++) {
iter = dict.find(std::string(target_name[i]));
- general_assert(iter != dict.end(), "RSEM can not recognize transcript " + cstrtos(target_name[i]) + "!");
+ general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
e2i[i + 1] = iter->second;
i2e[iter->second] = i + 1;
+RSEM v1.2.12
+
+- Enabled allele-specific expression estimation
+- Added option '--calc-pme' option to calculate posterior mean estimates only (no credibility intervals)
+
+--------------------------------------------------------------------------------------------
+
RSEM v1.2.11
- Enabled RSEM to use Bowtie 2 aligner (indel, local and discordant alignments are not supported yet)
--- /dev/null
+#ifndef WRITERESULTS_H_
+#define WRITERESULTS_H_
+
+#include<cstdio>
+#include<vector>
+#include<string>
+#include<fstream>
+#include<algorithm>
+
+#include "utils.h"
+#include "my_assert.h"
+#include "GroupInfo.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+#include "Refs.h"
+
+#include "Model.h"
+#include "SingleModel.h"
+#include "SingleQModel.h"
+#include "PairedEndModel.h"
+#include "PairedEndQModel.h"
+
+template<class ModelType>
+void calcExpectedEffectiveLengths(int M, Refs& refs, ModelType& model, std::vector<double>& eel) {
+ int lb, ub, span;
+ double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
+
+ model.getGLD().copyTo(pdf, cdf, lb, ub, span);
+ clen = new double[span + 1];
+ clen[0] = 0.0;
+ for (int i = 1; i <= span; i++) {
+ clen[i] = clen[i - 1] + pdf[i] * (lb + i);
+ }
+
+ eel.assign(M + 1, 0.0);
+ for (int i = 1; i <= M; i++) {
+ int totLen = refs.getRef(i).getTotLen();
+ int fullLen = refs.getRef(i).getFullLen();
+ int pos1 = std::max(std::min(totLen - fullLen + 1, ub) - lb, 0);
+ int pos2 = std::max(std::min(totLen, ub) - lb, 0);
+
+ if (pos2 == 0) { eel[i] = 0.0; continue; }
+
+ eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
+ assert(eel[i] >= 0);
+ if (eel[i] < MINEEL) { eel[i] = 0.0; }
+ }
+
+ delete[] pdf;
+ delete[] cdf;
+ delete[] clen;
+}
+
+void polishTheta(int M, std::vector<double>& theta, const std::vector<double>& eel, const double* mw) {
+ double sum = 0.0;
+
+ /* The reason that for noise gene, mw value is 1 is :
+ * currently, all masked positions are for poly(A) sites, which in theory should be filtered out.
+ * So the theta0 does not containing reads from any masked position
+ */
+
+ for (int i = 0; i <= M; i++) {
+ // i == 0, mw[i] == 1
+ if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) {
+ theta[i] = 0.0;
+ continue;
+ }
+ theta[i] = theta[i] / mw[i];
+ sum += theta[i];
+ }
+ // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked
+ general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !");
+ for (int i = 0; i <= M; i++) theta[i] /= sum;
+}
+
+void calcExpressionValues(int M, const std::vector<double>& theta, const std::vector<double>& eel, std::vector<double>& tpm, std::vector<double>& fpkm) {
+ double denom;
+ std::vector<double> frac;
+
+ //calculate fraction of count over all mappabile reads
+ denom = 0.0;
+ frac.assign(M + 1, 0.0);
+ for (int i = 1; i <= M; i++)
+ if (eel[i] >= EPSILON) {
+ frac[i] = theta[i];
+ denom += frac[i];
+ }
+ general_assert(denom >= EPSILON, "No alignable reads?!");
+ for (int i = 1; i <= M; i++) frac[i] /= denom;
+
+ //calculate FPKM
+ fpkm.assign(M + 1, 0.0);
+ for (int i = 1; i <= M; i++)
+ if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
+
+ //calculate TPM
+ tpm.assign(M + 1, 0.0);
+ denom = 0.0;
+ for (int i = 1; i <= M; i++) denom += fpkm[i];
+ for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;
+}
+
+inline bool isAlleleSpecific(const char* refName, GroupInfo* gt = NULL, GroupInfo* ta = NULL) {
+ bool alleleS;
+ char gtF[STRLEN], taF[STRLEN];
+
+ sprintf(gtF, "%s.gt", refName);
+ sprintf(taF, "%s.ta", refName);
+ std::ifstream gtIF(gtF), taIF(taF);
+ alleleS = gtIF.is_open() && taIF.is_open();
+ if (gtIF.is_open()) gtIF.close();
+ if (taIF.is_open()) taIF.close();
+
+ if (alleleS) {
+ if (gt != NULL) gt->load(gtF);
+ if (ta != NULL) ta->load(taF);
+ }
+
+ return alleleS;
+}
+
+void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts& transcripts, std::vector<double>& theta, std::vector<double>& eel, double* counts) {
+ char outF[STRLEN];
+ FILE *fo;
+
+ int m;
+ GroupInfo gi;
+ char groupF[STRLEN];
+
+ std::vector<int> tlens;
+ std::vector<double> fpkm, tpm, isopct;
+ std::vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
+
+ // Load group info
+ sprintf(groupF, "%s.grp", refName);
+ gi.load(groupF);
+ m = gi.getm();
+
+ // For allele-specific expression
+ int m_trans = 0;
+ GroupInfo gt, ta;
+ std::vector<double> trans_lens, trans_eels, trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct;
+
+ bool alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific
+
+ calcExpressionValues(M, theta, eel, tpm, fpkm);
+
+ //calculate IsoPct, etc.
+ isopct.assign(M + 1, 0.0);
+ tlens.assign(M + 1, 0);
+
+ glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
+ gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(j);
+ tlens[j] = transcript.getLength();
+
+ gene_counts[i] += counts[j];
+ gene_tpm[i] += tpm[j];
+ gene_fpkm[i] += fpkm[j];
+ }
+
+ if (gene_tpm[i] < EPSILON) {
+ double frac = 1.0 / (e - b);
+ for (int j = b; j < e; j++) {
+ glens[i] += tlens[j] * frac;
+ gene_eels[i] += eel[j] * frac;
+ }
+ }
+ else {
+ for (int j = b; j < e; j++) {
+ isopct[j] = tpm[j] / gene_tpm[i];
+ glens[i] += tlens[j] * isopct[j];
+ gene_eels[i] += eel[j] * isopct[j];
+ }
+ }
+ }
+
+ if (alleleS) {
+ m_trans = ta.getm();
+ ta_pct.assign(M + 1, 0.0);
+ trans_lens.assign(m_trans, 0.0); trans_eels.assign(m_trans, 0.0);
+ trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0);
+
+ for (int i = 0; i < m_trans; i++) {
+ int b = ta.spAt(i), e = ta.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ trans_counts[i] += counts[j];
+ trans_tpm[i] += tpm[j];
+ trans_fpkm[i] += fpkm[j];
+ }
+
+ if (trans_tpm[i] < EPSILON) {
+ double frac = 1.0 / (e - b);
+ for (int j = b; j < e; j++) {
+ trans_lens[i] += tlens[j] * frac;
+ trans_eels[i] += eel[j] * frac;
+ }
+ }
+ else {
+ for (int j = b; j < e; j++) {
+ ta_pct[j] = tpm[j] / trans_tpm[i];
+ trans_lens[i] += tlens[j] * ta_pct[j];
+ trans_eels[i] += eel[j] * ta_pct[j];
+ }
+ }
+ }
+
+ gt_pct.assign(m_trans, 0.0);
+ for (int i = 0; i < m; i++)
+ if (gene_tpm[i] >= EPSILON) {
+ int b = gt.spAt(i), e = gt.spAt(i + 1);
+ for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i];
+ }
+ }
+
+ if (!alleleS) {
+ //isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "w");
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+ fclose(fo);
+ }
+ else {
+ // allele level results
+ sprintf(outF, "%s.allele_res", imdName);
+ fo = fopen(outF, "w");
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s%c", transcript.getSeqName().c_str(), (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
+ }
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%d%c", tlens[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", eel[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", tpm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", fpkm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", ta_pct[i] * 1e2, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+ fclose(fo);
+
+ // isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "w");
+ for (int i = 0; i < m_trans; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i));
+ fprintf(fo, "%s%c", transcript.getTranscriptID().c_str(), (i < m_trans - 1 ? '\t' : '\n'));
+ }
+ for (int i = 0; i < m_trans; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i));
+ fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m_trans - 1 ? '\t' : '\n'));
+ }
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_lens[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_eels[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_counts[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_tpm[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_fpkm[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", gt_pct[i] * 1e2, (i < m_trans - 1 ? '\t' : '\n'));
+ fclose(fo);
+ }
+
+ //gene level results
+ sprintf(outF, "%s.gene_res", imdName);
+ fo = fopen(outF, "w");
+ for (int i = 0; i < m; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(gi.spAt(i));
+ fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < m - 1 ? '\t' : '\n'));
+ }
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ std::string curtid = "", tid;
+ for (int j = b; j < e; j++) {
+ tid = transcripts.getTranscriptAt(j).getTranscriptID();
+ if (curtid != tid) {
+ if (curtid != "") fprintf(fo, ",");
+ fprintf(fo, "%s", tid.c_str());
+ curtid = tid;
+ }
+ }
+ fprintf(fo, "%c", (i < m - 1 ? '\t' : '\n'));
+ }
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", glens[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_eels[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
+ fclose(fo);
+
+ if (verbose) { printf("Expression Results are written!\n"); }
+}
+
+void writeResultsGibbs(int M, char* refName, char* imdName, std::vector<double>& pme_c, std::vector<double>& pme_fpkm, std::vector<double>& pme_tpm) {
+ char outF[STRLEN];
+ FILE *fo;
+
+ int m;
+ GroupInfo gi;
+ char groupF[STRLEN];
+ std::vector<double> isopct;
+ std::vector<double> gene_counts, gene_tpm, gene_fpkm;
+
+ // Load group info
+ sprintf(groupF, "%s.grp", refName);
+ gi.load(groupF);
+ m = gi.getm();
+
+ // For allele-specific expression
+ int m_trans = 0;
+ GroupInfo gt, ta;
+ std::vector<double> trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct;
+
+ bool alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific
+
+ //calculate IsoPct, etc.
+ isopct.assign(M + 1, 0.0);
+ gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ gene_counts[i] += pme_c[j];
+ gene_tpm[i] += pme_tpm[j];
+ gene_fpkm[i] += pme_fpkm[j];
+ }
+ if (gene_tpm[i] < EPSILON) continue;
+ for (int j = b; j < e; j++)
+ isopct[j] = pme_tpm[j] / gene_tpm[i];
+ }
+
+ if (alleleS) {
+ m_trans = ta.getm();
+ ta_pct.assign(M + 1, 0.0);
+ trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0);
+
+ for (int i = 0; i < m_trans; i++) {
+ int b = ta.spAt(i), e = ta.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ trans_counts[i] += pme_c[j];
+ trans_tpm[i] += pme_tpm[j];
+ trans_fpkm[i] += pme_fpkm[j];
+ }
+ if (trans_tpm[i] < EPSILON) continue;
+ for (int j = b; j < e; j++)
+ ta_pct[j] = pme_tpm[j] / trans_tpm[i];
+ }
+
+ gt_pct.assign(m_trans, 0.0);
+ for (int i = 0; i < m; i++)
+ if (gene_tpm[i] >= EPSILON) {
+ int b = gt.spAt(i), e = gt.spAt(i + 1);
+ for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i];
+ }
+ }
+
+ if (!alleleS) {
+ //isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "a");
+ general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+ fclose(fo);
+ }
+ else {
+ //allele level results
+ sprintf(outF, "%s.allele_res", imdName);
+ fo = fopen(outF, "a");
+ general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", ta_pct[i] * 1e2, (i < M ? '\t' : '\n'));
+ for (int i = 1; i <= M; i++)
+ fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
+ fclose(fo);
+
+ //isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "a");
+ general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_counts[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_tpm[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", trans_fpkm[i], (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.2f%c", gt_pct[i] * 1e2, (i < m_trans - 1 ? '\t' : '\n'));
+ fclose(fo);
+ }
+
+ //gene level results
+ sprintf(outF, "%s.gene_res", imdName);
+ fo = fopen(outF, "a");
+ general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
+
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m; i++)
+ fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
+ fclose(fo);
+
+ if (verbose) { printf("Gibbs based expression values are written!\n"); }
+}
+
+void writeResultsSimulation(int M, char* refName, char* outFN, Transcripts& transcripts, std::vector<double>& eel, std::vector<double>& counts) {
+ char outF[STRLEN];
+ FILE *fo;
+
+ int m;
+ GroupInfo gi;
+ char groupF[STRLEN];
+
+ // Load group info
+ sprintf(groupF, "%s.grp", refName);
+ gi.load(groupF);
+ m = gi.getm();
+
+ std::vector<int> tlens;
+ std::vector<double> tpm, fpkm, isopct;
+ std::vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
+
+ // For allele-specific expression
+ int m_trans = 0;
+ GroupInfo gt, ta;
+ std::vector<double> trans_lens, trans_eels, trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct;
+
+ bool alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific
+
+ for (int i = 1; i <= M; i++)
+ general_assert(eel[i] > EPSILON || counts[i] <= EPSILON, "An isoform whose effecitve length < " + ftos(MINEEL, 6) + " got sampled!");
+
+ calcExpressionValues(M, counts, eel, tpm, fpkm);
+
+ //calculate IsoPct, etc.
+ isopct.assign(M + 1, 0.0);
+ tlens.assign(M + 1, 0);
+
+ glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
+ gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(j);
+ tlens[j] = transcript.getLength();
+
+ gene_counts[i] += counts[j];
+ gene_tpm[i] += tpm[j];
+ gene_fpkm[i] += fpkm[j];
+ }
+
+ if (gene_tpm[i] < EPSILON) {
+ double frac = 1.0 / (e - b);
+ for (int j = b; j < e; j++) {
+ glens[i] += tlens[j] * frac;
+ gene_eels[i] += eel[j] * frac;
+ }
+ }
+ else {
+ for (int j = b; j < e; j++) {
+ isopct[j] = tpm[j] / gene_tpm[i];
+ glens[i] += tlens[j] * isopct[j];
+ gene_eels[i] += eel[j] * isopct[j];
+ }
+ }
+ }
+
+ if (alleleS) {
+ m_trans = ta.getm();
+ ta_pct.assign(M + 1, 0.0);
+ trans_lens.assign(m_trans, 0.0); trans_eels.assign(m_trans, 0.0);
+ trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0);
+
+ for (int i = 0; i < m_trans; i++) {
+ int b = ta.spAt(i), e = ta.spAt(i + 1);
+ for (int j = b; j < e; j++) {
+ trans_counts[i] += counts[j];
+ trans_tpm[i] += tpm[j];
+ trans_fpkm[i] += fpkm[j];
+ }
+
+ if (trans_tpm[i] < EPSILON) {
+ double frac = 1.0 / (e - b);
+ for (int j = b; j < e; j++) {
+ trans_lens[i] += tlens[j] * frac;
+ trans_eels[i] += eel[j] * frac;
+ }
+ }
+ else {
+ for (int j = b; j < e; j++) {
+ ta_pct[j] = tpm[j] / trans_tpm[i];
+ trans_lens[i] += tlens[j] * ta_pct[j];
+ trans_eels[i] += eel[j] * ta_pct[j];
+ }
+ }
+ }
+
+ gt_pct.assign(m_trans, 0.0);
+ for (int i = 0; i < m; i++)
+ if (gene_tpm[i] >= EPSILON) {
+ int b = gt.spAt(i), e = gt.spAt(i + 1);
+ for (int j = b; j < e; j++) gt_pct[j] = trans_tpm[j] / gene_tpm[i];
+ }
+ }
+
+ //allele level
+ if (alleleS) {
+ sprintf(outF, "%s.sim.alleles.results", outFN);
+ fo = fopen(outF, "w");
+ fprintf(fo, "allele_id\ttranscript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tAlleleIsoPct\tAlleleGenePct\n");
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s\t%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getSeqName().c_str(), transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
+ eel[i], counts[i], tpm[i], fpkm[i], ta_pct[i] * 1e2, isopct[i] * 1e2);
+ }
+ fclose(fo);
+ }
+
+ //isoform level
+ sprintf(outF, "%s.sim.isoforms.results", outFN);
+ fo = fopen(outF, "w");
+ fprintf(fo, "transcript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tIsoPct\n");
+ if (!alleleS) {
+ for (int i = 1; i <= M; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i);
+ fprintf(fo, "%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
+ eel[i], counts[i], tpm[i], fpkm[i], isopct[i] * 1e2);
+ }
+ }
+ else {
+ for (int i = 0; i < m_trans; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(ta.spAt(i));
+ fprintf(fo, "%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), trans_lens[i],
+ trans_eels[i], trans_counts[i], trans_tpm[i], trans_fpkm[i], gt_pct[i] * 1e2);
+ }
+ }
+ fclose(fo);
+
+ //gene level
+ sprintf(outF, "%s.sim.genes.results", outFN);
+ fo = fopen(outF, "w");
+ fprintf(fo, "gene_id\ttranscript_id(s)\tlength\teffective_length\tcount\tTPM\tFPKM\n");
+ for (int i = 0; i < m; i++) {
+ int b = gi.spAt(i), e = gi.spAt(i + 1);
+ const std::string& gene_id = transcripts.getTranscriptAt(b).getGeneID();
+ fprintf(fo, "%s\t", gene_id.c_str());
+ std::string curtid = "", tid;
+ for (int j = b; j < e; j++) {
+ tid = transcripts.getTranscriptAt(j).getTranscriptID();
+ if (curtid != tid) {
+ if (curtid != "") fprintf(fo, ",");
+ fprintf(fo, "%s", tid.c_str());
+ curtid = tid;
+ }
+ }
+ fprintf(fo, "\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", glens[i], gene_eels[i], gene_counts[i], gene_tpm[i], gene_fpkm[i]);
+ }
+ fclose(fo);
+}
+
+#endif
#include "Refs.h"
#include "GroupInfo.h"
+#include "WriteResults.h"
#include "Buffer.h"
const double *mw;
};
+struct CIParams {
+ int no;
+ int start_gene_id, end_gene_id;
+};
+
struct CIType {
float lb, ub; // the interval is [lb, ub]
CIType() { lb = ub = 0.0; }
};
-struct CIParams {
- int no;
- int start_gene_id, end_gene_id;
-};
-
int model_type;
int nMB;
char cvsF[STRLEN], tmpF[STRLEN], command[STRLEN];
-CIType *iso_tpm, *gene_tpm, *iso_fpkm, *gene_fpkm;
+CIType *tpm, *fpkm;
+CIType *iso_tpm = NULL, *iso_fpkm = NULL;
+CIType *gene_tpm, *gene_fpkm;
int M, m;
Refs refs;
GroupInfo gi;
-char imdName[STRLEN], statName[STRLEN];
+char refName[STRLEN], imdName[STRLEN], statName[STRLEN];
char modelF[STRLEN], groupF[STRLEN], refF[STRLEN];
+bool alleleS;
+int m_trans;
+GroupInfo ta;
+
vector<double> eel; //expected effective lengths
Buffer *buffer;
CIParams *ciParamsArray;
-template<class ModelType>
-void calcExpectedEffectiveLengths(ModelType& model) {
- int lb, ub, span;
- double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = \sigma_{j=1}^{i}pdf[i]*(lb+i)
-
- model.getGLD().copyTo(pdf, cdf, lb, ub, span);
- clen = new double[span + 1];
- clen[0] = 0.0;
- for (int i = 1; i <= span; i++) {
- clen[i] = clen[i - 1] + pdf[i] * (lb + i);
- }
-
- eel.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++) {
- int totLen = refs.getRef(i).getTotLen();
- int fullLen = refs.getRef(i).getFullLen();
- int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
- int pos2 = max(min(totLen, ub) - lb, 0);
-
- if (pos2 == 0) { eel[i] = 0.0; continue; }
-
- eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
- assert(eel[i] >= 0);
- if (eel[i] < MINEEL) { eel[i] = 0.0; }
- }
-
- delete[] pdf;
- delete[] cdf;
- delete[] clen;
-}
-
void* sample_theta_from_c(void* arg) {
int *cvec;
double *theta;
void sample_theta_vectors_from_count_vectors() {
ModelType model;
model.read(modelF);
- calcExpectedEffectiveLengths<ModelType>(model);
+ calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
int num_threads = min(nThreads, nCV);
}
void* calcCI_batch(void* arg) {
- float *itsamples, *gtsamples, *ifsamples, *gfsamples;
+ float *tsamples, *fsamples;
+ float *itsamples = NULL, *ifsamples = NULL, *gtsamples, *gfsamples;
ifstream fin;
CIParams *ciParams = (CIParams*)arg;
+ int curtid, curaid, tid;
- itsamples = new float[nSamples];
+ tsamples = new float[nSamples];
+ fsamples = new float[nSamples];
+ if (alleleS) {
+ itsamples = new float[nSamples];
+ ifsamples = new float[nSamples];
+ }
gtsamples = new float[nSamples];
- ifsamples = new float[nSamples];
gfsamples = new float[nSamples];
fin.open(tmpF, ios::binary);
fin.seekg(pos, ios::beg);
int cnt = 0;
+ if (alleleS) {
+ curtid = curaid = -1;
+ memset(itsamples, 0, FLOATSIZE * nSamples);
+ memset(ifsamples, 0, FLOATSIZE * nSamples);
+ }
for (int i = ciParams->start_gene_id; i < ciParams->end_gene_id; i++) {
int b = gi.spAt(i), e = gi.spAt(i + 1);
memset(gtsamples, 0, FLOATSIZE * nSamples);
memset(gfsamples, 0, FLOATSIZE * nSamples);
for (int j = b; j < e; j++) {
+ if (alleleS) {
+ tid = ta.gidAt(j);
+ if (curtid != tid) {
+ if (curtid >= 0) {
+ if (j - curaid > 1) {
+ calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub);
+ calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub);
+ }
+ else {
+ iso_tpm[curtid] = tpm[curaid];
+ iso_fpkm[curtid] = fpkm[curaid];
+ }
+ }
+ curtid = tid;
+ curaid = j;
+ }
+ }
+
for (int k = 0; k < nSamples; k++) {
- fin.read((char*)(&itsamples[k]), FLOATSIZE);
- gtsamples[k] += itsamples[k];
- ifsamples[k] = 1e3 / l_bars[k] * itsamples[k];
- gfsamples[k] += ifsamples[k];
+ fin.read((char*)(&tsamples[k]), FLOATSIZE);
+ fsamples[k] = 1e3 / l_bars[k] * tsamples[k];
+ if (alleleS) {
+ itsamples[k] += tsamples[k];
+ ifsamples[k] += fsamples[k];
+ }
+ gtsamples[k] += tsamples[k];
+ gfsamples[k] += fsamples[k];
}
- calcCI(nSamples, itsamples, iso_tpm[j].lb, iso_tpm[j].ub);
- calcCI(nSamples, ifsamples, iso_fpkm[j].lb, iso_fpkm[j].ub);
+ calcCI(nSamples, tsamples, tpm[j].lb, tpm[j].ub);
+ calcCI(nSamples, fsamples, fpkm[j].lb, fpkm[j].ub);
}
if (e - b > 1) {
calcCI(nSamples, gfsamples, gene_fpkm[i].lb, gene_fpkm[i].ub);
}
else {
- gene_tpm[i].lb = iso_tpm[b].lb; gene_tpm[i].ub = iso_tpm[b].ub;
- gene_fpkm[i].lb = iso_fpkm[b].lb; gene_fpkm[i].ub = iso_fpkm[b].ub;
+ gene_tpm[i] = tpm[b];
+ gene_fpkm[i] = fpkm[b];
}
++cnt;
if (verbose && cnt % 1000 == 0) { printf("In thread %d, %d genes are processed for CI calculation!\n", ciParams->no, cnt); }
}
-
fin.close();
- delete[] itsamples;
+ if (alleleS && (curtid >= 0)) {
+ if (gi.spAt(ciParams->end_gene_id) - curaid > 1) {
+ calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub);
+ calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub);
+ }
+ else {
+ iso_tpm[curtid] = tpm[curaid];
+ iso_fpkm[curtid] = fpkm[curaid];
+ }
+ }
+
+ delete[] tsamples;
+ delete[] fsamples;
+ if (alleleS) {
+ delete[] itsamples;
+ delete[] ifsamples;
+ }
delete[] gtsamples;
+ delete[] gfsamples;
return NULL;
}
char outF[STRLEN];
int num_threads = nThreads;
- iso_tpm = new CIType[M + 1];
+ tpm = new CIType[M + 1];
+ fpkm = new CIType[M + 1];
+ if (alleleS) {
+ iso_tpm = new CIType[m_trans];
+ iso_fpkm = new CIType[m_trans];
+ }
gene_tpm = new CIType[m];
- iso_fpkm = new CIType[M + 1];
gene_fpkm = new CIType[m];
assert(M > 0);
delete[] ciParamsArray;
- //isoform level results
- sprintf(outF, "%s.iso_res", imdName);
+ alleleS ? sprintf(outF, "%s.allele_res", imdName) : sprintf(outF, "%s.iso_res", imdName);
fo = fopen(outF, "a");
for (int i = 1; i <= M; i++)
- fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < M ? '\t' : '\n'));
+ fprintf(fo, "%.6g%c", tpm[i].lb, (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++)
- fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < M ? '\t' : '\n'));
+ fprintf(fo, "%.6g%c", tpm[i].ub, (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++)
- fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < M ? '\t' : '\n'));
+ fprintf(fo, "%.6g%c", fpkm[i].lb, (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++)
- fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < M ? '\t' : '\n'));
+ fprintf(fo, "%.6g%c", fpkm[i].ub, (i < M ? '\t' : '\n'));
fclose(fo);
+ if (alleleS) {
+ //isoform level results
+ sprintf(outF, "%s.iso_res", imdName);
+ fo = fopen(outF, "a");
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < m_trans - 1 ? '\t' : '\n'));
+ for (int i = 0; i < m_trans; i++)
+ fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
+ fclose(fo);
+ }
+
//gene level results
sprintf(outF, "%s.gene_res", imdName);
fo = fopen(outF, "a");
fprintf(fo, "%.6g%c", gene_fpkm[i].ub, (i < m - 1 ? '\t' : '\n'));
fclose(fo);
- delete[] iso_tpm;
+ delete[] tpm;
+ delete[] fpkm;
+ if (alleleS) {
+ delete[] iso_tpm;
+ delete[] iso_fpkm;
+ }
delete[] gene_tpm;
- delete[] iso_fpkm;
delete[] gene_fpkm;
if (verbose) { printf("All credibility intervals are calculated!\n"); }
exit(-1);
}
+ strcpy(refName, argv[1]);
strcpy(imdName, argv[2]);
strcpy(statName, argv[3]);
}
verbose = !quiet;
- sprintf(refF, "%s.seq", argv[1]);
+ sprintf(refF, "%s.seq", refName);
refs.loadRefs(refF, 1);
M = refs.getM();
- sprintf(groupF, "%s.grp", argv[1]);
+
+ sprintf(groupF, "%s.grp", refName);
gi.load(groupF);
m = gi.getm();
+ // allele-specific
+ alleleS = isAlleleSpecific(refName, NULL, &ta);
+ if (alleleS) m_trans = ta.getm();
+
nSamples = nCV * nSpC;
assert(nSamples > 0 && M > 0); // for Buffter.h: (bufsize_type)nSamples
l_bars = new float[nSamples];
calculate_credibility_intervals(imdName);
delete l_bars;
- /*
- sprintf(command, "rm -f %s", tmpF);
- int status = system(command);
- if (status != 0) {
- fprintf(stderr, "Cannot delete %s!\n", tmpF);
- exit(-1);
- }
- */
return 0;
}
rsem-extract-reference-transcripts : utils.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp
$(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts
-rsem-synthesis-reference-transcripts : utils.h Transcript.h Transcripts.h synthesisRef.cpp
+rsem-synthesis-reference-transcripts : utils.h my_assert.h Transcript.h Transcripts.h synthesisRef.cpp
$(CC) -Wall -O3 synthesisRef.cpp -o rsem-synthesis-reference-transcripts
BowtieRefSeqPolicy.h : RefSeqPolicy.h
sampling.h : boost/random.hpp
+WriteResults.h : utils.h my_assert.h GroupInfo.h Transcript.h Transcripts.h RefSeq.h Refs.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h
+
rsem-run-em : EM.o sam/libbam.a
$(CC) -o rsem-run-em EM.o sam/libbam.a -lz -lpthread
-EM.o : utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sam_rsem_aux.h sampling.h boost/random.hpp EM.cpp
+EM.o : utils.h my_assert.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sam_rsem_aux.h sampling.h boost/random.hpp WriteResults.h EM.cpp
$(CC) $(COFLAGS) EM.cpp
bc_aux.h : sam/bam.h
rsem-simulate-reads : simulation.o
$(CC) -o rsem-simulate-reads simulation.o
-simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp simulation.cpp
+simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp WriteResults.h simulation.cpp
$(CC) $(COFLAGS) simulation.cpp
rsem-run-gibbs : Gibbs.o
$(CC) -o rsem-run-gibbs Gibbs.o -lpthread
#some header files are omitted
-Gibbs.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h Gibbs.cpp
+Gibbs.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Gibbs.cpp
$(CC) $(COFLAGS) Gibbs.cpp
Buffer.h : my_assert.h
$(CC) -o rsem-calculate-credibility-intervals calcCI.o -lpthread
#some header files are omitted
-calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h Buffer.h calcCI.cpp
+calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h WriteResults.h Buffer.h calcCI.cpp
$(CC) $(COFLAGS) calcCI.cpp
rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a
my $genBamF = 1; # default is generating transcript bam file
my $genGenomeBamF = 0;
my $sampling = 0;
+my $calcPME = 0;
my $calcCI = 0;
my $var_opt = 0; # temporarily, only for internal use
my $quiet = 0;
my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ();
my $gap = 32;
+my $alleleS = 0;
+
GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
"temporary-folder=s" => \$temp_dir,
"no-qualities" => \$no_qual,
"no-bam-output" => sub { $genBamF = 0; },
"output-genome-bam" => \$genGenomeBamF,
"sampling-for-bam" => \$sampling,
+ "calc-pme" => \$calcPME,
"var" => \$var_opt,
"calc-ci" => \$calcCI,
"ci-memory=i" => \$NMB,
$sampleName = $ARGV[3];
}
+if (((-e "$refName.ta") && !(-e "$refName.gt")) || (!(-e "$refName.ta") && (-e "$refName.gt"))) {
+ print "Allele-specific expression related reference files are corrupted!\n";
+ exit(-1);
+}
+
+$alleleS = (-e "$refName.ta") && (-e "$refName.gt");
+
if ($genGenomeBamF) {
open(INPUT, "$refName.ti");
my $line = <INPUT>; chomp($line);
else { $command .= " 0"; }
if ($sampling) { $command .= " --sampling"; }
}
-if ($calcCI || $var_opt) { $command .= " --gibbs-out"; }
+if ($calcPME || $var_opt || $calcCI) { $command .= " --gibbs-out"; }
if ($quiet) { $command .= " -q"; }
&runCommand($command);
-&collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+if ($alleleS) {
+ &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+}
+else {
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+}
if ($genBamF) {
$command = $dir."sam/samtools sort -@ $nThreads -m $SortMem $sampleName.transcript.bam $sampleName.transcript.sorted";
if ($mTime) { $time_start = time(); }
-if ($calcCI || $var_opt) {
+if ($calcPME || $var_opt || $calcCI ) {
$command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
$command .= " -p $nThreads";
if ($var_opt) { $command .= " --var"; }
&runCommand($command);
}
-if ($calcCI) {
- system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
- system("mv $sampleName.genes.results $imdName.genes.results.bak1");
- &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
- &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+if ($calcPME || $calcCI) {
+ if ($alleleS) {
+ system("mv $sampleName.alleles.results $imdName.alleles.results.bak1");
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak1");
+ &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ }
+ else {
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak1");
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ }
+}
+if ($calcCI) {
$command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
$command .= " -p $nThreads";
if ($quiet) { $command .= " -q"; }
&runCommand($command);
- system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
- system("mv $sampleName.genes.results $imdName.genes.results.bak2");
- &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
- &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ if ($alleleS) {
+ system("mv $sampleName.alleles.results $imdName.alleles.results.bak2");
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak2");
+ &collectResults("allele", "$imdName.allele_res", "$sampleName.alleles.results"); # allele level
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ }
+ else {
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak2");
+ &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+ }
}
if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; }
print OUTPUT "Aligning reads: $time_alignment s.\n";
print OUTPUT "Estimating expression levels: $time_rsem s.\n";
print OUTPUT "Calculating credibility intervals: $time_ci s.\n";
- my $time_del = $time_end - $time_start;
+# my $time_del = $time_end - $time_start;
# print OUTPUT "Delete: $time_del s.\n";
close(OUTPUT);
}
When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off)
+=item B<--calc-pme>
+
+Run RSEM's collapsed Gibbs sampler to calculate posterior mean estimates. (Default: off)
+
=item B<--calc-ci>
-Calculate 95% credibility intervals and posterior mean estimates. (Default: off)
+Calculate 95% credibility intervals and posterior mean estimates. (Default: off)
=item B<--seed-length> <int>
transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
-Fields are separated by the tab character. Fields within "[]" are only
-presented if '--calc-ci' is set.
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
'transcript_id' is the transcript name of this transcript. 'gene_id'
is the gene name of the gene which this transcript belongs to (denote
gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
-Fields are separated by the tab character. Fields within "[]" are only
-presented if '--calc-ci' is set.
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
'transcript_id(s)' is a comma-separated list of transcript_ids
belonging to this gene. If no gene information is provided, 'gene_id'
effective lengths (weighted by 'IsoPct'). A gene's abundance estimates
are just the sum of its transcripts' abundance estimates.
+=item B<sample_name.alleles.results>
+
+Only generated when the RSEM references are built with allele-specific
+transcripts.
+
+This file contains allele level expression estimates for
+allele-specific expression calculation. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+
+allele_id transcript_id gene_id length effective_length expected_count TPM FPKM AlleleIsoPct AlleleGenePct [pme_expected_count pme_TPM pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
+
+Fields are separated by the tab character. Fields within "[]" are
+optional. They will not be presented if neither '--calc-pme' nor
+'--calc-ci' is set.
+
+'allele_id' is the allele-specific name of this allele-specific transcript.
+
+'AlleleIsoPct' stands for allele-specific percentage on isoform
+level. It is the percentage of this allele-specific transcript's
+abundance over its parent transcript's abundance. If its parent
+transcript has only one allele variant form, this field will be set to
+100.
+
+'AlleleGenePct' stands for allele-specific percentage on gene
+level. It is the percentage of this allele-specific transcript's
+abundance over its parent gene's abundance.
+
+'AlleleIsoPct_from_pme_TPM' and 'AlleleGenePct_from_pme_TPM' have
+similar meanings. They are calculated based on posterior mean
+estimates.
+
+Please note that if this file is present, the fields 'length' and
+'effective_length' in 'sample_name.isoforms.results' should be
+interpreted similarly as the corresponding definitions in
+'sample_name.genes.results'.
+
=item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
Only generated when --no-bam-output is not specified.
nrow_per_page <- 3 # if input_list is composed of transcript ids
ncol_per_page <- 2 # if input_list is composed of transcript ids
-num_plots_per_page <- nrow_per_page * ncol_per_page # if input_list is composed of transcript ids
-
+num_plots_per_page <- nrow_per_page * ncol_per_page # if input_list is composed of transcript/allele ids
exit_with_error <- function(errmsg) {
cat(errmsg, "\n", sep = "", file = stderr())
quit(save = "no", status = 1)
}
-
args <- commandArgs(TRUE)
-if (length(args) != 5)
- exit_with_error("Usage: rsem-gen-transcript-plots sample_name input_list is_gene show_uniq output_plot_file")
+if (length(args) != 6)
+ exit_with_error("Usage: rsem-gen-transcript-plots sample_name input_list is_allele_specific id_type<0,allele;1,isoform;2,gene> show_uniq output_plot_file")
sample_name <- args[1]
input_list <- args[2]
-is_gene <- as.numeric(args[3])
-show_uniq <- as.numeric(args[4])
-output_plot_file <- args[5]
+alleleS <- as.numeric(args[3])
+id_type <- as.numeric(args[4])
+show_uniq <- as.numeric(args[5])
+output_plot_file <- args[6]
+is_composite <- (!alleleS && (id_type == 2)) || (alleleS && (id_type > 0))
+
load_readdepth_file <- function(filename) {
data <- read.table(file = filename, sep = "\t", stringsAsFactors = FALSE)
readdepth <- split(data[, 2:3], data[, 1])
}
-build_t2gmap <- function(filename) {
- tpos <- 1 # the position of transcript_id
- gpos <- 2 # the position of gene_id
+build_map <- function(filename) {
+ value_pos <- 1
+ if (!alleleS || (alleleS && id_type == 1)) {
+ key_pos <- 2
+ } else {
+ key_pos <- 3
+ }
data <- read.delim(file = filename, sep = "\t", stringsAsFactors = FALSE)
- tmp <- aggregate(data[tpos], data[gpos], function(x) x)
- t2gmap <- tmp[,2]
- names(t2gmap) <- tmp[,1]
+ tmp <- aggregate(data[value_pos], data[key_pos], function(x) x)
+ map <- tmp[,2]
+ names(map) <- tmp[,1]
- t2gmap
+ map
}
-make_a_plot <- function(tid) {
- vec <- readdepth[[tid]]
- if (is.null(vec)) exit_with_error(paste("Unknown transcript detected,", tid, "is not included in RSEM's indices."))
+make_a_plot <- function(id) {
+ vec <- readdepth[[id]]
+ if (is.null(vec)) exit_with_error(sprintf("Unknown %s detected, %s is not included in RSEM's indices.", ifelse(alleleS, "allele-specific transcript", "transcript"), id))
if (is.na(vec[[2]])) wiggle <- rep(0, vec[[1]]) else wiggle <- as.numeric(unlist(strsplit(vec[[2]], split = " ")))
len <- length(wiggle)
if (!show_uniq) {
plot(wiggle, type = "h")
} else {
- vec <- readdepth_uniq[[tid]]
+ vec <- readdepth_uniq[[id]]
stopifnot(!is.null(vec))
if (is.na(vec[[2]])) wiggle_uniq <- rep(0, vec[[1]]) else wiggle_uniq <- as.numeric(unlist(strsplit(vec[[2]], split = " ")))
stopifnot(len == length(wiggle_uniq))
if (len != sum(wiggle >= wiggle_uniq)) {
- cat("Warning: transcript ", tid, " has position(s) that read covarege with multireads is smaller than read covarge without multireads.\n", " The 1-based position(s) is(are) : ", which(wiggle < wiggle_uniq), ".\n", " This may be due to floating point arithmetics.\n", sep = "")
+ cat("Warning: ", ifelse(alleleS, "allele-specific transcript", "transcript"), " ", id, " has position(s) that read covarege with multireads is smaller than read covarge without multireads.\n", " The 1-based position(s) is(are) : ", which(wiggle < wiggle_uniq), ".\n", " This may be due to floating point arithmetics.\n", sep = "")
}
heights <- rbind(wiggle_uniq, wiggle - wiggle_uniq)
barplot(heights, space = 0, border = NA, names.arg = 1:len, col = c("black", "red"))
}
- title(main = tid) #, xlab = "Position in transcript", ylab = "Read depth")
+ title(main = id)
}
-generate_a_page <- function(tids, gene_id = NULL) {
- n <- length(tids)
- ncol <- ifelse(is_gene, floor(sqrt(n)), ncol_per_page)
- nrow <- ifelse(is_gene, ceiling(n / ncol), nrow_per_page)
+generate_a_page <- function(ids, title = NULL) {
+ n <- length(ids)
+ ncol <- ifelse(is_composite, floor(sqrt(n)), ncol_per_page)
+ nrow <- ifelse(is_composite, ceiling(n / ncol), nrow_per_page)
par(mfrow = c(nrow, ncol), mar = c(2, 2, 2, 2))
- if (is_gene) par(oma = c(0, 0, 3, 0))
- sapply(tids, make_a_plot)
- if (is_gene) mtext(gene_id, outer = TRUE, line = 1)
+ if (is_composite) par(oma = c(0, 0, 3, 0))
+ sapply(ids, make_a_plot)
+ if (is_composite) mtext(title, outer = TRUE, line = 1)
}
-plot_a_transcript <- function(i) {
+plot_individual <- function(i) {
fr <- (i - 1) * num_plots_per_page + 1
to <- min(i * num_plots_per_page, n)
generate_a_page(ids[fr:to])
}
-plot_a_gene <- function(gene_id) {
- if (is.null(t2gmap[[gene_id]])) exit_with_error(paste("Unknown gene detected,", gene_id, "is not included in RSEM's in indices."))
- generate_a_page(t2gmap[[gene_id]], gene_id)
+# cid, composite id, can be either a gene id or transcript id (for allele-specific expression only)
+plot_composite <- function(cid) {
+ if (is.null(map[[cid]])) exit_with_error(sprintf("Unknown %s detected, %s is not included in RSEM's indices.", ifelse(alleleS && id_type == 1, "transcript", "gene"), cid))
+ generate_a_page(map[[cid]], cid)
}
readdepth <- load_readdepth_file(paste(sample_name, ".transcript.readdepth", sep = ""))
cat("Loading files is done!\n")
-if (is_gene) {
- t2gmap <- build_t2gmap(paste(sample_name, ".isoforms.results", sep = ""))
+if (is_composite) {
+ file_name <- sprintf("%s.%s.results", sample_name, ifelse(alleleS, "alleles", "isoforms"))
+ map <- build_map(file_name)
cat("Building transcript to gene map is done!\n")
}
pdf(output_plot_file)
-if (!is_gene) {
+if (!is_composite) {
n <- length(ids)
ub <- (n - 1) %/% num_plots_per_page + 1
- tmp <- sapply(1:ub, plot_a_transcript)
+ dumbvar <- sapply(1:ub, plot_individual)
} else {
- tmp <- sapply(ids, plot_a_gene)
+ dumbvar <- sapply(ids, plot_composite)
}
cat("Plots are generated!\n")
dev.off.output <- dev.off()
-
-
-
ylab = "Number of fragments",
main = "Histogram of Alignments per Fragment")
-dev.off()
+dev.off.output <- dev.off()
use rsem_perl_utils;
-my $gene_list = 0; # default is 0, means input is a transcript list; 1 means input is a gene list
+my $gene_list = 0; # default is 0, means input is not a gene list
+my $transcript_list = 0; # default is 0, this option can only be turned on if allele-specific expression is calculated
my $show_unique = 0; # 0, default value, means do not show unique transcript wiggles; 1 means show unique transcript wiggles
my $help = 0;
GetOptions("gene-list" => \$gene_list,
+ "transcript-list" => \$transcript_list,
"show-unique" => \$show_unique,
"h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
pod2usage(-verbose => 2) if ($help == 1);
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
+my $alleleS = 0;
+if (-e "$ARGV[0].alleles.results") { $alleleS = 1; }
+
+pod2usage(-msg => "--transcript-list cannot be set if allele-specific reference is not built!", -exitval => 2, -verbose => 2) if (!$alleleS && $transcript_list);
+pod2usage(-msg => "--gene-list and --transcript-list cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($gene_list && $transcript_list);
+
my $dir = "$FindBin::Bin/";
my $command = "";
}
}
-$command = $dir."rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $gene_list $show_unique $ARGV[2]";
+my $id_type;
+
+if ($alleleS) {
+ $id_type = 0;
+ if ($transcript_list) { $id_type = 1; }
+ if ($gene_list) { $id_type = 2; }
+}
+else {
+ $id_type = 1;
+ if ($gene_list) { $id_type = 2; }
+}
+
+$command = $dir."rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $alleleS $id_type $show_unique $ARGV[2]";
&runCommand($command);
__END__
The input-list is a list of gene ids. (Default: off)
+=item B<--transcript-list>
+
+The input-list is a list of transcript ids. This option can only be turned on if allele-specific expression is calculated. (Default: off)
+
=item B<--show-unique>
Show the wiggle plots as stacked bar plots. See description section for details. (Default: off)
=head1 DESCRIPTION
-This program generates transcript wiggle plots and outputs them in a pdf file. This program can accept either a list of transcript ids or gene ids (if transcript to gene mapping information is provided) and has two modes of showing wiggle plots. If '--show-unique' is not specified, the wiggle plot for each transcript is a histogram where each position has the expected read depth at this position as its height. If '--show-unique' is specified, for each transcript a stacked bar plot is generated. For each position, the read depth of unique reads, which have only one alignment, is showed in black. The read depth of multi-reads, which align to more than one places, is showed in red on top of the read depth of unique reads.This program will use some files RSEM generated previouslly. So please do not delete/move any file 'rsem-calculate-expression' generated.
+This program generates transcript wiggle plots and outputs them in a pdf file. This program can accept either a list of transcript ids or gene ids (if transcript to gene mapping information is provided) and has two modes of showing wiggle plots. If '--show-unique' is not specified, the wiggle plot for each transcript is a histogram where each position has the expected read depth at this position as its height. If '--show-unique' is specified, for each transcript a stacked bar plot is generated. For each position, the read depth of unique reads, which have only one alignment, is showed in black. The read depth of multi-reads, which align to more than one places, is showed in red on top of the read depth of unique reads.This program will use some files RSEM generated previouslly. So please do not delete/move any file 'rsem-calculate-expression' generated. If allele-specific expression is calculated, the basic unit for plotting is an allele-specific transcript and plots can be grouped by either transcript ids (--transcript-list) or gene ids (--gene-list).
=head1 OUTPUT
=item B<output_plot_file>
-This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangment of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth.
+This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangment of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. If allele-specific expression is calculated, the basin unit becomes an allele-specific transcript and transcript ids and gene ids can be used to group allele-specific transcripts.
=item B<sample_name.transcript.sorted.bam and sample_name.transcript.readdepth>
my $quiet = 0;
my $help = 0;
+my $alleleMappingF = "";
+
GetOptions("gtf=s" => \$gtfF,
"transcript-to-gene-map=s" => \$mappingF,
+ "allele-to-gene-map=s" => \$alleleMappingF,
"no-polyA" => \$no_polyA,
"no-polyA-subset=s" => \$subsetFile,
"polyA-length=i" => \$polyALen,
pod2usage(-verbose => 2) if ($help == 1);
pod2usage(-msg => "Set --no-polyA & --no-polyA-subset at the same time!", -exitval => 2, -verbose => 2) if ($no_polyA == 1 && $subsetFile ne '');
+pod2usage(-msg => "--transcript-to-gene-map and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($mappingF ne "") && ($alleleMappingF ne ""));
+pod2usage(-msg => "--gtf and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($gtfF ne "") && ($alleleMappingF ne ""));
pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
if ($bowtie2) { $no_bowtie = 1; $no_ntog = 1; }
$"=" ";
$command = $dir."rsem-synthesis-reference-transcripts $ARGV[1] $quiet";
if ($mappingF ne "") { $command .= " 1 $mappingF"; }
+ elsif ($alleleMappingF ne "") { $command .= " 2 $alleleMappingF"; }
else { $command .= " 0"; }
$command .= " @list";
&runCommand($command);
(Default: off)
+=item B<--allele-to-gene-map> <file>
+
+Use information from <file> to provide gene_id and transcript_id information for each allele-specific transcript.
+Each line of <file> should be of the form:
+
+gene_id transcript_id allele_id
+
+with the fields separated by a tab character.
+
+This option is designed for quantifying allele-specific expression. It is only valid if '--gtf' option is not specified. allele_id should be the sequence names presented in the FASTA-formatted files.
+
+(Default: off)
+
=item B<--no-polyA>
Do not add poly(A) tails to the end of reference isoforms. (Default: adding poly(A) tails to all transcripts)
print "\n";
}
+my @allele_title = ("allele_id", "transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "AlleleIsoPct", "AlleleGenePct", "pme_expected_count", "pme_TPM", "pme_FPKM", "AlleleIsoPct_from_pme_TPM", "AlleleGenePct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
my @transcript_title = ("transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "IsoPct", "pme_expected_count", "pme_TPM", "pme_FPKM", "IsoPct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
my @gene_title = ("gene_id", "transcript_id(s)", "length", "effective_length", "expected_count", "TPM", "FPKM", "pme_expected_count", "pme_TPM", "pme_FPKM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound");
-# inpF, outF
+# type, inpF, outF
sub collectResults {
my $local_status;
my ($inpF, $outF);
my @out_arr = ();
for (my $i = 0; $i < $n; $i++) {
- if ($_[0] eq "isoform") { push(@out_arr, $transcript_title[$i]); }
+ if ($_[0] eq "allele") { push(@out_arr, $allele_title[$i]); }
+ elsif ($_[0] eq "isoform") { push(@out_arr, $transcript_title[$i]); }
elsif ($_[0] eq "gene") { push(@out_arr, $gene_title[$i]); }
else { print "A bug on 'collectResults' is detected!\n"; exit(-1); }
}
#include "PairedEndQModel.h"
#include "Refs.h"
-#include "GroupInfo.h"
#include "Transcript.h"
#include "Transcripts.h"
+#include "WriteResults.h"
+
#include "simul.h"
using namespace std;
-const int OFFSITE = 5;
+bool alleleS;
+int OFFSITE;
READ_INT_TYPE N;
-int model_type, M, m;
+int model_type, M;
Refs refs;
-GroupInfo gi;
Transcripts transcripts;
vector<double> eel;
ostream *os[2];
char outReadF[2][STRLEN];
-char refF[STRLEN], groupF[STRLEN], tiF[STRLEN];
-char geneResF[STRLEN], isoResF[STRLEN];
+char refName[STRLEN];
+char refF[STRLEN], tiF[STRLEN];
simul sampler;
os[i] = new ofstream(outReadF[i]);
}
-template<class ModelType>
-void calcExpectedEffectiveLengths(ModelType& model) {
- int lb, ub, span;
- double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
-
- model.getGLD().copyTo(pdf, cdf, lb, ub, span);
- clen = new double[span + 1];
- clen[0] = 0.0;
- for (int i = 1; i <= span; i++) {
- clen[i] = clen[i - 1] + pdf[i] * (lb + i);
- }
-
- eel.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++) {
- int totLen = refs.getRef(i).getTotLen();
- int fullLen = refs.getRef(i).getFullLen();
- int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
- int pos2 = max(min(totLen, ub) - lb, 0);
-
- if (pos2 == 0) { eel[i] = 0.0; continue; }
-
- eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
- assert(eel[i] >= 0);
- if (eel[i] < MINEEL) { eel[i] = 0.0; }
- }
-
- delete[] pdf;
- delete[] cdf;
- delete[] clen;
-}
-
template<class ReadType, class ModelType>
void simulate(char* modelF, char* resultsF) {
ModelType model(&refs);
model.read(modelF);
//calculate eel
- calcExpectedEffectiveLengths<ModelType>(model);
+ calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
//generate theta vector
ifstream fin(resultsF);
cout<< "Total number of resimulation is "<< resimulation_count<< endl;
}
-void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
- double denom;
- vector<double> frac;
-
- //calculate fraction of count over all mappabile reads
- denom = 0.0;
- frac.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++)
- if (eel[i] >= EPSILON) {
- frac[i] = theta[i];
- denom += frac[i];
- }
- general_assert(denom > 0, "No alignable reads?!");
- for (int i = 1; i <= M; i++) frac[i] /= denom;
-
- //calculate FPKM
- fpkm.assign(M + 1, 0.0);
- for (int i = 1; i <= M; i++)
- if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
-
- //calculate TPM
- tpm.assign(M + 1, 0.0);
- denom = 0.0;
- for (int i = 1; i <= M; i++) denom += fpkm[i];
- for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;
-}
-
-void writeResFiles(char* outFN) {
- FILE *fo;
- vector<int> tlens;
- vector<double> fpkm, tpm, isopct;
- vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
-
- for (int i = 1; i <= M; i++)
- general_assert(eel[i] > EPSILON || counts[i] <= EPSILON, "An isoform whose effecitve length < " + ftos(MINEEL, 6) + " got sampled!");
-
- calcExpressionValues(counts, eel, tpm, fpkm);
-
- //calculate IsoPct, etc.
- isopct.assign(M + 1, 0.0);
- tlens.assign(M + 1, 0);
-
- glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
- gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
-
- for (int i = 0; i < m; i++) {
- int b = gi.spAt(i), e = gi.spAt(i + 1);
- for (int j = b; j < e; j++) {
- const Transcript& transcript = transcripts.getTranscriptAt(j);
- tlens[j] = transcript.getLength();
-
- glens[i] += tlens[j] * tpm[j];
- gene_eels[i] += eel[j] * tpm[j];
- gene_counts[i] += counts[j];
- gene_tpm[i] += tpm[j];
- gene_fpkm[i] += fpkm[j];
- }
-
- if (gene_tpm[i] < EPSILON) continue;
-
- for (int j = b; j < e; j++)
- isopct[j] = tpm[j] / gene_tpm[i];
- glens[i] /= gene_tpm[i];
- gene_eels[i] /= gene_tpm[i];
- }
-
- //isoform level
- sprintf(isoResF, "%s.sim.isoforms.results", outFN);
- fo = fopen(isoResF, "w");
- fprintf(fo, "transcript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tIsoPct\n");
- for (int i = 1; i <= M; i++) {
- const Transcript& transcript = transcripts.getTranscriptAt(i);
- fprintf(fo, "%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
- eel[i], counts[i], tpm[i], fpkm[i], isopct[i] * 1e2);
- }
- fclose(fo);
-
- //gene level
- sprintf(geneResF, "%s.sim.genes.results", outFN);
- fo = fopen(geneResF, "w");
- fprintf(fo, "gene_id\ttranscript_id(s)\tlength\teffective_length\tcount\tTPM\tFPKM\n");
- for (int i = 0; i < m; i++) {
- int b = gi.spAt(i), e = gi.spAt(i + 1);
- const string& gene_id = transcripts.getTranscriptAt(b).getGeneID();
- fprintf(fo, "%s\t", gene_id.c_str());
- for (int j = b; j < e; j++) {
- fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : '\t'));
- }
- fprintf(fo, "%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", glens[i], gene_eels[i], gene_counts[i], gene_tpm[i], gene_fpkm[i]);
- }
- fclose(fo);
-}
-
void releaseOutReadStreams() {
for (int i = 0; i < n_os; i++) {
((ofstream*)os[i])->close();
printf("Parameters:\n\n");
printf("reference_name: The name of RSEM references, which should be already generated by 'rsem-prepare-reference'\n");
printf("estimated_model_file: This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model'\n");
- printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required.\n");
+ printf("estimated_isoform_results: This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. If the RSEM references built are aware of allele-specific transcripts, 'sample_name.alleles.results' should be used instead.\n");
printf("theta0: This parameter determines the fraction of reads that are coming from background \"noise\" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'.\n");
printf("N: The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'.\n");
printf("output_name: Prefix for all output files.\n");
if (argc == 8 && !strcmp(argv[7], "-q")) quiet = true;
verbose = !quiet;
+ strcpy(refName, argv[1]);
+ alleleS = isAlleleSpecific(refName);
+ OFFSITE = (alleleS ? 6: 5);
+
//load basic files
sprintf(refF, "%s.seq", argv[1]);
refs.loadRefs(refF);
M = refs.getM();
- sprintf(groupF, "%s.grp", argv[1]);
- gi.load(groupF);
- m = gi.getm();
sprintf(tiF, "%s.ti", argv[1]);
transcripts.readFrom(tiF);
case 3: simulate<PairedEndReadQ, PairedEndQModel>(argv[2], argv[3]); break;
}
- writeResFiles(argv[6]);
+ writeResultsSimulation(M, refName, argv[6], transcripts, eel, counts);
releaseOutReadStreams();
return 0;
#include<fstream>
#include<sstream>
#include<map>
+#include<vector>
#include "utils.h"
+#include "my_assert.h"
#include "Transcript.h"
#include "Transcripts.h"
Transcripts transcripts(1); // no genome, just transcript set
char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN], chromListF[STRLEN];
+char gtF[STRLEN], taF[STRLEN]; // group info between gene and transcript, transcript and allele
-bool hasMappingFile;
+int hasMappingFile;
char mappingFile[STRLEN];
map<string, string> mi_table; // mapping info table
map<string, string>::iterator mi_iter; //mapping info table's iterator
-void loadMappingInfo(char* mappingF) {
- ifstream fin(mappingF);
- string line, key, value;
-
- if (!fin.is_open()) {
- fprintf(stderr, "Cannot open %s! It may not exist.\n", mappingF);
- exit(-1);
- }
+map<string, string> mi_table2; // allele_id to transcript_id
+map<string, string>::iterator mi_iter2; // corresponding iterator
- mi_table.clear();
- while (getline(fin, line)) {
- line = cleanStr(line);
- if (line[0] == '#') continue;
- istringstream strin(line);
- strin>>value>>key;
- mi_table[key] = value;
+void loadMappingInfo(int file_type, char* mappingF) {
+ ifstream fin(mappingF);
+ string line, key, value, value2;
+
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(mappingF) + "! It may not exist.");
+
+ switch(file_type) {
+ case 1:
+ mi_table.clear();
+ while (getline(fin, line)) {
+ line = cleanStr(line);
+ if (line[0] == '#') continue;
+ istringstream strin(line);
+ strin>>value>>key;
+ mi_table[key] = value;
+ }
+ break;
+ case 2:
+ mi_table.clear();
+ mi_table2.clear();
+ while (getline(fin, line)) {
+ line = cleanStr(line);
+ if (line[0] == '#') continue;
+ istringstream strin(line);
+ strin>> value>> value2>> key;
+ mi_table[key] = value;
+ mi_table2[key] = value2;
+ }
+ break;
+ default: assert(false);
}
fin.close();
return c;
}
-void writeResults(char* refName) {
+void writeResults(int option, char* refName) {
ofstream fout, fout2;
- string cur_gene_id, name;
+ string cur_gene_id, cur_transcript_id, name;
+ vector<int> gi, gt, ta;
- sprintf(groupF, "%s.grp", refName);
sprintf(tiF, "%s.ti", refName);
- sprintf(refFastaF, "%s.transcripts.fa", refName);
- sprintf(chromListF, "%s.chrlist", refName);
-
transcripts.writeTo(tiF);
if (verbose) { printf("Transcript Information File is generated!\n"); }
- fout.open(groupF);
- cur_gene_id = "";
+ cur_gene_id = ""; gi.clear();
+ if (option == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); }
for (int i = 1; i <= M; i++) {
const Transcript& transcript = transcripts.getTranscriptAt(i);
if (cur_gene_id != transcript.getGeneID()) {
- fout<<i<<endl;
- cur_gene_id = transcript.getGeneID();
+ gi.push_back(i);
+ if (option == 2) gt.push_back((int)ta.size());
+ cur_gene_id = transcript.getGeneID();
+ }
+ if ((option == 2) && (cur_transcript_id != transcript.getTranscriptID())) {
+ ta.push_back(i);
+ cur_transcript_id = transcript.getTranscriptID();
}
}
- fout<<M + 1<<endl;
+ gi.push_back(M + 1);
+ if (option == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); }
+
+ sprintf(groupF, "%s.grp", refName);
+ fout.open(groupF);
+ for (int i = 0; i < (int)gi.size(); i++) fout<< gi[i]<< endl;
fout.close();
if (verbose) { printf("Group File is generated!\n"); }
+ if (option == 2) {
+ sprintf(gtF, "%s.gt", refName);
+ fout.open(gtF);
+ for (int i = 0; i < (int)gt.size(); i++) fout<< gt[i]<< endl;
+ fout.close();
+ sprintf(taF, "%s.ta", refName);
+ fout.open(taF);
+ for (int i = 0; i < (int)ta.size(); i++) fout<< ta[i]<< endl;
+ fout.close();
+ if (verbose) { printf("Allele-specific group files are generated!\n"); }
+ }
+
+ sprintf(refFastaF, "%s.transcripts.fa", refName);
+ sprintf(chromListF, "%s.chrlist", refName);
fout2.open(chromListF);
fout.open(refFastaF);
for (int i = 1; i <= M; i++) {
- name = transcripts.getTranscriptAt(i).getTranscriptID();
+ name = transcripts.getTranscriptAt(i).getSeqName();
iter = name2seq.find(name);
- if (iter == name2seq.end()) {
- fprintf(stderr, "Cannot recognize transcript ID %s!\n", name.c_str());
- exit(-1);
- }
+ general_assert(iter != name2seq.end(), "Cannot recognize sequence ID" + name + "!");
fout<<">"<<name<<endl;
fout<<iter->second<<endl;
int main(int argc, char* argv[]) {
if (argc < 5 || ((hasMappingFile = atoi(argv[3])) && argc < 6)) {
- printf("Usage: synthesisRef refName quiet hasMappingFile [mappingFile] reference_file_1 [reference_file_2 ...]\n");
+ printf("Usage: synthesisRef refName quiet hasMappingFile<0,no;1,yes;2,allele-specific> [mappingFile] reference_file_1 [reference_file_2 ...]\n");
exit(-1);
}
verbose = !atoi(argv[2]);
- if (hasMappingFile) { loadMappingInfo(argv[4]); }
+ if (hasMappingFile) { loadMappingInfo(hasMappingFile, argv[4]); }
+
+ // allele-specific
+ if (hasMappingFile == 2) { transcripts.setType(2); }
int start = hasMappingFile ? 5 : 4;
ifstream fin;
string line, gseq;
- string seqname, gene_id;
+ string seqname, gene_id, transcript_id;
vector<Interval> vec;
name2seq.clear();
for (int i = start; i < argc; i++) {
fin.open(argv[i]);
- if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); }
+ general_assert(fin.is_open(), "Cannot open " + cstrtos(argv[i]) + "! It may not exist.");
getline(fin, line);
while ((fin) && (line[0] == '>')) {
istringstream strin(line.substr(1));
name2seq[seqname] = gseq;
+ transcript_id = seqname;
+ gene_id = seqname;
+
if (hasMappingFile) {
mi_iter = mi_table.find(seqname);
- if (mi_iter == mi_table.end()) {
- fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", seqname.c_str());
- exit(-1);
- }
- //assert(iter != table.end());
+ general_assert(mi_iter != mi_table.end(), "Mapping Info is not correct, cannot find " + seqname + "'s gene_id!");
gene_id = mi_iter->second;
+ if (hasMappingFile == 2) {
+ mi_iter2 = mi_table2.find(seqname);
+ general_assert(mi_iter2 != mi_table2.end(), "Mapping Info is not correct, cannot find allele " + seqname + "'s transcript_id!");
+ transcript_id = mi_iter2->second;
+ }
}
- else gene_id = seqname;
-
+
vec.clear();
vec.push_back(Interval(1, len));
- transcripts.add(Transcript(seqname, gene_id, seqname, '+', vec, ""));
+ transcripts.add(Transcript(transcript_id, gene_id, seqname, '+', vec, ""));
++M;
if (verbose && M % 1000000 == 0) { printf("%d sequences are processed!\n", M); }
assert(M == transcripts.getM());
transcripts.sort();
- writeResults(argv[1]);
+ writeResults(hasMappingFile, argv[1]);
return 0;
}