X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=simulation.cpp;h=1288c65178afa360f89f4cff3ee5a2f78b7de5cf;hb=1d786dca8b4c9c1e2176a28a669d2d16cd93e395;hp=2511f33693ce42a3751d07542dca2ce8139a0f54;hpb=58f823f5be6dfbe00896fc44ac3aac5e881e9c5c;p=rsem.git

diff --git a/simulation.cpp b/simulation.cpp
index 2511f33..1288c65 100644
--- a/simulation.cpp
+++ b/simulation.cpp
@@ -9,7 +9,7 @@
 #include<vector>
 
 #include "utils.h"
-
+#include "my_assert.h"
 #include "Read.h"
 #include "SingleRead.h"
 #include "SingleReadQ.h"
@@ -31,15 +31,17 @@
 
 using namespace std;
 
-int N;
+const int OFFSITE = 5;
+
+READ_INT_TYPE N;
 int model_type, M, m;
 
 Refs refs;
 GroupInfo gi;
 Transcripts transcripts;
 
-double *theta, *counts;
 vector<double> eel;
+vector<double> theta, counts;
 
 int n_os;
 ostream *os[2];
@@ -78,34 +80,33 @@ void genOutReadStreams(int type, char *outFN) {
 
 template<class ModelType>
 void calcExpectedEffectiveLengths(ModelType& model) {
-  int lb, ub, span;
-  double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
+	int lb, ub, span;
+	double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
   
-  model.getGLD().copyTo(pdf, cdf, lb, ub, span);
-  clen = new double[span + 1];
-  clen[0] = 0.0;
-  for (int i = 1; i <= span; i++) {
-    clen[i] = clen[i - 1] + pdf[i] * (lb + i);
-  }
-
-  eel.clear();
-  eel.resize(M + 1, 0.0);
-  for (int i = 1; i <= M; i++) {
-    int totLen = refs.getRef(i).getTotLen();
-    int fullLen = refs.getRef(i).getFullLen();
-    int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
-    int pos2 = max(min(totLen, ub) - lb, 0);
-
-    if (pos2 == 0) { eel[i] = 0.0; continue; }
+	model.getGLD().copyTo(pdf, cdf, lb, ub, span);
+	clen = new double[span + 1];
+	clen[0] = 0.0;
+	for (int i = 1; i <= span; i++) {
+		clen[i] = clen[i - 1] + pdf[i] * (lb + i);
+	}
+
+	eel.assign(M + 1, 0.0);
+	for (int i = 1; i <= M; i++) {
+		int totLen = refs.getRef(i).getTotLen();
+		int fullLen = refs.getRef(i).getFullLen();
+		int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
+		int pos2 = max(min(totLen, ub) - lb, 0);
+
+		if (pos2 == 0) { eel[i] = 0.0; continue; }
     
-    eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
-    assert(eel[i] >= 0);
-    if (eel[i] < MINEEL) { eel[i] = 0.0; }
-  }
+		eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
+		assert(eel[i] >= 0);
+		if (eel[i] < MINEEL) { eel[i] = 0.0; }
+	}
   
-  delete[] pdf;
-  delete[] cdf;
-  delete[] clen;
+	delete[] pdf;
+	delete[] cdf;
+	delete[] clen;
 }
 
 template<class ReadType, class ModelType>
@@ -122,101 +123,129 @@ void simulate(char* modelF, char* resultsF) {
 	//generate theta vector
 	ifstream fin(resultsF);
 	string line;
-	double tau;
+	double tpm;
 	double denom = 0.0;
+	getline(fin, line); // read the first line, which is just column names
 	for (int i = 1; i <= M; i++) {
 	  getline(fin, line);
 	  size_t pos = 0;
-	  for (int j = 0; j < 3; j++) pos = line.find_first_of('\t', pos) + 1;
+	  for (int j = 0; j < OFFSITE; j++) pos = line.find_first_of('\t', pos) + 1;
 	  size_t pos2 = line.find_first_of('\t', pos);
 	  if (pos2 == string::npos) pos2 = line.length();
-	  tau = atof(line.substr(pos, pos2 - pos).c_str());
-	  theta[i] = tau * eel[i];
+	  tpm = atof(line.substr(pos, pos2 - pos).c_str());
+	  theta[i] = tpm * eel[i]; // during simulation, there is no check for effL < 0. The reason is for that case, eel[i] here = 0 and therefore no chance to sample from it
 	  denom += theta[i];
 	}
 	assert(denom > EPSILON);
 	fin.close();
 	for (int i = 1; i <= M; i++) theta[i] = theta[i] / denom * (1.0 - theta[0]);
 	
-	int tmp2140 = 0;
+	READ_INT_TYPE resimulation_count = 0;
 
 	//simulating...
 	model.startSimulation(&sampler, theta);
-	for (int i = 0; i < N; i++) {
-	  while (!model.simulate(i, read, sid)) { ++tmp2140; }//;
+	for (READ_INT_TYPE i = 0; i < N; i++) {
+		while (!model.simulate(i, read, sid)) { ++resimulation_count; }
 		read.write(n_os, os);
 		++counts[sid];
-		if ((i + 1) % 1000000 == 0 && verbose) printf("GEN %d\n", i + 1);
+		if ((i + 1) % 1000000 == 0 && verbose) cout<<"GEN "<< i + 1<< endl;
 	}
 	model.finishSimulation();
 
-	printf("Total number of resimulation is %d\n", tmp2140);
+	cout<< "Total number of resimulation is "<< resimulation_count<< endl;
 }
 
-void writeResFiles(char* outFN) {
-	FILE *fo;
+void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
 	double denom;
+	vector<double> frac;
 
-	//calculate normalized read fraction
-	double *nrf = new double[M + 1];
-	memset(nrf, 0, sizeof(double) * (M + 1));
+	//calculate fraction of count over all mappabile reads
 	denom = 0.0;
+	frac.assign(M + 1, 0.0);
 	for (int i = 1; i <= M; i++)
-	  if (eel[i] > EPSILON) {
-	    nrf[i] = counts[i];
-	    denom += nrf[i];
+	  if (eel[i] >= EPSILON) {
+	    frac[i] = theta[i];
+	    denom += frac[i];
 	  }
-	  else {
-	    if (counts[i] > EPSILON) { printf("Warning: An isoform which EEL is less than %.6g gets sampled!\n", MINEEL); }
-	  }
-	assert(denom > 0.0);
-	for (int i = 1; i <= M; i++) nrf[i] /= denom;
+	general_assert(denom > 0, "No alignable reads?!");
+	for (int i = 1; i <= M; i++) frac[i] /= denom;
+
+	//calculate FPKM
+	fpkm.assign(M + 1, 0.0);
+	for (int i = 1; i <= M; i++)
+		if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
 
-	//calculate tau values
-	double *tau = new double[M + 1];
-	memset(tau, 0, sizeof(double) * (M + 1));
+	//calculate TPM
+	tpm.assign(M + 1, 0.0);
 	denom = 0.0;
-	for (int i = 1; i <= M; i++) 
-	  if (eel[i] > EPSILON) {
-	    tau[i] = nrf[i] / eel[i];
-	    denom += tau[i];
-	  }   
-	assert(denom > 0.0);
-	for (int i = 1; i <= M; i++) tau[i] /= denom;
+	for (int i = 1; i <= M; i++) denom += fpkm[i];
+	for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;
+}
+
+void writeResFiles(char* outFN) {
+	FILE *fo;
+	vector<int> tlens;
+	vector<double> fpkm, tpm, isopct;
+	vector<double> glens, gene_eels, gene_counts, gene_tpm, gene_fpkm;
+
+	for (int i = 1; i <= M; i++)
+		general_assert(eel[i] > EPSILON || counts[i] <= EPSILON, "An isoform whose effecitve length < " + ftos(MINEEL, 6) + " got sampled!");
+
+	calcExpressionValues(counts, eel, tpm, fpkm);
+
+	//calculate IsoPct, etc.
+	isopct.assign(M + 1, 0.0);
+	tlens.assign(M + 1, 0);
+
+	glens.assign(m, 0.0); gene_eels.assign(m, 0.0);
+	gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
+
+	for (int i = 0; i < m; i++) {
+		int b = gi.spAt(i), e = gi.spAt(i + 1);
+		for (int j = b; j < e; j++) {
+			const Transcript& transcript = transcripts.getTranscriptAt(j);
+			tlens[j] = transcript.getLength();
+
+			glens[i] += tlens[j] * tpm[j];
+			gene_eels[i] += eel[j] * tpm[j];
+			gene_counts[i] += counts[j];
+			gene_tpm[i] += tpm[j];
+			gene_fpkm[i] += fpkm[j];
+		}
+
+		if (gene_tpm[i] < EPSILON) continue;
+
+		for (int j = b; j < e; j++)
+			isopct[j] = tpm[j] / gene_tpm[i];
+		glens[i] /= gene_tpm[i];
+		gene_eels[i] /= gene_tpm[i];
+	}
 
 	//isoform level
 	sprintf(isoResF, "%s.sim.isoforms.results", outFN);
 	fo = fopen(isoResF, "w");
+	fprintf(fo, "transcript_id\tgene_id\tlength\teffective_length\tcount\tTPM\tFPKM\tIsoPct\n");
 	for (int i = 1; i <= M; i++) {
 		const Transcript& transcript = transcripts.getTranscriptAt(i);
-		fprintf(fo, "%s\t%.2f\t%.15g\t%.15g", transcript.getTranscriptID().c_str(), counts[i], nrf[i], tau[i]);
-		
-		if (transcript.getLeft() != "") { fprintf(fo, "\t%s", transcript.getLeft().c_str()); }
-		fprintf(fo, "\n");
+		fprintf(fo, "%s\t%s\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str(), tlens[i],
+				eel[i], counts[i], tpm[i], fpkm[i], isopct[i] * 1e2);
 	}
 	fclose(fo);
 
 	//gene level
 	sprintf(geneResF, "%s.sim.genes.results", outFN);
 	fo = fopen(geneResF, "w");
+	fprintf(fo, "gene_id\ttranscript_id(s)\tlength\teffective_length\tcount\tTPM\tFPKM\n");
 	for (int i = 0; i < m; i++) {
-	  double sum_c = 0.0, sum_nrf = 0.0, sum_tau = 0.0;
 		int b = gi.spAt(i), e = gi.spAt(i + 1);
-		for (int j = b; j < e; j++) {
-			sum_c += counts[j];
-			sum_nrf += nrf[j];
-			sum_tau += tau[j];
-		}
 		const string& gene_id = transcripts.getTranscriptAt(b).getGeneID();
-		fprintf(fo, "%s\t%.2f\t%.15g\t%.15g\t", gene_id.c_str(), sum_c, sum_nrf, sum_tau);
+		fprintf(fo, "%s\t", gene_id.c_str());
 		for (int j = b; j < e; j++) {
-			fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : '\n'));
+			fprintf(fo, "%s%c", transcripts.getTranscriptAt(j).getTranscriptID().c_str(), (j < e - 1 ? ',' : '\t'));
 		}
+		fprintf(fo, "%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", glens[i], gene_eels[i], gene_counts[i], gene_tpm[i], gene_fpkm[i]);
 	}
 	fclose(fo);
-
-	delete[] nrf;
-	delete[] tau;
 }
 
 void releaseOutReadStreams() {
@@ -251,17 +280,16 @@ int main(int argc, char* argv[]) {
 	//read model type from modelF
 	fi = fopen(argv[2], "r");
 	if (fi == NULL) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[2]); exit(-1); }
-	fscanf(fi, "%d", &model_type);
+	assert(fscanf(fi, "%d", &model_type) == 1);
 	fclose(fi);
 
-	theta = new double[M + 1];
+	theta.assign(M + 1, 0.0);
 	theta[0] = atof(argv[4]);
 	N = atoi(argv[5]);
 
 	genOutReadStreams(model_type, argv[6]);
 
-	counts = new double[M + 1];
-	memset(counts, 0, sizeof(double) * (M + 1));
+	counts.assign(M + 1, 0.0);
 
 	switch(model_type) {
 	case 0: simulate<SingleRead, SingleModel>(argv[2], argv[3]); break;
@@ -273,8 +301,5 @@ int main(int argc, char* argv[]) {
 	writeResFiles(argv[6]);
 	releaseOutReadStreams();
 
-	delete[] theta;
-	delete[] counts;
-
 	return 0;
 }