From a42d21510e502fe5cdd21a5d7ad023b33d2e6b99 Mon Sep 17 00:00:00 2001
From: Bo Li <bli@cs.wisc.edu>
Date: Thu, 17 Mar 2011 05:39:07 -0500
Subject: [PATCH] Fixed bugs in the Gibbs sampler. Fixed a bug which will set
 probF to 0 if the protocol is not forward strand specific.

---
 EM.cpp                    | 61 ++++++++++++++++++++++++---------------
 Gibbs.cpp                 |  3 +-
 rsem-calculate-expression |  2 +-
 rsem-plot-model           |  1 +
 4 files changed, 41 insertions(+), 26 deletions(-)
diff --git a/EM.cpp b/EM.cpp
index e381b47..318b7eb 100644
--- a/EM.cpp
+++ b/EM.cpp
@@ -64,6 +64,7 @@ bool genGibbsOut; // generate file for Gibbs sampler
 char refName[STRLEN], imdName[STRLEN], outName[STRLEN];
 char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN];
 char mparamsF[STRLEN], bmparamsF[STRLEN];
+char modelF[STRLEN], thetaF[STRLEN];
 
 char inpSamType;
 char *pt_fn_list, *pt_chr_list;
@@ -301,20 +302,12 @@ void calcExpectedEffectiveLengths(ModelType& model) {
 template<class ModelType>
 void writeResults(ModelType& model, double* counts) {
 	double denom;
-	char modelF[STRLEN], thetaF[STRLEN];
 	char outF[STRLEN];
 	FILE *fo;
 
 	sprintf(modelF, "%s.model", outName);
 	model.write(modelF);
 
-	sprintf(thetaF, "%s.theta", outName);
-	fo = fopen(thetaF, "w");
-	fprintf(fo, "%d\n", M + 1);
-	for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
-	fprintf(fo, "%.15g\n", theta[M]);
-	fclose(fo);
-
 	//calculate tau values
 	double *tau = new double[M + 1];
 	memset(tau, 0, sizeof(double) * (M + 1));
@@ -408,6 +401,8 @@ inline bool doesUpdateModel(int ROUND) {
 //Including initialize, algorithm and results saving
 template<class ReadType, class HitType, class ModelType>
 void EM() {
+	FILE *fo;
+
 	int ROUND;
 	double sum;
 
@@ -426,6 +421,7 @@ void EM() {
 	void *status;
 	int rc;
 
+
 	//initialize boolean variables
 	updateModel = calcExpectedWeights = false;
 
@@ -514,17 +510,6 @@ void EM() {
 
 	if (totNum > 0) fprintf(stderr, "Warning: RSEM reaches %d iterations before meeting the convergence criteria.\n", MAX_ROUND);
 
-	//calculate expected effective lengths for each isoform
-	calcExpectedEffectiveLengths<ModelType>(model);
-
-	//correct theta vector
-	sum = theta[0];
-	for (int i = 1; i <= M; i++) 
-	  if (eel[i] < EPSILON) { theta[i] = 0.0; }
-	  else sum += theta[i];
-	if (sum < EPSILON) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); }
-	for (int i = 0; i <= M; i++) theta[i] /= sum;
-
 	//generate output file used by Gibbs sampler
 	if (genGibbsOut) {
 		if (model.getNeedCalcConPrb()) {
@@ -540,7 +525,7 @@ void EM() {
 		model.setNeedCalcConPrb(false);
 
 		sprintf(out_for_gibbs_F, "%s.ofg", imdName);
-		FILE *fo = fopen(out_for_gibbs_F, "w");
+		fo = fopen(out_for_gibbs_F, "w");
 		fprintf(fo, "%d %d\n", M, N0);
 		for (int i = 0; i < nThreads; i++) {
 			int numN = hitvs[i]->getN();
@@ -549,7 +534,7 @@ void EM() {
 				int to = hitvs[i]->getSAt(j + 1);
 				int totNum = 0;
 
-				if (ncpvs[i][j] > 0.0) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); }
+				if (ncpvs[i][j] >= EPSILON) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); }
 				for (int k = fr; k < to; k++) {
 					HitType &hit = hitvs[i]->getHitAt(k);
 					if (hit.getConPrb() >= EPSILON) {
@@ -562,8 +547,33 @@ void EM() {
 			}
 		}
 		fclose(fo);
+
+		char scoreF[STRLEN];
+		sprintf(scoreF, "%s.ns", imdName);
+		fo = fopen(scoreF, "w");
+		fprintf(fo, "%.15g\n", model.getLogP());
+		fclose(fo);
 	}
 
+	sprintf(thetaF, "%s.theta", outName);
+	fo = fopen(thetaF, "w");
+	fprintf(fo, "%d\n", M + 1);
+
+	// output theta'
+	for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
+	fprintf(fo, "%.15g\n", theta[M]);
+	
+	//calculate expected effective lengths for each isoform
+	calcExpectedEffectiveLengths<ModelType>(model);
+
+	//correct theta vector
+	sum = theta[0];
+	for (int i = 1; i <= M; i++) 
+	  if (eel[i] < EPSILON) { theta[i] = 0.0; }
+	  else sum += theta[i];
+	if (sum < EPSILON) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); }
+	for (int i = 0; i <= M; i++) theta[i] /= sum;
+
 	//calculate expected weights and counts using learned parameters
 	updateModel = false; calcExpectedWeights = true;
 	for (int i = 0; i < nThreads; i++) {
@@ -587,8 +597,7 @@ void EM() {
 	/* destroy attribute */
 	pthread_attr_destroy(&attr);
 
-	
-	//for all
+	//convert theta' to theta
        	double *mw = model.getMW();
 	sum = 0.0;
 	for (int i = 0; i <= M; i++) {
@@ -598,6 +607,12 @@ void EM() {
 	assert(sum >= EPSILON);
 	for (int i = 0; i <= M; i++) theta[i] /= sum;
 
+	// output theta
+	for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]);
+	fprintf(fo, "%.15g\n", theta[M]);
+
+	fclose(fo);
+
 	writeResults<ModelType>(model, countvs[0]);
 
 	if (genBamF) {
diff --git a/Gibbs.cpp b/Gibbs.cpp
index 4b1e2d5..7727f37 100644
--- a/Gibbs.cpp
+++ b/Gibbs.cpp
@@ -200,8 +200,7 @@ void Gibbs(char* imdName) {
 		}
 
 		if (ROUND > BURNIN) {
-			if ((ROUND - BURNIN -1) % GAP == 0) writeCountVector(fo);
-			writeCountVector(fo);
+			if ((ROUND - BURNIN - 1) % GAP == 0) writeCountVector(fo);
 			for (int i = 0; i <= M; i++) { 
 			  pme_c[i] += counts[i] - 1;
 			  pme_theta[i] += counts[i] / totc;
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index 0f22282..7826ec7 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -173,7 +173,7 @@ if (!$is_sam && !$is_bam) {
     if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
     
     if ($strand_specific || $probF == 1.0) { $command .= " --norc"; }
-    elsif ($probF = 0.0) { $command .= " --nofw"; }
+    elsif ($probF == 0.0) { $command .= " --nofw"; }
 
     $command .= " -p $nThreads -a -m $maxHits -S";
     if ($quiet) { $command .= " --quiet"; }
diff --git a/rsem-plot-model b/rsem-plot-model
index 5546ff6..c0eaa1b 100755
--- a/rsem-plot-model
+++ b/rsem-plot-model
@@ -43,6 +43,7 @@ if (bval == 1) {
   y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]])
   par(cex.axis = 0.7)
   barplot(y, space = 0, names.arg = 1:bin_size, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability")
+  par(cex.axis = 1.0)
 }
 strvec <- readLines(con, n = 1)
 
-- 
2.39.2