From a42d21510e502fe5cdd21a5d7ad023b33d2e6b99 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Thu, 17 Mar 2011 05:39:07 -0500 Subject: [PATCH] Fixed bugs in the Gibbs sampler. Fixed a bug which will set probF to 0 if the protocol is not forward strand specific. --- EM.cpp | 61 ++++++++++++++++++++++++--------------- Gibbs.cpp | 3 +- rsem-calculate-expression | 2 +- rsem-plot-model | 1 + 4 files changed, 41 insertions(+), 26 deletions(-) diff --git a/EM.cpp b/EM.cpp index e381b47..318b7eb 100644 --- a/EM.cpp +++ b/EM.cpp @@ -64,6 +64,7 @@ bool genGibbsOut; // generate file for Gibbs sampler char refName[STRLEN], imdName[STRLEN], outName[STRLEN]; char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN]; char mparamsF[STRLEN], bmparamsF[STRLEN]; +char modelF[STRLEN], thetaF[STRLEN]; char inpSamType; char *pt_fn_list, *pt_chr_list; @@ -301,20 +302,12 @@ void calcExpectedEffectiveLengths(ModelType& model) { template void writeResults(ModelType& model, double* counts) { double denom; - char modelF[STRLEN], thetaF[STRLEN]; char outF[STRLEN]; FILE *fo; sprintf(modelF, "%s.model", outName); model.write(modelF); - sprintf(thetaF, "%s.theta", outName); - fo = fopen(thetaF, "w"); - fprintf(fo, "%d\n", M + 1); - for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]); - fprintf(fo, "%.15g\n", theta[M]); - fclose(fo); - //calculate tau values double *tau = new double[M + 1]; memset(tau, 0, sizeof(double) * (M + 1)); @@ -408,6 +401,8 @@ inline bool doesUpdateModel(int ROUND) { //Including initialize, algorithm and results saving template void EM() { + FILE *fo; + int ROUND; double sum; @@ -426,6 +421,7 @@ void EM() { void *status; int rc; + //initialize boolean variables updateModel = calcExpectedWeights = false; @@ -514,17 +510,6 @@ void EM() { if (totNum > 0) fprintf(stderr, "Warning: RSEM reaches %d iterations before meeting the convergence criteria.\n", MAX_ROUND); - //calculate expected effective lengths for each isoform - calcExpectedEffectiveLengths(model); - - //correct theta vector - sum = theta[0]; - for (int i = 1; i <= M; i++) - if (eel[i] < EPSILON) { theta[i] = 0.0; } - else sum += theta[i]; - if (sum < EPSILON) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); } - for (int i = 0; i <= M; i++) theta[i] /= sum; - //generate output file used by Gibbs sampler if (genGibbsOut) { if (model.getNeedCalcConPrb()) { @@ -540,7 +525,7 @@ void EM() { model.setNeedCalcConPrb(false); sprintf(out_for_gibbs_F, "%s.ofg", imdName); - FILE *fo = fopen(out_for_gibbs_F, "w"); + fo = fopen(out_for_gibbs_F, "w"); fprintf(fo, "%d %d\n", M, N0); for (int i = 0; i < nThreads; i++) { int numN = hitvs[i]->getN(); @@ -549,7 +534,7 @@ void EM() { int to = hitvs[i]->getSAt(j + 1); int totNum = 0; - if (ncpvs[i][j] > 0.0) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); } + if (ncpvs[i][j] >= EPSILON) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); } for (int k = fr; k < to; k++) { HitType &hit = hitvs[i]->getHitAt(k); if (hit.getConPrb() >= EPSILON) { @@ -562,8 +547,33 @@ void EM() { } } fclose(fo); + + char scoreF[STRLEN]; + sprintf(scoreF, "%s.ns", imdName); + fo = fopen(scoreF, "w"); + fprintf(fo, "%.15g\n", model.getLogP()); + fclose(fo); } + sprintf(thetaF, "%s.theta", outName); + fo = fopen(thetaF, "w"); + fprintf(fo, "%d\n", M + 1); + + // output theta' + for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]); + fprintf(fo, "%.15g\n", theta[M]); + + //calculate expected effective lengths for each isoform + calcExpectedEffectiveLengths(model); + + //correct theta vector + sum = theta[0]; + for (int i = 1; i <= M; i++) + if (eel[i] < EPSILON) { theta[i] = 0.0; } + else sum += theta[i]; + if (sum < EPSILON) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); } + for (int i = 0; i <= M; i++) theta[i] /= sum; + //calculate expected weights and counts using learned parameters updateModel = false; calcExpectedWeights = true; for (int i = 0; i < nThreads; i++) { @@ -587,8 +597,7 @@ void EM() { /* destroy attribute */ pthread_attr_destroy(&attr); - - //for all + //convert theta' to theta double *mw = model.getMW(); sum = 0.0; for (int i = 0; i <= M; i++) { @@ -598,6 +607,12 @@ void EM() { assert(sum >= EPSILON); for (int i = 0; i <= M; i++) theta[i] /= sum; + // output theta + for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]); + fprintf(fo, "%.15g\n", theta[M]); + + fclose(fo); + writeResults(model, countvs[0]); if (genBamF) { diff --git a/Gibbs.cpp b/Gibbs.cpp index 4b1e2d5..7727f37 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -200,8 +200,7 @@ void Gibbs(char* imdName) { } if (ROUND > BURNIN) { - if ((ROUND - BURNIN -1) % GAP == 0) writeCountVector(fo); - writeCountVector(fo); + if ((ROUND - BURNIN - 1) % GAP == 0) writeCountVector(fo); for (int i = 0; i <= M; i++) { pme_c[i] += counts[i] - 1; pme_theta[i] += counts[i] / totc; diff --git a/rsem-calculate-expression b/rsem-calculate-expression index 0f22282..7826ec7 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -173,7 +173,7 @@ if (!$is_sam && !$is_bam) { if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; } if ($strand_specific || $probF == 1.0) { $command .= " --norc"; } - elsif ($probF = 0.0) { $command .= " --nofw"; } + elsif ($probF == 0.0) { $command .= " --nofw"; } $command .= " -p $nThreads -a -m $maxHits -S"; if ($quiet) { $command .= " --quiet"; } diff --git a/rsem-plot-model b/rsem-plot-model index 5546ff6..c0eaa1b 100755 --- a/rsem-plot-model +++ b/rsem-plot-model @@ -43,6 +43,7 @@ if (bval == 1) { y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]]) par(cex.axis = 0.7) barplot(y, space = 0, names.arg = 1:bin_size, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability") + par(cex.axis = 1.0) } strvec <- readLines(con, n = 1) -- 2.39.2