From da57529b92adbb7ae74a89861cb39fb35ac7c62d Mon Sep 17 00:00:00 2001 From: Bo Li Date: Fri, 6 Jun 2014 03:50:21 -0500 Subject: [PATCH] Added posterior standard deviation of counts as output if either '--calc-pme' or '--calc-ci' is set --- Gibbs.cpp | 130 +++++++++++++++++++++++++++----------- WriteResults.h | 30 ++++----- rsem-calculate-expression | 21 +++--- rsem_perl_utils.pm | 6 +- 4 files changed, 118 insertions(+), 69 deletions(-) diff --git a/Gibbs.cpp b/Gibbs.cpp index 7e26d86..7215c10 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -25,14 +25,15 @@ using namespace std; struct Params { - int no, nsamples; - FILE *fo; - engine_type *engine; - double *pme_c, *pve_c; //posterior mean and variance vectors on counts + int no, nsamples; + FILE *fo; + engine_type *engine; + double *pme_c, *pve_c; //posterior mean and variance vectors on counts double *pme_tpm, *pme_fpkm; + + double *pve_c_genes, *pve_c_trans; }; - struct Item { int sid; double conprb; @@ -66,7 +67,6 @@ double *mw; vector pme_c, pve_c; //global posterior mean and variance vectors on counts vector pme_tpm, pme_fpkm; -bool var_opt; bool quiet; Params *paramsArray; @@ -77,6 +77,27 @@ int rc; bool hasSeed; seedType seed; +int m; +char groupF[STRLEN]; +GroupInfo gi; + +bool alleleS; +int m_trans; +GroupInfo gt, ta; +vector pve_c_genes, pve_c_trans; + +void load_group_info(char* refName) { + // Load group info + sprintf(groupF, "%s.grp", refName); + gi.load(groupF); + m = gi.getm(); + + alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific + m_trans = (alleleS ? ta.getm() : 0); + + if (verbose) { printf("Loading group information is finished!\n"); } +} + void load_data(char* refName, char* statName, char* imdName) { ifstream fin; string line; @@ -114,7 +135,7 @@ void load_data(char* refName, char* statName, char* imdName) { totc = N0 + N1 + (M + 1); - if (verbose) { printf("Loading Data is finished!\n"); } + if (verbose) { printf("Loading data is finished!\n"); } } template @@ -157,6 +178,15 @@ void init() { memset(paramsArray[i].pme_tpm, 0, sizeof(double) * (M + 1)); paramsArray[i].pme_fpkm = new double[M + 1]; memset(paramsArray[i].pme_fpkm, 0, sizeof(double) * (M + 1)); + + paramsArray[i].pve_c_genes = new double[m]; + memset(paramsArray[i].pve_c_genes, 0, sizeof(double) * m); + + paramsArray[i].pve_c_trans = NULL; + if (alleleS) { + paramsArray[i].pve_c_trans = new double[m_trans]; + memset(paramsArray[i].pve_c_trans, 0, sizeof(double) * m_trans); + } } engineFactory::finish(); @@ -245,10 +275,25 @@ void* Gibbs(void* arg) { calcExpressionValues(M, theta, eel, tpm, fpkm); for (int i = 0; i <= M; i++) { params->pme_c[i] += counts[i] - 1; - params->pve_c[i] += (counts[i] - 1) * (counts[i] - 1); + params->pve_c[i] += double(counts[i] - 1) * (counts[i] - 1); params->pme_tpm[i] += tpm[i]; params->pme_fpkm[i] += fpkm[i]; } + + for (int i = 0; i < m; i++) { + int b = gi.spAt(i), e = gi.spAt(i + 1); + double count = 0.0; + for (int j = b; j < e; j++) count += counts[j] - 1; + params->pve_c_genes[i] += count * count; + } + + if (alleleS) + for (int i = 0; i < m_trans; i++) { + int b = ta.spAt(i), e = ta.spAt(i + 1); + double count = 0.0; + for (int j = b; j < e; j++) count += counts[j] - 1; + params->pve_c_trans[i] += count * count; + } } } @@ -270,6 +315,11 @@ void release() { pve_c.assign(M + 1, 0); pme_tpm.assign(M + 1, 0); pme_fpkm.assign(M + 1, 0); + + pve_c_genes.assign(m, 0); + pve_c_trans.clear(); + if (alleleS) pve_c_trans.assign(m_trans, 0); + for (int i = 0; i < nThreads; i++) { fclose(paramsArray[i].fo); delete paramsArray[i].engine; @@ -279,25 +329,53 @@ void release() { pme_tpm[j] += paramsArray[i].pme_tpm[j]; pme_fpkm[j] += paramsArray[i].pme_fpkm[j]; } + + for (int j = 0; j < m; j++) + pve_c_genes[j] += paramsArray[i].pve_c_genes[j]; + + if (alleleS) + for (int j = 0; j < m_trans; j++) + pve_c_trans[j] += paramsArray[i].pve_c_trans[j]; + delete[] paramsArray[i].pme_c; delete[] paramsArray[i].pve_c; delete[] paramsArray[i].pme_tpm; delete[] paramsArray[i].pme_fpkm; + + delete[] paramsArray[i].pve_c_genes; + if (alleleS) delete[] paramsArray[i].pve_c_trans; } delete[] paramsArray; - for (int i = 0; i <= M; i++) { pme_c[i] /= NSAMPLES; - pve_c[i] = (pve_c[i] - NSAMPLES * pme_c[i] * pme_c[i]) / (NSAMPLES - 1); + pve_c[i] = (pve_c[i] - double(NSAMPLES) * pme_c[i] * pme_c[i]) / double(NSAMPLES - 1); + if (pve_c[i] < 0.0) pve_c[i] = 0.0; pme_tpm[i] /= NSAMPLES; pme_fpkm[i] /= NSAMPLES; } + + for (int i = 0; i < m; i++) { + int b = gi.spAt(i), e = gi.spAt(i + 1); + double pme_c_gene = 0.0; + for (int j = b; j < e; j++) pme_c_gene += pme_c[j]; + pve_c_genes[i] = (pve_c_genes[i] - double(NSAMPLES) * pme_c_gene * pme_c_gene) / double(NSAMPLES - 1); + if (pve_c_genes[i] < 0.0) pve_c_genes[i] = 0.0; + } + + if (alleleS) + for (int i = 0; i < m_trans; i++) { + int b = ta.spAt(i), e = ta.spAt(i + 1); + double pme_c_tran = 0.0; + for (int j = b; j < e; j++) pme_c_tran += pme_c[j]; + pve_c_trans[i] = (pve_c_trans[i] - double(NSAMPLES) * pme_c_tran * pme_c_tran) / double(NSAMPLES - 1); + if (pve_c_trans[i] < 0.0) pve_c_trans[i] = 0.0; + } } int main(int argc, char* argv[]) { if (argc < 7) { - printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--var] [--seed seed] [-q]\n"); + printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--seed seed] [-q]\n"); exit(-1); } @@ -310,13 +388,11 @@ int main(int argc, char* argv[]) { GAP = atoi(argv[6]); nThreads = 1; - var_opt = false; hasSeed = false; quiet = false; for (int i = 7; i < argc; i++) { if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]); - if (!strcmp(argv[i], "--var")) var_opt = true; if (!strcmp(argv[i], "--seed")) { hasSeed = true; int len = strlen(argv[i + 1]); @@ -334,6 +410,7 @@ int main(int argc, char* argv[]) { printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads); } + load_group_info(refName); load_data(refName, statName, imdName); sprintf(modelF, "%s.model", statName); @@ -366,31 +443,8 @@ int main(int argc, char* argv[]) { if (verbose) printf("Gibbs finished!\n"); - writeResultsGibbs(M, refName, imdName, pme_c, pme_fpkm, pme_tpm); - - if (var_opt) { - char varF[STRLEN]; - - // Load group info - int m; - GroupInfo gi; - char groupF[STRLEN]; - sprintf(groupF, "%s.grp", refName); - gi.load(groupF); - m = gi.getm(); - - sprintf(varF, "%s.var", statName); - FILE *fo = fopen(varF, "w"); - general_assert(fo != NULL, "Cannot open " + cstrtos(varF) + "!"); - for (int i = 0; i < m; i++) { - int b = gi.spAt(i), e = gi.spAt(i + 1), number_of_isoforms = e - b; - for (int j = b; j < e; j++) { - fprintf(fo, "%s\t%d\t%.15g\t%.15g\n", refs.getRef(j).getName().c_str(), number_of_isoforms, pme_c[j], pve_c[j]); - } - } - fclose(fo); - } - + writeResultsGibbs(M, m, m_trans, gi, gt, ta, alleleS, imdName, pme_c, pme_fpkm, pme_tpm, pve_c, pve_c_genes, pve_c_trans); + delete mw; // delete the copy return 0; diff --git a/WriteResults.h b/WriteResults.h index 95aa7b0..79ba5b2 100644 --- a/WriteResults.h +++ b/WriteResults.h @@ -1,6 +1,7 @@ #ifndef WRITERESULTS_H_ #define WRITERESULTS_H_ +#include #include #include #include @@ -336,28 +337,16 @@ void writeResultsEM(int M, const char* refName, const char* imdName, Transcripts if (verbose) { printf("Expression Results are written!\n"); } } -void writeResultsGibbs(int M, char* refName, char* imdName, std::vector& pme_c, std::vector& pme_fpkm, std::vector& pme_tpm) { +void writeResultsGibbs(int M, int m, int m_trans, GroupInfo& gi, GroupInfo >, GroupInfo &ta, bool alleleS, char* imdName, std::vector& pme_c, std::vector& pme_fpkm, std::vector& pme_tpm, std::vector& pve_c, std::vector& pve_c_genes, std::vector& pve_c_trans) { char outF[STRLEN]; FILE *fo; - int m; - GroupInfo gi; - char groupF[STRLEN]; std::vector isopct; std::vector gene_counts, gene_tpm, gene_fpkm; - // Load group info - sprintf(groupF, "%s.grp", refName); - gi.load(groupF); - m = gi.getm(); - // For allele-specific expression - int m_trans = 0; - GroupInfo gt, ta; std::vector trans_counts, trans_tpm, trans_fpkm, ta_pct, gt_pct; - bool alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific - //calculate IsoPct, etc. isopct.assign(M + 1, 0.0); gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0); @@ -375,7 +364,6 @@ void writeResultsGibbs(int M, char* refName, char* imdName, std::vector& } if (alleleS) { - m_trans = ta.getm(); ta_pct.assign(M + 1, 0.0); trans_counts.assign(m_trans, 0.0); trans_tpm.assign(m_trans, 0.0); trans_fpkm.assign(m_trans, 0.0); @@ -407,6 +395,8 @@ void writeResultsGibbs(int M, char* refName, char* imdName, std::vector& for (int i = 1; i <= M; i++) fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", sqrt(pve_c[i]), (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) @@ -423,6 +413,8 @@ void writeResultsGibbs(int M, char* refName, char* imdName, std::vector& for (int i = 1; i <= M; i++) fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.2f%c", sqrt(pve_c[i]), (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) @@ -440,6 +432,8 @@ void writeResultsGibbs(int M, char* refName, char* imdName, std::vector& for (int i = 0; i < m_trans; i++) fprintf(fo, "%.2f%c", trans_counts[i], (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.2f%c", sqrt(pve_c_trans[i]), (i < m_trans - 1 ? '\t' : '\n')); for (int i = 0; i < m_trans; i++) fprintf(fo, "%.2f%c", trans_tpm[i], (i < m_trans - 1 ? '\t' : '\n')); for (int i = 0; i < m_trans; i++) @@ -455,11 +449,13 @@ void writeResultsGibbs(int M, char* refName, char* imdName, std::vector& general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!"); for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n')); + fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n')); + for (int i = 0; i < m; i++) + fprintf(fo, "%.2f%c", sqrt(pve_c_genes[i]), (i < m - 1 ? '\t' : '\n')); for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n')); + fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n')); for (int i = 0; i < m; i++) - fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n')); + fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n')); fclose(fo); if (verbose) { printf("Gibbs based expression values are written!\n"); } diff --git a/rsem-calculate-expression b/rsem-calculate-expression index d6cb5f2..ccfc643 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -57,7 +57,6 @@ my $genGenomeBamF = 0; my $sampling = 0; my $calcPME = 0; my $calcCI = 0; -my $var_opt = 0; # temporarily, only for internal use my $quiet = 0; my $help = 0; @@ -124,7 +123,6 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "output-genome-bam" => \$genGenomeBamF, "sampling-for-bam" => \$sampling, "calc-pme" => \$calcPME, - "var" => \$var_opt, "calc-ci" => \$calcCI, "ci-memory=i" => \$NMB, "samtools-sort-mem=s" => \$SortMem, @@ -355,7 +353,7 @@ if ($genBamF) { if ($sampling) { $command .= " --sampling"; } if ($seed ne "NULL") { $command .= " --seed $seeds[0]"; } } -if ($calcPME || $var_opt || $calcCI) { $command .= " --gibbs-out"; } +if ($calcPME || $calcCI) { $command .= " --gibbs-out"; } if ($quiet) { $command .= " -q"; } &runCommand($command); @@ -390,10 +388,9 @@ if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; } if ($mTime) { $time_start = time(); } -if ($calcPME || $var_opt || $calcCI ) { +if ($calcPME || $calcCI ) { $command = "rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP"; $command .= " -p $nThreads"; - if ($var_opt) { $command .= " --var"; } if ($seed ne "NULL") { $command .= " --seed $seeds[1]"; } if ($quiet) { $command .= " -q"; } &runCommand($command); @@ -707,7 +704,7 @@ File containing isoform level expression estimates. The first line contains column names separated by the tab character. The format of each line in the rest of this file is: -transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] +transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] Fields are separated by the tab character. Fields within "[]" are optional. They will not be presented if neither '--calc-pme' nor @@ -753,9 +750,11 @@ transcript's abandunce over its parent gene's abandunce. If its parent gene has only one isoform or the gene information is not provided, this field will be set to 100. -'pme_expected_count', 'pme_TPM', 'pme_FPKM' are posterior mean -estimates calculated by RSEM's Gibbs sampler. 'IsoPct_from_pme_TPM' is -the isoform percentage calculated from 'pme_TPM' values. +'posterior_mean_count', 'pme_TPM', 'pme_FPKM' are posterior mean +estimates calculated by RSEM's Gibbs +sampler. 'posterior_standard_deviation_of_count' is the posterior +standard deviation of counts. 'IsoPct_from_pme_TPM' is the isoform +percentage calculated from 'pme_TPM' values. 'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' and 'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95% @@ -768,7 +767,7 @@ File containing gene level expression estimates. The first line contains column names separated by the tab character. The format of each line in the rest of this file is: -gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] +gene_id transcript_id(s) length effective_length expected_count TPM FPKM [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] Fields are separated by the tab character. Fields within "[]" are optional. They will not be presented if neither '--calc-pme' nor @@ -793,7 +792,7 @@ allele-specific expression calculation. The first line contains column names separated by the tab character. The format of each line in the rest of this file is: -allele_id transcript_id gene_id length effective_length expected_count TPM FPKM AlleleIsoPct AlleleGenePct [pme_expected_count pme_TPM pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] +allele_id transcript_id gene_id length effective_length expected_count TPM FPKM AlleleIsoPct AlleleGenePct [posterior_mean_count posterior_standard_deviation_of_count pme_TPM pme_FPKM AlleleIsoPct_from_pme_TPM AlleleGenePct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound] Fields are separated by the tab character. Fields within "[]" are optional. They will not be presented if neither '--calc-pme' nor diff --git a/rsem_perl_utils.pm b/rsem_perl_utils.pm index 078d119..01a3576 100644 --- a/rsem_perl_utils.pm +++ b/rsem_perl_utils.pm @@ -31,11 +31,11 @@ sub runCommand { print "\n"; } -my @allele_title = ("allele_id", "transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "AlleleIsoPct", "AlleleGenePct", "pme_expected_count", "pme_TPM", "pme_FPKM", "AlleleIsoPct_from_pme_TPM", "AlleleGenePct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound"); +my @allele_title = ("allele_id", "transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "AlleleIsoPct", "AlleleGenePct", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "AlleleIsoPct_from_pme_TPM", "AlleleGenePct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound"); -my @transcript_title = ("transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "IsoPct", "pme_expected_count", "pme_TPM", "pme_FPKM", "IsoPct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound"); +my @transcript_title = ("transcript_id", "gene_id", "length", "effective_length", "expected_count", "TPM", "FPKM", "IsoPct", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "IsoPct_from_pme_TPM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound"); -my @gene_title = ("gene_id", "transcript_id(s)", "length", "effective_length", "expected_count", "TPM", "FPKM", "pme_expected_count", "pme_TPM", "pme_FPKM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound"); +my @gene_title = ("gene_id", "transcript_id(s)", "length", "effective_length", "expected_count", "TPM", "FPKM", "posterior_mean_count", "posterior_standard_deviation_of_count", "pme_TPM", "pme_FPKM", "TPM_ci_lower_bound", "TPM_ci_upper_bound", "FPKM_ci_lower_bound", "FPKM_ci_upper_bound"); # type, inpF, outF sub collectResults { -- 2.39.2