From fea934ddfe218ae83b4421d32a8f6b941f004487 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Mon, 16 Jun 2014 03:23:12 -0500 Subject: [PATCH] Allowed for a subset of reference sequences to be declared in an input SAM/BAM file; For any transcript not declared in the SAM/BAM file, its PME estimates and credibility intervals are set to zero. --- Gibbs.cpp | 53 +++++++++++++++++++++++---------------- Makefile | 2 +- SamParser.h | 6 ++--- Transcripts.h | 25 ++++++++++++++---- calcCI.cpp | 16 +++++++----- parseIt.cpp | 2 +- rsem-calculate-expression | 2 +- 7 files changed, 68 insertions(+), 38 deletions(-) diff --git a/Gibbs.cpp b/Gibbs.cpp index d5e0b9c..7563dd4 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -64,6 +64,7 @@ vector hits; vector eel; double *mw; +vector pseudo_counts; vector pme_c, pve_c; //global posterior mean and variance vectors on counts vector pme_tpm, pme_fpkm; @@ -86,18 +87,6 @@ int m_trans; GroupInfo gt, ta; vector pve_c_genes, pve_c_trans; -void load_group_info(char* refName) { - // Load group info - sprintf(groupF, "%s.grp", refName); - gi.load(groupF); - m = gi.getm(); - - alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific - m_trans = (alleleS ? ta.getm() : 0); - - if (verbose) { printf("Loading group information is finished!\n"); } -} - void load_data(char* refName, char* statName, char* imdName) { ifstream fin; string line; @@ -138,6 +127,29 @@ void load_data(char* refName, char* statName, char* imdName) { if (verbose) { printf("Loading data is finished!\n"); } } +void load_group_info(char* refName) { + // Load group info + sprintf(groupF, "%s.grp", refName); + gi.load(groupF); + m = gi.getm(); + + alleleS = isAlleleSpecific(refName, >, &ta); // if allele-specific + m_trans = (alleleS ? ta.getm() : 0); + + if (verbose) { printf("Loading group information is finished!\n"); } +} + +// Load imdName.omit and initialize the pseudo count vector. +void load_omit_info(const char* imdName) { + char omitF[STRLEN]; + sprintf(omitF, "%s.omit", imdName); + FILE *fi = fopen(omitF, "r"); + pseudo_counts.assign(M + 1, 1); + int tid; + while (fscanf(fi, "%d", &tid) == 1) pseudo_counts[tid] = 0; + fclose(fi); +} + template void init_model_related(char* modelF) { ModelType model; @@ -206,7 +218,7 @@ void sampleTheta(engine_type& engine, vector& theta) { theta.assign(M + 1, 0); denom = 0.0; for (int i = 0; i <= M; i++) { - theta[i] = gmg(); + theta[i] = (pseudo_counts[i] > 0 ? gmg() : 0.0); denom += theta[i]; } assert(denom > EPSILON); @@ -226,7 +238,7 @@ void* Gibbs(void* arg) { Params *params = (Params*)arg; vector theta, tpm, fpkm; - vector z, counts; + vector z, counts(pseudo_counts); vector arr; uniform_01_generator rg(*params->engine, uniform_01_dist()); @@ -235,8 +247,6 @@ void* Gibbs(void* arg) { sampleTheta(*params->engine, theta); z.assign(N1, 0); - - counts.assign(M + 1, 1); // 1 pseudo count counts[0] += N0; for (READ_INT_TYPE i = 0; i < N1; i++) { @@ -274,8 +284,8 @@ void* Gibbs(void* arg) { polishTheta(M, theta, eel, mw); calcExpressionValues(M, theta, eel, tpm, fpkm); for (int i = 0; i <= M; i++) { - params->pme_c[i] += counts[i] - 1; - params->pve_c[i] += double(counts[i] - 1) * (counts[i] - 1); + params->pme_c[i] += counts[i] - pseudo_counts[i]; + params->pve_c[i] += double(counts[i] - pseudo_counts[i]) * (counts[i] - pseudo_counts[i]); params->pme_tpm[i] += tpm[i]; params->pme_fpkm[i] += fpkm[i]; } @@ -283,7 +293,7 @@ void* Gibbs(void* arg) { for (int i = 0; i < m; i++) { int b = gi.spAt(i), e = gi.spAt(i + 1); double count = 0.0; - for (int j = b; j < e; j++) count += counts[j] - 1; + for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j]; params->pve_c_genes[i] += count * count; } @@ -291,7 +301,7 @@ void* Gibbs(void* arg) { for (int i = 0; i < m_trans; i++) { int b = ta.spAt(i), e = ta.spAt(i + 1); double count = 0.0; - for (int j = b; j < e; j++) count += counts[j] - 1; + for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j]; params->pve_c_trans[i] += count * count; } } @@ -410,8 +420,9 @@ int main(int argc, char* argv[]) { printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads); } - load_group_info(refName); load_data(refName, statName, imdName); + load_group_info(refName); + load_omit_info(imdName); sprintf(modelF, "%s.model", statName); FILE *fi = fopen(modelF, "r"); diff --git a/Makefile b/Makefile index a9a705f..54e2603 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ sam/libbam.a : Transcript.h : utils.h -Transcripts.h : my_assert.h Transcript.h +Transcripts.h : utils.h my_assert.h Transcript.h rsem-extract-reference-transcripts : utils.h my_assert.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp $(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts diff --git a/SamParser.h b/SamParser.h index e91f9c3..81d4097 100644 --- a/SamParser.h +++ b/SamParser.h @@ -25,7 +25,7 @@ class SamParser { public: - SamParser(char, const char*, Transcripts&, const char* = 0); + SamParser(char, const char*, const char*, Transcripts&, const char*); ~SamParser(); /** @@ -80,7 +80,7 @@ private: char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads // aux, if not 0, points to the file name of fn_list -SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux) +SamParser::SamParser(char inpType, const char* inpF, const char* aux, Transcripts& transcripts, const char* imdName) : transcripts(transcripts) { switch(inpType) { @@ -93,7 +93,7 @@ SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, c header = sam_in->header; general_assert(header != 0, "Fail to parse sam header!"); - transcripts.buildMappings(header->n_targets, header->target_name); + transcripts.buildMappings(header->n_targets, header->target_name, imdName); b = bam_init1(); b2 = bam_init1(); diff --git a/Transcripts.h b/Transcripts.h index 033669b..db8e64a 100644 --- a/Transcripts.h +++ b/Transcripts.h @@ -13,6 +13,7 @@ #include #include +#include "utils.h" #include "my_assert.h" #include "Transcript.h" @@ -68,7 +69,7 @@ public: return transcripts[getInternalSid(eid)]; } - void buildMappings(int, char**); + void buildMappings(int, char**, const char* = NULL); private: int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific @@ -101,29 +102,43 @@ void Transcripts::writeTo(const char* outF) { fout.close(); } -void Transcripts::buildMappings(int n_targets, char** target_name) { +void Transcripts::buildMappings(int n_targets, char** target_name, const char* imdName) { std::map dict; std::map::iterator iter; + std::vector appeared; - general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!"); + general_assert(n_targets > 0, "The SAM/BAM file declares less than one reference sequence!"); + general_assert(n_targets <= M, "The SAM/BAM file declares more reference sequences (" + itos(n_targets) + ") than RSEM knows (" + itos(M) + ")!"); + if (n_targets < M) printf("Warning: The SAM/BAM file declares less reference sequences (%d) than RSEM knows (%d)!\n", n_targets, M); dict.clear(); for (int i = 1; i <= M; i++) { const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID(); iter = dict.find(tid); - general_assert(iter == dict.end(), tid + " appears more than once!"); + general_assert(iter == dict.end(), "RSEM's indices might be corrupted, " + tid + " appears more than once!"); dict[tid] = i; } e2i.assign(M + 1, 0); i2e.assign(M + 1, 0); + appeared.assign(M + 1, false); for (int i = 0; i < n_targets; i++) { iter = dict.find(std::string(target_name[i])); general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!"); - general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!"); + general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " appears more than once in the SAM/BAM file!"); e2i[i + 1] = iter->second; i2e[iter->second] = i + 1; iter->second = -1; + appeared[e2i[i + 1]] = true; + } + + if (imdName != NULL) { + char omitF[STRLEN]; + sprintf(omitF, "%s.omit", imdName); + FILE *fo = fopen(omitF, "w"); + for (int i = 1; i <= M; i++) + if (!appeared[i]) fprintf(fo, "%d\n", i); + fclose(fo); } } diff --git a/calcCI.cpp b/calcCI.cpp index 86b3937..7cb58ff 100644 --- a/calcCI.cpp +++ b/calcCI.cpp @@ -106,18 +106,22 @@ void* sample_theta_from_c(void* arg) { int cnt = 0; while (fscanf(fi, "%d", &cvec[0]) == 1) { for (int j = 1; j <= M; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1); + assert(cvec[0] > 0); ++cnt; for (int j = 0; j <= M; j++) { - gammas[j] = new gamma_dist(cvec[j]); - rgs[j] = new gamma_generator(*(params->engine), *gammas[j]); + gammas[j] = NULL; rgs[j] = NULL; + if (cvec[j] > 0) { + gammas[j] = new gamma_dist(cvec[j]); + rgs[j] = new gamma_generator(*(params->engine), *gammas[j]); + } } - + for (int i = 0; i < nSpC; i++) { double sum = 0.0; for (int j = 0; j <= M; j++) { - theta[j] = ((j == 0 || (eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0); + theta[j] = ((j == 0 || (cvec[j] > 0 && eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0); sum += theta[j]; } assert(sum >= EPSILON); @@ -138,8 +142,8 @@ void* sample_theta_from_c(void* arg) { } for (int j = 0; j <= M; j++) { - delete gammas[j]; - delete rgs[j]; + if (gammas[j] != NULL) delete gammas[j]; + if (rgs[j] != NULL) delete rgs[j]; } if (verbose && cnt % 100 == 0) { printf("Thread %d, %d count vectors are processed!\n", params->no, cnt); } diff --git a/parseIt.cpp b/parseIt.cpp index 3373ef2..91a4c14 100644 --- a/parseIt.cpp +++ b/parseIt.cpp @@ -52,7 +52,7 @@ void init(const char* imdName, char alignFType, const char* alignF) { char* aux = 0; if (strcmp(fn_list, "")) aux = fn_list; - parser = new SamParser(alignFType, alignF, transcripts, aux); + parser = new SamParser(alignFType, alignF, aux, transcripts, imdName); memset(cat, 0, sizeof(cat)); memset(readOutFs, 0, sizeof(readOutFs)); diff --git a/rsem-calculate-expression b/rsem-calculate-expression index 16593c6..094c6e5 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -684,7 +684,7 @@ Maximum size (in memory, MB) of the auxiliary buffer used for computing credibil =item B<--ci-number-of-samples-per-count-vector> -The number of read generating probability vectors sampled per sampled count vector. The crebility intervals are calculated by first sampling P(C | D) and then sampling (\Theta | C) for each sampled count vector. This option controls how many \Theta vectors are sampled per sampled count vector. (Default: 50) +The number of read generating probability vectors sampled per sampled count vector. The crebility intervals are calculated by first sampling P(C | D) and then sampling P(Theta | C) for each sampled count vector. This option controls how many Theta vectors are sampled per sampled count vector. (Default: 50) =item B<--samtools-sort-mem> -- 2.39.2