From fea934ddfe218ae83b4421d32a8f6b941f004487 Mon Sep 17 00:00:00 2001
From: Bo Li <bli@cs.wisc.edu>
Date: Mon, 16 Jun 2014 03:23:12 -0500
Subject: [PATCH] Allowed for a subset of reference sequences to be declared in
 an input SAM/BAM file; For any transcript not declared in the SAM/BAM file,
 its PME estimates and credibility intervals are set to zero.

---
 Gibbs.cpp                 | 53 +++++++++++++++++++++++----------------
 Makefile                  |  2 +-
 SamParser.h               |  6 ++---
 Transcripts.h             | 25 ++++++++++++++----
 calcCI.cpp                | 16 +++++++-----
 parseIt.cpp               |  2 +-
 rsem-calculate-expression |  2 +-
 7 files changed, 68 insertions(+), 38 deletions(-)
diff --git a/Gibbs.cpp b/Gibbs.cpp
index d5e0b9c..7563dd4 100644
--- a/Gibbs.cpp
+++ b/Gibbs.cpp
@@ -64,6 +64,7 @@ vector<Item> hits;
 vector<double> eel;
 double *mw;
 
+vector<int> pseudo_counts;
 vector<double> pme_c, pve_c; //global posterior mean and variance vectors on counts
 vector<double> pme_tpm, pme_fpkm;
 
@@ -86,18 +87,6 @@ int m_trans;
 GroupInfo gt, ta;
 vector<double> pve_c_genes, pve_c_trans;
 
-void load_group_info(char* refName) {
-  // Load group info
-  sprintf(groupF, "%s.grp", refName);
-  gi.load(groupF);
-  m = gi.getm();
-  
-  alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific 
-  m_trans = (alleleS ? ta.getm() : 0);
-
-  if (verbose) { printf("Loading group information is finished!\n"); }
-}
-
 void load_data(char* refName, char* statName, char* imdName) {
 	ifstream fin;
 	string line;
@@ -138,6 +127,29 @@ void load_data(char* refName, char* statName, char* imdName) {
 	if (verbose) { printf("Loading data is finished!\n"); }
 }
 
+void load_group_info(char* refName) {
+  // Load group info
+  sprintf(groupF, "%s.grp", refName);
+  gi.load(groupF);
+  m = gi.getm();
+  
+  alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific 
+  m_trans = (alleleS ? ta.getm() : 0);
+
+  if (verbose) { printf("Loading group information is finished!\n"); }
+}
+
+// Load imdName.omit and initialize the pseudo count vector.
+void load_omit_info(const char* imdName) {
+  char omitF[STRLEN];
+  sprintf(omitF, "%s.omit", imdName);
+  FILE *fi = fopen(omitF, "r");
+  pseudo_counts.assign(M + 1, 1);
+  int tid;
+  while (fscanf(fi, "%d", &tid) == 1) pseudo_counts[tid] = 0;
+  fclose(fi);
+}
+
 template<class ModelType>
 void init_model_related(char* modelF) {
 	ModelType model;
@@ -206,7 +218,7 @@ void sampleTheta(engine_type& engine, vector<double>& theta) {
 	theta.assign(M + 1, 0);
 	denom = 0.0;
 	for (int i = 0; i <= M; i++) {
-		theta[i] = gmg();
+		theta[i] = (pseudo_counts[i] > 0 ? gmg() : 0.0);
 		denom += theta[i];
 	}
 	assert(denom > EPSILON);
@@ -226,7 +238,7 @@ void* Gibbs(void* arg) {
 	Params *params = (Params*)arg;
 
 	vector<double> theta, tpm, fpkm;
-	vector<int> z, counts;
+	vector<int> z, counts(pseudo_counts);
 	vector<double> arr;
 
 	uniform_01_generator rg(*params->engine, uniform_01_dist());
@@ -235,8 +247,6 @@ void* Gibbs(void* arg) {
 	sampleTheta(*params->engine, theta);
 
 	z.assign(N1, 0);
-
-	counts.assign(M + 1, 1); // 1 pseudo count
 	counts[0] += N0;
 
 	for (READ_INT_TYPE i = 0; i < N1; i++) {
@@ -274,8 +284,8 @@ void* Gibbs(void* arg) {
 				polishTheta(M, theta, eel, mw);
 				calcExpressionValues(M, theta, eel, tpm, fpkm);
 				for (int i = 0; i <= M; i++) {
-					params->pme_c[i] += counts[i] - 1;
-					params->pve_c[i] += double(counts[i] - 1) * (counts[i] - 1);
+					params->pme_c[i] += counts[i] - pseudo_counts[i];
+					params->pve_c[i] += double(counts[i] - pseudo_counts[i]) * (counts[i] - pseudo_counts[i]);
 					params->pme_tpm[i] += tpm[i];
 					params->pme_fpkm[i] += fpkm[i];
 				}
@@ -283,7 +293,7 @@ void* Gibbs(void* arg) {
 				for (int i = 0; i < m; i++) {
 				  int b = gi.spAt(i), e = gi.spAt(i + 1);
 				  double count = 0.0;
-				  for (int j = b; j < e; j++) count += counts[j] - 1;
+				  for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j];
 				  params->pve_c_genes[i] += count * count;
 				}
 
@@ -291,7 +301,7 @@ void* Gibbs(void* arg) {
 				  for (int i = 0; i < m_trans; i++) {
 				    int b = ta.spAt(i), e = ta.spAt(i + 1);
 				    double count = 0.0;
-				    for (int j = b; j < e; j++) count += counts[j] - 1;
+				    for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j];
 				    params->pve_c_trans[i] += count * count;
 				  }
 			}
@@ -410,8 +420,9 @@ int main(int argc, char* argv[]) {
 		printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads);
 	}
 
-	load_group_info(refName);
 	load_data(refName, statName, imdName);
+	load_group_info(refName);
+	load_omit_info(imdName);
 
 	sprintf(modelF, "%s.model", statName);
 	FILE *fi = fopen(modelF, "r");
diff --git a/Makefile b/Makefile
index a9a705f..54e2603 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ sam/libbam.a :
 
 Transcript.h : utils.h
 
-Transcripts.h : my_assert.h Transcript.h
+Transcripts.h : utils.h my_assert.h Transcript.h
 
 rsem-extract-reference-transcripts : utils.h my_assert.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp
 	$(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts
diff --git a/SamParser.h b/SamParser.h
index e91f9c3..81d4097 100644
--- a/SamParser.h
+++ b/SamParser.h
@@ -25,7 +25,7 @@
 
 class SamParser {
 public:
-	SamParser(char, const char*, Transcripts&, const char* = 0);
+	SamParser(char, const char*, const char*, Transcripts&, const char*);
 	~SamParser();
 
 	/**
@@ -80,7 +80,7 @@ private:
 char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
 
 // aux, if not 0, points to the file name of fn_list
-SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux)
+SamParser::SamParser(char inpType, const char* inpF, const char* aux, Transcripts& transcripts, const char* imdName)
 	: transcripts(transcripts)
 {
 	switch(inpType) {
@@ -93,7 +93,7 @@ SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, c
 	header = sam_in->header;
 	general_assert(header != 0, "Fail to parse sam header!");
 
-	transcripts.buildMappings(header->n_targets, header->target_name);
+	transcripts.buildMappings(header->n_targets, header->target_name, imdName);
 
 	b = bam_init1();
 	b2 = bam_init1();
diff --git a/Transcripts.h b/Transcripts.h
index 033669b..db8e64a 100644
--- a/Transcripts.h
+++ b/Transcripts.h
@@ -13,6 +13,7 @@
 #include<map>
 #include<string>
 
+#include "utils.h"
 #include "my_assert.h"
 #include "Transcript.h"
 
@@ -68,7 +69,7 @@ public:
 		return transcripts[getInternalSid(eid)];
 	}
 
-	void buildMappings(int, char**);
+	void buildMappings(int, char**, const char* = NULL);
 
 private:
 	int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific 
@@ -101,29 +102,43 @@ void Transcripts::writeTo(const char* outF) {
 	fout.close();
 }
 
-void Transcripts::buildMappings(int n_targets, char** target_name) {
+void Transcripts::buildMappings(int n_targets, char** target_name, const char* imdName) {
 	std::map<std::string, int> dict;
 	std::map<std::string, int>::iterator iter;
+	std::vector<bool> appeared;
 
-	general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+	general_assert(n_targets > 0, "The SAM/BAM file declares less than one reference sequence!");
+	general_assert(n_targets <=  M, "The SAM/BAM file declares more reference sequences (" + itos(n_targets) + ") than RSEM knows (" + itos(M) + ")!");
+	if (n_targets < M) printf("Warning: The SAM/BAM file declares less reference sequences (%d) than RSEM knows (%d)!\n", n_targets, M);
 
 	dict.clear();
 	for (int i = 1; i <= M; i++) {
 		const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
 		iter = dict.find(tid);
-		general_assert(iter == dict.end(), tid + " appears more than once!");
+		general_assert(iter == dict.end(), "RSEM's indices might be corrupted, " + tid + " appears more than once!");
 		dict[tid] = i;
 	}
 
 	e2i.assign(M + 1, 0);
 	i2e.assign(M + 1, 0);
+	appeared.assign(M + 1, false);
 	for (int i = 0; i < n_targets; i++) {
 		iter = dict.find(std::string(target_name[i]));
 		general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
-		general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
+		general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " appears more than once in the SAM/BAM file!");
 		e2i[i + 1] = iter->second;
 		i2e[iter->second] = i + 1;
 		iter->second = -1;
+		appeared[e2i[i + 1]] = true;
+	}
+
+	if (imdName != NULL) {
+	  char omitF[STRLEN];
+	  sprintf(omitF, "%s.omit", imdName);
+	  FILE *fo = fopen(omitF, "w");
+	  for (int i = 1; i <= M; i++) 
+	    if (!appeared[i]) fprintf(fo, "%d\n", i);
+	  fclose(fo);
 	}
 }
 
diff --git a/calcCI.cpp b/calcCI.cpp
index 86b3937..7cb58ff 100644
--- a/calcCI.cpp
+++ b/calcCI.cpp
@@ -106,18 +106,22 @@ void* sample_theta_from_c(void* arg) {
 	int cnt = 0;
 	while (fscanf(fi, "%d", &cvec[0]) == 1) {
 		for (int j = 1; j <= M; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1);
+		assert(cvec[0] > 0);
 
 		++cnt;
 
 		for (int j = 0; j <= M; j++) {
-			gammas[j] = new gamma_dist(cvec[j]);
-			rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
+		  gammas[j] = NULL; rgs[j] = NULL;
+		  if (cvec[j] > 0) {
+		    gammas[j] = new gamma_dist(cvec[j]);
+		    rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
+		  }
 		}
-
+		  
 		for (int i = 0; i < nSpC; i++) {
 			double sum = 0.0;
 			for (int j = 0; j <= M; j++) {
-				theta[j] = ((j == 0 || (eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
+				theta[j] = ((j == 0 || (cvec[j] > 0 && eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
 				sum += theta[j];
 			}
 			assert(sum >= EPSILON);
@@ -138,8 +142,8 @@ void* sample_theta_from_c(void* arg) {
 		}
 
 		for (int j = 0; j <= M; j++) {
-			delete gammas[j];
-			delete rgs[j];
+		  if (gammas[j] != NULL) delete gammas[j];
+		  if (rgs[j] != NULL) delete rgs[j];
 		}
 
 		if (verbose && cnt % 100 == 0) { printf("Thread %d, %d count vectors are processed!\n", params->no, cnt); }
diff --git a/parseIt.cpp b/parseIt.cpp
index 3373ef2..91a4c14 100644
--- a/parseIt.cpp
+++ b/parseIt.cpp
@@ -52,7 +52,7 @@ void init(const char* imdName, char alignFType, const char* alignF) {
 
 	char* aux = 0;
 	if (strcmp(fn_list, "")) aux = fn_list;
-	parser = new SamParser(alignFType, alignF, transcripts, aux);
+	parser = new SamParser(alignFType, alignF, aux, transcripts, imdName);
 
 	memset(cat, 0, sizeof(cat));
 	memset(readOutFs, 0, sizeof(readOutFs));
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index 16593c6..094c6e5 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -684,7 +684,7 @@ Maximum size (in memory, MB) of the auxiliary buffer used for computing credibil
 
 =item B<--ci-number-of-samples-per-count-vector> <int>
 
-The number of read generating probability vectors sampled per sampled count vector. The crebility intervals are calculated by first sampling P(C | D) and then sampling (\Theta | C) for each sampled count vector. This option controls how many \Theta vectors are sampled per sampled count vector. (Default: 50)
+The number of read generating probability vectors sampled per sampled count vector. The crebility intervals are calculated by first sampling P(C | D) and then sampling P(Theta | C) for each sampled count vector. This option controls how many Theta vectors are sampled per sampled count vector. (Default: 50)
 
 =item B<--samtools-sort-mem> <string>
 
-- 
2.39.5