X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=Transcripts.h;h=db8e64a879d50bdf1e2e11e8df1037a1e62c5467;hp=5c17f95352fcd91ea9e80d8dc5f1559531e2ae09;hb=92b24279a3ecc72946e7e7c23149ad0d181f373a;hpb=4f7502168c3816ba3283385f093e599527e2b144

diff --git a/Transcripts.h b/Transcripts.h
index 5c17f95..db8e64a 100644
--- a/Transcripts.h
+++ b/Transcripts.h
@@ -13,10 +13,10 @@
 #include<map>
 #include<string>
 
+#include "utils.h"
 #include "my_assert.h"
 #include "Transcript.h"
 
-
 class Transcripts {
 public:
 	Transcripts(int type = 0) {
@@ -29,6 +29,14 @@ public:
 
 	int getM() { return M; }
 
+	// used in shrinking the transcripts
+	void setM(int M) { this->M = M; transcripts.resize(M + 1); } 
+	
+	void move(int from, int to) {
+	  assert(from >= to);
+	  if (from > to) transcripts[to] = transcripts[from];
+	}
+	
 	int getType() { return type; }
 	void setType(int type) { this->type = type; }
 
@@ -61,7 +69,7 @@ public:
 		return transcripts[getInternalSid(eid)];
 	}
 
-	void buildMappings(int, char**);
+	void buildMappings(int, char**, const char* = NULL);
 
 private:
 	int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific 
@@ -94,29 +102,43 @@ void Transcripts::writeTo(const char* outF) {
 	fout.close();
 }
 
-void Transcripts::buildMappings(int n_targets, char** target_name) {
+void Transcripts::buildMappings(int n_targets, char** target_name, const char* imdName) {
 	std::map<std::string, int> dict;
 	std::map<std::string, int>::iterator iter;
+	std::vector<bool> appeared;
 
-	general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+	general_assert(n_targets > 0, "The SAM/BAM file declares less than one reference sequence!");
+	general_assert(n_targets <=  M, "The SAM/BAM file declares more reference sequences (" + itos(n_targets) + ") than RSEM knows (" + itos(M) + ")!");
+	if (n_targets < M) printf("Warning: The SAM/BAM file declares less reference sequences (%d) than RSEM knows (%d)!\n", n_targets, M);
 
 	dict.clear();
 	for (int i = 1; i <= M; i++) {
 		const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
 		iter = dict.find(tid);
-		general_assert(iter == dict.end(), tid + " appears more than once!");
+		general_assert(iter == dict.end(), "RSEM's indices might be corrupted, " + tid + " appears more than once!");
 		dict[tid] = i;
 	}
 
 	e2i.assign(M + 1, 0);
 	i2e.assign(M + 1, 0);
+	appeared.assign(M + 1, false);
 	for (int i = 0; i < n_targets; i++) {
 		iter = dict.find(std::string(target_name[i]));
 		general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
-		general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
+		general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " appears more than once in the SAM/BAM file!");
 		e2i[i + 1] = iter->second;
 		i2e[iter->second] = i + 1;
 		iter->second = -1;
+		appeared[e2i[i + 1]] = true;
+	}
+
+	if (imdName != NULL) {
+	  char omitF[STRLEN];
+	  sprintf(omitF, "%s.omit", imdName);
+	  FILE *fo = fopen(omitF, "w");
+	  for (int i = 1; i <= M; i++) 
+	    if (!appeared[i]) fprintf(fo, "%d\n", i);
+	  fclose(fo);
 	}
 }