For genome BAM, modified MD tag accordingly

[rsem.git] / Transcripts.h
diff --git a/Transcripts.h b/Transcripts.h

index 2750e0c36394d3c3be5cb7d63dfb74f7ae5687a3..033669b740e97737ce4f5f4bc3a25f016a083910 100644 (file)
--- a/Transcripts.h
+++ b/Transcripts.h
@@ -10,20 +10,36 @@
  #include<fstream>
  #include<vector>
  #include<algorithm>
+#include<map>
+#include<string>
  
+#include "my_assert.h"
  #include "Transcript.h"
  
-
  class Transcripts {
  public:
         Transcripts(int type = 0) {
                 M = 0; this->type = type;
                 transcripts.clear();
                 transcripts.push_back(Transcript());
+
+               e2i.clear(); i2e.clear();
         }
  
         int getM() { return M; }
+
+       // used in shrinking the transcripts
+       void setM(int M) { this->M = M; transcripts.resize(M + 1); } 
+       
+       void move(int from, int to) {
+         assert(from >= to);
+         if (from > to) transcripts[to] = transcripts[from];
+       }
+       
         int getType() { return type; }
+       void setType(int type) { this->type = type; }
+
+       bool isAlleleSpecific() { return type == 2; }
  
         const Transcript& getTranscriptAt(int pos) {
                 assert(pos > 0 && pos <= M);
@@ -42,9 +58,23 @@ public:
         void readFrom(const char*);
         void writeTo(const char*);
  
+       //Eid: external sid
+       int getInternalSid(int eid) {
+               assert(eid > 0 && eid <= M);
+               return e2i[eid];
+       }
+
+       const Transcript& getTranscriptViaEid(int eid) {
+               return transcripts[getInternalSid(eid)];
+       }
+
+       void buildMappings(int, char**);
+
  private:
-       int M, type; // type 0 from genome , 1 standalone transcriptome
+       int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific 
         std::vector<Transcript> transcripts;
+
+       std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
  };
  
  void Transcripts::readFrom(const char* inpF) {
@@ -55,8 +85,7 @@ void Transcripts::readFrom(const char* inpF) {
  
         fin>>M>>type;
         getline(fin, line);
-       transcripts.clear();
-       transcripts.resize(M + 1);
+       transcripts.assign(M + 1, Transcript());
         for (int i = 1; i <= M; i++) {
                 transcripts[i].read(fin);
         }
@@ -72,4 +101,30 @@ void Transcripts::writeTo(const char* outF) {
         fout.close();
  }
  
+void Transcripts::buildMappings(int n_targets, char** target_name) {
+       std::map<std::string, int> dict;
+       std::map<std::string, int>::iterator iter;
+
+       general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+
+       dict.clear();
+       for (int i = 1; i <= M; i++) {
+               const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
+               iter = dict.find(tid);
+               general_assert(iter == dict.end(), tid + " appears more than once!");
+               dict[tid] = i;
+       }
+
+       e2i.assign(M + 1, 0);
+       i2e.assign(M + 1, 0);
+       for (int i = 0; i < n_targets; i++) {
+               iter = dict.find(std::string(target_name[i]));
+               general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
+               general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
+               e2i[i + 1] = iter->second;
+               i2e[iter->second] = i + 1;
+               iter->second = -1;
+       }
+}
+
  #endif /* TRANSCRIPTS_H_ */