Imported Upstream version 1.2.17

[rsem.git] / Transcripts.h
diff --git a/Transcripts.h b/Transcripts.h

index 19b025eb2a0868a051db6f0530caf2b9819296a6..db8e64a879d50bdf1e2e11e8df1037a1e62c5467 100644 (file)
--- a/Transcripts.h
+++ b/Transcripts.h
@@ -13,10 +13,10 @@
  #include<map>
  #include<string>
  
  #include<map>
  #include<string>
  
+#include "utils.h"
  #include "my_assert.h"
  #include "Transcript.h"
  
  #include "my_assert.h"
  #include "Transcript.h"
  
-
  class Transcripts {
  public:
         Transcripts(int type = 0) {
  class Transcripts {
  public:
         Transcripts(int type = 0) {
@@ -28,7 +28,19 @@ public:
         }
  
         int getM() { return M; }
         }
  
         int getM() { return M; }
+
+       // used in shrinking the transcripts
+       void setM(int M) { this->M = M; transcripts.resize(M + 1); } 
+       
+       void move(int from, int to) {
+         assert(from >= to);
+         if (from > to) transcripts[to] = transcripts[from];
+       }
+       
         int getType() { return type; }
         int getType() { return type; }
+       void setType(int type) { this->type = type; }
+
+       bool isAlleleSpecific() { return type == 2; }
  
         const Transcript& getTranscriptAt(int pos) {
                 assert(pos > 0 && pos <= M);
  
         const Transcript& getTranscriptAt(int pos) {
                 assert(pos > 0 && pos <= M);
@@ -57,10 +69,10 @@ public:
                 return transcripts[getInternalSid(eid)];
         }
  
                 return transcripts[getInternalSid(eid)];
         }
  
-       void buildMappings(int, char**);
+       void buildMappings(int, char**, const char* = NULL);
  
  private:
  
  private:
-       int M, type; // type 0 from genome , 1 standalone transcriptome
+       int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific 
         std::vector<Transcript> transcripts;
  
         std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
         std::vector<Transcript> transcripts;
  
         std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
@@ -90,29 +102,43 @@ void Transcripts::writeTo(const char* outF) {
         fout.close();
  }
  
         fout.close();
  }
  
-void Transcripts::buildMappings(int n_targets, char** target_name) {
+void Transcripts::buildMappings(int n_targets, char** target_name, const char* imdName) {
         std::map<std::string, int> dict;
         std::map<std::string, int>::iterator iter;
         std::map<std::string, int> dict;
         std::map<std::string, int>::iterator iter;
+       std::vector<bool> appeared;
  
  
-       general_assert(n_targets == M, "Number of transcripts does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+       general_assert(n_targets > 0, "The SAM/BAM file declares less than one reference sequence!");
+       general_assert(n_targets <=  M, "The SAM/BAM file declares more reference sequences (" + itos(n_targets) + ") than RSEM knows (" + itos(M) + ")!");
+       if (n_targets < M) printf("Warning: The SAM/BAM file declares less reference sequences (%d) than RSEM knows (%d)!\n", n_targets, M);
  
         dict.clear();
         for (int i = 1; i <= M; i++) {
  
         dict.clear();
         for (int i = 1; i <= M; i++) {
-               const std::string& tid = transcripts[i].getTranscriptID();
+               const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
                 iter = dict.find(tid);
                 iter = dict.find(tid);
-               general_assert(iter == dict.end(), tid + " appears more than once!");
+               general_assert(iter == dict.end(), "RSEM's indices might be corrupted, " + tid + " appears more than once!");
                 dict[tid] = i;
         }
  
         e2i.assign(M + 1, 0);
         i2e.assign(M + 1, 0);
                 dict[tid] = i;
         }
  
         e2i.assign(M + 1, 0);
         i2e.assign(M + 1, 0);
+       appeared.assign(M + 1, false);
         for (int i = 0; i < n_targets; i++) {
                 iter = dict.find(std::string(target_name[i]));
         for (int i = 0; i < n_targets; i++) {
                 iter = dict.find(std::string(target_name[i]));
-               general_assert(iter != dict.end(), "RSEM can not recognize transcript " + cstrtos(target_name[i]) + "!");
-               general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
+               general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
+               general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " appears more than once in the SAM/BAM file!");
                 e2i[i + 1] = iter->second;
                 i2e[iter->second] = i + 1;
                 iter->second = -1;
                 e2i[i + 1] = iter->second;
                 i2e[iter->second] = i + 1;
                 iter->second = -1;
+               appeared[e2i[i + 1]] = true;
+       }
+
+       if (imdName != NULL) {
+         char omitF[STRLEN];
+         sprintf(omitF, "%s.omit", imdName);
+         FILE *fo = fopen(omitF, "w");
+         for (int i = 1; i <= M; i++) 
+           if (!appeared[i]) fprintf(fo, "%d\n", i);
+         fclose(fo);
         }
  }
  
         }
  }