2 * transcripts are numbered from 1. 0 is reserved for noise isoform
17 #include "my_assert.h"
18 #include "Transcript.h"
22 Transcripts(int type = 0) {
23 M = 0; this->type = type;
25 transcripts.push_back(Transcript());
27 e2i.clear(); i2e.clear();
30 int getM() { return M; }
32 // used in shrinking the transcripts
33 void setM(int M) { this->M = M; transcripts.resize(M + 1); }
35 void move(int from, int to) {
37 if (from > to) transcripts[to] = transcripts[from];
40 int getType() { return type; }
41 void setType(int type) { this->type = type; }
43 bool isAlleleSpecific() { return type == 2; }
45 const Transcript& getTranscriptAt(int pos) {
46 assert(pos > 0 && pos <= M);
47 return transcripts[pos];
50 void add(const Transcript& transcript) {
51 transcripts.push_back(transcript);
56 std::sort(transcripts.begin(), transcripts.end());
59 void readFrom(const char*);
60 void writeTo(const char*);
63 int getInternalSid(int eid) {
64 assert(eid > 0 && eid <= M);
68 const Transcript& getTranscriptViaEid(int eid) {
69 return transcripts[getInternalSid(eid)];
72 void buildMappings(int, char**, const char* = NULL);
75 int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific
76 std::vector<Transcript> transcripts;
78 std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
81 void Transcripts::readFrom(const char* inpF) {
83 std::ifstream fin(inpF);
85 if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
89 transcripts.assign(M + 1, Transcript());
90 for (int i = 1; i <= M; i++) {
91 transcripts[i].read(fin);
96 void Transcripts::writeTo(const char* outF) {
97 std::ofstream fout(outF);
98 fout<<M<<" "<<type<<std::endl;
99 for (int i = 1; i <= M; i++) {
100 transcripts[i].write(fout);
105 void Transcripts::buildMappings(int n_targets, char** target_name, const char* imdName) {
106 std::map<std::string, int> dict;
107 std::map<std::string, int>::iterator iter;
108 std::vector<bool> appeared;
110 general_assert(n_targets > 0, "The SAM/BAM file declares less than one reference sequence!");
111 general_assert(n_targets <= M, "The SAM/BAM file declares more reference sequences (" + itos(n_targets) + ") than RSEM knows (" + itos(M) + ")!");
112 if (n_targets < M) printf("Warning: The SAM/BAM file declares less reference sequences (%d) than RSEM knows (%d)!\n", n_targets, M);
115 for (int i = 1; i <= M; i++) {
116 const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
117 iter = dict.find(tid);
118 general_assert(iter == dict.end(), "RSEM's indices might be corrupted, " + tid + " appears more than once!");
122 e2i.assign(M + 1, 0);
123 i2e.assign(M + 1, 0);
124 appeared.assign(M + 1, false);
125 for (int i = 0; i < n_targets; i++) {
126 iter = dict.find(std::string(target_name[i]));
127 general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
128 general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " appears more than once in the SAM/BAM file!");
129 e2i[i + 1] = iter->second;
130 i2e[iter->second] = i + 1;
132 appeared[e2i[i + 1]] = true;
135 if (imdName != NULL) {
137 sprintf(omitF, "%s.omit", imdName);
138 FILE *fo = fopen(omitF, "w");
139 for (int i = 1; i <= M; i++)
140 if (!appeared[i]) fprintf(fo, "%d\n", i);
145 #endif /* TRANSCRIPTS_H_ */