#include<fstream>
#include<vector>
#include<algorithm>
+#include<map>
+#include<string>
+#include "my_assert.h"
#include "Transcript.h"
-
class Transcripts {
public:
Transcripts(int type = 0) {
M = 0; this->type = type;
transcripts.clear();
transcripts.push_back(Transcript());
+
+ e2i.clear(); i2e.clear();
}
int getM() { return M; }
+
+ // used in shrinking the transcripts
+ void setM(int M) { this->M = M; transcripts.resize(M + 1); }
+
+ void move(int from, int to) {
+ assert(from >= to);
+ if (from > to) transcripts[to] = transcripts[from];
+ }
+
int getType() { return type; }
+ void setType(int type) { this->type = type; }
+
+ bool isAlleleSpecific() { return type == 2; }
const Transcript& getTranscriptAt(int pos) {
assert(pos > 0 && pos <= M);
void readFrom(const char*);
void writeTo(const char*);
+ //Eid: external sid
+ int getInternalSid(int eid) {
+ assert(eid > 0 && eid <= M);
+ return e2i[eid];
+ }
+
+ const Transcript& getTranscriptViaEid(int eid) {
+ return transcripts[getInternalSid(eid)];
+ }
+
+ void buildMappings(int, char**);
+
private:
- int M, type; // type 0 from genome , 1 standalone transcriptome
+ int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific
std::vector<Transcript> transcripts;
+
+ std::vector<int> e2i, i2e; // external sid to internal sid, internal sid to external sid
};
void Transcripts::readFrom(const char* inpF) {
fin>>M>>type;
getline(fin, line);
- transcripts.clear();
- transcripts.resize(M + 1);
+ transcripts.assign(M + 1, Transcript());
for (int i = 1; i <= M; i++) {
transcripts[i].read(fin);
}
fout.close();
}
+void Transcripts::buildMappings(int n_targets, char** target_name) {
+ std::map<std::string, int> dict;
+ std::map<std::string, int>::iterator iter;
+
+ general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+
+ dict.clear();
+ for (int i = 1; i <= M; i++) {
+ const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
+ iter = dict.find(tid);
+ general_assert(iter == dict.end(), tid + " appears more than once!");
+ dict[tid] = i;
+ }
+
+ e2i.assign(M + 1, 0);
+ i2e.assign(M + 1, 0);
+ for (int i = 0; i < n_targets; i++) {
+ iter = dict.find(std::string(target_name[i]));
+ general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
+ general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
+ e2i[i + 1] = iter->second;
+ i2e[iter->second] = i + 1;
+ iter->second = -1;
+ }
+}
+
#endif /* TRANSCRIPTS_H_ */