X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=Refs.h;h=711f64b6c01f2a5ae8c334d66ade1757ea9f9f27;hp=f411622f0cfb74c974db580e8e16091f1503d16d;hb=b64d62d49f9b0446f10f87a2aadcde2f36854ab6;hpb=3ec78aa9af79921c44d62b65f88865a4b65880be diff --git a/Refs.h b/Refs.h index f411622..711f64b 100644 --- a/Refs.h +++ b/Refs.h @@ -20,6 +20,7 @@ class Refs { Refs() { M = 0; seqs.clear(); + has_polyA = false; } ~Refs() {} @@ -36,6 +37,8 @@ class Refs { std::vector& getRefs() { return seqs; } // may be slow, for copying the whole thing + bool hasPolyA() { return has_polyA; } // if any of sequence has poly(A) tail added + //lim : >=0 If mismatch > lim , return; -1 find all mismatches int countMismatch(const std::string& seq, int pos, const std::string& readseq, int LEN, int lim = -1) { int nMis = 0; // number of mismatches @@ -52,7 +55,7 @@ class Refs { } bool isValid(int sid, int dir, int pos, const std::string& readseq, int LEN, int C) { - if (sid <= 0 || sid > M || dir != 0 && dir != 1 || pos < 0 || pos + LEN > seqs[sid].getTotLen() || LEN > (int)readseq.length()) return false; + if (sid <= 0 || sid > M || (dir != 0 && dir != 1) || pos < 0 || pos + LEN > seqs[sid].getTotLen() || LEN > (int)readseq.length()) return false; const std::string& seq = seqs[sid].getSeq(dir); return countMismatch(seq, pos, readseq, LEN, C) <= C; } @@ -73,7 +76,7 @@ class Refs { private: int M; // # of isoforms, id starts from 1 std::vector seqs; // reference sequences, starts from 1; 0 is for noise gene - + bool has_polyA; // if at least one sequence has polyA added, the value is true; otherwise, the value is false }; //inpF in fasta format @@ -81,25 +84,29 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) { //read standard fasta format here std::ifstream fin; std::string tag, line, rawseq; - void* pt; // istream& is indeed a pointer, that's why I can use void* here seqs.clear(); seqs.push_back(RefSeq()); // noise isoform M = 0; + has_polyA = false; fin.open(inpF); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); } - pt = getline(fin, line); - while (pt != 0 && line[0] == '>') { + getline(fin, line); + while ((fin) && (line[0] == '>')) { tag = line.substr(1); rawseq = ""; - while((pt = getline(fin, line)) && line[0] != '>') { + while((getline(fin, line)) && (line[0] != '>')) { rawseq += line; } - assert(rawseq.size() > 0); + if (rawseq.size() <= 0) { + printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str()); + continue; + } ++M; seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag))); + has_polyA = has_polyA || seqs[M].getFullLen() < seqs[M].getTotLen(); } fin.close(); @@ -118,6 +125,7 @@ void Refs::loadRefs(char *inpF, int option) { seqs.push_back(RefSeq()); M = 0; + has_polyA = false; bool success; do { @@ -125,6 +133,7 @@ void Refs::loadRefs(char *inpF, int option) { if (success) { seqs.push_back(seq); ++M; + has_polyA = has_polyA || seq.getFullLen() < seq.getTotLen(); } } while (success);