X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=Refs.h;h=008d2755e3aa56862566363523028f330de69c5b;hp=41fcf061112bc4b6ca1cc96cb4395ca4a819e944;hb=683863b75f8d8bef2461039a6911b0e9619cc113;hpb=a95154919f950f86de9104b2b9dcf1f0c7e83387 diff --git a/Refs.h b/Refs.h index 41fcf06..008d275 100644 --- a/Refs.h +++ b/Refs.h @@ -20,6 +20,7 @@ class Refs { Refs() { M = 0; seqs.clear(); + has_polyA = false; } ~Refs() {} @@ -36,6 +37,8 @@ class Refs { std::vector& getRefs() { return seqs; } // may be slow, for copying the whole thing + bool hasPolyA() { return has_polyA; } // if any of sequence has poly(A) tail added + //lim : >=0 If mismatch > lim , return; -1 find all mismatches int countMismatch(const std::string& seq, int pos, const std::string& readseq, int LEN, int lim = -1) { int nMis = 0; // number of mismatches @@ -52,7 +55,7 @@ class Refs { } bool isValid(int sid, int dir, int pos, const std::string& readseq, int LEN, int C) { - if (sid <= 0 || sid > M || dir != 0 && dir != 1 || pos < 0 || pos + LEN > seqs[sid].getTotLen() || LEN > (int)readseq.length()) return false; + if (sid <= 0 || sid > M || (dir != 0 && dir != 1) || pos < 0 || pos + LEN > seqs[sid].getTotLen() || LEN > (int)readseq.length()) return false; const std::string& seq = seqs[sid].getSeq(dir); return countMismatch(seq, pos, readseq, LEN, C) <= C; } @@ -73,7 +76,7 @@ class Refs { private: int M; // # of isoforms, id starts from 1 std::vector seqs; // reference sequences, starts from 1; 0 is for noise gene - + bool has_polyA; // if at least one sequence has polyA added, the value is true; otherwise, the value is false }; //inpF in fasta format @@ -87,6 +90,7 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) { seqs.push_back(RefSeq()); // noise isoform M = 0; + has_polyA = false; fin.open(inpF); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); } @@ -97,9 +101,13 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) { while((pt = getline(fin, line)) && line[0] != '>') { rawseq += line; } - assert(rawseq.size() > 0); + if (rawseq.size() <= 0) { + printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str()); + continue; + } ++M; seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag))); + has_polyA = has_polyA || seqs[M].getFullLen() < seqs[M].getTotLen(); } fin.close(); @@ -107,7 +115,7 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) { } //inpF in fasta format, with sequence all in one line together -//option 0 read all, 1 do not read sequences and names +//option 0 read all, 1 do not read sequences void Refs::loadRefs(char *inpF, int option) { std::ifstream fin; RefSeq seq; @@ -118,6 +126,7 @@ void Refs::loadRefs(char *inpF, int option) { seqs.push_back(RefSeq()); M = 0; + has_polyA = false; bool success; do { @@ -125,6 +134,7 @@ void Refs::loadRefs(char *inpF, int option) { if (success) { seqs.push_back(seq); ++M; + has_polyA = has_polyA || seq.getFullLen() < seq.getTotLen(); } } while (success);