X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=SingleReadQ.h;h=9fd3dd18156b74674e5f19a43a50e937403181df;hp=976637fc269d79fa484cb0033421e7f7a0769c49;hb=7cd7abcad92a44bbd5d8d1b2bb3a871cd479c0bf;hpb=86e650e9577999a7ba00ab454d1f6bf674b0ea70 diff --git a/SingleReadQ.h b/SingleReadQ.h index 976637f..9fd3dd1 100644 --- a/SingleReadQ.h +++ b/SingleReadQ.h @@ -12,50 +12,46 @@ #include "Read.h" class SingleReadQ : public Read { - public: - SingleReadQ() { readseq = qscore = ""; len = 0; } - SingleReadQ(const std::string& name, const std::string& readseq, const std::string& qscore) { - this->name = name; - this->readseq = readseq; - this->qscore = qscore; - this->len = readseq.length(); - - calc_lq(); - } - - bool read(int argc, std::istream* argv[], int flags = 7); - void write(int argc, std::ostream* argv[]); - - int getReadLength() const { return len; } - const std::string& getReadSeq() const { return readseq; } - const std::string& getQScore() const { return qscore; } - - private: - int len; // read length - std::string readseq, qscore; // qscore : quality scores - - void calc_lq(); +public: + SingleReadQ() { readseq = qscore = ""; len = 0; } + SingleReadQ(const std::string& name, const std::string& readseq, const std::string& qscore) { + this->name = name; + this->readseq = readseq; + this->qscore = qscore; + this->len = readseq.length(); + } + + bool read(int argc, std::istream* argv[], int flags = 7); + void write(int argc, std::ostream* argv[]); + + int getReadLength() const { return len; } + const std::string& getReadSeq() const { return readseq; } + const std::string& getQScore() const { return qscore; } + + void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false + +private: + int len; // read length + std::string readseq, qscore; // qscore : quality scores }; bool SingleReadQ::read(int argc, std::istream* argv[], int flags) { - std::string line; - - assert(argc == 1); - if (!getline((*argv[0]), line)) return false; - if (line[0] != '@') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); } - name = ""; - if (flags & 4) { name = line.substr(1); } - if (!getline((*argv[0]), readseq)) return false; - len = readseq.length(); - if (!(flags & 1)) { readseq = ""; } - if (!getline((*argv[0]), line)) return false; - if (line[0] != '+') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); } - if (!getline((*argv[0]), qscore)) return false; - if (!(flags & 2)) { qscore = ""; } - - if (flags & 1) calc_lq(); - - return true; + std::string line; + + assert(argc == 1); + if (!getline((*argv[0]), line)) return false; + if (line[0] != '@') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); } + name = ""; + if (flags & 4) { name = line.substr(1); } + if (!getline((*argv[0]), readseq)) return false; + len = readseq.length(); + if (!(flags & 1)) { readseq = ""; } + if (!getline((*argv[0]), line)) return false; + if (line[0] != '+') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); } + if (!getline((*argv[0]), qscore)) return false; + if (!(flags & 2)) { qscore = ""; } + + return true; } void SingleReadQ::write(int argc, std::ostream* argv[]) { @@ -63,32 +59,39 @@ void SingleReadQ::write(int argc, std::ostream* argv[]) { (*argv[0])<<"@"<= len - OLEN) ++numTO; - } - } - - if (numA >= threshold_1) { - low_quality = (numAO >= threshold_2); - } - else if (numT >= threshold_1) { - low_quality = (numTO >= threshold_2); - } - else low_quality = false; +//calculate if this read is low quality +void SingleReadQ::calc_lq(bool hasPolyA, int seedLen) { + low_quality = false; + if (len < seedLen) { low_quality = true; return; } + + // if no polyA, no need to do the following calculation + if (!hasPolyA) return; + + assert(readseq != ""); + + int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region + int threshold_1, threshold_2; + + threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5); + threshold_2 = (OLEN - 1) / 2 + 1; + for (int i = 0; i < len; i++) { + if (readseq[i] == 'A') { + ++numA; + if (i < OLEN) ++numAO; + } + if (readseq[i] == 'T') { + ++numT; + if (i >= len - OLEN) ++numTO; + } + } + + if (numA >= threshold_1) { + low_quality = (numAO >= threshold_2); + } + else if (numT >= threshold_1) { + low_quality = (numTO >= threshold_2); + } + else low_quality = false; } #endif