X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=SamParser.h;h=e91f9c379539fc7ada0f8682bcfc7c957489d67f;hp=a36af3a0d11a7133b27c5ee4fe83ae7996b9f631;hb=636b82d9f60ebcbec7ef1b73ba23bbbacfd8b36a;hpb=a95154919f950f86de9104b2b9dcf1f0c7e83387 diff --git a/SamParser.h b/SamParser.h index a36af3a..e91f9c3 100644 --- a/SamParser.h +++ b/SamParser.h @@ -1,4 +1,4 @@ -/* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT single read or paired-end read */ +/* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT siheaderngle read or paired-end read */ #ifndef SAMPARSER_H_ #define SAMPARSER_H_ @@ -10,7 +10,10 @@ #include "sam/bam.h" #include "sam/sam.h" + #include "utils.h" +#include "my_assert.h" + #include "SingleRead.h" #include "SingleReadQ.h" #include "PairedEndRead.h" @@ -18,9 +21,11 @@ #include "SingleHit.h" #include "PairedEndHit.h" +#include "Transcripts.h" + class SamParser { public: - SamParser(char, const char*, const char* = 0); + SamParser(char, const char*, Transcripts&, const char* = 0); ~SamParser(); /** @@ -46,6 +51,8 @@ private: bam_header_t *header; bam1_t *b, *b2; + Transcripts& transcripts; + //tag used by aligner static char rtTag[STRLEN]; @@ -64,24 +71,32 @@ private: //0 ~ N0 1 ~ N1 2 ~ N2 int getReadType(const bam1_t*); int getReadType(const bam1_t*, const bam1_t*); // for paired-end reads + + bool check(bam1_t *b) { + return (b->core.n_cigar == 1) && ((*bam1_cigar(b) & BAM_CIGAR_MASK) == BAM_CMATCH) && (b->core.l_qseq == (int32_t)(*bam1_cigar(b) >> BAM_CIGAR_SHIFT)); + } }; char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads // aux, if not 0, points to the file name of fn_list -SamParser::SamParser(char inpType, const char* inpF, const char* aux) { +SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux) + : transcripts(transcripts) +{ switch(inpType) { case 'b': sam_in = samopen(inpF, "rb", aux); break; case 's': sam_in = samopen(inpF, "r", aux); break; default: assert(false); } - if (sam_in == 0) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); } - header = sam_in->header; - if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); } + general_assert(sam_in != 0, "Cannot open " + cstrtos(inpF) + "! It may not exist."); + header = sam_in->header; + general_assert(header != 0, "Fail to parse sam header!"); - b = bam_init1(); - b2 = bam_init1(); + transcripts.buildMappings(header->n_targets, header->target_name); + + b = bam_init1(); + b2 = bam_init1(); } SamParser::~SamParser() { @@ -97,8 +112,7 @@ int SamParser::parseNext(SingleRead& read, SingleHit& hit) { bool canR = (samread(sam_in, b) >= 0); if (!canR) return -1; - if (b->core.flag & 0x0001) { fprintf(stderr, "Find a paired end read in the file!\n"); exit(-1); } - //(b->core.flag & 0x0100) && && !(b->core.flag & 0x0004) + general_assert(!(b->core.flag & 0x0001), "Find a paired end read in the file!"); int readType = getReadType(b); std::string name = getName(b); @@ -110,11 +124,13 @@ int SamParser::parseNext(SingleRead& read, SingleHit& hit) { else val = 5; if (readType == 1) { + if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } + if (getDir(b) > 0) { - hit = SingleHit(b->core.tid + 1, b->core.pos); + hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos); } else { - hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq); + hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq); } } @@ -126,8 +142,7 @@ int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) { bool canR = (samread(sam_in, b) >= 0); if (!canR) return -1; - if (b->core.flag & 0x0001) { fprintf(stderr, "Find a paired end read in the file!\n"); exit(-1); } - //assert(!(b->core.flag & 0x0001)); //(b->core.flag & 0x0100) && && !(b->core.flag & 0x0004) + general_assert(!(b->core.flag & 0x0001), "Find a paired end read in the file!"); int readType = getReadType(b); std::string name = getName(b); @@ -139,11 +154,13 @@ int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) { else val = 5; if (readType == 1) { + if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } + if (getDir(b) > 0) { - hit = SingleHit(b->core.tid + 1, b->core.pos); + hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos); } else { - hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq); + hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq); } } @@ -156,21 +173,22 @@ int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) { bool canR = ((samread(sam_in, b) >= 0) && (samread(sam_in, b2) >= 0)); if (!canR) return -1; - if (!((b->core.flag & 0x0001) && (b2->core.flag & 0x0001))) { - fprintf(stderr, "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)\n"); - exit(-1); - } - //assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001)); + general_assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001), \ + "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)"); bam1_t *mp1 = NULL, *mp2 = NULL; - if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) { + // If lose mate info, discard. is it necessary? + if (!((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) && !((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))) return 4; + // If only one mate is mapped, discard + if (((b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) || (!(b->core.flag & 0x0004) && (b2->core.flag & 0x0004))) return 4; + + if (b->core.flag & 0x0040) { mp1 = b; mp2 = b2; } - else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) { + else { mp1 = b2; mp2 = b; } - else return 4; // If lose mate info, discard. is it necessary? int readType = getReadType(mp1, mp2); std::string name = getName(mp1); @@ -184,16 +202,18 @@ int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) { else val = 5; if (readType == 1) { + if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } + if (mp1->core.tid != mp2->core.tid) { fprintf(stderr, "The two reads do not come from the same pair!"); exit(-1); } //assert(mp1->core.tid == mp2->core.tid); if (getDir(mp1) > 0) { - hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos); + hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos); } else { - hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos); + hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos); } } @@ -205,21 +225,22 @@ int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) { bool canR = ((samread(sam_in, b) >= 0) && (samread(sam_in, b2) >= 0)); if (!canR) return -1; - if (!((b->core.flag & 0x0001) && (b2->core.flag & 0x0001))) { - fprintf(stderr, "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)\n"); - exit(-1); - } - //assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001)); + general_assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001), \ + "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)"); bam1_t *mp1 = NULL, *mp2 = NULL; - if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) { + // If lose mate info, discard. is it necessary? + if (!((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) && !((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))) return 4; + // If only one mate is mapped, discard + if (((b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) || (!(b->core.flag & 0x0004) && (b2->core.flag & 0x0004))) return 4; + + if (b->core.flag & 0x0040) { mp1 = b; mp2 = b2; } - else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) { + else { mp1 = b2; mp2 = b; } - else return 4; int readType = getReadType(mp1, mp2); std::string name = getName(mp1); @@ -233,16 +254,18 @@ int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) { else val = 5; if (readType == 1) { + if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } + if (mp1->core.tid != mp2->core.tid) { fprintf(stderr, "The two reads do not come from the same pair!"); exit(-1); } //assert(mp1->core.tid == mp2->core.tid); if (getDir(mp1) > 0) { - hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos); + hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos); } else { - hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos); + hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos); } } @@ -318,10 +341,17 @@ inline int SamParser::getReadType(const bam1_t* b) { return (bam_aux2i(p) > 0 ? 2 : 0); } - //For paired-end reads, do not print out type 2 reads inline int SamParser::getReadType(const bam1_t* b, const bam1_t* b2) { - if ((b->core.flag & 0x0002) && (b2->core.flag & 0x0002)) return 1; + if (!(b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) return 1; + + if (!strcmp(rtTag, "")) return 0; + + uint8_t *p = bam_aux_get(b, rtTag); + if (p != NULL && bam_aux2i(p) > 0) return 2; + + p = bam_aux_get(b2, rtTag); + if (p != NULL && bam_aux2i(p) > 0) return 2; return 0; }