X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=SamParser.h;h=c5a503554b8c9cf4716b722d47da5f8c5db8540c;hb=97554bbac838f2ed578d81f98e421dac0669e74e;hp=425593a91d8c48f72f28783abafc090e0f5c6920;hpb=3ec78aa9af79921c44d62b65f88865a4b65880be;p=rsem.git diff --git a/SamParser.h b/SamParser.h index 425593a..c5a5035 100644 --- a/SamParser.h +++ b/SamParser.h @@ -12,9 +12,7 @@ #include "sam/sam.h" #include "utils.h" - -#include "RefSeq.h" -#include "Refs.h" +#include "my_assert.h" #include "SingleRead.h" #include "SingleReadQ.h" @@ -23,9 +21,11 @@ #include "SingleHit.h" #include "PairedEndHit.h" +#include "Transcripts.h" + class SamParser { public: - SamParser(char, const char*, Refs&, const char* = 0); + SamParser(char, const char*, Transcripts&, const char* = 0); ~SamParser(); /** @@ -51,6 +51,7 @@ private: bam_header_t *header; bam1_t *b, *b2; + Transcripts& transcripts; //tag used by aligner static char rtTag[STRLEN]; @@ -79,33 +80,23 @@ private: char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads // aux, if not 0, points to the file name of fn_list -SamParser::SamParser(char inpType, const char* inpF, Refs& refs, const char* aux) { +SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux) + : transcripts(transcripts) +{ switch(inpType) { case 'b': sam_in = samopen(inpF, "rb", aux); break; case 's': sam_in = samopen(inpF, "r", aux); break; default: assert(false); } - if (sam_in == 0) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); } - header = sam_in->header; - if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); } - - // Check if the reference used for aligner is the transcript set RSEM generated - if (refs.getM() != header->n_targets) { - fprintf(stderr, "Number of transcripts does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n"); - exit(-1); - } - for (int i = 0; i < header->n_targets; i++) { - const RefSeq& refseq = refs.getRef(i + 1); - // If update int to long, chance the (int) conversion - if (refseq.getName().compare(header->target_name[i]) != 0 || refseq.getTotLen() != (int)header->target_len[i]) { - fprintf(stderr, "Transcript information does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n"); - exit(-1); - } - } - - b = bam_init1(); - b2 = bam_init1(); + general_assert(sam_in != 0, "Cannot open " + cstrtos(inpF) + "! It may not exist."); + header = sam_in->header; + general_assert(header != 0, "Fail to parse sam header!"); + + transcripts.buildMappings(header->n_targets, header->target_name); + + b = bam_init1(); + b2 = bam_init1(); } SamParser::~SamParser() { @@ -137,10 +128,10 @@ int SamParser::parseNext(SingleRead& read, SingleHit& hit) { if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } if (getDir(b) > 0) { - hit = SingleHit(b->core.tid + 1, b->core.pos); + hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos); } else { - hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq); + hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq); } } @@ -168,10 +159,10 @@ int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) { if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } if (getDir(b) > 0) { - hit = SingleHit(b->core.tid + 1, b->core.pos); + hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos); } else { - hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq); + hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq); } } @@ -220,10 +211,10 @@ int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) { } //assert(mp1->core.tid == mp2->core.tid); if (getDir(mp1) > 0) { - hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos); + hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos); } else { - hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos); + hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos); } } @@ -271,10 +262,10 @@ int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) { } //assert(mp1->core.tid == mp2->core.tid); if (getDir(mp1) > 0) { - hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos); + hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos); } else { - hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos); + hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos); } }