X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=BamConverter.h;h=d2b8b7324df6d61b59e3f03d92dbe0a4941a3bff;hp=cca0eae895fd347b243cc8f2f315be471c64498c;hb=HEAD;hpb=0a534802ee9fa3d488995a68621ff04f0fc6be7f diff --git a/BamConverter.h b/BamConverter.h index cca0eae..d2b8b73 100644 --- a/BamConverter.h +++ b/BamConverter.h @@ -14,6 +14,7 @@ #include "sam_rsem_cvt.h" #include "utils.h" +#include "my_assert.h" #include "bc_aux.h" #include "Transcript.h" #include "Transcripts.h" @@ -38,25 +39,49 @@ private: void writeCollapsedLines(); void flipSeq(uint8_t*, int); void flipQual(uint8_t*, int); - void addXSTag(bam1_t*, const Transcript&); + void modifyTags(bam1_t*, const Transcript&); // modify MD tag and XS tag if needed }; BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_list, Transcripts& transcripts) : transcripts(transcripts) { - if (transcripts.getType() != 0) - exitWithError("Genome information is not provided! RSEM cannot convert the transcript bam file!"); + general_assert(transcripts.getType() == 0, "Genome information is not provided! RSEM cannot convert the transcript bam file!"); in = samopen(inpF, "rb", NULL); assert(in != 0); + transcripts.buildMappings(in->header->n_targets, in->header->target_name); + bam_header_t *out_header = sam_header_read2(chr_list); + refmap.clear(); for (int i = 0; i < out_header->n_targets; i++) { refmap[out_header->target_name[i]] = i; } - append_header_text(out_header, in->header->text, in->header->l_text); + if (in->header->l_text > 0) { + char comment[] = "@CO\tThis BAM file is processed by rsem-tbam2gam to convert from transcript coordinates into genomic coordinates.\n"; + int comment_len = strlen(comment); + + //Filter @SQ fields if the BAM file is user provided + char *text = in->header->text; + int l_text = in->header->l_text; + char *new_text = new char[l_text + comment_len]; + int pos = 0, s = 0; + while (pos < l_text) { + if ((pos + 2 < l_text) && (text[pos] == '@') && (text[pos + 1] == 'S') && (text[pos + 2] == 'Q')) { + pos += 3; + while (pos < l_text && text[pos] != '\n') ++pos; + } + else new_text[s++] = text[pos]; + ++pos; + } + strncpy(new_text + s, comment, comment_len); + s += comment_len; + + append_header_text(out_header, new_text, s); + delete[] new_text; + } out = samopen(outF, "wb", out_header); assert(out != 0); @@ -74,7 +99,7 @@ void BamConverter::process() { std::string cqname; bool isPaired = false; - int cnt = 0; + HIT_INT_TYPE cnt = 0; cqname = ""; b = bam_init1(); b2 = bam_init1(); @@ -83,32 +108,58 @@ void BamConverter::process() { ++cnt; isPaired = (b->core.flag & 0x0001) > 0; if (isPaired) { - assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001) && b->core.tid == b2->core.tid); - assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing - ++cnt; + assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001)); + assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001)); + assert(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))); + ++cnt; } if (cnt % 1000000 == 0) { printf("."); fflush(stdout); } // at least one segment is not properly mapped - if ((b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004))) continue; + bool notgood = (b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004)); + + if (isPaired && notgood) general_assert((b->core.flag & 0x0004) && (b2->core.flag & 0x0004), cstrtos(bam1_qname(b)) + "'s two mates are not all marked as unalignable!"); + + if (!notgood) { + // for collapsing + if (isPaired) { + assert(b->core.tid == b2->core.tid); + general_assert(b->core.tid == b2->core.tid, cstrtos(bam1_qname(b)) + "'s two mates are aligned to two different transcripts!"); + if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) { + bam1_t *tmp = b; b = b2; b2 = tmp; + } + } - const Transcript& transcript = transcripts.getTranscriptAt(b->core.tid + 1); + const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1); - convert(b, transcript); - if (isPaired) { - convert(b2, transcript); - b->core.mpos = b2->core.pos; - b2->core.mpos = b->core.pos; + convert(b, transcript); + if (isPaired) { + convert(b2, transcript); + b->core.mpos = b2->core.pos; + b2->core.mpos = b->core.pos; + } + + if (cqname != bam1_qname(b)) { + writeCollapsedLines(); + cqname = bam1_qname(b); + collapseMap.init(isPaired); + } + + uint8_t *p = bam_aux_get(b, "ZW"); + float prb = (p != NULL? bam_aux2f(p) : 1.0); + collapseMap.insert(b, b2, prb); } + else { + assert(cqname != bam1_qname(b)); - if (cqname != bam1_qname(b)) { writeCollapsedLines(); cqname = bam1_qname(b); collapseMap.init(isPaired); - } - collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW"))); + samwrite(out, b); + if (isPaired) samwrite(out, b2); + } } writeCollapsedLines(); @@ -123,7 +174,7 @@ void BamConverter::convert(bam1_t* b, const Transcript& transcript) { int pos = b->core.pos; int readlen = b->core.l_qseq; - if (readlen == 0) exitWithError("One alignment line has SEQ field as *. RSEM does not support this currently!"); + general_assert(readlen > 0, "One alignment line has SEQ field as *. RSEM does not support this currently!"); iter = refmap.find(transcript.getSeqName()); assert(iter != refmap.end()); @@ -159,29 +210,32 @@ void BamConverter::convert(bam1_t* b, const Transcript& transcript) { b->core.n_cigar = core_n_cigar; b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); - addXSTag(b, transcript); // check if need to add XS tag, if need, add it + modifyTags(b, transcript); // check if need to add XS tag, if need, add it } inline void BamConverter::writeCollapsedLines() { bam1_t *tmp_b = NULL,*tmp_b2 = NULL; float prb; bool isPaired; + uint8_t *p; if (!collapseMap.empty(isPaired)) { while (collapseMap.next(tmp_b, tmp_b2, prb)) { - memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); - tmp_b->core.qual = getMAPQ(prb); - if (tmp_b->core.qual > 0) { - samwrite(out, tmp_b); - if (isPaired) { - memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); - tmp_b2->core.qual = tmp_b->core.qual; - samwrite(out, tmp_b2); - } + p = bam_aux_get(tmp_b, "ZW"); + if (p != NULL) { + memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); + tmp_b->core.qual = getMAPQ(prb); + } + // otherwise, just use the MAPQ score of the orignal alignment + + samwrite(out, tmp_b); + if (isPaired) { + if (p != NULL) memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); + tmp_b2->core.qual = tmp_b->core.qual; + samwrite(out, tmp_b2); } bam_destroy1(tmp_b); if (isPaired) bam_destroy1(tmp_b2); - } } } @@ -217,14 +271,58 @@ inline void BamConverter::flipQual(uint8_t* q, int readlen) { } } -inline void BamConverter::addXSTag(bam1_t* b, const Transcript& transcript) { - uint32_t* p = bam1_cigar(b); +inline void BamConverter::modifyTags(bam1_t* b, const Transcript& transcript) { + char strand = transcript.getStrand(); + uint8_t *s = NULL; + + if (strand == '-') { + s = bam_aux_get(b, "MD"); + if ((s != NULL) && (*(s) == 'Z') && (bam_aux2Z(s) != NULL)) { + char *mis = bam_aux2Z(s); + int len = strlen(mis); + char *tmp = new char[len]; + int cur_type = -1, fr = -1, type, base; + for (int i = 0; i < len; i++) { + type = (mis[i] >= '0' && mis[i] <= '9'); + if (cur_type != type) { + switch(cur_type) { + case 0: + base = len - 1; + if (mis[fr] == '^') { tmp[len - i] = mis[fr]; ++fr; ++base; } + for (int j = fr; j < i; j++) tmp[base - j] = ((mis[j] == 'A' || mis[j] == 'C' || mis[j] == 'G' || mis[j] == 'T') ? getOpp(mis[j]) : mis[j]); + break; + case 1: + base = len - i - fr; + for (int j = fr; j < i; j++) tmp[base + j] = mis[j]; + break; + } + cur_type = type; + fr = i; + } + } + switch(cur_type) { + case 0: + base = len - 1; + if (mis[fr] == '^') { tmp[0] = mis[fr]; ++fr; ++base; } + for (int j = fr; j < len; j++) tmp[base - j] = ((mis[j] == 'A' || mis[j] == 'C' || mis[j] == 'G' || mis[j] == 'T') ? getOpp(mis[j]) : mis[j]); + break; + case 1: + for (int j = fr; j < len; j++) tmp[j - fr] = mis[j]; + break; + } + strncpy(mis, tmp, len); + delete[] tmp; + } + } + + // append XS:A field if necessary + s = bam_aux_get(b, "XS"); + if (s != NULL) bam_aux_del(b, s); bool hasN = false; + uint32_t* p = bam1_cigar(b); for (int i = 0; i < (int)b->core.n_cigar; i++) if ((*(p + i) & BAM_CIGAR_MASK) == BAM_CREF_SKIP) { hasN = true; break; } - if (!hasN) return; - char strand = transcript.getStrand(); - bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand); + if (hasN) bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand); } #endif /* BAMCONVERTER_H_ */