X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=BamWriter.h;h=2a325d6cb46901760ef1cebe01b5cbbba1136ee7;hp=12ab087d668966f905a495f4e608a777e157ebc0;hb=412c1a2821c5a4cbe2e68e4e9f4e2026a86d25f7;hpb=fb2aa1ca9d00710943155ef3abbbdd87df116e4a diff --git a/BamWriter.h b/BamWriter.h index 12ab087..2a325d6 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "sam/bam.h" @@ -14,6 +15,8 @@ #include "sam_rsem_aux.h" #include "sam_rsem_cvt.h" +#include "utils.h" + #include "SingleHit.h" #include "PairedEndHit.h" @@ -52,9 +55,6 @@ BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const //generate output's header bam_header_t *out_header = bam_header_dwt(in->header); - for (int i = 0; i < out_header->n_targets; i++) { - out_header->target_len[i] = transcripts.getTranscriptViaEid(i + 1).getLength(); // transcript length without poly(A) tail - } std::ostringstream strout; strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n"; @@ -76,71 +76,73 @@ void BamWriter::work(HitWrapper wrapper) { bam1_t *b; SingleHit *hit; - int cnt = 0; + HIT_INT_TYPE cnt = 0; b = bam_init1(); while (samread(in, b) >= 0) { ++cnt; - if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; } - if (b->core.flag & 0x0004) continue; + if (!(b->core.flag & 0x0004)) { + hit = wrapper.getNextHit(); + assert(hit != NULL); - hit = wrapper.getNextHit(); - assert(hit != NULL); + assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid()); + convert(b, hit->getConPrb()); + } - assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid()); - convert(b, hit->getConPrb()); - if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0 + //if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0 + samwrite(out, b); } assert(wrapper.getNextHit() == NULL); bam_destroy1(b); - if (verbose) { printf("Bam output file is generated!\n"); } + if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; } } void BamWriter::work(HitWrapper wrapper) { bam1_t *b, *b2; PairedEndHit *hit; - int cnt = 0; + HIT_INT_TYPE cnt = 0; b = bam_init1(); b2 = bam_init1(); while (samread(in, b) >= 0 && samread(in, b2) >= 0) { cnt += 2; - if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } - - //mate info is not complete, skip - if (!(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)))) continue; - //unalignable reads, skip - if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue; - - //swap if b is mate 2 - if (b->core.flag & 0x0080) { - assert(b2->core.flag & 0x0040); - bam1_t *tmp = b; - b = b2; b2 = tmp; - } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; } - hit = wrapper.getNextHit(); - assert(hit != NULL); + assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001)); + assert(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))); - assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid()); - assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid()); + //unalignable reads, skip + bool notgood = (b->core.flag & 0x0004) || (b2->core.flag & 0x0004); - convert(b, hit->getConPrb()); - convert(b2, hit->getConPrb()); + if (!notgood) { + //swap if b is mate 2 + if (b->core.flag & 0x0080) { + assert(b2->core.flag & 0x0040); + bam1_t *tmp = b; + b = b2; b2 = tmp; + } - b->core.mpos = b2->core.pos; - b2->core.mpos = b->core.pos; + hit = wrapper.getNextHit(); + assert(hit != NULL); - if (b->core.qual > 0) { - samwrite(out, b); - samwrite(out, b2); + assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid()); + assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid()); + + convert(b, hit->getConPrb()); + convert(b2, hit->getConPrb()); + + // omit "b->core.mpos = b2->core.pos; b2->core.mpos = b->core.pos" for the reason that it is possible that only one mate is alignable } + + samwrite(out, b); + samwrite(out, b2); } assert(wrapper.getNextHit() == NULL); @@ -148,37 +150,11 @@ void BamWriter::work(HitWrapper wrapper) { bam_destroy1(b); bam_destroy1(b2); - if (verbose) { printf("Bam output file is generated!\n"); } + if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; } } void BamWriter::convert(bam1_t *b, double prb) { - const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1); - - int pos = b->core.pos; - int readlen = b->core.l_qseq; - - std::vector data; - data.clear(); - - int core_pos, core_n_cigar; - std::vector vec; - vec.assign(1, Interval(1, transcript.getLength())); - // make an artificial chromosome coordinates for the transcript to get new CIGAR strings - tr2chr(Transcript("", "", "", '+', vec, ""), pos + 1, pos + readlen, core_pos, core_n_cigar, data); - assert(core_pos >= 0); - - int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4; - b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len; - expand_data_size(b); - uint8_t* pt = b->data + b->core.l_qname; - memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len); - for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; } - - b->core.pos = core_pos; - b->core.n_cigar = core_n_cigar; b->core.qual = getMAPQ(prb); - b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); - float val = (float)prb; bam_aux_append(b, "ZW", 'f', bam_aux_type2size('f'), (uint8_t*)&val); }