X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=BamWriter.h;h=f664710245ff265e184f922c642c0d3efc74a29a;hp=b0aeee026bb7525df2446639c5702ad843eea209;hb=683863b75f8d8bef2461039a6911b0e9619cc113;hpb=946f9a6adb2a82048c8453d44693cd3838d32939 diff --git a/BamWriter.h b/BamWriter.h index b0aeee0..f664710 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "sam/bam.h" @@ -14,6 +15,8 @@ #include "sam_rsem_aux.h" #include "sam_rsem_cvt.h" +#include "utils.h" + #include "SingleHit.h" #include "PairedEndHit.h" @@ -47,23 +50,13 @@ BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const } assert(in != 0); + //build mappings from external sid to internal sid + transcripts.buildMappings(in->header->n_targets, in->header->target_name); + //generate output's header bam_header_t *out_header = bam_header_dwt(in->header); - - if (out_header->n_targets != transcripts.getM()) { - fprintf(stderr, "Number of reference sequences recorded in the header is not correct! The header contains %d sequences while there should be %d sequences\n", out_header->n_targets, transcripts.getM()); - exit(-1); - } - for (int i = 0; i < out_header->n_targets; i++) { - const Transcript& transcript = transcripts.getTranscriptAt(i + 1); - if (out_header->target_name[i] != transcript.getTranscriptID()) { - fprintf(stderr, "Reference sequence %d's name recorded in the header is not correct! \n", i); - fprintf(stderr, "Name in the header: %s\n", out_header->target_name[i]); - fprintf(stderr, "Should be: %s\n", transcript.getTranscriptID().c_str()); - exit(-1); - } - out_header->target_len[i] = transcript.getLength(); // transcript length without poly(A) tail + out_header->target_len[i] = transcripts.getTranscriptViaEid(i + 1).getLength(); // transcript length without poly(A) tail } std::ostringstream strout; @@ -86,28 +79,30 @@ void BamWriter::work(HitWrapper wrapper) { bam1_t *b; SingleHit *hit; - int cnt = 0; + HIT_INT_TYPE cnt = 0; b = bam_init1(); while (samread(in, b) >= 0) { ++cnt; - if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; } - if (b->core.flag & 0x0004) continue; + if (!(b->core.flag & 0x0004)) { + hit = wrapper.getNextHit(); + assert(hit != NULL); - hit = wrapper.getNextHit(); - assert(hit != NULL); + assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid()); + convert(b, hit->getConPrb()); + } - assert(b->core.tid + 1 == hit->getSid()); - convert(b, hit->getConPrb()); - if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0 + //if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0 + samwrite(out, b); } assert(wrapper.getNextHit() == NULL); bam_destroy1(b); - if (verbose) { printf("Bam output file is generated!\n"); } + if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; } } void BamWriter::work(HitWrapper wrapper) { @@ -121,33 +116,52 @@ void BamWriter::work(HitWrapper wrapper) { while (samread(in, b) >= 0 && samread(in, b2) >= 0) { cnt += 2; - if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; } - if (!((b->core.flag & 0x0002) && (b2->core.flag & 0x0002))) continue; + assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001)); + assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080) || (b->core.flag & 0x0080) && (b2->core.flag & 0x0040)); - //swap if b is mate 2 - if (b->core.flag & 0x0080) { - assert(b2->core.flag & 0x0040); - bam1_t *tmp = b; - b = b2; b2 = tmp; - } + //unalignable reads, skip + bool notgood = (b->core.flag & 0x0004) || (b2->core.flag & 0x0004); - hit = wrapper.getNextHit(); - assert(hit != NULL); + if (!notgood) { + //swap if b is mate 2 + if (b->core.flag & 0x0080) { + assert(b2->core.flag & 0x0040); + bam1_t *tmp = b; + b = b2; b2 = tmp; + } - assert(b->core.tid + 1 == hit->getSid()); - assert(b2->core.tid + 1 == hit->getSid()); + hit = wrapper.getNextHit(); + assert(hit != NULL); - convert(b, hit->getConPrb()); - convert(b2, hit->getConPrb()); + assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid()); + assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid()); - b->core.mpos = b2->core.pos; - b2->core.mpos = b->core.pos; + convert(b, hit->getConPrb()); + convert(b2, hit->getConPrb()); - if (b->core.qual > 0) { - samwrite(out, b); - samwrite(out, b2); + b->core.mpos = b2->core.pos; + b2->core.mpos = b->core.pos; } + + /* + else { + // if only one mate can be aligned, mask it as unaligned and put an additional tag Z0:A:! + char exclamation = '!'; + if (!(b->core.flag & 0x0004)) { + b->core.flag |= 0x0004; + bam_aux_append(b, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation); + } + if (!(b2->core.flag & 0x0004)) { + b2->core.flag |= 0x0004; + bam_aux_append(b2, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation); + } + } + */ + + samwrite(out, b); + samwrite(out, b2); } assert(wrapper.getNextHit() == NULL); @@ -155,12 +169,11 @@ void BamWriter::work(HitWrapper wrapper) { bam_destroy1(b); bam_destroy1(b2); - if (verbose) { printf("Bam output file is generated!\n"); } + if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; } } void BamWriter::convert(bam1_t *b, double prb) { - int sid = b->core.tid + 1; - const Transcript& transcript = transcripts.getTranscriptAt(sid); + const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1); int pos = b->core.pos; int readlen = b->core.l_qseq;