]> git.donarmstrong.com Git - rsem.git/blobdiff - BamWriter.h
Renamed makefile as Makefile
[rsem.git] / BamWriter.h
index b0aeee026bb7525df2446639c5702ad843eea209..2a325d6cb46901760ef1cebe01b5cbbba1136ee7 100644 (file)
@@ -7,6 +7,7 @@
 #include<cassert>
 #include<string>
 #include<sstream>
+#include<iostream>
 
 #include <stdint.h>
 #include "sam/bam.h"
@@ -14,6 +15,8 @@
 #include "sam_rsem_aux.h"
 #include "sam_rsem_cvt.h"
 
+#include "utils.h"
+
 #include "SingleHit.h"
 #include "PairedEndHit.h"
 
@@ -47,25 +50,12 @@ BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const
        }
        assert(in != 0);
 
+       //build mappings from external sid to internal sid
+       transcripts.buildMappings(in->header->n_targets, in->header->target_name);
+
        //generate output's header
        bam_header_t *out_header = bam_header_dwt(in->header);
 
-       if (out_header->n_targets != transcripts.getM()) {
-               fprintf(stderr, "Number of reference sequences recorded in the header is not correct! The header contains %d sequences while there should be %d sequences\n", out_header->n_targets, transcripts.getM());
-               exit(-1);
-       }
-
-       for (int i = 0; i < out_header->n_targets; i++) {
-               const Transcript& transcript = transcripts.getTranscriptAt(i + 1);
-               if (out_header->target_name[i] != transcript.getTranscriptID()) {
-                       fprintf(stderr, "Reference sequence %d's name recorded in the header is not correct! \n", i);
-                       fprintf(stderr, "Name in the header: %s\n", out_header->target_name[i]);
-                       fprintf(stderr, "Should be: %s\n", transcript.getTranscriptID().c_str());
-                       exit(-1);
-               }
-               out_header->target_len[i] = transcript.getLength();  // transcript length without poly(A) tail
-       }
-
        std::ostringstream strout;
        strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n";
        std::string content = strout.str();
@@ -86,68 +76,73 @@ void BamWriter::work(HitWrapper<SingleHit> wrapper) {
        bam1_t *b;
        SingleHit *hit;
 
-       int cnt = 0;
+       HIT_INT_TYPE cnt = 0;
 
        b = bam_init1();
 
        while (samread(in, b) >= 0) {
                ++cnt;
-               if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
+               if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
 
-               if (b->core.flag & 0x0004) continue;
+               if (!(b->core.flag & 0x0004)) {
+                 hit = wrapper.getNextHit();
+                 assert(hit != NULL);
 
-               hit = wrapper.getNextHit();
-               assert(hit != NULL);
+                 assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+                 convert(b, hit->getConPrb());
+               }
 
-               assert(b->core.tid + 1 == hit->getSid());
-               convert(b, hit->getConPrb());
-               if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
+               //if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
+               samwrite(out, b);
        }
 
        assert(wrapper.getNextHit() == NULL);
 
        bam_destroy1(b);
-       if (verbose) { printf("Bam output file is generated!\n"); }
+       if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; }
 }
 
 void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
        bam1_t *b, *b2;
        PairedEndHit *hit;
 
-       int cnt = 0;
+       HIT_INT_TYPE cnt = 0;
 
        b = bam_init1();
        b2 = bam_init1();
 
        while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
                cnt += 2;
-               if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
+               if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
 
-               if (!((b->core.flag & 0x0002) && (b2->core.flag & 0x0002))) continue;
+               assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
+               assert(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)));
 
-               //swap if b is mate 2
-               if (b->core.flag & 0x0080) {
-                       assert(b2->core.flag & 0x0040);
-                       bam1_t *tmp = b;
-                       b = b2; b2 = tmp;
-               }
+               //unalignable reads, skip               
+               bool notgood = (b->core.flag & 0x0004) || (b2->core.flag & 0x0004);
 
-               hit = wrapper.getNextHit();
-               assert(hit != NULL);
+               if (!notgood) {
+                       //swap if b is mate 2
+                       if (b->core.flag & 0x0080) {
+                               assert(b2->core.flag & 0x0040);
+                               bam1_t *tmp = b;
+                               b = b2; b2 = tmp;
+                       }
 
-               assert(b->core.tid + 1 == hit->getSid());
-               assert(b2->core.tid + 1 == hit->getSid());
+                       hit = wrapper.getNextHit();
+                       assert(hit != NULL);
 
-               convert(b, hit->getConPrb());
-               convert(b2, hit->getConPrb());
+                       assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+                       assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
 
-               b->core.mpos = b2->core.pos;
-               b2->core.mpos = b->core.pos;
+                       convert(b, hit->getConPrb());
+                       convert(b2, hit->getConPrb());
 
-               if (b->core.qual > 0) {
-                       samwrite(out, b);
-                       samwrite(out, b2);
+                       // omit "b->core.mpos = b2->core.pos; b2->core.mpos = b->core.pos" for the reason that it is possible that only one mate is alignable 
                }
+
+               samwrite(out, b);
+               samwrite(out, b2);
        }
 
        assert(wrapper.getNextHit() == NULL);
@@ -155,38 +150,11 @@ void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
        bam_destroy1(b);
        bam_destroy1(b2);
 
-       if (verbose) { printf("Bam output file is generated!\n"); }
+       if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; }
 }
 
 void BamWriter::convert(bam1_t *b, double prb) {
-       int sid = b->core.tid + 1;
-       const Transcript& transcript = transcripts.getTranscriptAt(sid);
-
-       int pos = b->core.pos;
-       int readlen = b->core.l_qseq;
-
-       std::vector<uint32_t> data;
-       data.clear();
-
-       int core_pos, core_n_cigar;
-       std::vector<Interval> vec;
-       vec.assign(1, Interval(1, transcript.getLength()));
-       // make an artificial chromosome coordinates for the transcript to get new CIGAR strings
-       tr2chr(Transcript("", "", "", '+', vec, ""), pos + 1, pos + readlen, core_pos, core_n_cigar, data);
-       assert(core_pos >= 0);
-
-       int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4;
-       b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len;
-       expand_data_size(b);
-       uint8_t* pt = b->data + b->core.l_qname;
-       memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len);
-       for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; }
-
-       b->core.pos = core_pos;
-       b->core.n_cigar = core_n_cigar;
        b->core.qual = getMAPQ(prb);
-       b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b)));
-
        float val = (float)prb;
        bam_aux_append(b, "ZW", 'f', bam_aux_type2size('f'), (uint8_t*)&val);
 }