]> git.donarmstrong.com Git - samtools.git/blobdiff - bam_pileup.c
* aggressive gapped aligner is implemented in calmd.
[samtools.git] / bam_pileup.c
index 5a87746f4d5418dfd663e260d531ffacf28eb695..db170e8f15114e21c63cd11849b2d78a05996ea6 100644 (file)
@@ -59,11 +59,13 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
        bam1_t *b = p->b;
        bam1_core_t *c = &b->core;
        uint32_t x = c->pos, y = 0;
-       int ret = 1, is_restart = 1;
+       int ret = 1, is_restart = 1, first_op, last_op;
 
        if (c->flag&BAM_FUNMAP) return 0; // unmapped read
        assert(x <= pos); // otherwise a bug
        p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;
+       first_op = bam1_cigar(b)[0] & BAM_CIGAR_MASK;
+       last_op = bam1_cigar(b)[c->n_cigar-1] & BAM_CIGAR_MASK;
        for (k = 0; k < c->n_cigar; ++k) {
                int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
                int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
@@ -71,32 +73,45 @@ static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
                        if (x + l > pos) { // overlap with pos
                                p->indel = p->is_del = 0;
                                p->qpos = y + (pos - x);
-                               if (x == pos && is_restart) p->is_head = 1;
+                               if (x == pos && is_restart && first_op != BAM_CDEL) p->is_head = 1;
                                if (x + l - 1 == pos) { // come to the end of a match
-                                       if (k < c->n_cigar - 1) { // there are additional operation(s)
+                                       int has_next_match = 0;
+                                       unsigned i;
+                                       for (i = k + 1; i < c->n_cigar; ++i) {
+                                               uint32_t cigar = bam1_cigar(b)[i];
+                                               int opi = cigar&BAM_CIGAR_MASK;
+                                               if (opi == BAM_CMATCH) {
+                                                       has_next_match = 1;
+                                                       break;
+                                               } else if (opi == BAM_CSOFT_CLIP || opi == BAM_CREF_SKIP || opi == BAM_CHARD_CLIP) break;
+                                       }
+                                       if (!has_next_match && last_op != BAM_CDEL) p->is_tail = 1;
+                                       if (k < c->n_cigar - 1 && has_next_match) { // there are additional operation(s)
                                                uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
                                                int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
                                                if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
                                                else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
-                                               if (op_next == BAM_CDEL || op_next == BAM_CINS) {
-                                                       if (k + 2 < c->n_cigar) op_next = bam1_cigar(b)[k+2]&BAM_CIGAR_MASK;
-                                                       else p->is_tail = 1;
+                                               else if (op_next == BAM_CPAD && k + 2 < c->n_cigar) { // no working for adjacent padding
+                                                       cigar = bam1_cigar(b)[k+2]; op_next = cigar&BAM_CIGAR_MASK;
+                                                       if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
+                                                       else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
                                                }
-                                               if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)
-                                                       p->is_tail = 1; // tail
-                                       } else p->is_tail = 1; // this is the last operation; set tail
+                                       }
                                }
                        }
                        x += l; y += l;
                } else if (op == BAM_CDEL) { // then set ->is_del
+                       if (k == 0 && x == pos) p->is_head = 1;
+                       else if (x + l - 1 == pos && k == c->n_cigar - 1) p->is_tail = 1;
                        if (x + l > pos) {
                                p->indel = 0; p->is_del = 1;
-                               p->qpos = y + (pos - x);
+                               p->qpos = y;
                        }
                        x += l;
                } else if (op == BAM_CREF_SKIP) x += l;
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
-               is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
+               if (is_restart) is_restart ^= (op == BAM_CMATCH);
+               else is_restart ^= (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
                if (x > pos) {
                        if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all
                        break;
@@ -116,11 +131,15 @@ struct __bam_plp_t {
        mempool_t *mp;
        lbnode_t *head, *tail, *dummy;
        int32_t tid, pos, max_tid, max_pos;
-       int is_eof, flag_mask, max_plp;
+       int is_eof, flag_mask, max_plp, error;
        bam_pileup1_t *plp;
+       // for the "auto" interface only
+       bam1_t *b;
+       bam_plp_auto_f func;
+       void *data;
 };
 
-bam_plp_t bam_plp_init(void)
+bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
 {
        bam_plp_t iter;
        iter = calloc(1, sizeof(struct __bam_plp_t));
@@ -129,11 +148,29 @@ bam_plp_t bam_plp_init(void)
        iter->dummy = mp_alloc(iter->mp);
        iter->max_tid = iter->max_pos = -1;
        iter->flag_mask = BAM_DEF_MASK;
+       if (func) {
+               iter->func = func;
+               iter->data = data;
+               iter->b = bam_init1();
+       }
        return iter;
 }
 
-const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_n_plp, int *_tid, int *_pos)
+void bam_plp_destroy(bam_plp_t iter)
+{
+       mp_free(iter->mp, iter->dummy);
+       mp_free(iter->mp, iter->head);
+       if (iter->mp->cnt != 0)
+               fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
+       mp_destroy(iter->mp);
+       if (iter->b) bam_destroy1(iter->b);
+       free(iter->plp);
+       free(iter);
+}
+
+const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
 {
+       if (iter->error) { *_n_plp = -1; return 0; }
        *_n_plp = 0;
        if (iter->is_eof && iter->head->next == 0) return 0;
        while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
@@ -159,6 +196,7 @@ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_n_plp, int *_tid, int *_
                if (iter->head->next) {
                        if (iter->tid > iter->head->b.core.tid) {
                                fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__);
+                               iter->error = 1;
                                *_n_plp = -1;
                                return 0;
                        }
@@ -177,6 +215,7 @@ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_n_plp, int *_tid, int *_
 
 int bam_plp_push(bam_plp_t iter, const bam1_t *b)
 {
+       if (iter->error) return -1;
        if (b) {
                if (b->core.tid < 0) return 0;
                if (b->core.flag & iter->flag_mask) return 0;
@@ -184,10 +223,12 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b)
                iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
                if (b->core.tid < iter->max_tid) {
                        fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
+                       iter->error = 1;
                        return -1;
                }
                if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
                        fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n");
+                       iter->error = 1;
                        return -1;
                }
                iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
@@ -199,6 +240,27 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b)
        return 0;
 }
 
+const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
+{
+       const bam_pileup1_t *plp;
+       if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
+       if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+       else {
+               *_n_plp = 0;
+               if (iter->is_eof) return 0;
+               while (iter->func(iter->data, iter->b) >= 0) {
+                       if (bam_plp_push(iter, iter->b) < 0) {
+                               *_n_plp = -1;
+                               return 0;
+                       }
+                       if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+               }
+               bam_plp_push(iter, 0);
+               if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+               return 0;
+       }
+}
+
 void bam_plp_reset(bam_plp_t iter)
 {
        lbnode_t *p, *q;
@@ -218,17 +280,6 @@ void bam_plp_set_mask(bam_plp_t iter, int mask)
        iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask);
 }
 
-void bam_plp_destroy(bam_plp_t iter)
-{
-       mp_free(iter->mp, iter->dummy);
-       mp_free(iter->mp, iter->head);
-       if (iter->mp->cnt != 0)
-               fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt);
-       mp_destroy(iter->mp);
-       free(iter->plp);
-       free(iter);
-}
-
 /*****************
  * callback APIs *
  *****************/
@@ -263,7 +314,7 @@ bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
 {
        bam_plbuf_t *buf;
        buf = calloc(1, sizeof(bam_plbuf_t));
-       buf->iter = bam_plp_init();
+       buf->iter = bam_plp_init(0, 0);
        buf->func = func;
        buf->data = data;
        return buf;
@@ -281,7 +332,69 @@ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
        const bam_pileup1_t *plp;
        ret = bam_plp_push(buf->iter, b);
        if (ret < 0) return ret;
-       while ((plp = bam_plp_next(buf->iter, &n_plp, &tid, &pos)) != 0)
+       while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0)
                buf->func(tid, pos, n_plp, plp, buf->data);
        return 0;
 }
+
+/***********
+ * mpileup *
+ ***********/
+
+struct __bam_mplp_t {
+       int n;
+       uint64_t min, *pos;
+       bam_plp_t *iter;
+       int *n_plp;
+       const bam_pileup1_t **plp;
+};
+
+bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
+{
+       int i;
+       bam_mplp_t iter;
+       iter = calloc(1, sizeof(struct __bam_mplp_t));
+       iter->pos = calloc(n, 8);
+       iter->n_plp = calloc(n, sizeof(int));
+       iter->plp = calloc(n, sizeof(void*));
+       iter->iter = calloc(n, sizeof(void*));
+       iter->n = n;
+       iter->min = (uint64_t)-1;
+       for (i = 0; i < n; ++i) {
+               iter->iter[i] = bam_plp_init(func, data[i]);
+               iter->pos[i] = iter->min;
+       }
+       return iter;
+}
+
+void bam_mplp_destroy(bam_mplp_t iter)
+{
+       int i;
+       for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
+       free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp);
+       free(iter);
+}
+
+int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
+{
+       int i, ret = 0;
+       uint64_t new_min = (uint64_t)-1;
+       for (i = 0; i < iter->n; ++i) {
+               if (iter->pos[i] == iter->min) {
+                       int tid, pos;
+                       iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
+                       iter->pos[i] = (uint64_t)tid<<32 | pos;
+               }
+               if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i];
+       }
+       iter->min = new_min;
+       if (new_min == (uint64_t)-1) return 0;
+       *_tid = new_min>>32; *_pos = (uint32_t)new_min;
+       for (i = 0; i < iter->n; ++i) {
+               if (iter->pos[i] == iter->min) {
+                       n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
+                       ++ret;
+               } else n_plp[i] = 0, plp[i] = 0;
+       }
+       return ret;
+}