]> git.donarmstrong.com Git - samtools.git/commitdiff
* this revision is UNSTABLE
authorHeng Li <lh3@live.co.uk>
Fri, 5 Nov 2010 04:19:14 +0000 (04:19 +0000)
committerHeng Li <lh3@live.co.uk>
Fri, 5 Nov 2010 04:19:14 +0000 (04:19 +0000)
 * indel caller seems working, but it is very insensitive and has
   several things I do not quite understand.

bam2bcf.c
bam2bcf.h
bam_plcmd.c

index d5a8dd440286cc2b6d0b119df6e6940d8ea05bbf..ab5bfcd20d4a6233109821846842d9996aab93a9 100644 (file)
--- a/bam2bcf.c
+++ b/bam2bcf.c
@@ -98,6 +98,7 @@ int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t
        // fill the bases array
        memset(r, 0, sizeof(bcf_callret1_t));
        r->indelreg = 10000;
+       n_ins = n_del = 0;
        for (i = n = 0; i < _n; ++i) {
                const bam_pileup1_t *p = pl + i;
                int q, b, mapQ, indelQ, is_diff, min_dist;
@@ -132,6 +133,7 @@ int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t
                                        if (qual[j] > BAM2BCF_INDELREG_THRES && indelreg == 0)
                                                indelreg = j - y + 1;
                                }
+                               if (r->indelreg > indelreg) r->indelreg = indelreg;
                        } else {
                                for (j = y; j <= p->qpos; ++j)
                                        if (max_left < qual[j]) max_left = qual[j];
@@ -143,6 +145,11 @@ int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t
                        // estimate the sequencing error rate
                        seqQ = bca->openQ;
                        if (p->indel != 0) seqQ += bca->extQ * (abs(p->indel) - 1); // FIXME: better to model homopolymer
+                       if (p->indel > 0) {
+                               ++n_ins; r->ins_len += p->indel;
+                       } else if (p->indel < 0) {
+                               ++n_del; r->del_len += -p->indel;
+                       }
                        if (p->indel != 0) { // a different model for tandem repeats
                                uint8_t *seq = bam1_seq(p->b);
                                int tandemQ, qb = bam1_seqi(seq, p->qpos), l;
@@ -181,6 +188,9 @@ int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t
                r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
        }
        r->depth = n;
+       r->ins_len = n_ins? r->ins_len / n_ins : 0;
+       r->del_len = n_del? r->del_len / n_del : 0;
+       if (r->indelreg >= 10000) r->indelreg = 0;
        // glfgen
        errmod_cal(bca->e, n, 2, bca->bases, r->p);
        return r->depth;        
@@ -191,6 +201,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
        int ref4, i, j, qsum[4];
        int64_t tmp;
        call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
+       call->ins_len = call->del_len = 0; call->indelreg = 0;
        if (ref4 > 4) ref4 = 4;
        // calculate qsum
        memset(qsum, 0, 4 * sizeof(int));
@@ -253,6 +264,66 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
        return 0;
 }
 
+int bcf_call_combine_gap(int n, const bcf_callret1_t *calls, bcf_call_t *call)
+{
+       int i, j, n_ins, n_del;
+       // combine annotations
+       call->ori_ref = 4;
+       memset(call->anno, 0, 16 * sizeof(int));
+       call->ins_len = call->del_len = 0; call->indelreg = 10000;
+       for (i = call->depth = 0, n_ins = n_del = 0; i < n; ++i) {
+               const bcf_callret1_t *r = calls + i;
+               if (r->depth > 0) {
+                       call->depth += r->depth;
+                       if (r->ins_len > 0) {
+                               call->ins_len += r->ins_len;
+                               ++n_ins;
+                       }
+                       if (r->del_len > 0) {
+                               call->del_len += r->del_len;
+                               ++n_del;
+                       }
+                       if (r->indelreg > 0 && call->indelreg > r->indelreg)
+                               call->indelreg = r->indelreg;
+                       for (j = 0; j < 16; ++j) call->anno[j] += r->anno[j];
+               }
+       }
+       if (call->depth == 0) return 0; // no indels
+       call->ins_len = n_ins? call->ins_len / n_ins : 0;
+       call->del_len = n_del? call->del_len / n_del : 0;
+       //
+       for (i = 0; i < 5; ++i) call->a[i] = -1;
+       call->a[0] = 0; call->a[1] = 1;
+       call->unseen = -1;
+       call->n_alleles = 2;
+       // set the PL array
+       if (call->n < n) {
+               call->n = n;
+               call->PL = realloc(call->PL, 15 * n);
+       }
+       {
+               int g[3];
+               double sum_min = 0.;
+               g[0] = 0; g[1] = 1; g[2] = 3;
+               for (i = 0; i < n; ++i) {
+                       uint8_t *PL = call->PL + 3 * i;
+                       const bcf_callret1_t *r = calls + i;
+                       float min = 1e37;
+                       for (j = 0; j < 3; ++j)
+                               if (min > r->p[g[j]]) min = r->p[g[j]];
+                       sum_min += min;
+                       for (j = 0; j < 3; ++j) {
+                               int y;
+                               y = (int)(r->p[g[j]] - min + .499);
+                               if (y > 255) y = 255;
+                               PL[j] = y;
+                       }
+               }
+               call->shift = (int)(sum_min + .499);
+       }
+       return 0;
+}
+
 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP)
 {
        extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
@@ -262,11 +333,19 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc
        b->tid = tid; b->pos = pos; b->qual = 0;
        s.s = b->str; s.m = b->m_str; s.l = 0;
        kputc('\0', &s);
-       kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
-       for (i = 1; i < 5; ++i) {
-               if (bc->a[i] < 0) break;
-               if (i > 1) kputc(',', &s);
-               kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
+       if (bc->ins_len > 0 || bc->del_len > 0) { // an indel
+               for (i = 0; i < bc->indelreg; ++i) kputc('N', &s);
+               kputc('\0', &s);
+               if (bc->ins_len > 0 && bc->del_len > 0) kputs("<INDEL>", &s);
+               else if (bc->ins_len > 0) kputs("<INS>", &s);
+               else if (bc->del_len > 0) kputs("<DEL>", &s);
+       } else { // SNP
+               kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
+               for (i = 1; i < 5; ++i) {
+                       if (bc->a[i] < 0) break;
+                       if (i > 1) kputc(',', &s);
+                       kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
+               }
        }
        kputc('\0', &s);
        kputc('\0', &s);
index 92f104507952d5197a6bde2bd326ff16595cf558..b6514c05c2b5633a93104524bcd4fa8533c3ba80 100644 (file)
--- a/bam2bcf.h
+++ b/bam2bcf.h
@@ -16,8 +16,8 @@ typedef struct __bcf_callaux_t {
 } bcf_callaux_t;
 
 typedef struct {
-       int depth;
-       float indelreg, ins_len, del_len;
+       int depth, indelreg;
+       float ins_len, del_len;
        int qsum[4];
        int anno[16];
        float p[25];
@@ -26,7 +26,9 @@ typedef struct {
 typedef struct {
        int a[5]; // alleles: ref, alt, alt2, alt3
        int n, n_alleles, shift, ori_ref, unseen;
-       int anno[16], depth;
+       int indelreg;
+       float ins_len, del_len;
+       int depth, anno[16];
        uint8_t *PL;
 } bcf_call_t;
 
@@ -38,6 +40,7 @@ extern "C" {
        void bcf_call_destroy(bcf_callaux_t *bca);
        int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r);
        int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call);
+       int bcf_call_combine_gap(int n, const bcf_callret1_t *calls, bcf_call_t *call);
        int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP);
        int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t *bca, bcf_callret1_t *r);
 
index ed230e8fb8c763388f57a272acee2ca5f96e7503..32d050d68413aa18e6ba4ea688615579a539bf2b 100644 (file)
@@ -742,6 +742,14 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
                                if (i < gplp.n) { // at least one of the read contains a gap
                                        for (i = 0; i < gplp.n; ++i)
                                                bcf_call_glfgen_gap(pos, gplp.n_plp[i], gplp.plp[i], bca, bcr + i);
+                                       bcf_call_combine_gap(gplp.n, bcr, &bc);
+                                       if (bc.depth > 0) {
+                                               b = calloc(1, sizeof(bcf1_t));
+                                               bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
+                                                                        (conf->flag&MPLP_FMT_SP));
+                                               bcf_write(bp, bh, b);
+                                               bcf_destroy(b);
+                                       }
                                }
                        }
                } else {