]> git.donarmstrong.com Git - samtools.git/blobdiff - bam_maqcns.c
A bug fix, "samtools view" is now working again.
[samtools.git] / bam_maqcns.c
index c8009aa91e1b273cfdfdcba3cd84942e3a9a3b42..f36b0ee2ab443affe0635866a8d593c5cb54fdf7 100644 (file)
@@ -4,6 +4,8 @@
 #include "ksort.h"
 KSORT_INIT_GENERIC(uint32_t)
 
+#define MAX_WINDOW 33
+
 typedef struct __bmc_aux_t {
        int max;
        uint32_t *info;
@@ -12,9 +14,11 @@ typedef struct __bmc_aux_t {
 typedef struct {
        float esum[4], fsum[4];
        uint32_t c[4];
-       uint32_t mapQ_max;
+       uint32_t rms_mapQ;
 } glf_call_aux_t;
 
+char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
 /*
   P(<b1,b2>) = \theta \sum_{i=1}^{N-1} 1/i
   P(D|<b1,b2>) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2]
@@ -104,6 +108,7 @@ bam_maqcns_t *bam_maqcns_init()
        bm->theta = 0.85;
        bm->n_hap = 2;
        bm->eta = 0.03;
+       bm->cap_mapQ = 60;
        return bm;
 }
 
@@ -125,6 +130,7 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam
        int i, j, k, w[8], c, n;
        glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t));
        float p[16], min_p = 1e30;
+       uint64_t rms;
 
        g->ref_base = ref_base;
        if (_n == 0) return g;
@@ -137,13 +143,14 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam
        }
        for (i = n = 0; i < _n; ++i) {
                const bam_pileup1_t *p = pl + i;
-               uint32_t q, x = 0;
+               uint32_t q, x = 0, qq;
                if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue;
                q = (uint32_t)bam1_qual(p->b)[p->qpos];
                x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
                if (p->b->core.qual < q) q = p->b->core.qual;
                x |= q << 24;
-               q = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+               qq = bam1_seqi(bam1_seq(p->b), p->qpos);
+               q = bam_nt16_nt4_table[qq? qq : ref_base];
                if (!p->is_del && q < 4) x |= 1 << 21 | q << 16;
                bm->aux->info[n++] = x;
        }
@@ -151,9 +158,10 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam
        // generate esum and fsum
        b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t));
        for (k = 0; k != 8; ++k) w[k] = 0;
-       b->mapQ_max = 0;
+       rms = 0;
        for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
                uint32_t info = bm->aux->info[j];
+               int tmp;
                if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
                k = info>>16&7;
                if (info>>24 > 0) {
@@ -162,8 +170,10 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam
                        if (w[k] < 0xff) ++w[k];
                        ++b->c[k&3];
                }
-               if (b->mapQ_max < (info&0x7f)) b->mapQ_max = info&0x7f;
+               tmp = (int)(info&0x7f) < bm->cap_mapQ? (int)(info&0x7f) : bm->cap_mapQ;
+               rms += tmp * tmp;
        }
+       b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499);
        // rescale ->c[]
        for (j = c = 0; j != 4; ++j) c += b->c[j];
        if (c > 255) {
@@ -203,8 +213,27 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam
                        if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0;
        }
 
+       { // fix p[k<<2|k]
+               float max1, max2, min1, min2;
+               int max_k, min_k;
+               max_k = min_k = -1;
+               max1 = max2 = -1.0; min1 = min2 = 1e30;
+               for (k = 0; k < 4; ++k) {
+                       if (b->esum[k] > max1) {
+                               max2 = max1; max1 = b->esum[k]; max_k = k;
+                       } else if (b->esum[k] > max2) max2 = b->esum[k];
+               }
+               for (k = 0; k < 4; ++k) {
+                       if (p[k<<2|k] < min1) {
+                               min2 = min1; min1 = p[k<<2|k]; min_k = k;
+                       } else if (p[k<<2|k] < min2) min2 = p[k<<2|k];
+               }
+               if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2))
+                       p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0;
+       }
+
        // convert necessary information to glf1_t
-       g->ref_base = ref_base; g->max_mapQ = b->mapQ_max;
+       g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ;
        g->depth = n > 16777215? 16777215 : n;
        for (j = 0; j != 4; ++j)
                for (k = j; k < 4; ++k)
@@ -260,6 +289,9 @@ uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm)
 bam_maqindel_opt_t *bam_maqindel_opt_init()
 {
        bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t));
+       mi->q_indel = 40;
+       mi->r_indel = 0.00015;
+       //
        mi->mm_penalty = 3;
        mi->indel_err = 4;
        mi->ambi_thres = 10;
@@ -269,24 +301,28 @@ bam_maqindel_opt_t *bam_maqindel_opt_init()
 void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir)
 {
        if (mir == 0) return;
-       free(mir->s1); free(mir->s2); free(mir);
+       free(mir->s[0]); free(mir->s[1]); free(mir);
 }
 
 #define MINUS_CONST 0x10000000
 
-bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref)
+bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+                                                                int _n_types, int *_types)
 {
        int i, j, n_types, *types, left, right;
        bam_maqindel_ret_t *ret = 0;
-       for (i = 0; i < n; ++i) {
-               const bam_pileup1_t *p = pl + i;
-               if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+       // if there is no proposed indel, check if there is an indel from the alignment
+       if (_n_types == 0) {
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+               }
+               if (i == n) return 0; // no indel
        }
-       if (i == n) return 0; // no indel
        { // calculate how many types of indels are available (set n_types and types)
                int m;
                uint32_t *aux;
-               aux = (uint32_t*)calloc(n+1, 4);
+               aux = (uint32_t*)calloc(n + _n_types + 1, 4);
                m = 0;
                aux[m++] = MINUS_CONST; // zero indel is always a type
                for (i = 0; i < n; ++i) {
@@ -294,9 +330,12 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                        if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0)
                                aux[m++] = MINUS_CONST + p->indel;
                }
+               if (_n_types) // then also add this to aux[]
+                       for (i = 0; i < _n_types; ++i)
+                               if (_types[i]) aux[m++] = MINUS_CONST + _types[i];
                ks_introsort(uint32_t, m, aux);
-               n_types = 1;
-               for (i = 1; i < m; ++i)
+               // squeeze out identical types
+               for (i = 1, n_types = 1; i < m; ++i)
                        if (aux[i] != aux[i-1]) ++n_types;
                types = (int*)calloc(n_types, sizeof(int));
                j = 0;
@@ -318,10 +357,12 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                                if (seg.tend > right) right = seg.tend;
                        }
                }
+               if (pos - left > MAX_WINDOW) left = pos - MAX_WINDOW;
+               if (right - pos> MAX_WINDOW) right = pos + MAX_WINDOW;
        }
        { // the core part
                char *ref2, *inscns = 0;
-               int k, l, *score, max_ins = types[n_types-1];
+               int k, l, *score, *pscore, max_ins = types[n_types-1];
                ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1);
                if (max_ins > 0) { // get the consensus of inserted sequences
                        int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
@@ -338,7 +379,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                                        }
                                }
                        }
-                       // construct the consensus
+                       // construct the consensus of inserted sequence
                        inscns = (char*)calloc(n_types * max_ins, sizeof(char));
                        for (i = 0; i < n_types; ++i) {
                                for (j = 0; j < types[i]; ++j) {
@@ -356,6 +397,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                }
                // calculate score
                score = (int*)calloc(n_types * n, sizeof(int));
+               pscore = (int*)calloc(n_types * n, sizeof(int));
                for (i = 0; i < n_types; ++i) {
                        // write ref2
                        for (k = 0, j = left; j <= pos; ++j)
@@ -370,29 +412,36 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                                const bam_pileup1_t *p = pl + j;
                                uint32_t *cigar;
                                bam1_core_t *c = &p->b->core;
-                               int s;
+                               int s, ps;
                                bam_segreg_t seg;
                                if (c->flag&BAM_FUNMAP) continue;
                                cigar = bam1_cigar(p->b);
                                bam_segreg(pos, c, cigar, &seg);
-                               for (s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
+                               for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
                                        int cq = bam1_seqi(bam1_seq(p->b), l), ct;
-                                       ct = c->pos + l >= left? ref2[c->pos + l - left] : 15; // "<" should not happen if there is no bug
-                                       if (cq < 15 && ct < 15)
+                                       // in the following line, "<" will happen if reads are too long
+                                       ct = c->pos + l - seg.qbeg >= left? ref2[c->pos + l - seg.qbeg - left] : 15;
+                                       if (cq < 15 && ct < 15) {
                                                s += cq == ct? 1 : -mi->mm_penalty;
+                                               if (cq != ct) ps += bam1_qual(p->b)[l];
+                                       }
                                }
-                               score[i*n + j] = s;
+                               score[i*n + j] = s; pscore[i*n + j] = ps;
                                if (types[i] != 0) { // then try the other way to calculate the score
-                                       for (s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
+                                       for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
                                                int cq = bam1_seqi(bam1_seq(p->b), l), ct;
-                                               ct = c->pos + l + types[i] >= left? ref2[c->pos + l + types[i] - left] : 15;
-                                               if (cq < 15 && ct < 15)
+                                               ct = c->pos + l - seg.qbeg + types[i] >= left? ref2[c->pos + l - seg.qbeg + types[i] - left] : 15;
+                                               if (cq < 15 && ct < 15) {
                                                        s += cq == ct? 1 : -mi->mm_penalty;
+                                                       if (cq != ct) ps += bam1_qual(p->b)[l];
+                                               }
                                        }
                                }
                                if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores
-                               if (types[i] != 0) score[i*n+j] -= mi->indel_err;
-                               //printf("%d, %d, %d, %d\n", i, types[i], j, score[i*n+j]);
+                               if (pscore[i*n+j] > ps) pscore[i*n+j] = ps;
+                               //if (types[i] != 0) score[i*n+j] -= mi->indel_err;
+                               //printf("%d, %d, %d, %d, %d, %d, %d\n", p->b->core.pos + 1, seg.qbeg, i, types[i], j,
+                               //         score[i*n+j], pscore[i*n+j]);
                        }
                }
                { // get final result
@@ -401,7 +450,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                        sum = (int*)calloc(n_types, sizeof(int));
                        for (i = 0; i < n_types; ++i)
                                for (j = 0; j < n; ++j)
-                                       sum[i] += score[i*n+j];
+                                       sum[i] += -pscore[i*n+j];
                        max1 = max2 = -0x7fffffff; max1_i = max2_i = -1;
                        for (i = 0; i < n_types; ++i) {
                                if (sum[i] > max1) {
@@ -414,37 +463,72 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                        // write ret
                        ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t));
                        ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i];
-                       ret->s1 = (char*)calloc(abs(ret->indel1) + 2, 1);
-                       ret->s2 = (char*)calloc(abs(ret->indel2) + 2, 1);
+                       ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1);
+                       ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1);
+                       // write indel sequence
                        if (ret->indel1 > 0) {
-                               ret->s1[0] = '+';
+                               ret->s[0][0] = '+';
                                for (k = 0; k < ret->indel1; ++k)
-                                       ret->s1[k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
+                                       ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
                        } else if (ret->indel1 < 0) {
-                               ret->s1[0] = '-';
+                               ret->s[0][0] = '-';
                                for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k)
-                                       ret->s1[k+1] = ref[pos + k + 1];
-                       } else ret->s1[0] = '*';
+                                       ret->s[0][k+1] = ref[pos + k + 1];
+                       } else ret->s[0][0] = '*';
                        if (ret->indel2 > 0) {
-                               ret->s2[0] = '+';
+                               ret->s[1][0] = '+';
                                for (k = 0; k < ret->indel2; ++k)
-                                       ret->s2[k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
+                                       ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
                        } else if (ret->indel2 < 0) {
-                               ret->s2[0] = '-';
+                               ret->s[1][0] = '-';
                                for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k)
-                                       ret->s2[k+1] = ref[pos + k + 1];
-                       } else ret->s2[0] = '*';
+                                       ret->s[1][k+1] = ref[pos + k + 1];
+                       } else ret->s[1][0] = '*';
+                       // write count
+                       for (i = 0; i < n; ++i) {
+                               const bam_pileup1_t *p = pl + i;
+                               if (p->indel == ret->indel1) ++ret->cnt1;
+                               else if (p->indel == ret->indel2) ++ret->cnt2;
+                               else ++ret->cnt_anti;
+                       }
+                       // write gl[]
+                       ret->gl[0] = ret->gl[1] = 0;
                        for (j = 0; j < n; ++j) {
-                               if (score[max1_i*n+j] < 0 && score[max2_i*n+j] < 0) ++ret->cnt_anti;
-                               else {
-                                       int diff = score[max1_i*n+j] - score[max2_i*n+j];
-                                       if (diff > mi->ambi_thres) ++ret->cnt1;
-                                       else if (diff < -mi->ambi_thres) ++ret->cnt2;
-                                       else ++ret->cnt_ambi;
+                               int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j];
+                               //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2);
+                               if (s1 > s2) ret->gl[0] += s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel;
+                               else ret->gl[1] += s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel;
+                       }
+                       // write cnt_ref and cnt_ambi
+                       if (max1_i != 0 && max2_i != 0) {
+                               for (j = 0; j < n; ++j) {
+                                       int diff1 = score[j] - score[max1_i * n + j];
+                                       int diff2 = score[j] - score[max2_i * n + j];
+                                       if (diff1 > 0 && diff2 > 0) ++ret->cnt_ref;
+                                       else if (diff1 == 0 || diff2 == 0) ++ret->cnt_ambi;
                                }
                        }
                }
-               free(score); free(ref2); free(inscns);
+               free(score); free(pscore); free(ref2); free(inscns);
+       }
+       { // call genotype
+               int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5);
+               int min1, min2, min1_i;
+               q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel;
+               q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel;
+               q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel;
+               min1 = min2 = 0x7fffffff; min1_i = -1;
+               for (i = 0; i < 3; ++i) {
+                       if (q[i] < min1) {
+                               min2 = min1; min1 = q[i]; min1_i = i;
+                       } else if (q[i] < min2) min2 = q[i];
+               }
+               ret->gt = min1_i;
+               ret->q_cns = min2 - min1;
+               // set q_ref
+               if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3;
+               else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2];
+               if (ret->q_ref < 0) ret->q_ref = 0;
        }
        free(types);
        return ret;