X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bam_maqcns.c;h=464288ae7da0ac74f1443bbb55c03684927cb391;hb=8cab5e6afd245a6dbbb1580dc37cd3ee36e55078;hp=65ed64c7031b030f84d9ac8ffc317ba8e5d942c1;hpb=d97f59c04fe0e183a66ad5b2f24a39a4b92f7f25;p=samtools.git diff --git a/bam_maqcns.c b/bam_maqcns.c index 65ed64c..464288a 100644 --- a/bam_maqcns.c +++ b/bam_maqcns.c @@ -4,6 +4,8 @@ #include "ksort.h" KSORT_INIT_GENERIC(uint32_t) +#define MAX_WINDOW 33 + typedef struct __bmc_aux_t { int max; uint32_t *info; @@ -12,7 +14,7 @@ typedef struct __bmc_aux_t { typedef struct { float esum[4], fsum[4]; uint32_t c[4]; - uint32_t mapQ_max; + uint32_t rms_mapQ; } glf_call_aux_t; char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; @@ -106,6 +108,7 @@ bam_maqcns_t *bam_maqcns_init() bm->theta = 0.85; bm->n_hap = 2; bm->eta = 0.03; + bm->cap_mapQ = 60; return bm; } @@ -127,6 +130,7 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam int i, j, k, w[8], c, n; glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t)); float p[16], min_p = 1e30; + uint64_t rms; g->ref_base = ref_base; if (_n == 0) return g; @@ -139,13 +143,14 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam } for (i = n = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; - uint32_t q, x = 0; + uint32_t q, x = 0, qq; if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; q = (uint32_t)bam1_qual(p->b)[p->qpos]; x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual; if (p->b->core.qual < q) q = p->b->core.qual; x |= q << 24; - q = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + qq = bam1_seqi(bam1_seq(p->b), p->qpos); + q = bam_nt16_nt4_table[qq? qq : ref_base]; if (!p->is_del && q < 4) x |= 1 << 21 | q << 16; bm->aux->info[n++] = x; } @@ -153,9 +158,10 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam // generate esum and fsum b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t)); for (k = 0; k != 8; ++k) w[k] = 0; - b->mapQ_max = 0; + rms = 0; for (j = n - 1; j >= 0; --j) { // calculate esum and fsum uint32_t info = bm->aux->info[j]; + int tmp; if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff); k = info>>16&7; if (info>>24 > 0) { @@ -164,8 +170,10 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam if (w[k] < 0xff) ++w[k]; ++b->c[k&3]; } - if (b->mapQ_max < (info&0x7f)) b->mapQ_max = info&0x7f; + tmp = (int)(info&0x7f) < bm->cap_mapQ? (int)(info&0x7f) : bm->cap_mapQ; + rms += tmp * tmp; } + b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499); // rescale ->c[] for (j = c = 0; j != 4; ++j) c += b->c[j]; if (c > 255) { @@ -205,8 +213,27 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0; } + { // fix p[k<<2|k] + float max1, max2, min1, min2; + int max_k, min_k; + max_k = min_k = -1; + max1 = max2 = -1.0; min1 = min2 = 1e30; + for (k = 0; k < 4; ++k) { + if (b->esum[k] > max1) { + max2 = max1; max1 = b->esum[k]; max_k = k; + } else if (b->esum[k] > max2) max2 = b->esum[k]; + } + for (k = 0; k < 4; ++k) { + if (p[k<<2|k] < min1) { + min2 = min1; min1 = p[k<<2|k]; min_k = k; + } else if (p[k<<2|k] < min2) min2 = p[k<<2|k]; + } + if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2)) + p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0; + } + // convert necessary information to glf1_t - g->ref_base = ref_base; g->max_mapQ = b->mapQ_max; + g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ; g->depth = n > 16777215? 16777215 : n; for (j = 0; j != 4; ++j) for (k = j; k < 4; ++k) @@ -262,6 +289,9 @@ uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm) bam_maqindel_opt_t *bam_maqindel_opt_init() { bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t)); + mi->q_indel = 40; + mi->r_indel = 0.00015; + // mi->mm_penalty = 3; mi->indel_err = 4; mi->ambi_thres = 10; @@ -271,24 +301,28 @@ bam_maqindel_opt_t *bam_maqindel_opt_init() void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir) { if (mir == 0) return; - free(mir->s1); free(mir->s2); free(mir); + free(mir->s[0]); free(mir->s[1]); free(mir); } #define MINUS_CONST 0x10000000 -bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref) +bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types) { int i, j, n_types, *types, left, right; bam_maqindel_ret_t *ret = 0; - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break; + // if there is no proposed indel, check if there is an indel from the alignment + if (_n_types == 0) { + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break; + } + if (i == n) return 0; // no indel } - if (i == n) return 0; // no indel { // calculate how many types of indels are available (set n_types and types) int m; uint32_t *aux; - aux = (uint32_t*)calloc(n+1, 4); + aux = (uint32_t*)calloc(n + _n_types + 1, 4); m = 0; aux[m++] = MINUS_CONST; // zero indel is always a type for (i = 0; i < n; ++i) { @@ -296,9 +330,12 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) aux[m++] = MINUS_CONST + p->indel; } + if (_n_types) // then also add this to aux[] + for (i = 0; i < _n_types; ++i) + if (_types[i]) aux[m++] = MINUS_CONST + _types[i]; ks_introsort(uint32_t, m, aux); - n_types = 1; - for (i = 1; i < m; ++i) + // squeeze out identical types + for (i = 1, n_types = 1; i < m; ++i) if (aux[i] != aux[i-1]) ++n_types; types = (int*)calloc(n_types, sizeof(int)); j = 0; @@ -320,10 +357,12 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c if (seg.tend > right) right = seg.tend; } } + if (pos - left > MAX_WINDOW) left = pos - MAX_WINDOW; + if (right - pos> MAX_WINDOW) right = pos + MAX_WINDOW; } { // the core part char *ref2, *inscns = 0; - int k, l, *score, max_ins = types[n_types-1]; + int k, l, *score, *pscore, max_ins = types[n_types-1]; ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1); if (max_ins > 0) { // get the consensus of inserted sequences int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); @@ -340,7 +379,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c } } } - // construct the consensus + // construct the consensus of inserted sequence inscns = (char*)calloc(n_types * max_ins, sizeof(char)); for (i = 0; i < n_types; ++i) { for (j = 0; j < types[i]; ++j) { @@ -358,6 +397,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c } // calculate score score = (int*)calloc(n_types * n, sizeof(int)); + pscore = (int*)calloc(n_types * n, sizeof(int)); for (i = 0; i < n_types; ++i) { // write ref2 for (k = 0, j = left; j <= pos; ++j) @@ -372,29 +412,36 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c const bam_pileup1_t *p = pl + j; uint32_t *cigar; bam1_core_t *c = &p->b->core; - int s; + int s, ps; bam_segreg_t seg; if (c->flag&BAM_FUNMAP) continue; cigar = bam1_cigar(p->b); bam_segreg(pos, c, cigar, &seg); - for (s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) { + for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) { int cq = bam1_seqi(bam1_seq(p->b), l), ct; - ct = c->pos + l >= left? ref2[c->pos + l - left] : 15; // "<" should not happen if there is no bug - if (cq < 15 && ct < 15) + // in the following line, "<" will happen if reads are too long + ct = c->pos + l - seg.qbeg >= left? ref2[c->pos + l - seg.qbeg - left] : 15; + if (cq < 15 && ct < 15) { s += cq == ct? 1 : -mi->mm_penalty; + if (cq != ct) ps += bam1_qual(p->b)[l]; + } } - score[i*n + j] = s; + score[i*n + j] = s; pscore[i*n + j] = ps; if (types[i] != 0) { // then try the other way to calculate the score - for (s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) { + for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) { int cq = bam1_seqi(bam1_seq(p->b), l), ct; - ct = c->pos + l + types[i] >= left? ref2[c->pos + l + types[i] - left] : 15; - if (cq < 15 && ct < 15) + ct = c->pos + l - seg.qbeg + types[i] >= left? ref2[c->pos + l - seg.qbeg + types[i] - left] : 15; + if (cq < 15 && ct < 15) { s += cq == ct? 1 : -mi->mm_penalty; + if (cq != ct) ps += bam1_qual(p->b)[l]; + } } } if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores + if (pscore[i*n+j] > ps) pscore[i*n+j] = ps; if (types[i] != 0) score[i*n+j] -= mi->indel_err; - //printf("%d, %d, %d, %d\n", i, types[i], j, score[i*n+j]); + //printf("%d, %d, %d, %d, %d, %d, %d\n", p->b->core.pos + 1, seg.qbeg, i, types[i], j, + // score[i*n+j], pscore[i*n+j]); } } { // get final result @@ -403,7 +450,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c sum = (int*)calloc(n_types, sizeof(int)); for (i = 0; i < n_types; ++i) for (j = 0; j < n; ++j) - sum[i] += score[i*n+j]; + sum[i] += -pscore[i*n+j]; max1 = max2 = -0x7fffffff; max1_i = max2_i = -1; for (i = 0; i < n_types; ++i) { if (sum[i] > max1) { @@ -416,37 +463,63 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c // write ret ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t)); ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i]; - ret->s1 = (char*)calloc(abs(ret->indel1) + 2, 1); - ret->s2 = (char*)calloc(abs(ret->indel2) + 2, 1); + ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1); + ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1); + // write indel sequence if (ret->indel1 > 0) { - ret->s1[0] = '+'; + ret->s[0][0] = '+'; for (k = 0; k < ret->indel1; ++k) - ret->s1[k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]]; + ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]]; } else if (ret->indel1 < 0) { - ret->s1[0] = '-'; + ret->s[0][0] = '-'; for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k) - ret->s1[k+1] = ref[pos + k + 1]; - } else ret->s1[0] = '*'; + ret->s[0][k+1] = ref[pos + k + 1]; + } else ret->s[0][0] = '*'; if (ret->indel2 > 0) { - ret->s2[0] = '+'; + ret->s[1][0] = '+'; for (k = 0; k < ret->indel2; ++k) - ret->s2[k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]]; + ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]]; } else if (ret->indel2 < 0) { - ret->s2[0] = '-'; + ret->s[1][0] = '-'; for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k) - ret->s2[k+1] = ref[pos + k + 1]; - } else ret->s2[0] = '*'; + ret->s[1][k+1] = ref[pos + k + 1]; + } else ret->s[1][0] = '*'; + // write count + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->indel == ret->indel1) ++ret->cnt1; + else if (p->indel == ret->indel2) ++ret->cnt2; + else ++ret->cnt_anti; + } + // write gl[] + ret->gl[0] = ret->gl[1] = 0; for (j = 0; j < n; ++j) { - if (score[max1_i*n+j] < 0 && score[max2_i*n+j] < 0) ++ret->cnt_anti; - else { - int diff = score[max1_i*n+j] - score[max2_i*n+j]; - if (diff > mi->ambi_thres) ++ret->cnt1; - else if (diff < -mi->ambi_thres) ++ret->cnt2; - else ++ret->cnt_ambi; - } + int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j]; + //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2); + if (s1 > s2) ret->gl[0] += s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel; + else ret->gl[1] += s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel; } } - free(score); free(ref2); free(inscns); + free(score); free(pscore); free(ref2); free(inscns); + } + { // call genotype + int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5); + int min1, min2, min1_i; + q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel; + q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel; + q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel; + min1 = min2 = 0x7fffffff; min1_i = -1; + for (i = 0; i < 3; ++i) { + if (q[i] < min1) { + min2 = min1; min1 = q[i]; min1_i = i; + } else if (q[i] < min2) min2 = q[i]; + } + ret->gt = min1_i; + ret->q_cns = min2 - min1; + // set q_ref + if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3; + else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2]; + if (ret->q_ref < 0) ret->q_ref = 0; } free(types); return ret;