From 4c8358db36b9d83a4aaa176a8f2c072ef5cc534d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 28 Sep 2010 19:44:16 +0000 Subject: [PATCH] prob_realn() seems working! --- bam_md.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++- bam_plcmd.c | 3 ++- kaln.c | 23 ++++++++++---------- 3 files changed, 73 insertions(+), 13 deletions(-) diff --git a/bam_md.c b/bam_md.c index 746b2c0..7023046 100644 --- a/bam_md.c +++ b/bam_md.c @@ -261,6 +261,63 @@ int bam_realn(bam1_t *b, const char *ref) return 0; } +int bam_prob_realn(bam1_t *b, const char *ref) +{ + int k, i, bw, x, y, yb, ye, xb, xe; + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + ka_probpar_t conf = ka_probpar_def; + // find the start and end of the alignment + if (c->flag & BAM_FUNMAP) return -1; + x = c->pos, y = 0, yb = ye = xb = xe = -1; + for (k = 0; k < c->n_cigar; ++k) { + int op, l; + op = cigar[k]&0xf; l = cigar[k]>>4; + if (op == BAM_CMATCH) { + if (yb < 0) yb = y; + if (xb < 0) xb = x; + ye = y + l; xe = x + l; + x += l; y += l; + } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; + else if (op == BAM_CDEL) x += l; + else if (op == BAM_CREF_SKIP) return -1; + } + // set bandwidth and the start and the end + bw = 7; + if (abs((xe - xb) - (ye - yb)) > bw) + bw = abs((xe - xb) - (ye - yb)) + 3; + conf.bw = bw; + xb -= yb + bw/2; if (xb < 0) xb = 0; + xe += c->l_qseq - ye + bw/2; + if (xe - xb - c->l_qseq > bw) + xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2; + { // glocal + uint8_t *s, *r, *q, *seq = bam1_seq(b), *qual = bam1_qual(b); + int *state; + s = calloc(c->l_qseq, 1); + for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)]; + r = calloc(xe - xb, 1); + for (i = xb; i < xe; ++i) + r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]]; + state = calloc(c->l_qseq, sizeof(int)); + q = calloc(c->l_qseq, 1); + ka_prob_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q); + for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { + int op = cigar[k]&0xf, l = cigar[k]>>4; + if (op == BAM_CMATCH) { + for (i = y; i < y + l; ++i) { + if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) qual[i] = 0; + else qual[i] = qual[i] < q[i]? qual[i] : q[i]; + } + x += l; y += l; + } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; + else if (op == BAM_CDEL) x += l; + } + free(s); free(r); free(q); free(state); + } + return 0; +} + int bam_fillmd(int argc, char *argv[]) { int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0, is_realn, capQ = 0; @@ -318,7 +375,8 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", fp->header->target_name[tid]); } - if (is_realn) bam_realn(b, ref); +// if (is_realn) bam_realn(b, ref); + if (is_realn) bam_prob_realn(b, ref); if (capQ > 10) { int q = bam_cap_mapQ(b, ref, capQ); if (b->core.qual > q) b->core.qual = q; diff --git a/bam_plcmd.c b/bam_plcmd.c index 358e9f9..af43918 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -483,6 +483,7 @@ typedef struct { static int mplp_func(void *data, bam1_t *b) { extern int bam_realn(bam1_t *b, const char *ref); + extern int bam_prob_realn(bam1_t *b, const char *ref); extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres); mplp_aux_t *ma = (mplp_aux_t*)data; int ret, skip = 0; @@ -491,7 +492,7 @@ static int mplp_func(void *data, bam1_t *b) ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); if (ret < 0) break; skip = 0; - if (has_ref && (ma->flag&MPLP_REALN)) bam_realn(b, ma->ref); + if (has_ref && (ma->flag&MPLP_REALN)) bam_prob_realn(b, ma->ref); if (has_ref && ma->capQ_thres > 10) { int q = bam_cap_mapQ(b, ma->ref, ma->capQ_thres); if (q < 0) skip = 1; diff --git a/kaln.c b/kaln.c index e25ebb5..d8edb91 100644 --- a/kaln.c +++ b/kaln.c @@ -371,9 +371,9 @@ uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const return cigar; } -/************************ - * Probabilistic glocal * - ************************/ +/***************************************** + * Probabilistic banded glocal alignment * + *****************************************/ static float g_qual2prob[256]; @@ -495,7 +495,7 @@ int ka_prob_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_ // b[l_query-1..1] for (i = l_query - 1; i >= 1; --i) { int beg = 1, end = l_ref, x, _beg, _end; - double *bi = b[i], *bi1 = b[i+1]; + double *bi = b[i], *bi1 = b[i+1], y = (i > 1); x = i - bw; beg = beg > x? beg : x; x = i + bw; end = end < x? end : x; for (k = end; k >= beg; --k) { @@ -507,12 +507,13 @@ int ka_prob_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_ bi[u+0] = e * m[0] * bi1[v11+0] + .25 * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; bi[u+1] = e * m[3] * bi1[v11+0] + .25 * m[4] * bi1[v10+1]; // FIXME: I do not know why I need this (i>1) factor, but only with it the result makes sense. - bi[u+2] = (e * m[6] * bi1[v11+0] + m[8] * bi[v01+2]) * (i > 1); + bi[u+2] = (e * m[6] * bi1[v11+0] + m[8] * bi[v01+2]) * y; // fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG } // rescale set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; - for (k = _beg; k <= _end; ++k) bi[k] /= s[i]; + y = s[i]; + for (k = _beg; k <= _end; ++k) bi[k] /= y; } { // b[0] int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; @@ -531,21 +532,21 @@ int ka_prob_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_ /*** MAP ***/ for (i = 1; i <= l_query; ++i) { double sum = 0., *fi = f[i], *bi = b[i], max = 0.; - int beg = 0, end = l_ref, x, max_k = -1; + int beg = 1, end = l_ref, x, max_k = -1; x = i - bw; beg = beg > x? beg : x; x = i + bw; end = end < x? end : x; for (k = beg; k <= end; ++k) { int u; double z; set_u(u, bw, i, k); - z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = k<<2 | 0; sum += z; - z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = k<<2 | 1; sum += z; + z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z; + z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z; } max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 if (state) state[i-1] = max_k; - if (q) q[i-1] = -4.343 * log(1. - max); + if (q) k = -4.343 * log(1. - max), q[i-1] = k > 100? 99 : k; #ifdef _MAIN - fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%d)~%lg\n", pb, sum, i, max_k>>2, max_k&3, max); // DEBUG + fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%d)~%lg\n", pb, sum, i-1, max_k>>2, max_k&3, max); // DEBUG #endif } /*** free ***/ -- 2.39.2