return 0;
}
+int bam_prob_realn(bam1_t *b, const char *ref)
+{
+ int k, i, bw, x, y, yb, ye, xb, xe;
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ ka_probpar_t conf = ka_probpar_def;
+ // find the start and end of the alignment
+ if (c->flag & BAM_FUNMAP) return -1;
+ x = c->pos, y = 0, yb = ye = xb = xe = -1;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op, l;
+ op = cigar[k]&0xf; l = cigar[k]>>4;
+ if (op == BAM_CMATCH) {
+ if (yb < 0) yb = y;
+ if (xb < 0) xb = x;
+ ye = y + l; xe = x + l;
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ else if (op == BAM_CREF_SKIP) return -1;
+ }
+ // set bandwidth and the start and the end
+ bw = 7;
+ if (abs((xe - xb) - (ye - yb)) > bw)
+ bw = abs((xe - xb) - (ye - yb)) + 3;
+ conf.bw = bw;
+ xb -= yb + bw/2; if (xb < 0) xb = 0;
+ xe += c->l_qseq - ye + bw/2;
+ if (xe - xb - c->l_qseq > bw)
+ xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
+ { // glocal
+ uint8_t *s, *r, *q, *seq = bam1_seq(b), *qual = bam1_qual(b);
+ int *state;
+ s = calloc(c->l_qseq, 1);
+ for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
+ r = calloc(xe - xb, 1);
+ for (i = xb; i < xe; ++i)
+ r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]];
+ state = calloc(c->l_qseq, sizeof(int));
+ q = calloc(c->l_qseq, 1);
+ ka_prob_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
+ for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k]&0xf, l = cigar[k]>>4;
+ if (op == BAM_CMATCH) {
+ for (i = y; i < y + l; ++i) {
+ if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) qual[i] = 0;
+ else qual[i] = qual[i] < q[i]? qual[i] : q[i];
+ }
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ }
+ free(s); free(r); free(q); free(state);
+ }
+ return 0;
+}
+
int bam_fillmd(int argc, char *argv[])
{
int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0, is_realn, capQ = 0;
fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
fp->header->target_name[tid]);
}
- if (is_realn) bam_realn(b, ref);
+// if (is_realn) bam_realn(b, ref);
+ if (is_realn) bam_prob_realn(b, ref);
if (capQ > 10) {
int q = bam_cap_mapQ(b, ref, capQ);
if (b->core.qual > q) b->core.qual = q;
static int mplp_func(void *data, bam1_t *b)
{
extern int bam_realn(bam1_t *b, const char *ref);
+ extern int bam_prob_realn(bam1_t *b, const char *ref);
extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
mplp_aux_t *ma = (mplp_aux_t*)data;
int ret, skip = 0;
ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b);
if (ret < 0) break;
skip = 0;
- if (has_ref && (ma->flag&MPLP_REALN)) bam_realn(b, ma->ref);
+ if (has_ref && (ma->flag&MPLP_REALN)) bam_prob_realn(b, ma->ref);
if (has_ref && ma->capQ_thres > 10) {
int q = bam_cap_mapQ(b, ma->ref, ma->capQ_thres);
if (q < 0) skip = 1;
return cigar;
}
-/************************
- * Probabilistic glocal *
- ************************/
+/*****************************************
+ * Probabilistic banded glocal alignment *
+ *****************************************/
static float g_qual2prob[256];
// b[l_query-1..1]
for (i = l_query - 1; i >= 1; --i) {
int beg = 1, end = l_ref, x, _beg, _end;
- double *bi = b[i], *bi1 = b[i+1];
+ double *bi = b[i], *bi1 = b[i+1], y = (i > 1);
x = i - bw; beg = beg > x? beg : x;
x = i + bw; end = end < x? end : x;
for (k = end; k >= beg; --k) {
bi[u+0] = e * m[0] * bi1[v11+0] + .25 * m[1] * bi1[v10+1] + m[2] * bi[v01+2];
bi[u+1] = e * m[3] * bi1[v11+0] + .25 * m[4] * bi1[v10+1];
// FIXME: I do not know why I need this (i>1) factor, but only with it the result makes sense.
- bi[u+2] = (e * m[6] * bi1[v11+0] + m[8] * bi[v01+2]) * (i > 1);
+ bi[u+2] = (e * m[6] * bi1[v11+0] + m[8] * bi[v01+2]) * y;
// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
}
// rescale
set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg; k <= _end; ++k) bi[k] /= s[i];
+ y = s[i];
+ for (k = _beg; k <= _end; ++k) bi[k] /= y;
}
{ // b[0]
int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
/*** MAP ***/
for (i = 1; i <= l_query; ++i) {
double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
- int beg = 0, end = l_ref, x, max_k = -1;
+ int beg = 1, end = l_ref, x, max_k = -1;
x = i - bw; beg = beg > x? beg : x;
x = i + bw; end = end < x? end : x;
for (k = beg; k <= end; ++k) {
int u;
double z;
set_u(u, bw, i, k);
- z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = k<<2 | 0; sum += z;
- z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = k<<2 | 1; sum += z;
+ z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
+ z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
}
max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
if (state) state[i-1] = max_k;
- if (q) q[i-1] = -4.343 * log(1. - max);
+ if (q) k = -4.343 * log(1. - max), q[i-1] = k > 100? 99 : k;
#ifdef _MAIN
- fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%d)~%lg\n", pb, sum, i, max_k>>2, max_k&3, max); // DEBUG
+ fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%d)~%lg\n", pb, sum, i-1, max_k>>2, max_k&3, max); // DEBUG
#endif
}
/*** free ***/