#include "bam.h"
#include "bam_maqcns.h"
#include "ksort.h"
+#include "kaln.h"
KSORT_INIT_GENERIC(uint32_t)
-#define MAX_WINDOW 33
+#define INDEL_WINDOW_SIZE 50
typedef struct __bmc_aux_t {
int max;
bm->theta = 0.85;
bm->n_hap = 2;
bm->eta = 0.03;
+ bm->cap_mapQ = 60;
return bm;
}
rms = 0;
for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
uint32_t info = bm->aux->info[j];
+ int tmp;
if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
k = info>>16&7;
if (info>>24 > 0) {
if (w[k] < 0xff) ++w[k];
++b->c[k&3];
}
- rms += (int)(info&0x7f) * (info&0x7f);
+ tmp = (int)(info&0xff) < bm->cap_mapQ? (int)(info&0xff) : bm->cap_mapQ;
+ rms += tmp * tmp;
}
b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499);
// rescale ->c[]
if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0;
}
+ { // fix p[k<<2|k]
+ float max1, max2, min1, min2;
+ int max_k, min_k;
+ max_k = min_k = -1;
+ max1 = max2 = -1.0; min1 = min2 = 1e30;
+ for (k = 0; k < 4; ++k) {
+ if (b->esum[k] > max1) {
+ max2 = max1; max1 = b->esum[k]; max_k = k;
+ } else if (b->esum[k] > max2) max2 = b->esum[k];
+ }
+ for (k = 0; k < 4; ++k) {
+ if (p[k<<2|k] < min1) {
+ min2 = min1; min1 = p[k<<2|k]; min_k = k;
+ } else if (p[k<<2|k] < min2) min2 = p[k<<2|k];
+ }
+ if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2))
+ p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0;
+ }
+
// convert necessary information to glf1_t
g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ;
g->depth = n > 16777215? 16777215 : n;
bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
int _n_types, int *_types)
{
- int i, j, n_types, *types, left, right;
+ int i, j, n_types, *types, left, right, max_rd_len = 0;
bam_maqindel_ret_t *ret = 0;
// if there is no proposed indel, check if there is an indel from the alignment
if (_n_types == 0) {
const bam_pileup1_t *p = pl + i;
if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0)
aux[m++] = MINUS_CONST + p->indel;
+ j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
+ if (j > max_rd_len) max_rd_len = j;
}
if (_n_types) // then also add this to aux[]
for (i = 0; i < _n_types; ++i)
free(aux);
}
{ // calculate left and right boundary
- bam_segreg_t seg;
- left = 0x7fffffff; right = 0;
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (!(p->b->core.flag&BAM_FUNMAP)) {
- bam_segreg(pos, &p->b->core, bam1_cigar(p->b), &seg);
- if (seg.tbeg < left) left = seg.tbeg;
- if (seg.tend > right) right = seg.tend;
- }
- }
- if (pos - left > MAX_WINDOW) left = pos - MAX_WINDOW;
- if (right - pos> MAX_WINDOW) right = pos + MAX_WINDOW;
+ left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+ right = pos + INDEL_WINDOW_SIZE;
}
{ // the core part
- char *ref2, *inscns = 0;
+ char *ref2, *rs, *inscns = 0;
int k, l, *score, *pscore, max_ins = types[n_types-1];
ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1);
+ rs = (char*)calloc(right - left + max_rd_len + types[n_types-1] + 2, 1);
if (max_ins > 0) { // get the consensus of inserted sequences
int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
// count occurrences
score = (int*)calloc(n_types * n, sizeof(int));
pscore = (int*)calloc(n_types * n, sizeof(int));
for (i = 0; i < n_types; ++i) {
+ ka_param_t ap = ka_param_blast;
+ ap.band_width = 2 * types[n_types - 1] + 2;
// write ref2
for (k = 0, j = left; j <= pos; ++j)
- ref2[k++] = bam_nt16_table[(int)ref[j]];
+ ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
if (types[i] <= 0) j += -types[i];
else for (l = 0; l < types[i]; ++l)
ref2[k++] = inscns[i*max_ins + l];
for (; j < right && ref[j]; ++j)
- ref2[k++] = bam_nt16_table[(int)ref[j]];
+ ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
+ if (j < right) right = j;
// calculate score for each read
for (j = 0; j < n; ++j) {
const bam_pileup1_t *p = pl + j;
- uint32_t *cigar;
- bam1_core_t *c = &p->b->core;
- int s, ps;
- bam_segreg_t seg;
- if (c->flag&BAM_FUNMAP) continue;
- cigar = bam1_cigar(p->b);
- bam_segreg(pos, c, cigar, &seg);
- for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
- int cq = bam1_seqi(bam1_seq(p->b), l), ct;
- ct = c->pos + l >= left? ref2[c->pos + l - left] : 15; // "<" will happen if reads are too long
- if (cq < 15 && ct < 15) {
- s += cq == ct? 1 : -mi->mm_penalty;
- if (cq != ct) ps += bam1_qual(p->b)[l];
- }
- }
- score[i*n + j] = s; pscore[i*n + j] = ps;
- if (types[i] != 0) { // then try the other way to calculate the score
- for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
- int cq = bam1_seqi(bam1_seq(p->b), l), ct;
- ct = c->pos + l + types[i] >= left? ref2[c->pos + l + types[i] - left] : 15;
- if (cq < 15 && ct < 15) {
- s += cq == ct? 1 : -mi->mm_penalty;
- if (cq != ct) ps += bam1_qual(p->b)[l];
+ int qbeg, qend, tbeg, tend;
+ if (p->b->core.flag & BAM_FUNMAP) continue;
+ qbeg = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), left, &tbeg);
+ qend = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), right, &tend);
+ for (l = qbeg; l < qend; ++l)
+ rs[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), l)];
+ {
+ int x, y, n_acigar, ps;
+ uint32_t *acigar;
+ ps = 0;
+ acigar = ka_global_core((uint8_t*)ref2 + tbeg - left, tend - tbeg + types[i], (uint8_t*)rs, qend - qbeg, &ap, &score[i*n+j], &n_acigar);
+ x = tbeg - left; y = 0;
+ for (l = 0; l < n_acigar; ++l) {
+ int op = acigar[l]&0xf;
+ int len = acigar[l]>>4;
+ if (op == BAM_CMATCH) {
+ int k;
+ for (k = 0; k < len; ++k)
+ if (ref2[x+k] != rs[y+k]) ps += bam1_qual(p->b)[y+k];
+ x += len; y += len;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+ if (op == BAM_CINS) ps += mi->q_indel * len;
+ y += len;
+ } else if (op == BAM_CDEL) {
+ ps += mi->q_indel * len;
+ x += len;
}
}
+ pscore[i*n+j] = ps;
+ //printf("pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, ", pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend);
+ //for (l = 0; l < n_acigar; ++l) printf("%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); printf("\n");
+ free(acigar);
}
- if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores
- if (pscore[i*n+j] > ps) pscore[i*n+j] = ps;
- if (types[i] != 0) score[i*n+j] -= mi->indel_err;
- //printf("%d, %d, %d, %d\n", i, types[i], j, score[i*n+j]);
}
}
{ // get final result
ret->gl[0] = ret->gl[1] = 0;
for (j = 0; j < n; ++j) {
int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j];
- //printf("%d, %d\n", s1, s2);
+ //printf("%d, %d, %d, %d, %d\n", pl[j].b->core.pos+1, max1_i, max2_i, s1, s2);
if (s1 > s2) ret->gl[0] += s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel;
else ret->gl[1] += s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel;
}
+ // write cnt_ref and cnt_ambi
+ if (max1_i != 0 && max2_i != 0) {
+ for (j = 0; j < n; ++j) {
+ int diff1 = score[j] - score[max1_i * n + j];
+ int diff2 = score[j] - score[max2_i * n + j];
+ if (diff1 > 0 && diff2 > 0) ++ret->cnt_ref;
+ else if (diff1 == 0 || diff2 == 0) ++ret->cnt_ambi;
+ }
+ }
}
- free(score); free(pscore); free(ref2); free(inscns);
+ free(score); free(pscore); free(ref2); free(rs); free(inscns);
}
{ // call genotype
int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5);