6 #include "bcftools/bcf.h"
8 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
10 #define CALL_ETA 0.03f
12 #define CALL_DEFTHETA 0.83f
16 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
19 if (theta <= 0.) theta = CALL_DEFTHETA;
20 bca = calloc(1, sizeof(bcf_callaux_t));
25 bca->min_baseQ = min_baseQ;
26 bca->e = errmod_init(1. - theta);
30 void bcf_call_destroy(bcf_callaux_t *bca)
33 errmod_destroy(bca->e);
34 free(bca->bases); free(bca);
37 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r)
40 memset(r, 0, sizeof(bcf_callret1_t));
41 ref4 = bam_nt16_nt4_table[ref_base];
42 if (_n == 0) return -1;
44 // enlarge the bases array if necessary
45 if (bca->max_bases < _n) {
47 kroundup32(bca->max_bases);
48 bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
50 // fill the bases array
51 memset(r, 0, sizeof(bcf_callret1_t));
52 for (i = n = 0; i < _n; ++i) {
53 const bam_pileup1_t *p = pl + i;
54 int q, b, mapQ, baseQ, is_diff, min_dist;
56 if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
57 baseQ = q = (int)bam1_qual(p->b)[p->qpos]; // base quality
58 if (q < bca->min_baseQ) continue;
59 mapQ = p->b->core.qual < bca->capQ? p->b->core.qual : bca->capQ;
60 if (q > mapQ) q = mapQ;
63 b = bam1_seqi(bam1_seq(p->b), p->qpos); // base
64 b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base
65 bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
66 // collect annotations
68 is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
69 ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)];
70 min_dist = p->b->core.l_qseq - 1 - p->qpos;
71 if (min_dist > p->qpos) min_dist = p->qpos;
72 if (min_dist > CAP_DIST) min_dist = CAP_DIST;
73 r->anno[1<<2|is_diff<<1|0] += baseQ;
74 r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
75 r->anno[2<<2|is_diff<<1|0] += mapQ;
76 r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
77 r->anno[3<<2|is_diff<<1|0] += min_dist;
78 r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
82 errmod_cal(bca->e, n, 5, bca->bases, r->p);
86 int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t *bca, bcf_callret1_t *r)
88 int i, n, n_ins, n_del;
89 memset(r, 0, sizeof(bcf_callret1_t));
90 if (_n == 0) return -1;
92 // enlarge the bases array if necessary
93 if (bca->max_bases < _n) {
95 kroundup32(bca->max_bases);
96 bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
98 // fill the bases array
99 memset(r, 0, sizeof(bcf_callret1_t));
102 for (i = n = 0; i < _n; ++i) {
103 const bam_pileup1_t *p = pl + i;
104 int q, b, mapQ, indelQ, is_diff, min_dist;
105 if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
106 { // compute indel (base) quality
107 // this can be made more efficient, but realignment is the bottleneck anyway
108 int j, k, x, y, op, len = 0, max_left, max_rght, seqQ, indelreg;
109 bam1_core_t *c = &p->b->core;
110 uint32_t *cigar = bam1_cigar(p->b);
111 uint8_t *qual = bam1_qual(p->b);
112 for (k = y = 0, x = c->pos; k < c->n_cigar && y <= p->qpos; ++k) {
115 if (op == BAM_CMATCH) {
116 if (pos > x && pos < x + len) break;
118 } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += len;
119 else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += len;
121 if (k == c->n_cigar) continue; // this actually should not happen
122 max_left = max_rght = 0; indelreg = 0;
123 if (pos == x + len - 1 && k+2 < c->n_cigar && ((cigar[k+1]&0xf) == BAM_CINS || (cigar[k+1]&0xf) == BAM_CDEL)
124 && (cigar[k+2]&0xf) == BAM_CMATCH)
126 for (j = y; j < y + len; ++j)
127 if (max_left < qual[j]) max_left = qual[j];
128 if ((cigar[k+1]&0xf) == BAM_CINS) y += cigar[k+1]>>4;
129 else x += cigar[k+1]>>4;
130 op = cigar[k+2]&0xf; len = cigar[k+2]>>4;
131 for (j = y; j < y + len; ++j) {
132 if (max_rght < qual[j]) max_rght = qual[j];
133 if (qual[j] > BAM2BCF_INDELREG_THRES && indelreg == 0)
134 indelreg = j - y + 1;
136 if (r->indelreg > indelreg) r->indelreg = indelreg;
138 for (j = y; j <= p->qpos; ++j)
139 if (max_left < qual[j]) max_left = qual[j];
140 for (j = p->qpos + 1; j < y + len; ++j)
141 if (max_rght < qual[j]) max_rght = qual[j];
144 indelQ = max_left < max_rght? max_left : max_rght;
145 // estimate the sequencing error rate
147 if (p->indel != 0) seqQ += bca->extQ * (abs(p->indel) - 1); // FIXME: better to model homopolymer
149 ++n_ins; r->ins_len += p->indel;
150 } else if (p->indel < 0) {
151 ++n_del; r->del_len += -p->indel;
153 if (p->indel != 0) { // a different model for tandem repeats
154 uint8_t *seq = bam1_seq(p->b);
155 int tandemQ, qb = bam1_seqi(seq, p->qpos), l;
156 for (j = p->qpos + 1; j < c->l_qseq; ++j)
157 if (qb != bam1_seqi(seq, j)) break;
159 for (j = (int)p->qpos - 1; j >= 0; --j)
160 if (qb != bam1_seqi(seq, j)) break;
162 tandemQ = (int)((double)(abs(p->indel)) / l * bca->tandemQ + .499);
163 if (seqQ > tandemQ) seqQ = tandemQ;
165 // fprintf(stderr, "%s\t%d\t%d\t%d\t%d\t%d\t%d\n", bam1_qname(p->b), pos+1, p->indel, indelQ, seqQ, max_left, max_rght);
166 if (indelQ > seqQ) indelQ = seqQ;
169 if (q < bca->min_baseQ) continue;
170 mapQ = p->b->core.qual < bca->capQ? p->b->core.qual : bca->capQ;
171 if (q > mapQ) q = mapQ;
175 bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
176 // collect annotations
179 ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)];
180 min_dist = p->b->core.l_qseq - 1 - p->qpos;
181 if (min_dist > p->qpos) min_dist = p->qpos;
182 if (min_dist > CAP_DIST) min_dist = CAP_DIST;
183 r->anno[1<<2|is_diff<<1|0] += indelQ;
184 r->anno[1<<2|is_diff<<1|1] += indelQ * indelQ;
185 r->anno[2<<2|is_diff<<1|0] += mapQ;
186 r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
187 r->anno[3<<2|is_diff<<1|0] += min_dist;
188 r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
191 r->ins_len = n_ins? r->ins_len / n_ins : 0;
192 r->del_len = n_del? r->del_len / n_del : 0;
193 if (r->indelreg >= 10000) r->indelreg = 0;
195 errmod_cal(bca->e, n, 2, bca->bases, r->p);
199 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
201 int ref4, i, j, qsum[4];
203 call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
204 call->ins_len = call->del_len = 0; call->indelreg = 0;
205 if (ref4 > 4) ref4 = 4;
207 memset(qsum, 0, 4 * sizeof(int));
208 for (i = 0; i < n; ++i)
209 for (j = 0; j < 4; ++j)
210 qsum[j] += calls[i].qsum[j];
211 for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j;
212 // find the top 2 alleles
213 for (i = 1; i < 4; ++i) // insertion sort
214 for (j = i; j > 0 && qsum[j] < qsum[j-1]; --j)
215 tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
216 // set the reference allele and alternative allele(s)
217 for (i = 0; i < 5; ++i) call->a[i] = -1;
220 for (i = 3, j = 1; i >= 0; --i) {
221 if ((qsum[i]&3) != ref4) {
222 if (qsum[i]>>2 != 0) call->a[j++] = qsum[i]&3;
226 if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
227 call->unseen = j, call->a[j++] = qsum[i]&3;
232 call->PL = realloc(call->PL, 15 * n);
237 x = call->n_alleles * (call->n_alleles + 1) / 2;
238 // get the possible genotypes
239 for (i = z = 0; i < call->n_alleles; ++i)
240 for (j = i; j < call->n_alleles; ++j)
241 g[z++] = call->a[i] * 5 + call->a[j];
242 for (i = 0; i < n; ++i) {
243 uint8_t *PL = call->PL + x * i;
244 const bcf_callret1_t *r = calls + i;
246 for (j = 0; j < x; ++j)
247 if (min > r->p[g[j]]) min = r->p[g[j]];
249 for (j = 0; j < x; ++j) {
251 y = (int)(r->p[g[j]] - min + .499);
252 if (y > 255) y = 255;
256 call->shift = (int)(sum_min + .499);
258 // combine annotations
259 memset(call->anno, 0, 16 * sizeof(int));
260 for (i = call->depth = 0, tmp = 0; i < n; ++i) {
261 call->depth += calls[i].depth;
262 for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
267 int bcf_call_combine_gap(int n, const bcf_callret1_t *calls, bcf_call_t *call)
269 int i, j, n_ins, n_del;
270 // combine annotations
272 memset(call->anno, 0, 16 * sizeof(int));
273 call->ins_len = call->del_len = 0; call->indelreg = 10000;
274 for (i = call->depth = 0, n_ins = n_del = 0; i < n; ++i) {
275 const bcf_callret1_t *r = calls + i;
277 call->depth += r->depth;
278 if (r->ins_len > 0) {
279 call->ins_len += r->ins_len;
282 if (r->del_len > 0) {
283 call->del_len += r->del_len;
286 if (r->indelreg > 0 && call->indelreg > r->indelreg)
287 call->indelreg = r->indelreg;
288 for (j = 0; j < 16; ++j) call->anno[j] += r->anno[j];
291 if (call->depth == 0) return 0; // no indels
292 call->ins_len = n_ins? call->ins_len / n_ins : 0;
293 call->del_len = n_del? call->del_len / n_del : 0;
295 for (i = 0; i < 5; ++i) call->a[i] = -1;
296 call->a[0] = 0; call->a[1] = 1;
302 call->PL = realloc(call->PL, 15 * n);
307 g[0] = 0; g[1] = 1; g[2] = 3;
308 for (i = 0; i < n; ++i) {
309 uint8_t *PL = call->PL + 3 * i;
310 const bcf_callret1_t *r = calls + i;
312 for (j = 0; j < 3; ++j)
313 if (min > r->p[g[j]]) min = r->p[g[j]];
315 for (j = 0; j < 3; ++j) {
317 y = (int)(r->p[g[j]] - min + .499);
318 if (y > 255) y = 255;
322 call->shift = (int)(sum_min + .499);
327 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP)
329 extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
333 b->tid = tid; b->pos = pos; b->qual = 0;
334 s.s = b->str; s.m = b->m_str; s.l = 0;
336 if (bc->ins_len > 0 || bc->del_len > 0) { // an indel
337 for (i = 0; i < bc->indelreg; ++i) kputc('N', &s);
339 if (bc->ins_len > 0 && bc->del_len > 0) kputs("<INDEL>", &s);
340 else if (bc->ins_len > 0) kputs("<INS>", &s);
341 else if (bc->del_len > 0) kputs("<DEL>", &s);
343 kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
344 for (i = 1; i < 5; ++i) {
345 if (bc->a[i] < 0) break;
346 if (i > 1) kputc(',', &s);
347 kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
354 for (i = 0; i < 16; ++i) {
355 if (i) kputc(',', &s);
356 kputw(bc->anno[i], &s);
363 if (is_SP) kputs(":SP", &s);
366 b->m_str = s.m; b->str = s.s; b->l_str = s.l;
368 memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);
370 uint16_t *dp = (uint16_t*)b->gi[1].data;
371 uint8_t *sp = is_SP? b->gi[2].data : 0;
372 for (i = 0; i < bc->n; ++i) {
373 bcf_callret1_t *p = bcr + i;
374 dp[i] = p->depth < 0xffff? p->depth : 0xffff;
376 if (p->anno[0] + p->anno[1] < 2 || p->anno[2] + p->anno[3] < 2
377 || p->anno[0] + p->anno[2] < 2 || p->anno[1] + p->anno[3] < 2)
381 double left, right, two;
383 kt_fisher_exact(p->anno[0], p->anno[1], p->anno[2], p->anno[3], &left, &right, &two);
384 x = (int)(-4.343 * log(two) + .499);
385 if (x > 255) x = 255;