6 #include "bcftools/bcf.h"
8 #define END_DIST_THRES 11
10 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
12 #define CALL_ETA 0.03f
14 #define CALL_DEFTHETA 0.85f
16 struct __bcf_callaux_t {
17 int max_info, capQ, min_baseQ;
22 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
26 if (theta <= 0.) theta = CALL_DEFTHETA;
27 bca = calloc(1, sizeof(bcf_callaux_t));
29 bca->min_baseQ = min_baseQ;
30 bca->fk = calloc(CALL_MAX, sizeof(double));
32 for (n = 1; n < CALL_MAX; ++n)
33 bca->fk[n] = theta >= 1.? 1. : pow(theta, n) * (1.0 - CALL_ETA) + CALL_ETA;
34 bca->fk[CALL_MAX-1] = 0.;
38 void bcf_call_destroy(bcf_callaux_t *bca)
41 free(bca->info); free(bca->fk); free(bca);
46 float esum[5], fsum[5];
52 The following code is nearly identical to bam_maqcns_glfgen() under
53 the simplified SOAPsnp model. It does the following:
55 1) Collect strand, base, quality and mapping quality information for
56 each base and combine them in an integer:
58 x = min{baseQ,mapQ}<<24 | 1<<21 | strand<<18 | base<<16 | baseQ<<8 | mapQ
60 2) Sort the list of integers for the next step.
62 3) For each base, calculate e_b, the sum of weighted qualities. For
63 each type of base on each strand, the best quality has the highest
64 weight. Only the top 255 bases on each strand are used (different
67 4) Rescale the total read depth to 255.
69 5) Calculate Q(D|g) = -10\log_{10}P(D|g) (d_a is the allele count):
71 Q(D|<aa>)=\sum_{b\not=a}e_b
72 Q(D|<aA>)=3*(d_a+d_A)+\sum_{b\not=a,b\not=A}e_b
74 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r)
76 int i, j, k, c, n, ref4;
80 memset(r, 0, sizeof(bcf_callret1_t));
81 ref4 = bam_nt16_nt4_table[ref_base];
82 if (_n == 0) return -1;
84 // enlarge the aux array if necessary
85 if (bca->max_info < _n) {
87 kroundup32(bca->max_info);
88 bca->info = (uint32_t*)realloc(bca->info, 4 * bca->max_info);
91 for (i = n = 0; i < _n; ++i) {
92 const bam_pileup1_t *p = pl + i;
93 uint32_t q, x = 0, qq;
95 if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; // skip unmapped reads and deleted bases
96 q = (uint32_t)bam1_qual(p->b)[p->qpos]; // base quality
97 if (q < bca->min_baseQ) continue;
98 x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
99 if (p->b->core.qual < q) q = p->b->core.qual; // cap the overall quality at mapping quality
101 qq = bam1_seqi(bam1_seq(p->b), p->qpos); // base
102 q = bam_nt16_nt4_table[qq? qq : ref_base]; // q is the 2-bit base
103 if (q < 4) x |= 1 << 21 | q << 16;
104 k = (ref4 < 4 && q == ref4)? 0 : 1;
105 k = k<<1 | bam1_strand(p->b);
108 // calculate min_dist
109 min_dist = p->b->core.l_qseq - 1 - p->qpos;
110 if (min_dist > p->qpos) min_dist = p->qpos;
111 k = (k&2) | (min_dist <= END_DIST_THRES);
114 ks_introsort_uint32_t(n, bca->info);
116 // generate esum and fsum
117 memset(&aux, 0, sizeof(auxaux_t));
118 for (j = n - 1, r->sum_Q2 = 0; j >= 0; --j) { // calculate esum and fsum
119 uint32_t info = bca->info[j];
121 if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
124 aux.esum[k&3] += bca->fk[aux.w[k]] * (info>>24);
125 aux.fsum[k&3] += bca->fk[aux.w[k]];
126 if (aux.w[k] + 1 < CALL_MAX) ++aux.w[k];
129 tmp = (int)(info&0xff) < bca->capQ? (int)(info&0xff) : bca->capQ;
130 r->sum_Q2 += tmp * tmp;
132 memcpy(r->esum, aux.esum, 5 * sizeof(float));
134 for (j = c = 0; j != 4; ++j) c += aux.c[j];
136 for (j = 0; j != 4; ++j) aux.c[j] = (int)(254.0 * aux.c[j] / c + 0.499);
137 for (j = c = 0; j != 4; ++j) c += aux.c[j];
139 // generate likelihood
140 for (j = 0; j != 5; ++j) {
143 for (k = 0, tmp = 0.0; k != 5; ++k)
144 if (j != k) tmp += aux.esum[k];
145 p[j*5+j] = tmp; // anything that is not j
147 for (k = j + 1; k < 5; ++k) {
148 for (i = 0, tmp = 0.0; i != 5; ++i)
149 if (i != j && i != k) tmp += aux.esum[i];
150 p[j*5+k] = p[k*5+j] = 3.01 * (aux.c[j] + aux.c[k]) + tmp;
156 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
160 call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
161 if (ref4 > 4) ref4 = 4;
164 memset(esum, 0, sizeof(double) * 4);
165 for (i = 0; i < n; ++i) {
166 for (j = 0; j < 4; ++j)
167 esum[j] += calls[i].esum[j];
169 for (j = 0; j < 4; ++j)
170 sum[j] = (int)(esum[j] * 100. + .499) << 2 | j;
172 // find the top 2 alleles
173 for (i = 1; i < 4; ++i) // insertion sort
174 for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
175 tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
176 // set the reference allele and alternative allele(s)
177 for (i = 0; i < 5; ++i) call->a[i] = -1;
180 for (i = 3, j = 1; i >= 0; --i) {
181 if ((sum[i]&3) != ref4) {
182 if (sum[i]>>2 != 0) call->a[j++] = sum[i]&3;
186 if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
187 call->unseen = j, call->a[j++] = sum[i]&3;
192 call->PL = realloc(call->PL, 15 * n);
197 x = call->n_alleles * (call->n_alleles + 1) / 2;
198 // get the possible genotypes
199 for (i = z = 0; i < call->n_alleles; ++i)
200 for (j = i; j < call->n_alleles; ++j)
201 g[z++] = call->a[i] * 5 + call->a[j];
202 for (i = 0; i < n; ++i) {
203 uint8_t *PL = call->PL + x * i;
204 const bcf_callret1_t *r = calls + i;
206 for (j = 0; j < x; ++j)
207 if (min > r->p[g[j]]) min = r->p[g[j]];
209 for (j = 0; j < x; ++j) {
211 y = (int)(r->p[g[j]] - min + .499);
212 if (y > 255) y = 255;
216 call->shift = (int)(sum_min + .499);
218 memset(call->d, 0, 4 * sizeof(int));
219 memset(call->ed, 0, 4 * sizeof(int));
220 for (i = call->depth = 0, tmp = 0; i < n; ++i) {
221 call->depth += calls[i].depth;
222 for (j = 0; j < 4; ++j) call->d[j] += calls[i].d[j], call->ed[j] += calls[i].ed[j];
223 tmp += calls[i].sum_Q2;
225 call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499);
229 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b)
233 b->tid = tid; b->pos = pos; b->qual = 0;
234 s.s = b->str; s.m = b->m_str; s.l = 0;
236 kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
237 for (i = 1; i < 5; ++i) {
238 if (bc->a[i] < 0) break;
239 if (i > 1) kputc(',', &s);
240 // kputc(bc->unseen == i && i != 3? 'X' : "ACGT"[bc->a[i]], &s);
241 kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
246 kputs("MQ=", &s); kputw(bc->rmsQ, &s); // kputs(";DP=", &s); kputw(bc->depth, &s);
248 for (i = 0; i < 4; ++i) {
249 if (i) kputc(',', &s);
253 for (i = 0; i < 4; ++i) {
254 if (i) kputc(',', &s);
255 kputw(bc->ed[i], &s);
259 kputs("PL", &s); kputc('\0', &s);
260 b->m_str = s.m; b->str = s.s; b->l_str = s.l;
262 memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);