6 #include "bcftools/bcf.h"
8 #define MAX_END_DIST 30
10 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
12 #define CALL_ETA 0.03f
14 #define CALL_DEFTHETA 0.85f
16 struct __bcf_callaux_t {
17 int max_info, capQ, min_baseQ;
22 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
26 if (theta <= 0.) theta = CALL_DEFTHETA;
27 bca = calloc(1, sizeof(bcf_callaux_t));
29 bca->min_baseQ = min_baseQ;
30 bca->fk = calloc(CALL_MAX, sizeof(double));
32 for (n = 1; n < CALL_MAX; ++n)
33 bca->fk[n] = theta >= 1.? 1. : pow(theta, n) * (1.0 - CALL_ETA) + CALL_ETA;
34 bca->fk[CALL_MAX-1] = 0.;
38 void bcf_call_destroy(bcf_callaux_t *bca)
41 free(bca->info); free(bca->fk); free(bca);
46 float esum[5], fsum[5];
52 The following code is nearly identical to bam_maqcns_glfgen() under
53 the simplified SOAPsnp model. It does the following:
55 1) Collect strand, base, quality and mapping quality information for
56 each base and combine them in an integer:
58 x = min{baseQ,mapQ}<<24 | 1<<21 | strand<<18 | base<<16 | baseQ<<8 | mapQ
60 2) Sort the list of integers for the next step.
62 3) For each base, calculate e_b, the sum of weighted qualities. For
63 each type of base on each strand, the best quality has the highest
64 weight. Only the top 255 bases on each strand are used (different
67 4) Rescale the total read depth to 255.
69 5) Calculate Q(D|g) = -10\log_{10}P(D|g) (d_a is the allele count):
71 Q(D|<aa>)=\sum_{b\not=a}e_b
72 Q(D|<aA>)=3*(d_a+d_A)+\sum_{b\not=a,b\not=A}e_b
74 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r)
76 int i, j, k, c, n, ref4;
80 memset(r, 0, sizeof(bcf_callret1_t));
81 ref4 = bam_nt16_nt4_table[ref_base];
82 if (_n == 0) return -1;
84 // enlarge the aux array if necessary
85 if (bca->max_info < _n) {
87 kroundup32(bca->max_info);
88 bca->info = (uint32_t*)realloc(bca->info, 4 * bca->max_info);
91 for (i = n = 0; i < _n; ++i) {
92 const bam_pileup1_t *p = pl + i;
93 uint32_t q, x = 0, qq;
95 if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; // skip unmapped reads and deleted bases
96 q = (uint32_t)bam1_qual(p->b)[p->qpos]; // base quality
97 if (q < bca->min_baseQ) continue;
98 x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
99 if (p->b->core.qual < q) q = p->b->core.qual; // cap the overall quality at mapping quality
101 qq = bam1_seqi(bam1_seq(p->b), p->qpos); // base
102 q = bam_nt16_nt4_table[qq? qq : ref_base]; // q is the 2-bit base
103 if (q < 4) x |= 1 << 21 | q << 16;
104 k = (ref4 < 4 && q == ref4)? 0 : 1;
105 k = k<<1 | bam1_strand(p->b);
108 // calculate min_dist
109 min_dist = p->b->core.l_qseq - 1 - p->qpos;
110 if (min_dist > p->qpos) min_dist = p->qpos;
111 if (min_dist > MAX_END_DIST) min_dist = MAX_END_DIST;
113 r->dsum[k] += min_dist;
114 r->d2sum[k] += min_dist * min_dist;
116 ks_introsort_uint32_t(n, bca->info);
118 // generate esum and fsum
119 memset(&aux, 0, sizeof(auxaux_t));
120 for (j = n - 1, r->sum_Q2 = 0; j >= 0; --j) { // calculate esum and fsum
121 uint32_t info = bca->info[j];
123 if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
126 aux.esum[k&3] += bca->fk[aux.w[k]] * (info>>24);
127 aux.fsum[k&3] += bca->fk[aux.w[k]];
128 if (aux.w[k] + 1 < CALL_MAX) ++aux.w[k];
131 tmp = (int)(info&0xff) < bca->capQ? (int)(info&0xff) : bca->capQ;
132 r->sum_Q2 += tmp * tmp;
134 memcpy(r->esum, aux.esum, 5 * sizeof(float));
136 for (j = c = 0; j != 4; ++j) c += aux.c[j];
138 for (j = 0; j != 4; ++j) aux.c[j] = (int)(254.0 * aux.c[j] / c + 0.499);
139 for (j = c = 0; j != 4; ++j) c += aux.c[j];
141 // generate likelihood
142 for (j = 0; j != 5; ++j) {
145 for (k = 0, tmp = 0.0; k != 5; ++k)
146 if (j != k) tmp += aux.esum[k];
147 p[j*5+j] = tmp; // anything that is not j
149 for (k = j + 1; k < 5; ++k) {
150 for (i = 0, tmp = 0.0; i != 5; ++i)
151 if (i != j && i != k) tmp += aux.esum[i];
152 p[j*5+k] = p[k*5+j] = 3.01 * (aux.c[j] + aux.c[k]) + tmp;
158 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
162 call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
163 if (ref4 > 4) ref4 = 4;
166 memset(esum, 0, sizeof(double) * 4);
167 for (i = 0; i < n; ++i) {
168 for (j = 0; j < 4; ++j)
169 esum[j] += calls[i].esum[j];
171 for (j = 0; j < 4; ++j)
172 sum[j] = (int)(esum[j] * 100. + .499) << 2 | j;
174 // find the top 2 alleles
175 for (i = 1; i < 4; ++i) // insertion sort
176 for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
177 tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
178 // set the reference allele and alternative allele(s)
179 for (i = 0; i < 5; ++i) call->a[i] = -1;
182 for (i = 3, j = 1; i >= 0; --i) {
183 if ((sum[i]&3) != ref4) {
184 if (sum[i]>>2 != 0) call->a[j++] = sum[i]&3;
188 if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
189 call->unseen = j, call->a[j++] = sum[i]&3;
194 call->PL = realloc(call->PL, 15 * n);
199 x = call->n_alleles * (call->n_alleles + 1) / 2;
200 // get the possible genotypes
201 for (i = z = 0; i < call->n_alleles; ++i)
202 for (j = i; j < call->n_alleles; ++j)
203 g[z++] = call->a[i] * 5 + call->a[j];
204 for (i = 0; i < n; ++i) {
205 uint8_t *PL = call->PL + x * i;
206 const bcf_callret1_t *r = calls + i;
208 for (j = 0; j < x; ++j)
209 if (min > r->p[g[j]]) min = r->p[g[j]];
211 for (j = 0; j < x; ++j) {
213 y = (int)(r->p[g[j]] - min + .499);
214 if (y > 255) y = 255;
218 call->shift = (int)(sum_min + .499);
220 memset(call->d, 0, 4 * sizeof(int));
221 call->davg[0] = call->davg[1] = call->dstd[0] = call->dstd[1] = 0.;
222 for (i = call->depth = 0, tmp = 0; i < n; ++i) {
223 call->depth += calls[i].depth;
224 for (j = 0; j < 4; ++j) call->d[j] += calls[i].d[j];
225 for (j = 0; j < 2; ++j) {
226 call->davg[j] += calls[i].dsum[j];
227 call->dstd[j] += calls[i].d2sum[j];
229 tmp += calls[i].sum_Q2;
231 for (j = 0; j < 2; ++j) {
232 int x = call->d[j*2] + call->d[j*2+1];
234 call->davg[j] = call->dstd[j] = 0.;
237 call->dstd[j] = sqrt(call->dstd[j] / x - call->davg[j] * call->davg[j]);
240 call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499);
244 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b)
248 b->tid = tid; b->pos = pos; b->qual = 0;
249 s.s = b->str; s.m = b->m_str; s.l = 0;
251 kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
252 for (i = 1; i < 5; ++i) {
253 if (bc->a[i] < 0) break;
254 if (i > 1) kputc(',', &s);
255 // kputc(bc->unseen == i && i != 3? 'X' : "ACGT"[bc->a[i]], &s);
256 kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
261 kputs("MQ=", &s); kputw(bc->rmsQ, &s); // kputs(";DP=", &s); kputw(bc->depth, &s);
263 for (i = 0; i < 4; ++i) {
264 if (i) kputc(',', &s);
267 if (bc->d[2] + bc->d[3] > 1)
268 ksprintf(&s, ";MED=%.1lf,%.1lf,%.1lf,%.1lf", bc->davg[0], bc->dstd[0], bc->davg[1], bc->dstd[1]);
271 kputs("PL", &s); kputc('\0', &s);
272 b->m_str = s.m; b->str = s.s; b->l_str = s.l;
274 memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);