8 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
10 #define CALL_ETA 0.03f
12 #define CALL_DEFTHETA 0.85f
14 struct __bcf_callaux_t {
20 bcf_callaux_t *bcf_call_init(double theta)
24 if (theta <= 0.) theta = CALL_DEFTHETA;
25 bca = calloc(1, sizeof(bcf_callaux_t));
27 bca->fk = calloc(CALL_MAX, sizeof(double));
29 for (n = 1; n < CALL_MAX; ++n)
30 bca->fk[n] = theta >= 1.? 1. : pow(theta, n) * (1.0 - CALL_ETA) + CALL_ETA;
31 bca->fk[CALL_MAX-1] = 0.;
35 void bcf_call_destroy(bcf_callaux_t *bca)
38 free(bca->info); free(bca->fk); free(bca);
43 float esum[4], fsum[4];
49 The following code is nearly identical to bam_maqcns_glfgen() under
50 the simplified SOAPsnp model. It does the following:
52 1) Collect strand, base, quality and mapping quality information for
53 each base and combine them in an integer:
55 x = min{baseQ,mapQ}<<24 | 1<<21 | strand<<18 | base<<16 | baseQ<<8 | mapQ
57 2) Sort the list of integers for the next step.
59 3) For each base, calculate e_b, the sum of weighted qualities. For
60 each type of base on each strand, the best quality has the highest
61 weight. Only the top 255 bases on each strand are used (different
64 4) Rescale the total read depth to 255.
66 5) Calculate Q(D|g) = -10\log_{10}P(D|g) (d_a is the allele count):
68 Q(D|<aa>)=\sum_{b\not=a}e_b
69 Q(D|<aA>)=3*(d_a+d_A)+\sum_{b\not=a,b\not=A}e_b
71 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r)
77 memset(r, 0, sizeof(bcf_callret1_t));
78 if (_n == 0) return -1;
80 // enlarge the aux array if necessary
81 if (bca->max_info < _n) {
83 kroundup32(bca->max_info);
84 bca->info = (uint32_t*)realloc(bca->info, 4 * bca->max_info);
87 for (i = n = 0; i < _n; ++i) {
88 const bam_pileup1_t *p = pl + i;
89 uint32_t q, x = 0, qq;
90 if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; // skip unmapped reads and deleted bases
91 q = (uint32_t)bam1_qual(p->b)[p->qpos]; // base quality
92 x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
93 if (p->b->core.qual < q) q = p->b->core.qual; // cap the overall quality at mapping quality
95 qq = bam1_seqi(bam1_seq(p->b), p->qpos); // base
96 q = bam_nt16_nt4_table[qq? qq : ref_base]; // q is the 2-bit base
97 if (q < 4) x |= 1 << 21 | q << 16;
101 ks_introsort_uint32_t(n, bca->info);
103 // generate esum and fsum
104 memset(&aux, 0, sizeof(auxaux_t));
105 for (j = n - 1, r->sum_Q2 = 0; j >= 0; --j) { // calculate esum and fsum
106 uint32_t info = bca->info[j];
108 if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
111 aux.esum[k&3] += bca->fk[aux.w[k]] * (info>>24);
112 aux.fsum[k&3] += bca->fk[aux.w[k]];
113 if (aux.w[k] + 1 < CALL_MAX) ++aux.w[k];
116 tmp = (int)(info&0xff) < bca->capQ? (int)(info&0xff) : bca->capQ;
117 r->sum_Q2 += tmp * tmp;
119 memcpy(r->esum, aux.esum, 4 * sizeof(float));
121 for (j = c = 0; j != 4; ++j) c += aux.c[j];
123 for (j = 0; j != 4; ++j) aux.c[j] = (int)(254.0 * aux.c[j] / c + 0.499);
124 for (j = c = 0; j != 4; ++j) c += aux.c[j];
126 // generate likelihood
127 for (j = 0; j != 4; ++j) {
130 for (k = 0, tmp = 0.0; k != 4; ++k)
131 if (j != k) tmp += aux.esum[k];
132 p[j<<2|j] = tmp; // anything that is not j
134 for (k = j + 1; k < 4; ++k) {
135 for (i = 0, tmp = 0.0; i != 4; ++i)
136 if (i != j && i != k) tmp += aux.esum[i];
137 p[j<<2|k] = p[k<<2|j] = 3.01 * (aux.c[j] + aux.c[k]) + tmp;
144 1) Find the top 2 bases (from esum[4]).
146 2) If the reference base is among the top 2, consider the locus is
147 potentially biallelic and set call->a[2] as -1; otherwise, the
148 locus is potentially triallelic. If the reference is ambiguous,
149 take the weakest call as the pseudo-reference.
151 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
155 call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
156 if (ref4 > 4) ref4 = 4;
159 memset(esum, 0, sizeof(double) * 4);
160 for (i = 0; i < n; ++i) {
161 for (j = 0; j < 4; ++j)
162 esum[j] += calls[i].esum[j];
164 for (j = 0; j < 4; ++j)
165 sum[j] = (int)(esum[j] * 100. + .499) << 2 | j;
167 // find the top 2 alleles
168 for (i = 1; i < 4; ++i) // insertion sort
169 for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
170 tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
171 // set the reference allele and alternative allele(s)
172 for (i = 0; i < 4; ++i) call->a[i] = -1;
176 for (i = 3, j = 1; i >= 0; --i) {
177 if ((sum[i]&3) != ref4) {
178 if (sum[i]>>2 != 0) call->a[j++] = sum[i]&3;
182 if (j < 4 && i >= 0) call->unseen = j, call->a[j++] = sum[i]&3;
185 for (i = 3, j = 0; i >= 0; --i)
186 if (sum[i]>>2 != 0) call->a[j++] = sum[i]&3;
188 if (j < 4 && i >= 0) call->unseen = j, call->a[j++] = sum[i]&3;
194 call->PL = realloc(call->PL, 10 * n);
199 x = call->n_alleles * (call->n_alleles + 1) / 2;
200 // get the possible genotypes
201 for (i = z = 0; i < call->n_alleles; ++i)
202 for (j = i; j < call->n_alleles; ++j)
203 g[z++] = call->a[i]<<2 | call->a[j];
204 for (i = 0; i < n; ++i) {
205 uint8_t *PL = call->PL + x * i;
206 const bcf_callret1_t *r = calls + i;
208 for (j = 0; j < x; ++j)
209 if (min > r->p[g[j]]) min = r->p[g[j]];
211 for (j = 0; j < x; ++j) {
213 y = (int)(r->p[g[j]] - min + .499);
214 if (y > 255) y = 255;
218 call->shift = (int)(sum_min + .499);
220 for (i = call->depth = 0, tmp = 0; i < n; ++i) {
221 call->depth += calls[i].depth;
222 tmp += calls[i].sum_Q2;
224 call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499);
228 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b)
232 b->tid = tid; b->pos = pos; b->qual = 0;
233 s.s = b->str; s.m = b->m_str; s.l = 0;
235 kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
236 beg = bc->ori_ref > 3? 0 : 1;
237 for (i = beg; i < 4; ++i) {
238 if (bc->a[i] < 0) break;
239 if (i > beg) kputc(',', &s);
240 // kputc(bc->unseen == i && i != 3? 'X' : "ACGT"[bc->a[i]], &s);
241 kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
245 kputs("MQ=", &s); kputw(bc->rmsQ, &s); kputs(";DP=", &s); kputw(bc->depth, &s); kputc('\0', &s);
246 kputs("PL", &s); kputc('\0', &s);
247 b->m_str = s.m; b->str = s.s; b->l_str = s.l;
249 memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);