8 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
10 #define CALL_ETA 0.03f
12 #define CALL_DEFTHETA 0.85f
14 struct __bcf_callaux_t {
20 bcf_callaux_t *bcf_call_init(double theta)
24 if (theta <= 0.) theta = CALL_DEFTHETA;
25 bca = calloc(1, sizeof(bcf_callaux_t));
27 bca->fk = calloc(CALL_MAX, sizeof(double));
29 for (n = 1; n < CALL_MAX; ++n)
30 bca->fk[n] = theta >= 1.? 1. : pow(theta, n) * (1.0 - CALL_ETA) + CALL_ETA;
31 bca->fk[CALL_MAX-1] = 0.;
35 void bcf_call_destroy(bcf_callaux_t *bca)
38 free(bca->info); free(bca->fk); free(bca);
43 float esum[4], fsum[4];
49 The following code is nearly identical to bam_maqcns_glfgen() under
50 the simplified SOAPsnp model. It does the following:
52 1) Collect strand, base, quality and mapping quality information for
53 each base and combine them in an integer:
55 x = min{baseQ,mapQ}<<24 | 1<<21 | strand<<18 | base<<16 | baseQ<<8 | mapQ
57 2) Sort the list of integers for the next step.
59 3) For each base, calculate e_b, the sum of weighted qualities. For
60 each type of base on each strand, the best quality has the highest
61 weight. Only the top 255 bases on each strand are used (different
64 4) Rescale the total read depth to 255.
66 5) Calculate Q(D|g) = -10\log_{10}P(D|g) (d_a is the allele count):
68 Q(D|<aa>)=\sum_{b\not=a}e_b
69 Q(D|<aA>)=3*(d_a+d_A)+\sum_{b\not=a,b\not=A}e_b
71 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r)
77 memset(r, 0, sizeof(bcf_callret1_t));
78 if (_n == 0) return -1;
80 // enlarge the aux array if necessary
81 if (bca->max_info < _n) {
83 kroundup32(bca->max_info);
84 bca->info = (uint32_t*)realloc(bca->info, 4 * bca->max_info);
87 for (i = n = 0; i < _n; ++i) {
88 const bam_pileup1_t *p = pl + i;
89 uint32_t q, x = 0, qq;
90 if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; // skip unmapped reads and deleted bases
91 q = (uint32_t)bam1_qual(p->b)[p->qpos]; // base quality
92 x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
93 if (p->b->core.qual < q) q = p->b->core.qual; // cap the overall quality at mapping quality
95 qq = bam1_seqi(bam1_seq(p->b), p->qpos); // base
96 q = bam_nt16_nt4_table[qq? qq : ref_base]; // q is the 2-bit base
97 if (q < 4) x |= 1 << 21 | q << 16;
101 ks_introsort_uint32_t(n, bca->info);
103 // generate esum and fsum
104 memset(&aux, 0, sizeof(auxaux_t));
105 for (j = n - 1, r->sum_Q2 = 0; j >= 0; --j) { // calculate esum and fsum
106 uint32_t info = bca->info[j];
108 if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
111 aux.esum[k&3] += bca->fk[aux.w[k]] * (info>>24);
112 aux.fsum[k&3] += bca->fk[aux.w[k]];
113 if (aux.w[k] + 1 < CALL_MAX) ++aux.w[k];
116 tmp = (int)(info&0xff) < bca->capQ? (int)(info&0xff) : bca->capQ;
117 r->sum_Q2 += tmp * tmp;
119 memcpy(r->esum, aux.esum, 4 * sizeof(float));
121 for (j = c = 0; j != 4; ++j) c += aux.c[j];
123 for (j = 0; j != 4; ++j) aux.c[j] = (int)(254.0 * aux.c[j] / c + 0.499);
124 for (j = c = 0; j != 4; ++j) c += aux.c[j];
126 // generate likelihood
127 for (j = 0; j != 4; ++j) {
130 for (k = 0, tmp = 0.0; k != 4; ++k)
131 if (j != k) tmp += aux.esum[k];
132 p[j<<2|j] = tmp; // anything that is not j
134 for (k = j + 1; k < 4; ++k) {
135 for (i = 0, tmp = 0.0; i != 4; ++i)
136 if (i != j && i != k) tmp += aux.esum[i];
137 p[j<<2|k] = p[k<<2|j] = 3.01 * (aux.c[j] + aux.c[k]) + tmp;
144 1) Find the top 2 bases (from esum[4]).
146 2) If the reference base is among the top 2, consider the locus is
147 potentially biallelic and set call->a[2] as -1; otherwise, the
148 locus is potentially triallelic. If the reference is ambiguous,
149 take the weakest call as the pseudo-reference.
151 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
155 call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
156 if (ref4 > 4) ref4 = 4;
159 memset(esum, 0, sizeof(double) * 4);
160 for (i = 0; i < n; ++i) {
161 for (j = 0; j < 4; ++j)
162 esum[j] += calls[i].esum[j];
164 for (j = 0; j < 4; ++j)
165 sum[j] = (int)(esum[j] * 100. + .499) << 2 | j;
167 // find the top 2 alleles
168 for (i = 1; i < 4; ++i) // insertion sort
169 for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
170 tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
171 call->a[0] = sum[3]&3; call->a[1] = sum[2]&3; call->a[2] = -1;
172 // set the reference allele and alternative allele(s)
173 if (ref4 != call->a[0]) { // ref is not the best
174 if (ref4 < 4) { // not ambiguous
175 if (ref4 == call->a[1]) tmp = call->a[0], call->a[0] = call->a[1], call->a[1] = tmp; // switch 0 and 1
176 else call->a[2] = call->a[1], call->a[1] = call->a[0], call->a[0] = ref4; // triallele
182 call->PL = realloc(call->PL, 6 * n);
187 call->n_alleles = call->a[2] < 0? 2 : 3;
188 x = call->n_alleles * (call->n_alleles + 1) / 2;
189 // get the possible genotypes
190 for (i = z = 0; i < call->n_alleles; ++i)
191 for (j = i; j < call->n_alleles; ++j)
192 g[z++] = call->a[i]<<2 | call->a[j];
193 for (i = 0; i < n; ++i) {
194 uint8_t *PL = call->PL + x * i;
195 const bcf_callret1_t *r = calls + i;
197 for (j = 0; j < x; ++j)
198 if (min > r->p[g[j]]) min = r->p[g[j]];
200 for (j = 0; j < x; ++j) {
202 y = (int)(r->p[g[j]] - min + .499);
203 if (y > 255) y = 255;
207 call->shift = (int)(sum_min + .499);
209 for (i = call->depth = 0, tmp = 0; i < n; ++i) {
210 call->depth += calls[i].depth;
211 tmp += calls[i].sum_Q2;
213 call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499);
217 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b)
221 b->tid = tid; b->pos = pos; b->qual = 0;
222 s.s = b->str; s.m = b->m_str; s.l = 0;
224 kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
225 beg = bc->ori_ref > 3? 0 : 1;
226 for (i = beg; i < 4; ++i) {
227 if (bc->a[i] < 0) break;
228 if (i > beg) kputc(',', &s);
229 kputc("ACGT"[bc->a[i]], &s);
233 kputs("MQ=", &s); kputw(bc->rmsQ, &s); kputc('\0', &s);
234 kputs("PL", &s); kputc('\0', &s);
235 b->m_str = s.m; b->str = s.s; b->l_str = s.l;
237 memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);