8 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
10 #define CALL_ETA 0.03f
12 #define CALL_DEFTHETA 0.85f
14 struct __bcf_callaux_t {
20 bcf_callaux_t *bcf_call_init(double theta)
24 if (theta <= 0.) theta = CALL_DEFTHETA;
25 bca = calloc(1, sizeof(bcf_callaux_t));
27 bca->fk = calloc(CALL_MAX, sizeof(double));
29 for (n = 1; n < CALL_MAX; ++n)
30 bca->fk[n] = theta >= 1.? 1. : pow(theta, n) * (1.0 - CALL_ETA) + CALL_ETA;
31 bca->fk[CALL_MAX-1] = 0.;
35 void bcf_call_destroy(bcf_callaux_t *bca)
38 free(bca->info); free(bca->fk); free(bca);
43 float esum[5], fsum[5];
49 The following code is nearly identical to bam_maqcns_glfgen() under
50 the simplified SOAPsnp model. It does the following:
52 1) Collect strand, base, quality and mapping quality information for
53 each base and combine them in an integer:
55 x = min{baseQ,mapQ}<<24 | 1<<21 | strand<<18 | base<<16 | baseQ<<8 | mapQ
57 2) Sort the list of integers for the next step.
59 3) For each base, calculate e_b, the sum of weighted qualities. For
60 each type of base on each strand, the best quality has the highest
61 weight. Only the top 255 bases on each strand are used (different
64 4) Rescale the total read depth to 255.
66 5) Calculate Q(D|g) = -10\log_{10}P(D|g) (d_a is the allele count):
68 Q(D|<aa>)=\sum_{b\not=a}e_b
69 Q(D|<aA>)=3*(d_a+d_A)+\sum_{b\not=a,b\not=A}e_b
71 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r)
77 memset(r, 0, sizeof(bcf_callret1_t));
78 if (_n == 0) return -1;
80 // enlarge the aux array if necessary
81 if (bca->max_info < _n) {
83 kroundup32(bca->max_info);
84 bca->info = (uint32_t*)realloc(bca->info, 4 * bca->max_info);
87 for (i = n = 0; i < _n; ++i) {
88 const bam_pileup1_t *p = pl + i;
89 uint32_t q, x = 0, qq;
90 if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; // skip unmapped reads and deleted bases
91 q = (uint32_t)bam1_qual(p->b)[p->qpos]; // base quality
92 x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
93 if (p->b->core.qual < q) q = p->b->core.qual; // cap the overall quality at mapping quality
95 qq = bam1_seqi(bam1_seq(p->b), p->qpos); // base
96 q = bam_nt16_nt4_table[qq? qq : ref_base]; // q is the 2-bit base
97 if (q < 4) x |= 1 << 21 | q << 16;
101 ks_introsort_uint32_t(n, bca->info);
103 // generate esum and fsum
104 memset(&aux, 0, sizeof(auxaux_t));
105 for (j = n - 1, r->sum_Q2 = 0; j >= 0; --j) { // calculate esum and fsum
106 uint32_t info = bca->info[j];
108 if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
111 aux.esum[k&3] += bca->fk[aux.w[k]] * (info>>24);
112 aux.fsum[k&3] += bca->fk[aux.w[k]];
113 if (aux.w[k] + 1 < CALL_MAX) ++aux.w[k];
116 tmp = (int)(info&0xff) < bca->capQ? (int)(info&0xff) : bca->capQ;
117 r->sum_Q2 += tmp * tmp;
119 memcpy(r->esum, aux.esum, 5 * sizeof(float));
121 for (j = c = 0; j != 4; ++j) c += aux.c[j];
123 for (j = 0; j != 4; ++j) aux.c[j] = (int)(254.0 * aux.c[j] / c + 0.499);
124 for (j = c = 0; j != 4; ++j) c += aux.c[j];
126 // generate likelihood
127 for (j = 0; j != 5; ++j) {
130 for (k = 0, tmp = 0.0; k != 5; ++k)
131 if (j != k) tmp += aux.esum[k];
132 p[j*5+j] = tmp; // anything that is not j
134 for (k = j + 1; k < 5; ++k) {
135 for (i = 0, tmp = 0.0; i != 5; ++i)
136 if (i != j && i != k) tmp += aux.esum[i];
137 p[j*5+k] = p[k*5+j] = 3.01 * (aux.c[j] + aux.c[k]) + tmp;
144 1) Find the top 2 bases (from esum[4]).
146 2) If the reference base is among the top 2, consider the locus is
147 potentially biallelic and set call->a[2] as -1; otherwise, the
148 locus is potentially triallelic. If the reference is ambiguous,
149 take the weakest call as the pseudo-reference.
151 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
155 call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
156 if (ref4 > 4) ref4 = 4;
159 memset(esum, 0, sizeof(double) * 4);
160 for (i = 0; i < n; ++i) {
161 for (j = 0; j < 4; ++j)
162 esum[j] += calls[i].esum[j];
164 for (j = 0; j < 4; ++j)
165 sum[j] = (int)(esum[j] * 100. + .499) << 2 | j;
167 // find the top 2 alleles
168 for (i = 1; i < 4; ++i) // insertion sort
169 for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
170 tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
171 // set the reference allele and alternative allele(s)
172 for (i = 0; i < 4; ++i) call->a[i] = -1;
175 for (i = 3, j = 1; i >= 0; --i) {
176 if ((sum[i]&3) != ref4) {
177 if (sum[i]>>2 != 0) call->a[j++] = sum[i]&3;
181 if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
182 call->unseen = j, call->a[j++] = sum[i]&3;
187 call->PL = realloc(call->PL, 15 * n);
192 x = call->n_alleles * (call->n_alleles + 1) / 2;
193 // get the possible genotypes
194 for (i = z = 0; i < call->n_alleles; ++i)
195 for (j = i; j < call->n_alleles; ++j)
196 g[z++] = call->a[i] * 5 + call->a[j];
197 for (i = 0; i < n; ++i) {
198 uint8_t *PL = call->PL + x * i;
199 const bcf_callret1_t *r = calls + i;
201 for (j = 0; j < x; ++j)
202 if (min > r->p[g[j]]) min = r->p[g[j]];
204 for (j = 0; j < x; ++j) {
206 y = (int)(r->p[g[j]] - min + .499);
207 if (y > 255) y = 255;
211 call->shift = (int)(sum_min + .499);
213 for (i = call->depth = 0, tmp = 0; i < n; ++i) {
214 call->depth += calls[i].depth;
215 tmp += calls[i].sum_Q2;
217 call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499);
221 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b)
225 b->tid = tid; b->pos = pos; b->qual = 0;
226 s.s = b->str; s.m = b->m_str; s.l = 0;
228 kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
229 for (i = 1; i < 5; ++i) {
230 if (bc->a[i] < 0) break;
231 if (i > 1) kputc(',', &s);
232 // kputc(bc->unseen == i && i != 3? 'X' : "ACGT"[bc->a[i]], &s);
233 kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
237 kputs("MQ=", &s); kputw(bc->rmsQ, &s); kputs(";DP=", &s); kputw(bc->depth, &s); kputc('\0', &s);
238 kputs("PL", &s); kputc('\0', &s);
239 b->m_str = s.m; b->str = s.s; b->l_str = s.l;
241 memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);