6 #include "bcftools/bcf.h"
8 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
10 #define CALL_ETA 0.03f
12 #define CALL_DEFTHETA 0.85f
14 struct __bcf_callaux_t {
15 int max_info, capQ, min_baseQ;
20 bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
24 if (theta <= 0.) theta = CALL_DEFTHETA;
25 bca = calloc(1, sizeof(bcf_callaux_t));
27 bca->min_baseQ = min_baseQ;
28 bca->fk = calloc(CALL_MAX, sizeof(double));
30 for (n = 1; n < CALL_MAX; ++n)
31 bca->fk[n] = theta >= 1.? 1. : pow(theta, n) * (1.0 - CALL_ETA) + CALL_ETA;
32 bca->fk[CALL_MAX-1] = 0.;
36 void bcf_call_destroy(bcf_callaux_t *bca)
39 free(bca->info); free(bca->fk); free(bca);
44 float esum[5], fsum[5];
50 The following code is nearly identical to bam_maqcns_glfgen() under
51 the simplified SOAPsnp model. It does the following:
53 1) Collect strand, base, quality and mapping quality information for
54 each base and combine them in an integer:
56 x = min{baseQ,mapQ}<<24 | 1<<21 | strand<<18 | base<<16 | baseQ<<8 | mapQ
58 2) Sort the list of integers for the next step.
60 3) For each base, calculate e_b, the sum of weighted qualities. For
61 each type of base on each strand, the best quality has the highest
62 weight. Only the top 255 bases on each strand are used (different
65 4) Rescale the total read depth to 255.
67 5) Calculate Q(D|g) = -10\log_{10}P(D|g) (d_a is the allele count):
69 Q(D|<aa>)=\sum_{b\not=a}e_b
70 Q(D|<aA>)=3*(d_a+d_A)+\sum_{b\not=a,b\not=A}e_b
72 int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r)
78 memset(r, 0, sizeof(bcf_callret1_t));
79 if (_n == 0) return -1;
81 // enlarge the aux array if necessary
82 if (bca->max_info < _n) {
84 kroundup32(bca->max_info);
85 bca->info = (uint32_t*)realloc(bca->info, 4 * bca->max_info);
88 for (i = n = 0; i < _n; ++i) {
89 const bam_pileup1_t *p = pl + i;
90 uint32_t q, x = 0, qq;
91 if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; // skip unmapped reads and deleted bases
92 q = (uint32_t)bam1_qual(p->b)[p->qpos]; // base quality
93 if (q < bca->min_baseQ) continue;
94 x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
95 if (p->b->core.qual < q) q = p->b->core.qual; // cap the overall quality at mapping quality
97 qq = bam1_seqi(bam1_seq(p->b), p->qpos); // base
98 q = bam_nt16_nt4_table[qq? qq : ref_base]; // q is the 2-bit base
99 if (q < 4) x |= 1 << 21 | q << 16;
103 ks_introsort_uint32_t(n, bca->info);
105 // generate esum and fsum
106 memset(&aux, 0, sizeof(auxaux_t));
107 for (j = n - 1, r->sum_Q2 = 0; j >= 0; --j) { // calculate esum and fsum
108 uint32_t info = bca->info[j];
110 if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
113 aux.esum[k&3] += bca->fk[aux.w[k]] * (info>>24);
114 aux.fsum[k&3] += bca->fk[aux.w[k]];
115 if (aux.w[k] + 1 < CALL_MAX) ++aux.w[k];
118 tmp = (int)(info&0xff) < bca->capQ? (int)(info&0xff) : bca->capQ;
119 r->sum_Q2 += tmp * tmp;
121 memcpy(r->esum, aux.esum, 5 * sizeof(float));
123 for (j = c = 0; j != 4; ++j) c += aux.c[j];
125 for (j = 0; j != 4; ++j) aux.c[j] = (int)(254.0 * aux.c[j] / c + 0.499);
126 for (j = c = 0; j != 4; ++j) c += aux.c[j];
128 // generate likelihood
129 for (j = 0; j != 5; ++j) {
132 for (k = 0, tmp = 0.0; k != 5; ++k)
133 if (j != k) tmp += aux.esum[k];
134 p[j*5+j] = tmp; // anything that is not j
136 for (k = j + 1; k < 5; ++k) {
137 for (i = 0, tmp = 0.0; i != 5; ++i)
138 if (i != j && i != k) tmp += aux.esum[i];
139 p[j*5+k] = p[k*5+j] = 3.01 * (aux.c[j] + aux.c[k]) + tmp;
145 int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
149 call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
150 if (ref4 > 4) ref4 = 4;
153 memset(esum, 0, sizeof(double) * 4);
154 for (i = 0; i < n; ++i) {
155 for (j = 0; j < 4; ++j)
156 esum[j] += calls[i].esum[j];
158 for (j = 0; j < 4; ++j)
159 sum[j] = (int)(esum[j] * 100. + .499) << 2 | j;
161 // find the top 2 alleles
162 for (i = 1; i < 4; ++i) // insertion sort
163 for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
164 tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
165 // set the reference allele and alternative allele(s)
166 for (i = 0; i < 5; ++i) call->a[i] = -1;
169 for (i = 3, j = 1; i >= 0; --i) {
170 if ((sum[i]&3) != ref4) {
171 if (sum[i]>>2 != 0) call->a[j++] = sum[i]&3;
175 if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
176 call->unseen = j, call->a[j++] = sum[i]&3;
181 call->PL = realloc(call->PL, 15 * n);
186 x = call->n_alleles * (call->n_alleles + 1) / 2;
187 // get the possible genotypes
188 for (i = z = 0; i < call->n_alleles; ++i)
189 for (j = i; j < call->n_alleles; ++j)
190 g[z++] = call->a[i] * 5 + call->a[j];
191 for (i = 0; i < n; ++i) {
192 uint8_t *PL = call->PL + x * i;
193 const bcf_callret1_t *r = calls + i;
195 for (j = 0; j < x; ++j)
196 if (min > r->p[g[j]]) min = r->p[g[j]];
198 for (j = 0; j < x; ++j) {
200 y = (int)(r->p[g[j]] - min + .499);
201 if (y > 255) y = 255;
205 call->shift = (int)(sum_min + .499);
207 for (i = call->depth = 0, tmp = 0; i < n; ++i) {
208 call->depth += calls[i].depth;
209 tmp += calls[i].sum_Q2;
211 call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499);
215 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b)
219 b->tid = tid; b->pos = pos; b->qual = 0;
220 s.s = b->str; s.m = b->m_str; s.l = 0;
222 kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
223 for (i = 1; i < 5; ++i) {
224 if (bc->a[i] < 0) break;
225 if (i > 1) kputc(',', &s);
226 // kputc(bc->unseen == i && i != 3? 'X' : "ACGT"[bc->a[i]], &s);
227 kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
231 kputs("MQ=", &s); kputw(bc->rmsQ, &s); kputs(";DP=", &s); kputw(bc->depth, &s); kputc('\0', &s);
232 kputs("PL", &s); kputc('\0', &s);
233 b->m_str = s.m; b->str = s.s; b->l_str = s.l;
235 memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);