7 KHASH_MAP_INIT_STR(str2id, int)
10 #define srand48(x) srand(x)
11 #define drand48() ((double)rand() / RAND_MAX)
14 // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
15 void *bcf_build_refhash(bcf_hdr_t *h)
17 khash_t(str2id) *hash;
19 hash = kh_init(str2id);
20 for (i = 0; i < h->n_ref; ++i) {
22 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
28 void *bcf_str2id_init()
30 return kh_init(str2id);
33 void bcf_str2id_destroy(void *_hash)
35 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
36 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
39 void bcf_str2id_thorough_destroy(void *_hash)
41 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
43 if (hash == 0) return;
44 for (k = 0; k < kh_end(hash); ++k)
45 if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
46 kh_destroy(str2id, hash);
49 int bcf_str2id(void *_hash, const char *str)
51 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
54 k = kh_get(str2id, hash, str);
55 return k == kh_end(hash)? -1 : kh_val(hash, k);
58 int bcf_str2id_add(void *_hash, const char *str)
62 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
64 k = kh_put(str2id, hash, str, &ret);
65 if (ret == 0) return kh_val(hash, k);
66 kh_val(hash, k) = kh_size(hash) - 1;
67 return kh_val(hash, k);
70 void bcf_fit_alt(bcf1_t *b, int mask)
72 mask |= 1; // REF must be always present
75 for (i=0; i<sizeof(int); i++)
76 if ( mask&1<<i) nals++;
78 if ( b->n_alleles <= nals ) return;
80 // update ALT, in principle any of the alleles can be removed
85 int n=0, nalts=nals-1;
86 for (src=dst=p=b->alt, i=1; *p; p++)
88 if ( *p!=',' ) continue;
95 memmove(dst,src,p-src);
99 if ( n<nalts ) { *dst=','; dst++; }
103 if ( n>=nalts ) { *dst=0; break; }
108 memmove(dst,src,p-src);
114 else p = b->alt, *p = '\0';
116 memmove(p, b->flt, b->str + b->l_str - b->flt);
117 b->l_str -= b->flt - p;
121 for (i = 0; i < b->n_gi; ++i)
123 bcf_ginfo_t *g = b->gi + i;
124 if (g->fmt == bcf_str2int("PL", 2)) ipl = i;
125 if (g->fmt == bcf_str2int("GT", 2)) igt = i;
128 // .. create mapping between old and new indexes
129 int npl = nals * (nals+1) / 2;
130 int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles));
132 for (i=0; i<b->n_alleles; i++)
137 if ( i && !(mask&1<<i) ) skip=1;
138 if ( j && !(mask&1<<j) ) skip=1;
139 if ( !skip ) { map[knew++] = kori; }
143 // .. apply to all samples
144 int n_smpl = b->n_smpl;
145 for (i = 0; i < b->n_gi; ++i)
147 bcf_ginfo_t *g = b->gi + i;
148 if (g->fmt == bcf_str2int("PL", 2))
151 uint8_t *d = (uint8_t*)g->data;
152 int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2;
153 for (knew=ismpl=0; ismpl<n_smpl; ismpl++)
155 uint8_t *dl = d + ismpl * npl_ori;
156 for (j=0; j<npl; j++) d[knew++] = dl[map[j]];
158 } // FIXME: to add GL
162 for (i=1, knew=0; i<b->n_alleles; i++)
163 map[i] = mask&1<<i ? ++knew : -1;
164 for (i=0; i<n_smpl; i++)
166 uint8_t gt = ((uint8_t*)b->gi[igt].data)[i];
169 assert( map[a1]>=0 && map[a2]>=0 );
170 ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2];
177 int bcf_shrink_alt(bcf1_t *b, int n)
180 int i, j, k, n_smpl = b->n_smpl;
181 if (b->n_alleles <= n) return -1;
184 for (p = b->alt, k = 1; *p; ++p)
185 if (*p == ',' && ++k == n) break;
187 } else p = b->alt, *p = '\0';
189 memmove(p, b->flt, b->str + b->l_str - b->flt);
190 b->l_str -= b->flt - p;
192 for (i = 0; i < b->n_gi; ++i) {
193 bcf_ginfo_t *g = b->gi + i;
194 if (g->fmt == bcf_str2int("PL", 2)) {
195 int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
196 uint8_t *d = (uint8_t*)g->data;
197 g->len = n * (n + 1) / 2;
198 for (l = k = 0; l < n_smpl; ++l) {
199 uint8_t *dl = d + l * x;
200 for (j = 0; j < g->len; ++j) d[k++] = dl[j];
202 } // FIXME: to add GL
209 int bcf_gl2pl(bcf1_t *b)
212 int i, n_smpl = b->n_smpl;
216 if (strstr(b->fmt, "PL")) return -1;
217 if ((p = strstr(b->fmt, "GL")) == 0) return -1;
219 for (i = 0; i < b->n_gi; ++i)
220 if (b->gi[i].fmt == bcf_str2int("GL", 2))
223 g->fmt = bcf_str2int("PL", 2);
224 g->len /= 4; // 4 == sizeof(float)
225 d0 = (float*)g->data; d1 = (uint8_t*)g->data;
226 for (i = 0; i < n_smpl * g->len; ++i) {
227 int x = (int)(-10. * d0[i] + .499);
228 if (x > 255) x = 255;
234 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never
235 * produces such FMT, but others may do. */
236 int bcf_fix_gt(bcf1_t *b)
242 // check the presence of the GT FMT
243 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
244 if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact
245 tmp = bcf_str2int("GT", 2);
246 for (i = 0; i < b->n_gi; ++i)
247 if (b->gi[i].fmt == tmp) break;
248 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
250 // move GT to the first
251 for (; i > 0; --i) b->gi[i] = b->gi[i-1];
253 memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt);
254 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
258 int bcf_fix_pl(bcf1_t *b)
265 tmp = bcf_str2int("PL", 2);
266 for (i = 0; i < b->n_gi; ++i)
267 if (b->gi[i].fmt == tmp) break;
268 if (i == b->n_gi) return 0;
271 PL = (uint8_t*)gi->data;
272 swap = alloca(gi->len);
273 // loop through individuals
274 for (i = 0; i < b->n_smpl; ++i) {
276 uint8_t *PLi = PL + i * gi->len;
277 memcpy(swap, PLi, gi->len);
278 for (k = x = 0; k < b->n_alleles; ++k)
279 for (l = k; l < b->n_alleles; ++l)
280 PLi[l*(l+1)/2 + k] = swap[x++];
285 int bcf_smpl_covered(const bcf1_t *b)
291 tmp = bcf_str2int("PL", 2);
292 for (i = 0; i < b->n_gi; ++i)
293 if (b->gi[i].fmt == tmp) break;
294 if (i == b->n_gi) return 0;
295 // count how many samples having PL!=[0..0]
297 for (i = 0; i < b->n_smpl; ++i) {
298 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
299 for (j = 0; j < gi->len; ++j)
301 if (j < gi->len) ++n;
306 static void *locate_field(const bcf1_t *b, const char *fmt, int l)
310 tmp = bcf_str2int(fmt, l);
311 for (i = 0; i < b->n_gi; ++i)
312 if (b->gi[i].fmt == tmp) break;
313 return i == b->n_gi? 0 : b->gi[i].data;
316 int bcf_anno_max(bcf1_t *b)
318 int k, max_gq, max_sp, n_het;
322 max_gq = max_sp = n_het = 0;
323 gt = locate_field(b, "GT", 2);
324 if (gt == 0) return -1;
325 gq = locate_field(b, "GQ", 2);
326 sp = locate_field(b, "SP", 2);
328 for (k = 0; k < b->n_smpl; ++k)
330 max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
332 for (k = 0; k < b->n_smpl; ++k)
334 max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
335 for (k = 0; k < b->n_smpl; ++k) {
337 a1 = gt[k]&7; a2 = gt[k]>>3&7;
338 if ((!a1 && a2) || (!a2 && a1)) { // a het
339 if (gq == 0) ++n_het;
340 else if (gq[k] >= 20) ++n_het;
343 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
344 if (max_sp < 0) max_sp = 0;
345 memset(&str, 0, sizeof(kstring_t));
346 if (*b->info) kputc(';', &str);
347 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
348 bcf_append_info(b, str.s, str.l);
353 // FIXME: only data are shuffled; the header is NOT
354 int bcf_shuffle(bcf1_t *b, int seed)
357 if (seed > 0) srand48(seed);
358 a = malloc(b->n_smpl * sizeof(int));
359 for (i = 0; i < b->n_smpl; ++i) a[i] = i;
360 for (i = b->n_smpl; i > 1; --i) {
362 j = (int)(drand48() * i);
363 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
365 for (j = 0; j < b->n_gi; ++j) {
366 bcf_ginfo_t *gi = b->gi + j;
367 uint8_t *swap, *data = (uint8_t*)gi->data;
368 swap = malloc(gi->len * b->n_smpl);
369 for (i = 0; i < b->n_smpl; ++i)
370 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
378 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
383 khash_t(str2id) *hash;
385 s.l = s.m = 0; s.s = 0;
386 hash = kh_init(str2id);
387 for (i = 0; i < h0->n_smpl; ++i) {
388 k = kh_put(str2id, hash, h0->sns[i], &ret);
391 for (i = j = 0; i < n; ++i) {
392 k = kh_get(str2id, hash, samples[i]);
393 if (k != kh_end(hash)) {
394 list[j++] = kh_val(hash, k);
395 kputs(samples[i], &s); kputc('\0', &s);
398 if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
399 kh_destroy(str2id, hash);
400 h = calloc(1, sizeof(bcf_hdr_t));
402 h->ns = 0; h->sns = 0;
403 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
404 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
405 h->l_smpl = s.l; h->sname = s.s;
410 int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
413 for (j = 0; j < b->n_gi; ++j) {
414 bcf_ginfo_t *gi = b->gi + j;
416 swap = malloc(gi->len * b->n_smpl);
417 for (i = 0; i < n_smpl; ++i)
418 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
426 static int8_t nt4_table[128] = {
427 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
428 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
429 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
430 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
431 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
432 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
433 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
434 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
437 int bcf_gl10(const bcf1_t *b, uint8_t *gl)
439 int a[4], k, l, map[4], k1, j, i;
440 const bcf_ginfo_t *PL;
442 if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
443 for (i = 0; i < b->n_gi; ++i)
444 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
445 if (i == b->n_gi) return -1; // no PL
447 a[0] = nt4_table[(int)b->ref[0]];
448 if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
449 a[1] = a[2] = a[3] = -2; // -1 has a special meaning
450 if (b->alt[0] == 0) return -1; // no alternate allele
451 map[0] = map[1] = map[2] = map[3] = -2;
453 for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
454 if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
455 a[k+1] = nt4_table[(int)*s];
456 if (a[k+1] >= 0) map[a[k+1]] = k+1;
458 if (s[1] == 0) break; // the end of the ALT string
460 for (k = 0; k < 4; ++k)
461 if (map[k] < 0) map[k] = k1;
462 for (i = 0; i < b->n_smpl; ++i) {
463 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
464 uint8_t *g = gl + 10 * i;
465 for (k = j = 0; k < 4; ++k) {
466 for (l = k; l < 4; ++l) {
467 int t, x = map[k], y = map[l];
468 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
469 g[j++] = p[y * (y+1) / 2 + x];
476 int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
479 const bcf_ginfo_t *PL;
480 if (b->alt[0] == 0) return -1; // no alternate allele
481 for (i = 0; i < b->n_gi; ++i)
482 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
483 if (i == b->n_gi) return -1; // no PL
485 for (i = 0; i < b->n_smpl; ++i) {
486 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
487 uint8_t *g = gl + 10 * i;
488 for (k = j = 0; k < 4; ++k) {
489 for (l = k; l < 4; ++l) {
491 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
492 x = y * (y+1) / 2 + x;
493 g[j++] = x < PL->len? p[x] : 255;