+static char **read_samples(const char *fn, int *_n)
+{
+ gzFile fp;
+ kstream_t *ks;
+ kstring_t s;
+ int dret, n = 0, max = 0;
+ char **sam = 0;
+ *_n = 0;
+ s.l = s.m = 0; s.s = 0;
+ fp = gzopen(fn, "r");
+ if (fp == 0) return 0; // fail to open file
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
+ int l;
+ if (max == n) {
+ max = max? max<<1 : 4;
+ sam = realloc(sam, sizeof(void*)*max);
+ }
+ l = s.l;
+ sam[n] = malloc(s.l + 2);
+ strcpy(sam[n], s.s);
+ sam[n][l+1] = 2; // by default, diploid
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2
+ int x = (int)s.s[0] - '0';
+ if (x == 1 || x == 2) sam[n][l+1] = x;
+ else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__);
+ }
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ }
+ ++n;
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(s.s);
+ *_n = n;
+ return sam;
+}
+
+static void write_header(bcf_hdr_t *h)
+{
+ kstring_t str;
+ str.l = h->l_txt? h->l_txt - 1 : 0;
+ str.m = str.l + 1; str.s = h->txt;
+ if (!strstr(str.s, "##INFO=<ID=DP,"))
+ kputs("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=DP4,"))
+ kputs("##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=MQ,"))
+ kputs("##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=FQ,"))
+ kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AF1,"))
+ kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AC1,"))
+ kputs("##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=G3,"))
+ kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=HWE,"))
+ kputs("##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=CLR,"))
+ kputs("##INFO=<ID=CLR,Number=1,Type=Integer,Description=\"Log ratio of genotype likelihoods with and without the constraint\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=UGT,"))
+ kputs("##INFO=<ID=UGT,Number=1,Type=String,Description=\"The most probable unconstrained genotype configuration in the trio\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=CGT,"))
+ kputs("##INFO=<ID=CGT,Number=1,Type=String,Description=\"The most probable constrained genotype configuration in the trio\">\n", &str);
+// if (!strstr(str.s, "##INFO=<ID=CI95,"))
+// kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail Bayesian credible interval of the site allele frequency at the 95% level\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=PV4,"))
+ kputs("##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=INDEL,"))
+ kputs("##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=PC2,"))
+ kputs("##INFO=<ID=PC2,Number=2,Type=Integer,Description=\"Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=PCHI2,"))
+ kputs("##INFO=<ID=PCHI2,Number=1,Type=Float,Description=\"Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=QCHI2,"))
+ kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=RP,"))
+ kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=VDB,"))
+ kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=GT,"))
+ kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
+ kputs("##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=GL,"))
+ kputs("##FORMAT=<ID=GL,Number=3,Type=Float,Description=\"Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=DP,"))
+ kputs("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"# high-quality bases\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=DV,"))
+ kputs("##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"# high-quality non-reference bases\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=SP,"))
+ kputs("##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">\n", &str);
+ if (!strstr(str.s, "##FORMAT=<ID=PL,"))
+ kputs("##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">\n", &str);
+ h->l_txt = str.l + 1; h->txt = str.s;
+}
+
+double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
+