]> git.donarmstrong.com Git - samtools.git/blob - bcftools/vcf.c
d441d7f0b278bf0d6d72eb05d0e641a0926b04ef
[samtools.git] / bcftools / vcf.c
1 #include <zlib.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include "bcf.h"
6 #include "kstring.h"
7 #include "kseq.h"
8 KSTREAM_INIT(gzFile, gzread, 4096)
9
10 typedef struct {
11         gzFile fp;
12         FILE *fpout;
13         kstream_t *ks;
14         void *refhash;
15         kstring_t line;
16         int max_ref;
17 } vcf_t;
18
19 bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
20 {
21         kstring_t meta, smpl;
22         int dret;
23         vcf_t *v;
24         bcf_hdr_t *h;
25         if (!bp->is_vcf) return bcf_hdr_read(bp);
26         h = calloc(1, sizeof(bcf_hdr_t));
27         v = (vcf_t*)bp->v;
28         v->line.l = 0;
29         memset(&meta, 0, sizeof(kstring_t));
30         memset(&smpl, 0, sizeof(kstring_t));
31         while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
32                 if (v->line.l < 2) continue;
33                 if (v->line.s[0] != '#') return 0; // no sample line
34                 if (v->line.s[0] == '#' && v->line.s[1] == '#') {
35                         kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
36                 } else if (v->line.s[0] == '#') {
37                         int k;
38                         ks_tokaux_t aux;
39                         char *p;
40                         for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
41                                 if (k >= 9) {
42                                         kputsn(p, aux.p - p, &smpl);
43                                         kputc('\0', &smpl);
44                                 }
45                         }
46                         break;
47                 }
48         }
49         kputc('\0', &meta);
50         h->name = 0;
51         h->sname = smpl.s; h->l_smpl = smpl.l;
52         h->txt = meta.s; h->l_txt = meta.l;
53         bcf_hdr_sync(h);
54         return h;
55 }
56
57 bcf_t *vcf_open(const char *fn, const char *mode)
58 {
59         bcf_t *bp;
60         vcf_t *v;
61         if (strchr(mode, 'b')) return bcf_open(fn, mode);
62         bp = calloc(1, sizeof(bcf_t));
63         v = calloc(1, sizeof(vcf_t));
64         bp->is_vcf = 1;
65         bp->v = v;
66         v->refhash = bcf_str2id_init();
67         if (strchr(mode, 'r')) {
68                 v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
69                 v->ks = ks_init(v->fp);
70         } else if (strchr(mode, 'w'))
71                 v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout;
72         return bp;
73 }
74
75 int vcf_close(bcf_t *bp)
76 {
77         vcf_t *v;
78         if (bp == 0) return -1;
79         if (!bp->is_vcf) return bcf_close(bp);
80         v = (vcf_t*)bp->v;
81         if (v->fp) {
82                 ks_destroy(v->ks);
83                 gzclose(v->fp);
84         }
85         if (v->fpout) fclose(v->fpout);
86         free(v->line.s);
87         bcf_str2id_destroy(v->refhash);
88         free(v);
89         free(bp);
90         return 0;
91 }
92
93 int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
94 {
95         vcf_t *v = (vcf_t*)bp->v;
96         int i, has_ref = 0, has_ver = 0;
97         if (!bp->is_vcf) return bcf_hdr_write(bp, h);
98         if (h->l_txt > 0) {
99                 if (strstr(h->txt, "##fileformat=")) has_ver = 1;
100                 if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n");
101                 fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
102                 if (strstr(h->txt, "##SQ=")) has_ref = 1;
103         }
104         if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n");
105         if (!has_ref) {
106                 fprintf(v->fpout, "##SQ=");
107                 for (i = 0; i < h->n_ref; ++i) {
108                         fprintf(v->fpout, "%s", h->ns[i]);
109                         fputc(i == h->n_ref - 1? '\n' : ',', v->fpout);
110                 }
111         }
112         fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
113         for (i = 0; i < h->n_smpl; ++i)
114                 fprintf(v->fpout, "\t%s", h->sns[i]);
115         fputc('\n', v->fpout);
116         return 0;
117 }
118
119 int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
120 {
121         vcf_t *v = (vcf_t*)bp->v;
122         extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s);
123         if (!bp->is_vcf) return bcf_write(bp, h, b);
124         bcf_fmt_core(h, b, &v->line);
125         fwrite(v->line.s, 1, v->line.l, v->fpout);
126         fputc('\n', v->fpout);
127         return v->line.l + 1;
128 }
129
130 int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
131 {
132         int dret, k, i, sync = 0;
133         vcf_t *v = (vcf_t*)bp->v;
134         char *p, *q;
135         kstring_t str, rn;
136         ks_tokaux_t aux, a2;
137         if (!bp->is_vcf) return bcf_read(bp, h, b);
138         v->line.l = 0;
139         str.l = 0; str.m = b->m_str; str.s = b->str;
140         rn.l = rn.m = h->l_nm; rn.s = h->name;
141         if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
142         for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
143                 *(char*)aux.p = 0;
144                 if (k == 0) { // ref
145                         int tid = bcf_str2id(v->refhash, p);
146                         if (tid < 0) {
147                                 tid = bcf_str2id_add(v->refhash, p);
148                                 kputs(p, &rn); kputc('\0', &rn);
149                                 sync = 1;
150                         }
151                         b->tid = tid;
152                 } else if (k == 1) { // pos
153                         b->pos = atoi(p) - 1;
154                 } else if (k == 5) { // qual
155                         b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
156                 } else if (k <= 8) { // variable length strings
157                         kputs(p, &str); kputc('\0', &str);
158                         b->l_str = str.l; b->m_str = str.m; b->str = str.s;
159                         if (k == 8) bcf_sync(h->n_smpl, b);
160                 } else { // k > 9
161                         if (strncmp(p, "./.", 3) == 0) {
162                                 for (i = 0; i < b->n_gi; ++i) {
163                                         if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
164                                                 ((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
165                                         } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
166                                                 ((uint8_t*)b->gi[i].data)[k-9] = 0;
167                                         } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
168                                                 ((uint16_t*)b->gi[i].data)[k-9] = 0;
169                                         } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
170                                                 int y = b->n_alleles * (b->n_alleles + 1) / 2;
171                                                 memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
172                                         } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
173                                                 int y = b->n_alleles * (b->n_alleles + 1) / 2;
174                                                 memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
175                                         }
176                                 }
177                                 goto endblock;
178                         }
179                         for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
180                                 if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
181                                         ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
182                                 } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
183                                         double _x = strtod(q, &q);
184                                         int x = (int)(_x + .499);
185                                         if (x > 255) x = 255;
186                                         ((uint8_t*)b->gi[i].data)[k-9] = x;
187                                 } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
188                                         int x = strtol(q, &q, 10);
189                                         if (x > 0xffff) x = 0xffff;
190                                         ((uint16_t*)b->gi[i].data)[k-9] = x;
191                                 } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
192                                         int x, y, j;
193                                         uint8_t *data = (uint8_t*)b->gi[i].data;
194                                         y = b->n_alleles * (b->n_alleles + 1) / 2;
195                                         for (j = 0; j < y; ++j) {
196                                                 x = strtol(q, &q, 10);
197                                                 if (x > 255) x = 255;
198                                                 data[(k-9) * y + j] = x;
199                                                 ++q;
200                                         }
201                                 } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
202                                         int j, y;
203                                         float x, *data = (float*)b->gi[i].data;
204                                         y = b->n_alleles * (b->n_alleles + 1) / 2;
205                                         for (j = 0; j < y; ++j) {
206                                                 x = strtod(q, &q);
207                                                 data[(k-9) * y + j] = x;
208                                                 ++q;
209                                         }
210                                 }
211                         }
212                 endblock: i = i;
213                 }
214         }
215         h->l_nm = rn.l; h->name = rn.s;
216         if (sync) bcf_hdr_sync(h);
217         return v->line.l + 1;
218 }