]> git.donarmstrong.com Git - samtools.git/blob - misc/ace2sam.c
added the ace2sam converter
[samtools.git] / misc / ace2sam.c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <zlib.h>
5 #include "kstring.h"
6 #include "kseq.h"
7 KSTREAM_INIT(gzFile, gzread, 16384)
8
9 #define N_TMPSTR 5
10
11 #define write_cigar(_c, _n, _m, _v) do { \
12                 if (_n == _m) { \
13                         _m = _m? _m<<1 : 4; \
14                         _c = realloc(_c, _m * sizeof(unsigned)); \
15                 } \
16                 _c[_n++] = (_v); \
17         } while (0)
18
19 int main(int argc, char *argv[])
20 {
21         gzFile fp;
22         kstream_t *ks;
23         kstring_t s, t[N_TMPSTR];
24         int dret, i, af_n, af_max, af_i;
25         long n_ctgs = 0, n_reads = 0, m_cigar = 0, n_cigar = 0;
26         unsigned *af, *cigar = 0;
27
28         if (argc == 1) {
29                 fprintf(stderr, "\nUsage: ace2sam <in.ace>\n\n");
30                 fprintf(stderr, "  ace2sam in.ace > a2s.out 2> a2s.err\n");
31                 fprintf(stderr, "  (grep ^H a2s.err | sed s,^..,,; cat a2s.out) > a2s.sam\n");
32                 fprintf(stderr, "  grep ^S a2s.err | sed s,^..,, > a2s.fa\n\n");
33                 return 1;
34         }
35         s.l = s.m = 0; s.s = 0;
36         af_n = af_max = af_i = 0; af = 0;
37         for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
38         fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
39         ks = ks_init(fp);
40         while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
41                 if (strcmp(s.s, "AS") == 0) {
42                         ks_getuntil(ks, 0, &s, &dret); n_ctgs = atol(s.s);
43                         ks_getuntil(ks, 0, &s, &dret); n_reads = atol(s.s);
44                         if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
45                 } else if (strcmp(s.s, "CO") == 0) {
46                         t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded; 2: unpadded; 3: CIGAR/flat-cigar; 4: SAM line
47                         af_n = af_i = 0;
48                         ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]);
49                         ks_getuntil(ks, 0, &s, &dret);
50                         ks_getuntil(ks, 0, &s, &dret); n_reads = atoi(s.s);
51                         ks_getuntil(ks, '\n', &s, &dret);
52                         fprintf(stderr, "S >%s\n", t[0].s);
53                         while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) {
54                                 kputs(s.s, &t[1]);
55                                 fputs("S ", stderr); fputs(s.s, stderr); fputc('\n', stderr);
56                         }
57
58 #define __padded2cigar(sp, su) do { \
59                 int i, l_M = 0, l_D = 0; \
60                 kputsn(sp.s, sp.l, &su); su.l = 0; \
61                 for (i = 0; i < sp.l; ++i) { \
62                         if (sp.s[i] == '*') { \
63                                 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
64                                 ++l_D; l_M = 0; \
65                         } else { \
66                                 if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
67                                 ++l_M; l_D = 0; \
68                                 su.s[su.l++] = sp.s[i]; \
69                         } \
70                 } \
71                 su.s[su.l] = 0; \
72                 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
73                 else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
74         } while (0)
75
76                         n_cigar = 0;
77                         __padded2cigar(t[1], t[2]);
78                         for (i = 0; i < n_cigar; ++i) {
79                                 kputw(cigar[i]>>4, &t[3]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[3]);
80                         }
81                         kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
82                         kputsn(t[3].s, t[3].l, &t[4]); kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
83                         fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[1].l);
84                 } else if (strcmp(s.s, "BQ") == 0) {
85                         if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
86                         t[4].s[--t[4].l] = 0;
87                         for (i = 0; i < t[2].l; ++i) {
88                                 int q;
89                                 if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n");
90                                 q = atoi(s.s) + 33;
91                                 if (q > 126) q = 126;
92                                 kputc(q, &t[4]);
93                         }
94                         if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
95                         ks_getuntil(ks, '\n', &s, &dret);
96                         puts(t[4].s); t[4].l = 0;
97                 } else if (strcmp(s.s, "AF") == 0) { // read coordinate
98                         int reversed, neg, pos;
99                         if (t[4].l) puts(t[4].s);
100                         t[4].l = 0;
101                         ks_getuntil(ks, 0, &s, &dret);
102                         ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0;
103                         ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos;
104                         if (af_n == af_max) {
105                                 af_max = af_max? af_max<<1 : 4;
106                                 af = realloc(af, af_max * sizeof(unsigned));
107                         }
108                         af[af_n++] = pos << 2 | neg << 1 | reversed;
109                 } else if (strcmp(s.s, "RD") == 0) { // read sequence
110                         t[2].l = t[3].l = t[4].l = 0;
111                         ks_getuntil(ks, 0, &t[4], &dret); kputc('\t', &t[4]); // read name
112                         kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]);
113                         kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // the SAM line stops at RNAME
114                         if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
115                         while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]);
116                 } else if (strcmp(s.s, "QA") == 0) { // clipping
117                         int beg, end, pos;
118                         ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping
119                         ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start
120                         ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s);
121                         pos = af[af_i]>>2;
122                         if (af[af_i]>>1&1) pos = -pos;
123                         pos += beg;
124                         kputw(pos, &t[4]); kputs("\t60\t", &t[4]);
125                         n_cigar = 0; // start to generate CIGAR
126                         if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4);
127                         __padded2cigar(t[2], t[3]);
128                         if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l-end)<<4|4);
129                         if (n_cigar > 1 && (cigar[0]&0xf) == 4) cigar[1] -= beg<<4;
130                         if (n_cigar > 1 && (cigar[n_cigar-1]&0xf) == 4) cigar[n_cigar-2] -= cigar[n_cigar-1]>>4<<4;
131                         for (i = 0; i < n_cigar; ++i) {
132                                 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
133                         }
134                         kputs("\t*\t0\t0\t", &t[4]);
135                         kputsn(t[3].s, t[3].l, &t[4]); kputs("\t*", &t[4]);
136                         puts(t[4].s);
137                         ++af_i;
138                 } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
139         }
140         ks_destroy(ks);
141         gzclose(fp);
142         free(af); free(s.s); free(cigar);
143         for (i = 0; i < N_TMPSTR; ++i) free(t[i].s);
144         return 0;
145 }