3 Copyright (c) 2011 Heng Li <lh3@live.co.uk>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 KSTREAM_INIT(gzFile, gzread, 16384)
38 // append a CIGAR operation plus length
39 #define write_cigar(_c, _n, _m, _v) do { \
42 _c = realloc(_c, _m * sizeof(unsigned)); \
48 static void fatal(const char *msg)
50 fprintf(stderr, "E %s\n", msg);
54 static void remove_pads(const kstring_t *src, kstring_t *dst)
58 kputsn(src->s, src->l, dst);
59 for (i = j = 0; i < dst->l; ++i)
60 if (dst->s[i] != '*') dst->s[j++] = dst->s[i];
65 int main(int argc, char *argv[])
69 kstring_t s, t[N_TMPSTR];
70 int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0;
71 long m_cigar = 0, n_cigar = 0;
72 unsigned *af, *cigar = 0;
74 while ((c = getopt(argc, argv, "pc")) >= 0) {
76 case 'p': is_padded = 1; break;
77 case 'c': write_cns = 1; break;
81 fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n");
82 fprintf(stderr, "Options: -p output padded SAM\n");
83 fprintf(stderr, " -c write the contig sequence in SAM\n\n");
84 fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
85 fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n");
86 fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
87 fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n");
91 s.l = s.m = 0; s.s = 0;
92 af_n = af_max = af_i = 0; af = 0;
93 for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
94 fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
96 while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
97 if (strcmp(s.s, "CO") == 0) { // contig sequence
99 t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line
100 af_n = af_i = 0; // reset the af array
101 ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name
102 ks_getuntil(ks, '\n', &s, &dret); // read the whole line
103 while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence
104 remove_pads(&t[1], &t[2]); // construct the unpadded sequence
105 // compute the array for mapping padded positions to unpadded positions
106 p2u = realloc(p2u, t[1].m * sizeof(int));
107 for (i = k = 0; i < t[1].l; ++i) {
109 if (t[1].s[i] != '*') ++k;
111 // write out the SAM header and contig sequences
112 fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line
113 cns = &t[is_padded?1:2];
114 fprintf(stderr, "S >%s\n", t[0].s);
115 for (i = 0; i < cns->l; i += LINE_LEN) {
117 for (k = 0; k < LINE_LEN && i + k < cns->l; ++k)
118 fputc(cns->s[i + k], stderr);
122 #define __padded2cigar(sp) do { \
123 int i, l_M = 0, l_D = 0; \
124 for (i = 0; i < sp.l; ++i) { \
125 if (sp.s[i] == '*') { \
126 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
129 if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
133 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
134 else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
137 if (write_cns) { // write the consensus SAM line (dummy read)
139 if (is_padded) __padded2cigar(t[1]);
140 else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4);
141 kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
142 for (i = 0; i < n_cigar; ++i) {
143 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
145 kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
147 } else if (strcmp(s.s, "BQ") == 0) { // contig quality
148 if (t[0].l == 0) fatal("come to 'BQ' before reading 'CO'");
149 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire "BQ" line
150 if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*"
151 for (i = 0; i < t[2].l; ++i) { // read the consensus quality
153 if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n");
156 if (q > 126) q = 126;
157 if (write_cns) kputc(q, &t[4]);
160 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
161 ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
162 if (write_cns) puts(t[4].s); t[4].l = 0;
163 } else if (strcmp(s.s, "AF") == 0) { // padded read position
164 int reversed, neg, pos;
165 if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
167 if (t[4].l) puts(t[4].s);
170 ks_getuntil(ks, 0, &s, &dret); // read name
171 ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0; // strand
172 ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos; // position
173 if (af_n == af_max) { // double the af array
174 af_max = af_max? af_max<<1 : 4;
175 af = realloc(af, af_max * sizeof(unsigned));
177 af[af_n++] = pos << 2 | neg << 1 | reversed; // keep the placement information
178 } else if (strcmp(s.s, "RD") == 0) { // read sequence
179 if (af_i >= af_n) fatal("more 'RD' records than 'AF'");
180 t[2].l = t[3].l = t[4].l = 0;
181 ks_getuntil(ks, 0, &t[4], &dret); // QNAME
182 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire RD line
183 while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]); // read the read sequence
184 } else if (strcmp(s.s, "QA") == 0) { // clipping
185 if (af_i >= af_n) fatal("more 'QA' records than 'AF'");
186 int beg, end, pos, op;
187 ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping
188 ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start
189 ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s); // clipping end
190 if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
191 // compute 1-based POS
192 pos = af[af_i]>>2; // retrieve the position information
193 if (af[af_i]>>1&1) pos = -pos;
194 pos += beg; // now pos is the true padded position
196 remove_pads(&t[2], &t[3]); // backup the unpadded read sequence
198 if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4);
200 __padded2cigar(t[2]);
201 if (beg && n_cigar > 1) cigar[1] -= beg<<4; // fix the left-hand CIGAR
202 if (end < t[2].l && n_cigar) cigar[n_cigar-1] -= (t[2].l - end)<<4; // fix the right-hand CIGAR
204 // generate flattened CIGAR string
205 for (i = beg, k = pos - 1; i < end; ++i, ++k)
206 t[2].s[i] = t[2].s[i] != '*'? (t[1].s[k] != '*'? 0 : 1) : (t[1].s[k] != '*'? 2 : 6);
207 // generate the proper CIGAR
208 for (i = beg + 1, k = 1, op = t[2].s[beg]; i < end; ++i) {
209 if (op != t[2].s[i]) {
210 write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
211 op = t[2].s[i]; k = 1;
214 write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
215 // remove unnecessary "P" and possibly merge adjacent operations
216 for (i = 2; i < n_cigar; ++i) {
217 if ((cigar[i]&0xf) != 1 && (cigar[i-1]&0xf) == 6 && (cigar[i-2]&0xf) != 1) {
219 if ((cigar[i]&0xf) == (cigar[i-2]&0xf)) // merge operations
220 cigar[i] += cigar[i-2], cigar[i-2] = 0;
223 for (i = k = 0; i < n_cigar; ++i) // squeeze out dumb operations
224 if (cigar[i]) cigar[k++] = cigar[i];
227 if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l - end)<<4|4);
228 // write the SAM line for the read
229 kputc('\t', &t[4]); // QNAME has already been written
230 kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]); // FLAG
231 kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // RNAME
232 kputw(is_padded? pos : p2u[pos-1]+1, &t[4]); // POS
233 kputs("\t60\t", &t[4]); // MAPQ
234 for (i = 0; i < n_cigar; ++i) { // CIGAR
235 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
237 kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN
238 kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ
239 kputs("\t*", &t[4]); // QUAL
240 puts(t[4].s); // print to stdout
242 } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
246 free(af); free(s.s); free(cigar); free(p2u);
247 for (i = 0; i < N_TMPSTR; ++i) free(t[i].s);