]> git.donarmstrong.com Git - rsem.git/blob - sam/misc/ace2sam.c
Updated samtools to 0.1.19
[rsem.git] / sam / misc / ace2sam.c
1 /* The MIT License
2
3    Copyright (c) 2011  Heng Li <lh3@live.co.uk>
4
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <zlib.h>
31 #include "kstring.h"
32 #include "kseq.h"
33 KSTREAM_INIT(gzFile, gzread, 16384)
34
35 #define N_TMPSTR 5
36 #define LINE_LEN 60
37
38 // append a CIGAR operation plus length
39 #define write_cigar(_c, _n, _m, _v) do { \
40                 if (_n == _m) { \
41                         _m = _m? _m<<1 : 4; \
42                         _c = realloc(_c, _m * sizeof(unsigned)); \
43                 } \
44                 _c[_n++] = (_v); \
45         } while (0)
46
47 // a fatal error
48 static void fatal(const char *msg)
49 {
50         fprintf(stderr, "E %s\n", msg);
51         exit(1);
52 }
53 // remove pads
54 static void remove_pads(const kstring_t *src, kstring_t *dst)
55 {
56         int i, j;
57         dst->l = 0;
58         kputsn(src->s, src->l, dst);
59         for (i = j = 0; i < dst->l; ++i)
60                 if (dst->s[i] != '*') dst->s[j++] = dst->s[i];
61         dst->s[j] = 0;
62         dst->l = j;
63 }
64
65 int main(int argc, char *argv[])
66 {
67         gzFile fp;
68         kstream_t *ks;
69         kstring_t s, t[N_TMPSTR];
70         int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0;
71         long m_cigar = 0, n_cigar = 0;
72         unsigned *af, *cigar = 0;
73
74         while ((c = getopt(argc, argv, "pc")) >= 0) {
75                 switch (c) {
76                         case 'p': is_padded = 1; break;
77                         case 'c': write_cns = 1; break;
78                 }
79         }
80         if (argc == optind) {
81                 fprintf(stderr, "\nUsage:   ace2sam [-pc] <in.ace>\n\n");
82                 fprintf(stderr, "Options: -p     output padded SAM\n");
83                 fprintf(stderr, "         -c     write the contig sequence in SAM\n\n");
84                 fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
85                 fprintf(stderr, "       2. The order of reads in AF and in RD must be identical\n");
86                 fprintf(stderr, "       3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
87                 fprintf(stderr, "       4. This program writes the headerless SAM to stdout and header to stderr\n\n");
88                 return 1;
89         }
90
91         s.l = s.m = 0; s.s = 0;
92         af_n = af_max = af_i = 0; af = 0;
93         for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
94         fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
95         ks = ks_init(fp);
96         while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
97                 if (strcmp(s.s, "CO") == 0) { // contig sequence
98                         kstring_t *cns;
99                         t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line
100                         af_n = af_i = 0; // reset the af array
101                         ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name
102                         ks_getuntil(ks, '\n', &s, &dret); // read the whole line
103                         while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence
104                         remove_pads(&t[1], &t[2]); // construct the unpadded sequence
105                         // compute the array for mapping padded positions to unpadded positions
106                         p2u = realloc(p2u, t[1].m * sizeof(int));
107                         for (i = k = 0; i < t[1].l; ++i) {
108                                 p2u[i] = k;
109                                 if (t[1].s[i] != '*') ++k;
110                         }
111                         // write out the SAM header and contig sequences
112                         fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line
113                         cns = &t[is_padded?1:2];
114                         fprintf(stderr, "S >%s\n", t[0].s);
115                         for (i = 0; i < cns->l; i += LINE_LEN) {
116                                 fputs("S ", stderr);
117                                 for (k = 0; k < LINE_LEN && i + k < cns->l; ++k)
118                                         fputc(cns->s[i + k], stderr);
119                                 fputc('\n', stderr);
120                         }
121
122 #define __padded2cigar(sp) do { \
123                 int i, l_M = 0, l_D = 0; \
124                 for (i = 0; i < sp.l; ++i) { \
125                         if (sp.s[i] == '*') { \
126                                 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
127                                 ++l_D; l_M = 0; \
128                         } else { \
129                                 if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
130                                 ++l_M; l_D = 0; \
131                         } \
132                 } \
133                 if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
134                 else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
135         } while (0)
136
137                         if (write_cns) { // write the consensus SAM line (dummy read)
138                                 n_cigar = 0;
139                                 if (is_padded) __padded2cigar(t[1]);
140                                 else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4);
141                                 kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
142                                 for (i = 0; i < n_cigar; ++i) {
143                                         kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
144                                 }
145                                 kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
146                         }
147                 } else if (strcmp(s.s, "BQ") == 0) { // contig quality
148                         if (t[0].l == 0) fatal("come to 'BQ' before reading 'CO'");
149                         if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire "BQ" line
150                         if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*"
151                         for (i = 0; i < t[2].l; ++i) { // read the consensus quality
152                                 int q;
153                                 if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n");
154                                 if (s.l) {
155                                         q = atoi(s.s) + 33;
156                                         if (q > 126) q = 126;
157                                         if (write_cns) kputc(q, &t[4]);
158                                 } else --i;
159                         }
160                         if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
161                         ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
162                         if (write_cns) puts(t[4].s); t[4].l = 0;
163                 } else if (strcmp(s.s, "AF") == 0) { // padded read position
164                         int reversed, neg, pos;
165                         if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
166                         if (write_cns) {
167                                 if (t[4].l) puts(t[4].s);
168                                 t[4].l = 0;
169                         }
170                         ks_getuntil(ks, 0, &s, &dret); // read name
171                         ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0; // strand
172                         ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos; // position
173                         if (af_n == af_max) { // double the af array
174                                 af_max = af_max? af_max<<1 : 4;
175                                 af = realloc(af, af_max * sizeof(unsigned));
176                         }
177                         af[af_n++] = pos << 2 | neg << 1 | reversed; // keep the placement information
178                 } else if (strcmp(s.s, "RD") == 0) { // read sequence
179                         if (af_i >= af_n) fatal("more 'RD' records than 'AF'");
180                         t[2].l = t[3].l = t[4].l = 0;
181                         ks_getuntil(ks, 0, &t[4], &dret); // QNAME
182                         if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire RD line
183                         while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]); // read the read sequence
184                 } else if (strcmp(s.s, "QA") == 0) { // clipping
185                         if (af_i >= af_n) fatal("more 'QA' records than 'AF'");
186                         int beg, end, pos, op;
187                         ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping
188                         ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start
189                         ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s); // clipping end
190                         if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
191                         // compute 1-based POS
192                         pos = af[af_i]>>2; // retrieve the position information
193                         if (af[af_i]>>1&1) pos = -pos;
194                         pos += beg; // now pos is the true padded position
195                         // generate CIGAR
196                         remove_pads(&t[2], &t[3]); // backup the unpadded read sequence
197                         n_cigar = 0;
198                         if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4);
199                         if (is_padded) {
200                                 __padded2cigar(t[2]);
201                                 if (beg && n_cigar > 1) cigar[1] -= beg<<4; // fix the left-hand CIGAR 
202                                 if (end < t[2].l && n_cigar) cigar[n_cigar-1] -= (t[2].l - end)<<4; // fix the right-hand CIGAR
203                         } else {
204                                 // generate flattened CIGAR string
205                                 for (i = beg, k = pos - 1; i < end; ++i, ++k)
206                                         t[2].s[i] = t[2].s[i] != '*'? (t[1].s[k] != '*'? 0 : 1) : (t[1].s[k] != '*'? 2 : 6);
207                                 // generate the proper CIGAR
208                                 for (i = beg + 1, k = 1, op = t[2].s[beg]; i < end; ++i) {
209                                         if (op != t[2].s[i]) {
210                                                 write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
211                                                 op = t[2].s[i]; k = 1;
212                                         } else ++k;
213                                 }
214                                 write_cigar(cigar, n_cigar, m_cigar, k<<4|op);
215                                 // remove unnecessary "P" and possibly merge adjacent operations
216                                 for (i = 2; i < n_cigar; ++i) {
217                                         if ((cigar[i]&0xf) != 1 && (cigar[i-1]&0xf) == 6 && (cigar[i-2]&0xf) != 1) {
218                                                 cigar[i-1] = 0;
219                                                 if ((cigar[i]&0xf) == (cigar[i-2]&0xf)) // merge operations
220                                                         cigar[i] += cigar[i-2], cigar[i-2] = 0;
221                                         }
222                                 }
223                                 for (i = k = 0; i < n_cigar; ++i) // squeeze out dumb operations
224                                         if (cigar[i]) cigar[k++] = cigar[i];
225                                 n_cigar = k;
226                         }
227                         if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l - end)<<4|4);
228                         // write the SAM line for the read
229                         kputc('\t', &t[4]); // QNAME has already been written
230                         kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]); // FLAG
231                         kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // RNAME
232                         kputw(is_padded? pos : p2u[pos-1]+1, &t[4]); // POS
233                         kputs("\t60\t", &t[4]); // MAPQ
234                         for (i = 0; i < n_cigar; ++i) { // CIGAR
235                                 kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
236                         }
237                         kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN
238                         kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ
239                         kputs("\t*", &t[4]); // QUAL
240                         puts(t[4].s); // print to stdout
241                         ++af_i;
242                 } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
243         }
244         ks_destroy(ks);
245         gzclose(fp);
246         free(af); free(s.s); free(cigar); free(p2u);
247         for (i = 0; i < N_TMPSTR; ++i) free(t[i].s);
248         return 0;
249 }