]> git.donarmstrong.com Git - samtools.git/blob - kseq.h
Refine logic for removing superfluous P ops (still testing ...)
[samtools.git] / kseq.h
1 /* The MIT License
2
3    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
4
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25
26 /* Last Modified: 05MAR2012 */
27
28 #ifndef AC_KSEQ_H
29 #define AC_KSEQ_H
30
31 #include <ctype.h>
32 #include <string.h>
33 #include <stdlib.h>
34
35 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36 #define KS_SEP_TAB   1 // isspace() && !' '
37 #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
38 #define KS_SEP_MAX   2
39
40 #define __KS_TYPE(type_t)                                               \
41         typedef struct __kstream_t {                            \
42                 unsigned char *buf;                                             \
43                 int begin, end, is_eof;                                 \
44                 type_t f;                                                               \
45         } kstream_t;
46
47 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
48 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
49
50 #define __KS_BASIC(type_t, __bufsize)                                                           \
51         static inline kstream_t *ks_init(type_t f)                                              \
52         {                                                                                                                               \
53                 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
54                 ks->f = f;                                                                                                      \
55                 ks->buf = (unsigned char*)malloc(__bufsize);                            \
56                 return ks;                                                                                                      \
57         }                                                                                                                               \
58         static inline void ks_destroy(kstream_t *ks)                                    \
59         {                                                                                                                               \
60                 if (ks) {                                                                                                       \
61                         free(ks->buf);                                                                                  \
62                         free(ks);                                                                                               \
63                 }                                                                                                                       \
64         }
65
66 #define __KS_GETC(__read, __bufsize)                                            \
67         static inline int ks_getc(kstream_t *ks)                                \
68         {                                                                                                               \
69                 if (ks->is_eof && ks->begin >= ks->end) return -1;      \
70                 if (ks->begin >= ks->end) {                                                     \
71                         ks->begin = 0;                                                                  \
72                         ks->end = __read(ks->f, ks->buf, __bufsize);    \
73                         if (ks->end < __bufsize) ks->is_eof = 1;                \
74                         if (ks->end == 0) return -1;                                    \
75                 }                                                                                                       \
76                 return (int)ks->buf[ks->begin++];                                       \
77         }
78
79 #ifndef KSTRING_T
80 #define KSTRING_T kstring_t
81 typedef struct __kstring_t {
82         size_t l, m;
83         char *s;
84 } kstring_t;
85 #endif
86
87 #ifndef kroundup32
88 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
89 #endif
90
91 #define __KS_GETUNTIL(__read, __bufsize)                                                                \
92         static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
93         {                                                                                                                                       \
94                 if (dret) *dret = 0;                                                                                    \
95                 str->l = append? str->l : 0;                                                                    \
96                 if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
97                 for (;;) {                                                                                                              \
98                         int i;                                                                                                          \
99                         if (ks->begin >= ks->end) {                                                                     \
100                                 if (!ks->is_eof) {                                                                              \
101                                         ks->begin = 0;                                                                          \
102                                         ks->end = __read(ks->f, ks->buf, __bufsize);            \
103                                         if (ks->end < __bufsize) ks->is_eof = 1;                        \
104                                         if (ks->end == 0) break;                                                        \
105                                 } else break;                                                                                   \
106                         }                                                                                                                       \
107                         if (delimiter == KS_SEP_LINE) { \
108                                 for (i = ks->begin; i < ks->end; ++i) \
109                                         if (ks->buf[i] == '\n') break; \
110                         } else if (delimiter > KS_SEP_MAX) {                                            \
111                                 for (i = ks->begin; i < ks->end; ++i)                                   \
112                                         if (ks->buf[i] == delimiter) break;                                     \
113                         } else if (delimiter == KS_SEP_SPACE) {                                         \
114                                 for (i = ks->begin; i < ks->end; ++i)                                   \
115                                         if (isspace(ks->buf[i])) break;                                         \
116                         } else if (delimiter == KS_SEP_TAB) {                                           \
117                                 for (i = ks->begin; i < ks->end; ++i)                                   \
118                                         if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
119                         } else i = 0; /* never come to here! */                                         \
120                         if (str->m - str->l < (size_t)(i - ks->begin + 1)) {            \
121                                 str->m = str->l + (i - ks->begin) + 1;                                  \
122                                 kroundup32(str->m);                                                                             \
123                                 str->s = (char*)realloc(str->s, str->m);                                \
124                         }                                                                                                                       \
125                         memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
126                         str->l = str->l + (i - ks->begin);                                                      \
127                         ks->begin = i + 1;                                                                                      \
128                         if (i < ks->end) {                                                                                      \
129                                 if (dret) *dret = ks->buf[i];                                                   \
130                                 break;                                                                                                  \
131                         }                                                                                                                       \
132                 }                                                                                                                               \
133                 if (str->s == 0) {                                                                                              \
134                         str->m = 1;                                                                                                     \
135                         str->s = (char*)calloc(1, 1);                                                           \
136                 } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
137                 str->s[str->l] = '\0';                                                                                  \
138                 return str->l;                                                                                                  \
139         } \
140         static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
141         { return ks_getuntil2(ks, delimiter, str, dret, 0); }
142
143 #define KSTREAM_INIT(type_t, __read, __bufsize) \
144         __KS_TYPE(type_t)                                                       \
145         __KS_BASIC(type_t, __bufsize)                           \
146         __KS_GETC(__read, __bufsize)                            \
147         __KS_GETUNTIL(__read, __bufsize)
148
149 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
150
151 #define __KSEQ_BASIC(SCOPE, type_t)                                                                             \
152         SCOPE kseq_t *kseq_init(type_t fd)                                                                      \
153         {                                                                                                                                       \
154                 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
155                 s->f = ks_init(fd);                                                                                             \
156                 return s;                                                                                                               \
157         }                                                                                                                                       \
158         SCOPE void kseq_destroy(kseq_t *ks)                                                                     \
159         {                                                                                                                                       \
160                 if (!ks) return;                                                                                                \
161                 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
162                 ks_destroy(ks->f);                                                                                              \
163                 free(ks);                                                                                                               \
164         }
165
166 /* Return value:
167    >=0  length of the sequence (normal)
168    -1   end-of-file
169    -2   truncated quality string
170  */
171 #define __KSEQ_READ(SCOPE) \
172         SCOPE int kseq_read(kseq_t *seq) \
173         { \
174                 int c; \
175                 kstream_t *ks = seq->f; \
176                 if (seq->last_char == 0) { /* then jump to the next header line */ \
177                         while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
178                         if (c == -1) return -1; /* end of file */ \
179                         seq->last_char = c; \
180                 } /* else: the first header char has been read in the previous call */ \
181                 seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
182                 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
183                 if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
184                 if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
185                         seq->seq.m = 256; \
186                         seq->seq.s = (char*)malloc(seq->seq.m); \
187                 } \
188                 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
189                         if (c == '\n') continue; /* skip empty lines */ \
190                         seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
191                         ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
192                 } \
193                 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
194                 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
195                         seq->seq.m = seq->seq.l + 2; \
196                         kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
197                         seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
198                 } \
199                 seq->seq.s[seq->seq.l] = 0;     /* null terminated string */ \
200                 if (c != '+') return seq->seq.l; /* FASTA */ \
201                 if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
202                         seq->qual.m = seq->seq.m; \
203                         seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
204                 } \
205                 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
206                 if (c == -1) return -2; /* error: no quality string */ \
207                 while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
208                 seq->last_char = 0;     /* we have not come to the next header line */ \
209                 if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
210                 return seq->seq.l; \
211         }
212
213 #define __KSEQ_TYPE(type_t)                                             \
214         typedef struct {                                                        \
215                 kstring_t name, comment, seq, qual;             \
216                 int last_char;                                                  \
217                 kstream_t *f;                                                   \
218         } kseq_t;
219
220 #define KSEQ_INIT2(SCOPE, type_t, __read)               \
221         KSTREAM_INIT(type_t, __read, 16384)                     \
222         __KSEQ_TYPE(type_t)                                                     \
223         __KSEQ_BASIC(SCOPE, type_t)                                     \
224         __KSEQ_READ(SCOPE)
225
226 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
227
228 #define KSEQ_DECLARE(type_t) \
229         __KS_TYPE(type_t) \
230         __KSEQ_TYPE(type_t) \
231         extern kseq_t *kseq_init(type_t fd); \
232         void kseq_destroy(kseq_t *ks); \
233         int kseq_read(kseq_t *seq);
234
235 #endif