]> git.donarmstrong.com Git - samtools.git/blob - kseq.h
* samtools-0.1.2-22
[samtools.git] / kseq.h
1 /* The MIT License
2
3    Copyright (c) 2008 Genome Research Ltd (GRL).
4
5    Permission is hereby granted, free of charge, to any person obtaining
6    a copy of this software and associated documentation files (the
7    "Software"), to deal in the Software without restriction, including
8    without limitation the rights to use, copy, modify, merge, publish,
9    distribute, sublicense, and/or sell copies of the Software, and to
10    permit persons to whom the Software is furnished to do so, subject to
11    the following conditions:
12
13    The above copyright notice and this permission notice shall be
14    included in all copies or substantial portions of the Software.
15
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23    SOFTWARE.
24 */
25
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
27
28 /* Last Modified: 12APR2009 */
29
30 #ifndef AC_KSEQ_H
31 #define AC_KSEQ_H
32
33 #include <ctype.h>
34 #include <string.h>
35 #include <stdlib.h>
36
37 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
38 #define KS_SEP_TAB   1 // isspace() && !' '
39 #define KS_SEP_MAX   1
40
41 #define __KS_TYPE(type_t)                                               \
42         typedef struct __kstream_t {                            \
43                 char *buf;                                                              \
44                 int begin, end, is_eof;                                 \
45                 type_t f;                                                               \
46         } kstream_t;
47
48 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
49 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
50
51 #define __KS_BASIC(type_t, __bufsize)                                                           \
52         static inline kstream_t *ks_init(type_t f)                                              \
53         {                                                                                                                               \
54                 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
55                 ks->f = f;                                                                                                      \
56                 ks->buf = (char*)malloc(__bufsize);                                                     \
57                 return ks;                                                                                                      \
58         }                                                                                                                               \
59         static inline void ks_destroy(kstream_t *ks)                                    \
60         {                                                                                                                               \
61                 if (ks) {                                                                                                       \
62                         free(ks->buf);                                                                                  \
63                         free(ks);                                                                                               \
64                 }                                                                                                                       \
65         }
66
67 #define __KS_GETC(__read, __bufsize)                                            \
68         static inline int ks_getc(kstream_t *ks)                                \
69         {                                                                                                               \
70                 if (ks->is_eof && ks->begin >= ks->end) return -1;      \
71                 if (ks->begin >= ks->end) {                                                     \
72                         ks->begin = 0;                                                                  \
73                         ks->end = __read(ks->f, ks->buf, __bufsize);    \
74                         if (ks->end < __bufsize) ks->is_eof = 1;                \
75                         if (ks->end == 0) return -1;                                    \
76                 }                                                                                                       \
77                 return (int)ks->buf[ks->begin++];                                       \
78         }
79
80 #ifndef KSTRING_T
81 #define KSTRING_T kstring_t
82 typedef struct __kstring_t {
83         size_t l, m;
84         char *s;
85 } kstring_t;
86 #endif
87
88 #ifndef kroundup32
89 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
90 #endif
91
92 #define __KS_GETUNTIL(__read, __bufsize)                                                                \
93         static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
94         {                                                                                                                                       \
95                 if (dret) *dret = 0;                                                                                    \
96                 str->l = 0;                                                                                                             \
97                 if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
98                 for (;;) {                                                                                                              \
99                         int i;                                                                                                          \
100                         if (ks->begin >= ks->end) {                                                                     \
101                                 if (!ks->is_eof) {                                                                              \
102                                         ks->begin = 0;                                                                          \
103                                         ks->end = __read(ks->f, ks->buf, __bufsize);            \
104                                         if (ks->end < __bufsize) ks->is_eof = 1;                        \
105                                         if (ks->end == 0) break;                                                        \
106                                 } else break;                                                                                   \
107                         }                                                                                                                       \
108                         if (delimiter > KS_SEP_MAX) {                                                           \
109                                 for (i = ks->begin; i < ks->end; ++i)                                   \
110                                         if (ks->buf[i] == delimiter) break;                                     \
111                         } else if (delimiter == KS_SEP_SPACE) {                                         \
112                                 for (i = ks->begin; i < ks->end; ++i)                                   \
113                                         if (isspace(ks->buf[i])) break;                                         \
114                         } else if (delimiter == KS_SEP_TAB) {                                           \
115                                 for (i = ks->begin; i < ks->end; ++i)                                   \
116                                         if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
117                         } else i = 0; /* never come to here! */                                         \
118                         if (str->m - str->l < i - ks->begin + 1) {                                      \
119                                 str->m = str->l + (i - ks->begin) + 1;                                  \
120                                 kroundup32(str->m);                                                                             \
121                                 str->s = (char*)realloc(str->s, str->m);                                \
122                         }                                                                                                                       \
123                         memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
124                         str->l = str->l + (i - ks->begin);                                                      \
125                         ks->begin = i + 1;                                                                                      \
126                         if (i < ks->end) {                                                                                      \
127                                 if (dret) *dret = ks->buf[i];                                                   \
128                                 break;                                                                                                  \
129                         }                                                                                                                       \
130                 }                                                                                                                               \
131                 str->s[str->l] = '\0';                                                                                  \
132                 return str->l;                                                                                                  \
133         }
134
135 #define KSTREAM_INIT(type_t, __read, __bufsize) \
136         __KS_TYPE(type_t)                                                       \
137         __KS_BASIC(type_t, __bufsize)                           \
138         __KS_GETC(__read, __bufsize)                            \
139         __KS_GETUNTIL(__read, __bufsize)
140
141 #define __KSEQ_BASIC(type_t)                                                                                    \
142         static inline kseq_t *kseq_init(type_t fd)                                                      \
143         {                                                                                                                                       \
144                 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
145                 s->f = ks_init(fd);                                                                                             \
146                 return s;                                                                                                               \
147         }                                                                                                                                       \
148         static inline void kseq_rewind(kseq_t *ks)                                                      \
149         {                                                                                                                                       \
150                 ks->last_char = 0;                                                                                              \
151                 ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
152         }                                                                                                                                       \
153         static inline void kseq_destroy(kseq_t *ks)                                                     \
154         {                                                                                                                                       \
155                 if (!ks) return;                                                                                                \
156                 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
157                 ks_destroy(ks->f);                                                                                              \
158                 free(ks);                                                                                                               \
159         }
160
161 /* Return value:
162    >=0  length of the sequence (normal)
163    -1   end-of-file
164    -2   truncated quality string
165  */
166 #define __KSEQ_READ                                                                                                             \
167         static int kseq_read(kseq_t *seq)                                                                       \
168         {                                                                                                                                       \
169                 int c;                                                                                                                  \
170                 kstream_t *ks = seq->f;                                                                                 \
171                 if (seq->last_char == 0) { /* then jump to the next header line */ \
172                         while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');        \
173                         if (c == -1) return -1; /* end of file */                                       \
174                         seq->last_char = c;                                                                                     \
175                 } /* the first header char has been read */                                             \
176                 seq->comment.l = seq->seq.l = seq->qual.l = 0;                                  \
177                 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;                  \
178                 if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);                 \
179                 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
180                         if (isgraph(c)) { /* printable non-space character */           \
181                                 if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
182                                         seq->seq.m = seq->seq.l + 2;                                            \
183                                         kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
184                                         seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
185                                 }                                                                                                               \
186                                 seq->seq.s[seq->seq.l++] = (char)c;                                             \
187                         }                                                                                                                       \
188                 }                                                                                                                               \
189                 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
190                 seq->seq.s[seq->seq.l] = 0;     /* null terminated string */            \
191                 if (c != '+') return seq->seq.l; /* FASTA */                                    \
192                 if (seq->qual.m < seq->seq.m) { /* allocate enough memory */    \
193                         seq->qual.m = seq->seq.m;                                                                       \
194                         seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
195                 }                                                                                                                               \
196                 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
197                 if (c == -1) return -2; /* we should not stop here */                   \
198                 while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
199                         if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
200                 seq->qual.s[seq->qual.l] = 0; /* null terminated string */              \
201                 seq->last_char = 0;     /* we have not come to the next header line */ \
202                 if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
203                 return seq->seq.l;                                                                                              \
204         }
205
206 #define __KSEQ_TYPE(type_t)                                             \
207         typedef struct {                                                        \
208                 kstring_t name, comment, seq, qual;             \
209                 int last_char;                                                  \
210                 kstream_t *f;                                                   \
211         } kseq_t;
212
213 #define KSEQ_INIT(type_t, __read)                               \
214         KSTREAM_INIT(type_t, __read, 4096)                      \
215         __KSEQ_TYPE(type_t)                                                     \
216         __KSEQ_BASIC(type_t)                                            \
217         __KSEQ_READ
218
219 #endif