]> git.donarmstrong.com Git - samtools.git/blobdiff - kseq.h
works
[samtools.git] / kseq.h
diff --git a/kseq.h b/kseq.h
index e99a8abb2abf7c70ae83d4d3162d994fdbbdec30..a5cec7c289aeda5fe36edb2a828a9f669741d3a3 100644 (file)
--- a/kseq.h
+++ b/kseq.h
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2008 Genome Research Ltd (GRL).
+   Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -23,9 +23,7 @@
    SOFTWARE.
 */
 
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
-/* Last Modified: 12APR2009 */
+/* Last Modified: 05MAR2012 */
 
 #ifndef AC_KSEQ_H
 #define AC_KSEQ_H
 
 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
 #define KS_SEP_TAB   1 // isspace() && !' '
-#define KS_SEP_MAX   1
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
 
 #define __KS_TYPE(type_t)                                              \
        typedef struct __kstream_t {                            \
-               char *buf;                                                              \
+               unsigned char *buf;                                             \
                int begin, end, is_eof;                                 \
                type_t f;                                                               \
        } kstream_t;
@@ -53,7 +52,7 @@
        {                                                                                                                               \
                kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
                ks->f = f;                                                                                                      \
-               ks->buf = (char*)malloc(__bufsize);                                                     \
+               ks->buf = (unsigned char*)malloc(__bufsize);                            \
                return ks;                                                                                                      \
        }                                                                                                                               \
        static inline void ks_destroy(kstream_t *ks)                                    \
@@ -90,10 +89,10 @@ typedef struct __kstring_t {
 #endif
 
 #define __KS_GETUNTIL(__read, __bufsize)                                                               \
-       static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+       static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
        {                                                                                                                                       \
                if (dret) *dret = 0;                                                                                    \
-               str->l = 0;                                                                                                             \
+               str->l = append? str->l : 0;                                                                    \
                if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
                for (;;) {                                                                                                              \
                        int i;                                                                                                          \
@@ -105,7 +104,10 @@ typedef struct __kstring_t {
                                        if (ks->end == 0) break;                                                        \
                                } else break;                                                                                   \
                        }                                                                                                                       \
-                       if (delimiter > KS_SEP_MAX) {                                                           \
+                       if (delimiter == KS_SEP_LINE) { \
+                               for (i = ks->begin; i < ks->end; ++i) \
+                                       if (ks->buf[i] == '\n') break; \
+                       } else if (delimiter > KS_SEP_MAX) {                                            \
                                for (i = ks->begin; i < ks->end; ++i)                                   \
                                        if (ks->buf[i] == delimiter) break;                                     \
                        } else if (delimiter == KS_SEP_SPACE) {                                         \
@@ -115,7 +117,7 @@ typedef struct __kstring_t {
                                for (i = ks->begin; i < ks->end; ++i)                                   \
                                        if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
                        } else i = 0; /* never come to here! */                                         \
-                       if (str->m - str->l < i - ks->begin + 1) {                                      \
+                       if (str->m - str->l < (size_t)(i - ks->begin + 1)) {            \
                                str->m = str->l + (i - ks->begin) + 1;                                  \
                                kroundup32(str->m);                                                                             \
                                str->s = (char*)realloc(str->s, str->m);                                \
@@ -128,9 +130,15 @@ typedef struct __kstring_t {
                                break;                                                                                                  \
                        }                                                                                                                       \
                }                                                                                                                               \
+               if (str->s == 0) {                                                                                              \
+                       str->m = 1;                                                                                                     \
+                       str->s = (char*)calloc(1, 1);                                                           \
+               } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
                str->s[str->l] = '\0';                                                                                  \
                return str->l;                                                                                                  \
-       }
+       } \
+       static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+       { return ks_getuntil2(ks, delimiter, str, dret, 0); }
 
 #define KSTREAM_INIT(type_t, __read, __bufsize) \
        __KS_TYPE(type_t)                                                       \
@@ -138,19 +146,16 @@ typedef struct __kstring_t {
        __KS_GETC(__read, __bufsize)                            \
        __KS_GETUNTIL(__read, __bufsize)
 
-#define __KSEQ_BASIC(type_t)                                                                                   \
-       static inline kseq_t *kseq_init(type_t fd)                                                      \
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)                                                                            \
+       SCOPE kseq_t *kseq_init(type_t fd)                                                                      \
        {                                                                                                                                       \
                kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
                s->f = ks_init(fd);                                                                                             \
                return s;                                                                                                               \
        }                                                                                                                                       \
-       static inline void kseq_rewind(kseq_t *ks)                                                      \
-       {                                                                                                                                       \
-               ks->last_char = 0;                                                                                              \
-               ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
-       }                                                                                                                                       \
-       static inline void kseq_destroy(kseq_t *ks)                                                     \
+       SCOPE void kseq_destroy(kseq_t *ks)                                                                     \
        {                                                                                                                                       \
                if (!ks) return;                                                                                                \
                free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
@@ -163,44 +168,46 @@ typedef struct __kstring_t {
    -1   end-of-file
    -2   truncated quality string
  */
-#define __KSEQ_READ                                                                                                            \
-       static int kseq_read(kseq_t *seq)                                                                       \
-       {                                                                                                                                       \
-               int c;                                                                                                                  \
-               kstream_t *ks = seq->f;                                                                                 \
+#define __KSEQ_READ(SCOPE) \
+       SCOPE int kseq_read(kseq_t *seq) \
+       { \
+               int c; \
+               kstream_t *ks = seq->f; \
                if (seq->last_char == 0) { /* then jump to the next header line */ \
-                       while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');        \
-                       if (c == -1) return -1; /* end of file */                                       \
-                       seq->last_char = c;                                                                                     \
-               } /* the first header char has been read */                                             \
-               seq->comment.l = seq->seq.l = seq->qual.l = 0;                                  \
-               if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;                  \
-               if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);                 \
+                       while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
+                       if (c == -1) return -1; /* end of file */ \
+                       seq->last_char = c; \
+               } /* else: the first header char has been read in the previous call */ \
+               seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
+               if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
+               if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+               if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
+                       seq->seq.m = 256; \
+                       seq->seq.s = (char*)malloc(seq->seq.m); \
+               } \
                while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
-                       if (isgraph(c)) { /* printable non-space character */           \
-                               if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
-                                       seq->seq.m = seq->seq.l + 2;                                            \
-                                       kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
-                                       seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
-                               }                                                                                                               \
-                               seq->seq.s[seq->seq.l++] = (char)c;                                             \
-                       }                                                                                                                       \
-               }                                                                                                                               \
+                       if (c == '\n') continue; /* skip empty lines */ \
+                       seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
+                       ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
+               } \
                if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
-               seq->seq.s[seq->seq.l] = 0;     /* null terminated string */            \
-               if (c != '+') return seq->seq.l; /* FASTA */                                    \
-               if (seq->qual.m < seq->seq.m) { /* allocate enough memory */    \
-                       seq->qual.m = seq->seq.m;                                                                       \
-                       seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
-               }                                                                                                                               \
+               if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
+                       seq->seq.m = seq->seq.l + 2; \
+                       kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
+                       seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+               } \
+               seq->seq.s[seq->seq.l] = 0;     /* null terminated string */ \
+               if (c != '+') return seq->seq.l; /* FASTA */ \
+               if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
+                       seq->qual.m = seq->seq.m; \
+                       seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
+               } \
                while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
-               if (c == -1) return -2; /* we should not stop here */                   \
-               while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
-                       if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
-               seq->qual.s[seq->qual.l] = 0; /* null terminated string */              \
+               if (c == -1) return -2; /* error: no quality string */ \
+               while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
                seq->last_char = 0;     /* we have not come to the next header line */ \
-               if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
-               return seq->seq.l;                                                                                              \
+               if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
+               return seq->seq.l; \
        }
 
 #define __KSEQ_TYPE(type_t)                                            \
@@ -210,10 +217,19 @@ typedef struct __kstring_t {
                kstream_t *f;                                                   \
        } kseq_t;
 
-#define KSEQ_INIT(type_t, __read)                              \
-       KSTREAM_INIT(type_t, __read, 4096)                      \
+#define KSEQ_INIT2(SCOPE, type_t, __read)              \
+       KSTREAM_INIT(type_t, __read, 16384)                     \
        __KSEQ_TYPE(type_t)                                                     \
-       __KSEQ_BASIC(type_t)                                            \
-       __KSEQ_READ
+       __KSEQ_BASIC(SCOPE, type_t)                                     \
+       __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+       __KS_TYPE(type_t) \
+       __KSEQ_TYPE(type_t) \
+       extern kseq_t *kseq_init(type_t fd); \
+       void kseq_destroy(kseq_t *ks); \
+       int kseq_read(kseq_t *seq);
 
 #endif