]> git.donarmstrong.com Git - fastq-tools.git/commitdiff
a new and improved parser
authorDaniel Caleb Jones <dcjones@cs.washington.edu>
Thu, 3 Mar 2011 03:47:33 +0000 (19:47 -0800)
committerDaniel Caleb Jones <dcjones@cs.washington.edu>
Thu, 3 Mar 2011 03:47:33 +0000 (19:47 -0800)
src/Makefile.am
src/fastq-common.c [new file with mode: 0644]
src/fastq-common.h [new file with mode: 0644]
src/fastq-grep.c
src/fastq-kmers.c
src/fastq-match.c
src/fastq-parse.c [new file with mode: 0644]
src/fastq-parse.h [new file with mode: 0644]
src/kseq.h [deleted file]

index 4d47d16df5cfb8e33677f93be09437d895eda756..c50f420bcd01e4396244e757fb4beec46d82b109 100644 (file)
@@ -3,13 +3,13 @@ SUBDIRS = swsse2
 
 bin_PROGRAMS = fastq-grep fastq-kmers fastq-match
 
-fastq_grep_SOURCES = kseq.h fastq-grep.c
+fastq_grep_SOURCES = kseq.h fastq-grep.c fastq-parse.c fastq-common.c
 fastq_grep_LDADD = $(PCRE_LIBS) -lz
 
-fastq_kmers_SOURCES = kseq.h fastq-kmers.c
+fastq_kmers_SOURCES = kseq.h fastq-kmers.c fastq-parse.c fastq-common.c
 fastq_kmers_LDADD = -lz
 
-fastq_match_SOURCES = kseq.h fastq-match.c
+fastq_match_SOURCES = kseq.h fastq-match.c fastq-parse.c fastq-common.c
 fastq_match_LDADD   = swsse2/libswsse2.la -lz
 fastq_match_CFLAGS  = -msse2
 
diff --git a/src/fastq-common.c b/src/fastq-common.c
new file mode 100644 (file)
index 0000000..ae1ecac
--- /dev/null
@@ -0,0 +1,56 @@
+
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ */
+
+
+#include "fastq-common.h"
+#include <stdlib.h>
+
+
+void or_die(int b, const char* msg)
+{
+    if (b == 0) {
+        fputs(msg, stderr);
+        exit(1);
+    }
+}
+
+
+void* malloc_or_die(size_t n)
+{
+    void* p = malloc(n);
+    if (p == NULL) {
+        fprintf(stderr, "Can not allocate %zu bytes.\n", n);
+        exit(1);
+    }
+    return p;
+}
+
+
+void* realloc_or_die(void* ptr, size_t n)
+{
+    void* p = realloc(ptr, n);
+    if (p == NULL) {
+        fprintf(stderr, "Can not allocate %zu bytes.\n", n);
+        exit(1);
+    }
+    return p;
+}
+
+
+FILE* fopen_or_die(const char* path, const char* mode)
+{
+    FILE* f = fopen(path, mode);
+    if (f == NULL) {
+        fprintf(stderr, "Can not open file %s with mode %s.\n", path, mode);
+        exit(1);
+    }
+    return f;
+}
+
+
+
diff --git a/src/fastq-common.h b/src/fastq-common.h
new file mode 100644 (file)
index 0000000..cdc89e8
--- /dev/null
@@ -0,0 +1,23 @@
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * common :
+ * A few common functions, primarily for crashing whilst retaining our dignity.
+ *
+ */
+
+#ifndef FASTQ_TOOLS_COMMON_H
+#define FASTQ_TOOLS_COMMON_H
+
+#include <stdio.h>
+
+void or_die(int b, const char* msg);
+
+void* malloc_or_die(size_t);
+void* realloc_or_die(void*, size_t);
+FILE* fopen_or_die(const char*, const char*);
+
+#endif
+
index b1773576bf1c55d3f4002e750eafbee8a22cacb6..f7b3913beb4b2869a5fd3d4298c24aa1fd0387f7 100644 (file)
@@ -1,22 +1,23 @@
 
-
-/* 
- * fastq-grep: regular expression searches of the sequences within a fastq file
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
  *
- * Febuary 2011 / Daniel Jones <dcjones@cs.washington.edu> 
+ * fastq-grep :
+ * Regular expression searches of the sequences within a FASTQ file.
  *
  */
 
 
+#include "fastq-common.h"
+#include "fastq-parse.h"
 #include <stdio.h>
 #include <string.h>
 #include <getopt.h>
 #include <zlib.h>
 #include <pcre.h>
 
-#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
-
 
 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
 #  include <fcntl.h>
@@ -46,30 +47,30 @@ static int count_flag;
 
 
 
-void print_fastq_entry( FILE* fout, kseq_t* seq )
+void print_fastq_entry(FILE* fout, seq_t* seq)
 {
     fprintf(fout, "@%s\n%s\n+%s\n%s\n",
-                  seq->name.s,
+                  seq->id1.s,
                   seq->seq.s,
-                  seq->comment.s,
+                  seq->id2.s,
                   seq->qual.s );
 }
 
 
-void fastq_grep( gzFile fin, FILE* fout, pcre* re )
+void fastq_grep(FILE* fin, FILE* fout, pcre* re)
 {
     int rc;
     int ovector[3];
     size_t count = 0;
 
-    kseq_t* seq;
-    seq = kseq_init(fin);
+    fastq_t* fqf = fastq_open(fin);
+    seq_t* seq = fastq_alloc_seq();
 
-    while (kseq_read(seq) >= 0) {
+    while (fastq_next(fqf, seq)) {
         rc = pcre_exec(re,          /* pattern */
                        NULL,        /* extre data */
                        seq->seq.s,  /* subject */
-                       seq->seq.l,  /* subject length */
+                       seq->seq.n,  /* subject length */
                        0,           /* subject offset */
                        0,           /* options */
                        ovector,     /* output vector */
@@ -81,7 +82,8 @@ void fastq_grep( gzFile fin, FILE* fout, pcre* re )
         }
     }
 
-    kseq_destroy(seq);
+    fastq_free_seq(seq);
+    fastq_close(fqf);
 
     if (count_flag) fprintf(fout, "%zu\n", count);
 }
@@ -99,7 +101,6 @@ int main(int argc, char* argv[])
     int pat_error_offset;
 
     FILE*  fin;
-    gzFile gzfin;
 
 
     invert_flag  = 0;
@@ -172,15 +173,7 @@ int main(int argc, char* argv[])
 
 
     if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
-        gzfin = gzdopen( fileno(stdin), "rb" );
-        if (gzfin == NULL) {
-            fprintf(stderr, "Malformed file 'stdin'.\n");
-            return 1;
-        }
-
-        fastq_grep(gzfin, stdout, re);
-
-        gzclose(gzfin);
+        fastq_grep(stdin, stdout, re);
     }
     else {
         for (; optind < argc; optind++) {
@@ -190,15 +183,9 @@ int main(int argc, char* argv[])
                 continue;
             }
 
-            gzfin = gzdopen(fileno(fin), "rb");
-            if (gzfin == NULL) {
-                fprintf(stderr, "Malformed file '%s'.\n", argv[optind]);
-                continue;
-            }
-
-            fastq_grep(gzfin, stdout, re);
+            fastq_grep(fin, stdout, re);
 
-            gzclose(gzfin);
+            fclose(fin);
         }
     }
 
index 43fa3026b8bf02919cced34663fc14fc4999d066..d78a453b58d75757910012c4ab669f38afe209f0 100644 (file)
@@ -1,22 +1,23 @@
 
-/* 
- * fastq-kmers: kmer frequences within fastq files
+/*
+ * This file is part of fastq-tools.
  *
- * Febuary 2011 / Daniel Jones <dcjones@cs.washington.edu> 
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-kmers :
+ * Tabulate k-mer frequencies with FASTQ files.
  *
  */
 
-
-
+#include "fastq-common.h"
+#include "fastq-parse.h"
+#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
 #include <getopt.h>
 #include <zlib.h>
 
-#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
-
 
 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
 #  include <fcntl.h>
@@ -105,15 +106,16 @@ void unpackkmer( uint32_t kmer, char* s, int k )
 }
 
 
-void count_fastq_kmers(gzFile* fin, uint32_t* cs)
+void count_fastq_kmers(FILE* fin, uint32_t* cs)
 {
-    kseq_t* seq = kseq_init(fin);
+    seq_t* seq = fastq_alloc_seq();
+    fastq_t* fqf = fastq_open(fin);
     int i;
     int n;
     uint32_t kmer;
 
-    while (kseq_read(seq) >= 0) {
-        n = (int)seq->seq.l - k + 1;
+    while (fastq_next(fqf, seq)) {
+        n = (int)seq->seq.n - k + 1;
         for (i = 0; i < n; i++) {
             if( packkmer(seq->seq.s + i, &kmer, k) ) {
                 cs[kmer]++;
@@ -121,7 +123,8 @@ void count_fastq_kmers(gzFile* fin, uint32_t* cs)
         }
     }
 
-    kseq_destroy(seq);
+    fastq_free_seq(seq);
+    fastq_close(fqf);
 }
 
 
@@ -154,7 +157,6 @@ int main(int argc, char* argv[])
     uint32_t* cs; /* counts */
 
     FILE* fin;
-    gzFile gzfin;
 
     int opt;
     int opt_idx;
@@ -222,15 +224,7 @@ int main(int argc, char* argv[])
     }
 
     if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
-        gzfin = gzdopen( fileno(stdin), "rb" );
-        if (gzfin == NULL) {
-            fprintf(stderr, "Malformed file 'stdin'.\n");
-            return 1;
-        }
-
-        count_fastq_kmers(gzfin, cs);
-
-        gzclose(gzfin);
+        count_fastq_kmers(stdin, cs);
     }
     else {
         for (; optind < argc; optind++) {
@@ -240,15 +234,7 @@ int main(int argc, char* argv[])
                 continue;
             }
 
-            gzfin = gzdopen(fileno(fin), "rb");
-            if (gzfin == NULL) {
-                fprintf(stderr, "Malformed file '%s'.\n", argv[optind]);
-                continue;
-            }
-
-            count_fastq_kmers(gzfin, cs);
-
-            gzclose(gzfin);
+            count_fastq_kmers(fin, cs);
         }
     }
 
index ccfca73860501d6b1cf32915ff25632ca7230a46..69eff70b079a22d455ea4b0d9884481284c8fab0 100644 (file)
@@ -1,20 +1,25 @@
 
-/* Smith-Waterman alignments against sequences within a fastq file.
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-match :
+ * Smith-Waterman alignments against sequences within a fastq file.
  *
- * Daniel Jones <dcjones@cs.washington.edu>
  */
 
-#include <stdlib.h>
-#include <zlib.h>
-#include <getopt.h>
 
+#include "fastq-common.h"
+#include "fastq-parse.h"
 #include "swsse2/blosum62.h"
 #include "swsse2/swsse2.h"
 #include "swsse2/matrix.h"
 #include "swsse2/swstriped.h"
-#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
-
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+#include <getopt.h>
 
 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
 #  include <fcntl.h>
@@ -49,7 +54,7 @@ void convert_sequence(unsigned char* s, int n)
 }
 
 
-void fastq_match(gzFile fin, FILE* fout,
+void fastq_match(FILE* fin, FILE* fout,
                  SwStripedData* sw_data,
                  unsigned char* query, int n,
                  SEARCH_OPTIONS* options)
@@ -60,16 +65,16 @@ void fastq_match(gzFile fin, FILE* fout,
     int gap_ext   = -options->gapExt;
     int threshold = options->threshold;
 
-    kseq_t* seq;
-    seq = kseq_init(fin);
+    fastq_t* fqf = fastq_open(fin);
+    seq_t* seq = fastq_alloc_seq();
 
-    while (kseq_read(seq) >= 0) {
+    while (fastq_next(fqf, seq)) {
         fprintf(fout, "%s\t", seq->seq.s);
 
-        convert_sequence((unsigned char*)seq->seq.s, seq->seq.l);
+        convert_sequence((unsigned char*)seq->seq.s, seq->seq.n);
 
         score = swStripedByte(query, n,
-                              (unsigned char*)seq->seq.s, seq->seq.l,
+                              (unsigned char*)seq->seq.s, seq->seq.n,
                               gap_init, gap_ext,
                               sw_data->pvbQueryProf,
                               sw_data->pvH1,
@@ -78,7 +83,7 @@ void fastq_match(gzFile fin, FILE* fout,
                               sw_data->bias);
         if (score >= 255) {
             score = swStripedWord(query, n,
-                                  (unsigned char*)seq->seq.s, seq->seq.l,
+                                  (unsigned char*)seq->seq.s, seq->seq.n,
                                   gap_init, gap_ext,
                                   sw_data->pvbQueryProf,
                                   sw_data->pvH1,
@@ -89,7 +94,8 @@ void fastq_match(gzFile fin, FILE* fout,
         fprintf(fout, "%d\n", score);
     }
 
-    kseq_destroy(seq);
+    fastq_free_seq(seq);
+    fastq_close(fqf);
 }
 
 
@@ -110,7 +116,6 @@ int main(int argc, char* argv[])
     options.threshold = -1;
 
     FILE*  fin;
-    gzFile gzfin;
 
     help_flag = 0;
 
@@ -165,15 +170,7 @@ int main(int argc, char* argv[])
     sw_data = swStripedInit(query, query_len, mat);
 
     if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
-        gzfin = gzdopen( fileno(stdin), "rb" );
-        if (gzfin == NULL) {
-            fprintf(stderr, "Malformed file 'stdin'.\n");
-            return 1;
-        }
-
-        fastq_match(gzfin, stdout, sw_data, query, query_len, &options);
-
-        gzclose(gzfin);
+        fastq_match(stdin, stdout, sw_data, query, query_len, &options);
     }
     else {
         for (; optind < argc; optind++) {
@@ -183,15 +180,7 @@ int main(int argc, char* argv[])
                 continue;
             }
 
-            gzfin = gzdopen(fileno(fin), "rb");
-            if (gzfin == NULL) {
-                fprintf(stderr, "Malformed file '%s'.\n", argv[optind]);
-                continue;
-            }
-
-            fastq_match(gzfin, stdout, sw_data, query, query_len, &options);
-
-            gzclose(gzfin);
+            fastq_match(fin, stdout, sw_data, query, query_len, &options);
         }
     }
 
diff --git a/src/fastq-parse.c b/src/fastq-parse.c
new file mode 100644 (file)
index 0000000..ec5b478
--- /dev/null
@@ -0,0 +1,235 @@
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ */
+
+#include "fastq-parse.h"
+#include "fastq-common.h"
+#include <stdlib.h>
+#include <ctype.h>
+
+
+static const size_t init_str_size  = 128;
+static const size_t fastq_buf_size = 4096;
+
+static void fastq_alloc_str(str_t* s)
+{
+    s->s = malloc_or_die(init_str_size);
+    s->s[0] = '\0';
+    s->n = 0;
+    s->size = init_str_size;
+}
+
+
+static void fastq_expand_str(str_t* s)
+{
+    s->size *= 2;
+    realloc_or_die(s->s, s->size);
+}
+
+
+seq_t* fastq_alloc_seq()
+{
+    seq_t* seq = malloc_or_die(sizeof(seq_t));
+    fastq_alloc_str(&seq->id1);
+    fastq_alloc_str(&seq->seq);
+    fastq_alloc_str(&seq->id2);
+    fastq_alloc_str(&seq->qual);
+
+    return seq;
+}
+
+
+void fastq_free_seq(seq_t* seq)
+{
+    free(seq->id1.s);
+    free(seq->seq.s);
+    free(seq->id2.s);
+    free(seq->qual.s);
+    free(seq);
+}
+
+
+typedef enum
+{
+    STATE_EOF,
+    STATE_ERROR,
+    STATE_ID1,
+    STATE_SEQ,
+    STATE_ID2,
+    STATE_QUAL
+
+} fastq_state;
+
+
+fastq_t* fastq_open(FILE* f)
+{
+    fastq_t* fqf = malloc_or_die(sizeof(fastq_t));
+    or_die((int)(fqf->file = gzdopen(fileno(f), "rb")),
+           "Can not open gzip file.");
+    
+    fqf->state = STATE_ID1;
+    fqf->buf = malloc_or_die(fastq_buf_size);
+    fqf->buf[0] = '\0';
+    fqf->c = fqf->buf;
+
+    return fqf;
+}
+
+
+void fastq_close(fastq_t* fqf)
+{
+    gzclose(fqf->file);
+    free(fqf->buf);
+    free(fqf);
+}
+
+
+void fastq_refill(fastq_t* f)
+{
+    int errnum;
+    const char* errmsg;
+
+    int n = gzread(f->file, f->buf, fastq_buf_size - 1);
+
+    if (n <= 0) {
+        if (gzeof(f->file)) {
+            f->state = STATE_EOF;
+            n = 0;
+        }
+        else {
+            errmsg = gzerror(f->file, &errnum);
+            fprintf(stderr, "I/O error: %s\n", errmsg);
+            exit(1);
+        }
+    }
+
+    f->buf[n] = '\0';
+    f->c = f->buf;
+}
+
+
+void fastq_get_line(fastq_t* f, str_t* s)
+{
+    int i = 0;
+
+    if (f->state == STATE_EOF) goto fastq_get_line_done;
+
+    while (1) {
+        switch (*f->c) {
+            case '\0':
+                fastq_refill(f);
+                if (f->state == STATE_EOF) goto fastq_get_line_done;
+                break;
+
+            case '\r':
+                f->c++;
+                break;
+
+            case '\n':
+                goto fastq_get_line_done;
+
+            default:
+                while (s->size < i + 2) {
+                    fastq_expand_str(s);
+                }
+                if (s) s->s[i++] = *f->c;
+                f->c++;
+        }
+
+    }
+
+fastq_get_line_done:
+    if (s) {
+        s->s[i] = '\0';
+        s->n = i;
+    }
+}
+
+
+
+int fastq_next(fastq_t* f, seq_t* seq)
+{
+    if (f->state == STATE_EOF) return 0;
+
+    while (1) {
+
+        /* read more, if needed */
+        if (*f->c == '\0' ) {
+            fastq_refill(f);
+            if (f->state == STATE_EOF) return 0;
+            continue;
+        }
+
+        /* skip over leading whitespace */
+        else if (isspace(*f->c)) {
+            /* do nothing */
+        }
+
+        /* skip comments */
+        else if (*f->c == ';') {
+            fastq_get_line(f, NULL);
+            if (f->state == STATE_EOF) return 0;
+        }
+
+        /* read id1 */
+        else if (f->state == STATE_ID1) {
+            if (*f->c == '@') {
+                f->c++;
+                fastq_get_line(f, &seq->id1);
+                if (f->state == STATE_EOF) return 0;
+
+                f->state = STATE_SEQ;
+            }
+            else {
+                fprintf(stderr, "Malformed FASTQ file: expecting an '@', saw a '%c'\n", *f->c);
+                exit(1);
+            }
+        }
+
+        /* read sequence */
+        else if (f->state == STATE_SEQ) {
+            fastq_get_line(f, &seq->seq);
+            if (f->state == STATE_EOF) return 0;
+
+            f->state = STATE_ID2;
+        }
+
+        /* read id2 */
+        else if (f->state == STATE_ID2) {
+            if (*f->c == '+') {
+                f->c++;
+                fastq_get_line(f, &seq->id2);
+                if (f->state == STATE_EOF) return 0;
+
+                f->state = STATE_QUAL;
+            }
+            else {
+                fprintf(stderr, "Malformed FASTQ file: expecting an '+', saw a '%c'\n", *f->c);
+                exit(1);
+            }
+        }
+
+        /* read quality string */
+        else if (f->state == STATE_QUAL) {
+            fastq_get_line(f, &seq->qual);
+            if (f->state == STATE_EOF) return 1;
+
+            f->state = STATE_ID1;
+            break;
+        }
+
+        else {
+            fputs("Inexplicable error in fastq parser.\n", stderr);
+            exit(1);
+        }
+
+        f->c++;
+    }
+
+    return 1;
+}
+
+
diff --git a/src/fastq-parse.h b/src/fastq-parse.h
new file mode 100644 (file)
index 0000000..56a074a
--- /dev/null
@@ -0,0 +1,57 @@
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-parse :
+ * A parser for FASTQ files.
+ *
+ * This parser is mostly derivative of Heng Li's.
+ * See: http://lh3lh3.users.sourceforge.net/kseq.shtml
+ *
+ */
+
+#ifndef FASTQ_TOOLS_PARSE_H
+#define FASTQ_TOOLS_PARSE_H
+
+#include <stdio.h>
+#include <zlib.h>
+
+
+typedef struct
+{
+    char*  s;    /* null-terminated string */
+    size_t n;    /* length of s */
+    size_t size; /* bytes allocated for s */
+} str_t;
+
+
+
+typedef struct
+{
+    str_t id1;
+    str_t seq;
+    str_t id2;
+    str_t qual;
+} seq_t;
+
+
+seq_t* fastq_alloc_seq();
+void fastq_free_seq(seq_t*);
+
+
+typedef struct
+{
+    gzFile file;
+    int    state;
+    char*  buf;
+    char*  c;
+} fastq_t;
+
+
+fastq_t* fastq_open(FILE*);
+void fastq_close(fastq_t*);
+int  fastq_next(fastq_t*, seq_t*);
+
+#endif
+
diff --git a/src/kseq.h b/src/kseq.h
deleted file mode 100644 (file)
index 4fafd85..0000000
+++ /dev/null
@@ -1,223 +0,0 @@
-/* The MIT License
-
-   Copyright (c) 2008 Genome Research Ltd (GRL).
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   "Software"), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be
-   included in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-   SOFTWARE.
-*/
-
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
-/* Last Modified: 12APR2009 */
-
-#ifndef AC_KSEQ_H
-#define AC_KSEQ_H
-
-#include <ctype.h>
-#include <string.h>
-#include <stdlib.h>
-
-#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
-#define KS_SEP_TAB   1 // isspace() && !' '
-#define KS_SEP_MAX   1
-
-#define __KS_TYPE(type_t)                                              \
-       typedef struct __kstream_t {                            \
-               char *buf;                                                              \
-               int begin, end, is_eof;                                 \
-               type_t f;                                                               \
-       } kstream_t;
-
-#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
-#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
-
-#define __KS_BASIC(type_t, __bufsize)                                                          \
-       static inline kstream_t *ks_init(type_t f)                                              \
-       {                                                                                                                               \
-               kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
-               ks->f = f;                                                                                                      \
-               ks->buf = (char*)malloc(__bufsize);                                                     \
-               return ks;                                                                                                      \
-       }                                                                                                                               \
-       static inline void ks_destroy(kstream_t *ks)                                    \
-       {                                                                                                                               \
-               if (ks) {                                                                                                       \
-                       free(ks->buf);                                                                                  \
-                       free(ks);                                                                                               \
-               }                                                                                                                       \
-       }
-
-#define __KS_GETC(__read, __bufsize)                                           \
-       static inline int ks_getc(kstream_t *ks)                                \
-       {                                                                                                               \
-               if (ks->is_eof && ks->begin >= ks->end) return -1;      \
-               if (ks->begin >= ks->end) {                                                     \
-                       ks->begin = 0;                                                                  \
-                       ks->end = __read(ks->f, ks->buf, __bufsize);    \
-                       if (ks->end < __bufsize) ks->is_eof = 1;                \
-                       if (ks->end == 0) return -1;                                    \
-               }                                                                                                       \
-               return (int)ks->buf[ks->begin++];                                       \
-       }
-
-#ifndef KSTRING_T
-#define KSTRING_T kstring_t
-typedef struct __kstring_t {
-       size_t l, m;
-       char *s;
-} kstring_t;
-#endif
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#define __KS_GETUNTIL(__read, __bufsize)                                                               \
-       static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
-       {                                                                                                                                       \
-               if (dret) *dret = 0;                                                                                    \
-               str->l = 0;                                                                                                             \
-               if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
-               for (;;) {                                                                                                              \
-                       int i;                                                                                                          \
-                       if (ks->begin >= ks->end) {                                                                     \
-                               if (!ks->is_eof) {                                                                              \
-                                       ks->begin = 0;                                                                          \
-                                       ks->end = __read(ks->f, ks->buf, __bufsize);            \
-                                       if (ks->end < __bufsize) ks->is_eof = 1;                        \
-                                       if (ks->end == 0) break;                                                        \
-                               } else break;                                                                                   \
-                       }                                                                                                                       \
-                       if (delimiter > KS_SEP_MAX) {                                                           \
-                               for (i = ks->begin; i < ks->end; ++i)                                   \
-                                       if (ks->buf[i] == delimiter) break;                                     \
-                       } else if (delimiter == KS_SEP_SPACE) {                                         \
-                               for (i = ks->begin; i < ks->end; ++i)                                   \
-                                       if (isspace(ks->buf[i])) break;                                         \
-                       } else if (delimiter == KS_SEP_TAB) {                                           \
-                               for (i = ks->begin; i < ks->end; ++i)                                   \
-                                       if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
-                       } else i = 0; /* never come to here! */                                         \
-                       if (str->m - str->l < i - ks->begin + 1) {                                      \
-                               str->m = str->l + (i - ks->begin) + 1;                                  \
-                               kroundup32(str->m);                                                                             \
-                               str->s = (char*)realloc(str->s, str->m);                                \
-                       }                                                                                                                       \
-                       memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
-                       str->l = str->l + (i - ks->begin);                                                      \
-                       ks->begin = i + 1;                                                                                      \
-                       if (i < ks->end) {                                                                                      \
-                               if (dret) *dret = ks->buf[i];                                                   \
-                               break;                                                                                                  \
-                       }                                                                                                                       \
-               }                                                                                                                               \
-               if (str->l == 0) {                                                                                              \
-                       str->m = 1;                                                                                                     \
-                       str->s = (char*)calloc(1, 1);                                                           \
-               }                                                                                                                               \
-               str->s[str->l] = '\0';                                                                                  \
-               return str->l;                                                                                                  \
-       }
-
-#define KSTREAM_INIT(type_t, __read, __bufsize) \
-       __KS_TYPE(type_t)                                                       \
-       __KS_BASIC(type_t, __bufsize)                           \
-       __KS_GETC(__read, __bufsize)                            \
-       __KS_GETUNTIL(__read, __bufsize)
-
-#define __KSEQ_BASIC(type_t)                                                                                   \
-       static inline kseq_t *kseq_init(type_t fd)                                                      \
-       {                                                                                                                                       \
-               kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
-               s->f = ks_init(fd);                                                                                             \
-               return s;                                                                                                               \
-       }                                                                                                                                       \
-       static inline void kseq_rewind(kseq_t *ks)                                                      \
-       {                                                                                                                                       \
-               ks->last_char = 0;                                                                                              \
-               ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
-       }                                                                                                                                       \
-       static inline void kseq_destroy(kseq_t *ks)                                                     \
-       {                                                                                                                                       \
-               if (!ks) return;                                                                                                \
-               free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
-               ks_destroy(ks->f);                                                                                              \
-               free(ks);                                                                                                               \
-       }
-
-/* Return value:
-   >=0  length of the sequence (normal)
-   -1   end-of-file
-   -2   truncated quality string
- */
-#define __KSEQ_READ                                                                                                            \
-       static int kseq_read(kseq_t *seq)                                                                       \
-       {                                                                                                                                       \
-               int c;                                                                                                                  \
-               kstream_t *ks = seq->f;                                                                                 \
-               if (seq->last_char == 0) { /* then jump to the next header line */ \
-                       while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');        \
-                       if (c == -1) return -1; /* end of file */                                       \
-                       seq->last_char = c;                                                                                     \
-               } /* the first header char has been read */                                             \
-               seq->comment.l = seq->seq.l = seq->qual.l = 0;                                  \
-               if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;                  \
-               if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);                 \
-               while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
-                       if (isgraph(c)) { /* printable non-space character */           \
-                               if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
-                                       seq->seq.m = seq->seq.l + 2;                                            \
-                                       kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
-                                       seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
-                               }                                                                                                               \
-                               seq->seq.s[seq->seq.l++] = (char)c;                                             \
-                       }                                                                                                                       \
-               }                                                                                                                               \
-               if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
-               seq->seq.s[seq->seq.l] = 0;     /* null terminated string */            \
-               if (c != '+') return seq->seq.l; /* FASTA */                                    \
-               if (seq->qual.m < seq->seq.m) { /* allocate enough memory */    \
-                       seq->qual.m = seq->seq.m;                                                                       \
-                       seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
-               }                                                                                                                               \
-        ks_getuntil(ks, '\n', &seq->comment, &c);                        \
-               if (c == -1) return -2; /* we should not stop here */                   \
-               while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
-                       if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
-               seq->qual.s[seq->qual.l] = 0; /* null terminated string */              \
-               seq->last_char = 0;     /* we have not come to the next header line */ \
-               if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
-               return seq->seq.l;                                                                                              \
-       }
-
-#define __KSEQ_TYPE(type_t)                                            \
-       typedef struct {                                                        \
-               kstring_t name, comment, seq, qual;             \
-               int last_char;                                                  \
-               kstream_t *f;                                                   \
-       } kseq_t;
-
-#define KSEQ_INIT(type_t, __read)                              \
-       KSTREAM_INIT(type_t, __read, 4096)                      \
-       __KSEQ_TYPE(type_t)                                                     \
-       __KSEQ_BASIC(type_t)                                            \
-       __KSEQ_READ
-
-#endif