]> git.donarmstrong.com Git - fastq-tools.git/blobdiff - src/parse.c
a program to output a nonredundant list of readss
[fastq-tools.git] / src / parse.c
diff --git a/src/parse.c b/src/parse.c
new file mode 100644 (file)
index 0000000..bec5ac3
--- /dev/null
@@ -0,0 +1,235 @@
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ */
+
+#include "parse.h"
+#include "common.h"
+#include <stdlib.h>
+#include <ctype.h>
+
+
+static const size_t init_str_size  = 128;
+static const size_t fastq_buf_size = 4096;
+
+static void fastq_alloc_str(str_t* s)
+{
+    s->s = malloc_or_die(init_str_size);
+    s->s[0] = '\0';
+    s->n = 0;
+    s->size = init_str_size;
+}
+
+
+static void fastq_expand_str(str_t* s)
+{
+    s->size *= 2;
+    realloc_or_die(s->s, s->size);
+}
+
+
+seq_t* fastq_alloc_seq()
+{
+    seq_t* seq = malloc_or_die(sizeof(seq_t));
+    fastq_alloc_str(&seq->id1);
+    fastq_alloc_str(&seq->seq);
+    fastq_alloc_str(&seq->id2);
+    fastq_alloc_str(&seq->qual);
+
+    return seq;
+}
+
+
+void fastq_free_seq(seq_t* seq)
+{
+    free(seq->id1.s);
+    free(seq->seq.s);
+    free(seq->id2.s);
+    free(seq->qual.s);
+    free(seq);
+}
+
+
+typedef enum
+{
+    STATE_EOF,
+    STATE_ERROR,
+    STATE_ID1,
+    STATE_SEQ,
+    STATE_ID2,
+    STATE_QUAL
+
+} fastq_state;
+
+
+fastq_t* fastq_open(FILE* f)
+{
+    fastq_t* fqf = malloc_or_die(sizeof(fastq_t));
+    or_die((int)(fqf->file = gzdopen(fileno(f), "rb")),
+           "Can not open gzip file.");
+    
+    fqf->state = STATE_ID1;
+    fqf->buf = malloc_or_die(fastq_buf_size);
+    fqf->buf[0] = '\0';
+    fqf->c = fqf->buf;
+
+    return fqf;
+}
+
+
+void fastq_close(fastq_t* fqf)
+{
+    gzclose(fqf->file);
+    free(fqf->buf);
+    free(fqf);
+}
+
+
+void fastq_refill(fastq_t* f)
+{
+    int errnum;
+    const char* errmsg;
+
+    int n = gzread(f->file, f->buf, fastq_buf_size - 1);
+
+    if (n <= 0) {
+        if (gzeof(f->file)) {
+            f->state = STATE_EOF;
+            n = 0;
+        }
+        else {
+            errmsg = gzerror(f->file, &errnum);
+            fprintf(stderr, "I/O error: %s\n", errmsg);
+            exit(1);
+        }
+    }
+
+    f->buf[n] = '\0';
+    f->c = f->buf;
+}
+
+
+void fastq_get_line(fastq_t* f, str_t* s)
+{
+    int i = 0;
+
+    if (f->state == STATE_EOF) goto fastq_get_line_done;
+
+    while (1) {
+        switch (*f->c) {
+            case '\0':
+                fastq_refill(f);
+                if (f->state == STATE_EOF) goto fastq_get_line_done;
+                break;
+
+            case '\r':
+                f->c++;
+                break;
+
+            case '\n':
+                goto fastq_get_line_done;
+
+            default:
+                while (s->size < i + 2) {
+                    fastq_expand_str(s);
+                }
+                if (s) s->s[i++] = *f->c;
+                f->c++;
+        }
+
+    }
+
+fastq_get_line_done:
+    if (s) {
+        s->s[i] = '\0';
+        s->n = i;
+    }
+}
+
+
+
+int fastq_next(fastq_t* f, seq_t* seq)
+{
+    if (f->state == STATE_EOF) return 0;
+
+    while (1) {
+
+        /* read more, if needed */
+        if (*f->c == '\0' ) {
+            fastq_refill(f);
+            if (f->state == STATE_EOF) return 0;
+            continue;
+        }
+
+        /* skip over leading whitespace */
+        else if (isspace(*f->c)) {
+            /* do nothing */
+        }
+
+        /* skip comments */
+        else if (*f->c == ';') {
+            fastq_get_line(f, NULL);
+            if (f->state == STATE_EOF) return 0;
+        }
+
+        /* read id1 */
+        else if (f->state == STATE_ID1) {
+            if (*f->c == '@') {
+                f->c++;
+                fastq_get_line(f, &seq->id1);
+                if (f->state == STATE_EOF) return 0;
+
+                f->state = STATE_SEQ;
+            }
+            else {
+                fprintf(stderr, "Malformed FASTQ file: expecting an '@', saw a '%c'\n", *f->c);
+                exit(1);
+            }
+        }
+
+        /* read sequence */
+        else if (f->state == STATE_SEQ) {
+            fastq_get_line(f, &seq->seq);
+            if (f->state == STATE_EOF) return 0;
+
+            f->state = STATE_ID2;
+        }
+
+        /* read id2 */
+        else if (f->state == STATE_ID2) {
+            if (*f->c == '+') {
+                f->c++;
+                fastq_get_line(f, &seq->id2);
+                if (f->state == STATE_EOF) return 0;
+
+                f->state = STATE_QUAL;
+            }
+            else {
+                fprintf(stderr, "Malformed FASTQ file: expecting an '+', saw a '%c'\n", *f->c);
+                exit(1);
+            }
+        }
+
+        /* read quality string */
+        else if (f->state == STATE_QUAL) {
+            fastq_get_line(f, &seq->qual);
+            if (f->state == STATE_EOF) return 1;
+
+            f->state = STATE_ID1;
+            break;
+        }
+
+        else {
+            fputs("Inexplicable error in fastq parser.\n", stderr);
+            exit(1);
+        }
+
+        f->c++;
+    }
+
+    return 1;
+}
+
+