X-Git-Url: https://git.donarmstrong.com/?p=fastq-tools.git;a=blobdiff_plain;f=src%2Fparse.c;h=e27a3859ebdb24a6a9b7406fa0f9218d64b40e99;hp=61e3efa3cab29160ce59e4823d9aaa726e335059;hb=d6b81119e55cf77c1e4c17b9e9026abc01e22b96;hpb=07ffc4895d705c5207492a002ffce4576dac5fc5 diff --git a/src/parse.c b/src/parse.c index 61e3efa..e27a385 100644 --- a/src/parse.c +++ b/src/parse.c @@ -1,272 +1,178 @@ -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - */ + +#include +#include +#include #include "parse.h" #include "common.h" -#include -#include - -static const size_t init_str_size = 128; -static const size_t fastq_buf_size = 4096; -static void fastq_alloc_str(str_t* s) +static void str_init(str_t* str) { - s->s = malloc_or_die(init_str_size); - s->s[0] = '\0'; - s->n = 0; - s->size = init_str_size; + str->n = 0; + str->size = 128; + str->s = malloc_or_die(str->size = 0); + str->s[0] = '\0'; } -static void fastq_expand_str(str_t* s) +static void str_free(str_t* str) { - s->size *= 2; - realloc_or_die(s->s, s->size); + free(str->s); } -seq_t* fastq_alloc_seq() +/* Reserve space for `size` more characters. */ +static void str_reserve_extra(str_t* str, size_t size) { - seq_t* seq = malloc_or_die(sizeof(seq_t)); - fastq_alloc_str(&seq->id1); - fastq_alloc_str(&seq->seq); - fastq_alloc_str(&seq->id2); - fastq_alloc_str(&seq->qual); - - return seq; + if (str->n + size > str->size) { + if (str->n + size > 2 * str->size) { + str->size = str->n + size; + } + else str->size *= 2; + str->s = realloc_or_die(str->s, str->size * sizeof(char)); + } } -void fastq_free_seq(seq_t* seq) +/* Copy n characters from c to the end of str. */ +static void str_append(str_t* str, char* c, size_t n) { - free(seq->id1.s); - free(seq->seq.s); - free(seq->id2.s); - free(seq->qual.s); - free(seq); + str_reserve_extra(str, n); + memcpy(str->s + str->n, c, n); + str->n += n; + str->s[str->n] = '\0'; } -typedef enum +seq_t* seq_create() { - STATE_EOF, - STATE_ERROR, - STATE_ID1, - STATE_SEQ, - STATE_ID2, - STATE_QUAL - -} fastq_state; - - -fastq_t* fastq_open(FILE* f) -{ - fastq_t* fqf = malloc_or_die(sizeof(fastq_t)); - or_die((int)((fqf->file = gzdopen(fileno(f), "rb")) != NULL), - "Can not open gzip file."); - - fqf->state = STATE_ID1; - fqf->buf = malloc_or_die(fastq_buf_size); - fqf->buf[0] = '\0'; - fqf->c = fqf->buf; - - return fqf; + seq_t* seq = malloc_or_die(sizeof(seq_t)); + str_init(&seq->id1); + str_init(&seq->seq); + str_init(&seq->id2); + str_init(&seq->qual); + return seq; } -void fastq_close(fastq_t* fqf) +void seq_free(seq_t* seq) { - gzclose(fqf->file); - free(fqf->buf); - free(fqf); + str_free(&seq->id1); + str_free(&seq->seq); + str_free(&seq->id2); + str_free(&seq->qual); + free(seq); } -void fastq_refill(fastq_t* f) -{ - int errnum; - const char* errmsg; +static const size_t parser_buf_size = 1000000; - int n = gzread(f->file, f->buf, fastq_buf_size - 1); - if (n <= 0) { - if (gzeof(f->file)) { - f->state = STATE_EOF; - n = 0; - } - else { - errmsg = gzerror(f->file, &errnum); - fprintf(stderr, "I/O error: %s\n", errmsg); - exit(1); - } - } +struct fastq_t_ +{ + FILE* file; + size_t readlen; + char* buf; + char* next; + bool linestart; +}; - f->buf[n] = '\0'; - f->c = f->buf; -} -void fastq_get_line(fastq_t* f, str_t* s) +fastq_t* fastq_create(FILE* file) { - size_t i = 0; - - if (f->state == STATE_EOF) goto fastq_get_line_done; - - while (1) { - switch (*f->c) { - case '\0': - fastq_refill(f); - if (f->state == STATE_EOF) goto fastq_get_line_done; - break; - - case '\r': - f->c++; - break; - - case '\n': - goto fastq_get_line_done; - - default: - while (s->size < i + 2) { - fastq_expand_str(s); - } - if (s) s->s[i++] = *f->c; - f->c++; - } - - } - -fastq_get_line_done: - if (s) { - s->s[i] = '\0'; - s->n = i; - } + fastq_t* f = malloc_or_die(sizeof(fastq_t)); + f->file = file; + f->next = f->buf = malloc_or_die(parser_buf_size); + f->readlen = 0; + f->linestart = true; + return f; } - -int fastq_next(fastq_t* f, seq_t* seq) +void fastq_free(fastq_t* f) { - if (f->state == STATE_EOF) return 0; + free(f->buf); + free(f); +} - while (1) { - /* read more, if needed */ - if (*f->c == '\0' ) { - fastq_refill(f); - if (f->state == STATE_EOF) return 0; - continue; - } +typedef enum { + FASTQ_STATE_ID1, /* Reading ID1. */ + FASTQ_STATE_SEQ, /* Reading the sequence. */ + FASTQ_STATE_ID2, /* Reading ID2. */ + FASTQ_STATE_QUAL, /* Reading quality scores. */ +} fastq_parser_state_t; - /* skip over leading whitespace */ - else if (isspace(*f->c)) { - /* do nothing */ - } - /* skip comments */ - /* - else if (*f->c == ';') { - fastq_get_line(f, NULL); - if (f->state == STATE_EOF) return 0; - } - */ - - /* read id1 */ - else if (f->state == STATE_ID1) { - if (*f->c == '@' || *f->c == '>') { - f->c++; - fastq_get_line(f, &seq->id1); - if (f->state == STATE_EOF) return 0; - - f->state = STATE_SEQ; - } - else { - fprintf(stderr, - "Malformed FASTQ file: expecting an '@' or '>', saw a '%c'\n", - *f->c); - exit(1); +bool fastq_read(fastq_t* f, seq_t* seq) +{ + seq->id1.n = seq->seq.n = seq->id2.n = seq->qual.n = 0; + fastq_parser_state_t state = FASTQ_STATE_ID1; + char* end = f->buf + f->readlen; + do { + while (f->next < end) { + /* Consume pointless special characters prefixing IDs */ + if ((state == FASTQ_STATE_ID1 && f->linestart && f->next[0] == '@') || + (state == FASTQ_STATE_ID2 && f->linestart && f->next[0] == '+')) { + f->linestart = false; + ++f->next; + continue; } - } - - /* read sequence */ - else if (f->state == STATE_SEQ) { - fastq_get_line(f, &seq->seq); - if (f->state == STATE_EOF) return 0; - - f->state = STATE_ID2; - } - - /* read id2 */ - else if (f->state == STATE_ID2) { - if (*f->c == '+') { - f->c++; - fastq_get_line(f, &seq->id2); - if (f->state == STATE_EOF) return 0; - f->state = STATE_QUAL; + char* u = memchr(f->next, '\n', end - f->next); + if (u == NULL) { + f->linestart = false; + u = end; } - else { - /* fasta style entry */ - seq->id2.s[0] = '\0'; - seq->qual.s[0] = '\0'; - - f->state = STATE_ID1; - break; + else f->linestart = true; + + switch (state) { + case FASTQ_STATE_ID1: + str_append(&seq->id1, f->next, u - f->next); + if (f->linestart) state = FASTQ_STATE_SEQ; + break; + + case FASTQ_STATE_SEQ: + str_append(&seq->seq, f->next, u - f->next); + if (f->linestart) state = FASTQ_STATE_ID2; + break; + + case FASTQ_STATE_ID2: + str_append(&seq->id2, f->next, u - f->next); + if (f->linestart) state = FASTQ_STATE_QUAL; + break; + + case FASTQ_STATE_QUAL: + str_append(&seq->qual, f->next, u - f->next); + if (f->linestart) { + f->next = u + 1; + return true; + } + break; } - } - /* read quality string */ - else if (f->state == STATE_QUAL) { - fastq_get_line(f, &seq->qual); - if (f->state == STATE_EOF) return 1; - - f->state = STATE_ID1; - break; + f->next = u + 1; } - else { - fputs("Inexplicable error in fastq parser.\n", stderr); - exit(1); - } + /* Try to read more. */ + f->readlen = fread(f->buf, 1, parser_buf_size, f->file); + f->next = f->buf; + end = f->buf + f->readlen; + } while (f->readlen); - f->c++; - } - - return 1; + return false; } -void fastq_rewind(fastq_t* fqf) +void fastq_print(FILE* fout, const seq_t* seq) { - gzrewind(fqf->file); - fqf->state = STATE_ID1; - fqf->buf[0] = '\0'; - fqf->c = fqf->buf; -} - - -void fastq_print(FILE* fout, seq_t* seq) -{ - /* FASTQ */ - if (seq->qual.n > 0) { - fprintf(fout, "@%s\n%s\n+%s\n%s\n", - seq->id1.s, - seq->seq.s, - seq->id2.s, - seq->qual.s ); - } - - /* FASTA */ - else { - fprintf(fout, ">%s\n%s\n", - seq->id1.s, - seq->seq.s ); - } + fprintf(fout, "@%s\n%s\n+%s\n%s\n", + seq->id1.s, + seq->seq.s, + seq->id2.s, + seq->qual.s ); }