From d6b81119e55cf77c1e4c17b9e9026abc01e22b96 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Mon, 8 Oct 2012 23:20:25 -0700 Subject: [PATCH] Much simpler faster code for parsing fastq files. --- Makefile.am | 2 +- configure.ac | 4 +- src/fastq-grep.c | 10 +- src/fastq-kmers.c | 10 +- src/fastq-match.c | 10 +- src/fastq-qual.c | 10 +- src/fastq-qualadj.c | 10 +- src/fastq-sample.c | 28 ++-- src/fastq-uniq.c | 10 +- src/parse.c | 334 ++++++++++++++++--------------------------- src/parse.h | 55 ++++--- tests/Makefile.am | 6 + tests/cat_fastq.c | 21 +++ tests/parse_fastq | 5 + tests/random_fastq.c | 146 +++++++++++++++++++ 15 files changed, 381 insertions(+), 280 deletions(-) create mode 100644 tests/Makefile.am create mode 100644 tests/cat_fastq.c create mode 100755 tests/parse_fastq create mode 100644 tests/random_fastq.c diff --git a/Makefile.am b/Makefile.am index b91f90b..164fbcb 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ ACLOCAL_AMFLAGS = -I m4 -SUBDIRS = src doc +SUBDIRS = src tests doc diff --git a/configure.ac b/configure.ac index f106f53..4e6c266 100644 --- a/configure.ac +++ b/configure.ac @@ -28,9 +28,6 @@ AC_PROG_LIBTOOL AC_CHECK_FUNC(fileno, , AC_MSG_ERROR([The 'fileno' function is missing.])) -# check zlib -AX_CHECK_ZLIB - # check pcre AC_CHECK_PROG(HAVE_PCRE, pcre-config, yes, no) if test "x$HAVE_PCRE" = "xno" @@ -50,6 +47,7 @@ CXXFLAGS=$CFLAGS AC_CONFIG_FILES([Makefile src/Makefile + tests/Makefile doc/Makefile src/version.h]) diff --git a/src/fastq-grep.c b/src/fastq-grep.c index 8eb6f7f..4d1603f 100644 --- a/src/fastq-grep.c +++ b/src/fastq-grep.c @@ -59,10 +59,10 @@ void fastq_grep(FILE* fin, FILE* fout, FILE* mismatch_file, pcre* re) int ovector[3]; size_t count = 0; - fastq_t* fqf = fastq_open(fin); - seq_t* seq = fastq_alloc_seq(); + fastq_t* fqf = fastq_create(fin); + seq_t* seq = seq_create(); - while (fastq_next(fqf, seq)) { + while (fastq_read(fqf, seq)) { rc = pcre_exec(re, /* pattern */ NULL, /* extra data */ @@ -82,8 +82,8 @@ void fastq_grep(FILE* fin, FILE* fout, FILE* mismatch_file, pcre* re) } } - fastq_free_seq(seq); - fastq_close(fqf); + seq_free(seq); + fastq_free(fqf); if (count_flag) fprintf(fout, "%zu\n", count); } diff --git a/src/fastq-kmers.c b/src/fastq-kmers.c index 26792de..92f4314 100644 --- a/src/fastq-kmers.c +++ b/src/fastq-kmers.c @@ -109,13 +109,13 @@ void unpackkmer( uint32_t kmer, char* s, int k ) void count_fastq_kmers(FILE* fin, uint32_t* cs) { - seq_t* seq = fastq_alloc_seq(); - fastq_t* fqf = fastq_open(fin); + seq_t* seq = seq_create(); + fastq_t* fqf = fastq_create(fin); int i; int n; uint32_t kmer; - while (fastq_next(fqf, seq)) { + while (fastq_read(fqf, seq)) { n = (int)seq->seq.n - k + 1; for (i = 0; i < n; i++) { if( packkmer(seq->seq.s + i, &kmer, k) ) { @@ -124,8 +124,8 @@ void count_fastq_kmers(FILE* fin, uint32_t* cs) } } - fastq_free_seq(seq); - fastq_close(fqf); + seq_free(seq); + fastq_free(fqf); } diff --git a/src/fastq-match.c b/src/fastq-match.c index bd040a6..1e4d9c7 100644 --- a/src/fastq-match.c +++ b/src/fastq-match.c @@ -47,10 +47,10 @@ void fastq_match(FILE* fin, FILE* fout, sw_t* sw) { int score; - fastq_t* fqf = fastq_open(fin); - seq_t* seq = fastq_alloc_seq(); + fastq_t* fqf = fastq_create(fin); + seq_t* seq = seq_create(); - while (fastq_next(fqf, seq)) { + while (fastq_read(fqf, seq)) { fprintf(fout, "%s\t", seq->seq.s); fastq_sw_conv_seq((unsigned char*)seq->seq.s, seq->seq.n); @@ -59,8 +59,8 @@ void fastq_match(FILE* fin, FILE* fout, sw_t* sw) fprintf(fout, "%d\n", score); } - fastq_free_seq(seq); - fastq_close(fqf); + seq_free(seq); + fastq_free(fqf); } diff --git a/src/fastq-qual.c b/src/fastq-qual.c index 5201376..5d8e4da 100644 --- a/src/fastq-qual.c +++ b/src/fastq-qual.c @@ -42,12 +42,12 @@ void print_help() void tally_quals(FILE* fin, unsigned int** xs, size_t* n) { - seq_t* seq = fastq_alloc_seq(); - fastq_t* fqf = fastq_open(fin); + seq_t* seq = seq_create(); + fastq_t* fqf = fastq_create(fin); size_t i; - while (fastq_next(fqf, seq)) { + while (fastq_read(fqf, seq)) { if (seq->qual.n > *n) { *xs = realloc_or_die(*xs, 255 * seq->qual.n * sizeof(unsigned int)); memset(*xs + *n, 0, 255 * (seq->qual.n - *n) * sizeof(unsigned int)); @@ -60,8 +60,8 @@ void tally_quals(FILE* fin, unsigned int** xs, size_t* n) } } - fastq_free_seq(seq); - fastq_close(fqf); + seq_free(seq); + fastq_free(fqf); } diff --git a/src/fastq-qualadj.c b/src/fastq-qualadj.c index 488fdad..51919b0 100644 --- a/src/fastq-qualadj.c +++ b/src/fastq-qualadj.c @@ -44,12 +44,12 @@ void print_help() void fastq_qualadj(FILE* fin, FILE* fout, int offset) { - fastq_t* fqf = fastq_open(fin); - seq_t* seq = fastq_alloc_seq(); + fastq_t* fqf = fastq_create(fin); + seq_t* seq = seq_create(); size_t i; int c; - while (fastq_next(fqf, seq)) { + while (fastq_read(fqf, seq)) { for (i = 0; i < seq->qual.n; ++i) { c = (int) seq->qual.s[i] + offset; c = c < 0 ? 0 : (c > 126 ? 126: c); @@ -59,8 +59,8 @@ void fastq_qualadj(FILE* fin, FILE* fout, int offset) fastq_print(fout, seq); } - fastq_free_seq(seq); - fastq_close(fqf); + seq_free(seq); + fastq_free(fqf); } diff --git a/src/fastq-sample.c b/src/fastq-sample.c index 4074df9..d3c2b55 100644 --- a/src/fastq-sample.c +++ b/src/fastq-sample.c @@ -39,7 +39,7 @@ void print_help() "Options:\n" " -n N the number of reads to sample (default: 10000)\n" " -p N the proportion of the total reads to sample\n" -" -o, --output=PREFIX output file prefix\n" +" -o, --output=PREFIX output file prefix\n" " -c, --complement-output=PREFIX\n" " output reads not included in the random sample to\n" " a file (or files) with the given prefix (by default,\n" @@ -55,10 +55,10 @@ void print_help() /* count the number of entries in a fastq file */ unsigned long count_entries(fastq_t* fqf) { - seq_t* seq = fastq_alloc_seq(); + seq_t* seq = seq_create(); unsigned long n = 0; - while (fastq_next(fqf, seq)) ++n; - fastq_free_seq(seq); + while (fastq_read(fqf, seq)) ++n; + seq_free(seq); return n; } @@ -126,8 +126,8 @@ void fastq_sample(unsigned long rng_seed, unsigned long n, n2; - fastq_t* f1 = fastq_open(file1); - fastq_t* f2 = file2 == NULL ? NULL : fastq_open(file2); + fastq_t* f1 = fastq_create(file1); + fastq_t* f2 = file2 == NULL ? NULL : fastq_create(file2); n = count_entries(f1); if (f2 != NULL) { @@ -245,12 +245,12 @@ void fastq_sample(unsigned long rng_seed, unsigned long j = 0; // index into xs int ret; - seq_t* seq1 = fastq_alloc_seq(); - seq_t* seq2 = fastq_alloc_seq(); + seq_t* seq1 = seq_create(); + seq_t* seq2 = seq_create(); - while (j < k && fastq_next(f1, seq1)) { + while (j < k && fastq_read(f1, seq1)) { if (f2 != NULL){ - ret = fastq_next(f2, seq2); + ret = fastq_read(f2, seq2); if (ret == 0) { fputs("Input files have differing numbers of entries.\n", stderr); exit(1); @@ -272,10 +272,10 @@ void fastq_sample(unsigned long rng_seed, ++i; } - fastq_free_seq(seq1); - fastq_free_seq(seq2); - fastq_close(f1); - if (f2 != NULL) fastq_close(f2); + seq_free(seq1); + seq_free(seq2); + fastq_free(f1); + if (f2 != NULL) fastq_free(f2); fclose(fout1); if (fout2 != NULL) fclose(fout2); diff --git a/src/fastq-uniq.c b/src/fastq-uniq.c index 2e50762..3a92565 100644 --- a/src/fastq-uniq.c +++ b/src/fastq-uniq.c @@ -49,10 +49,10 @@ static size_t total_reads; void fastq_hash(FILE* fin, hash_table* T) { - fastq_t* fqf = fastq_open(fin); - seq_t* seq = fastq_alloc_seq(); + fastq_t* fqf = fastq_create(fin); + seq_t* seq = seq_create(); - while (fastq_next(fqf, seq)) { + while (fastq_read(fqf, seq)) { inc_hash_table(T, seq->seq.s, seq->seq.n); total_reads++; @@ -61,8 +61,8 @@ void fastq_hash(FILE* fin, hash_table* T) } } - fastq_free_seq(seq); - fastq_close(fqf); + seq_free(seq); + fastq_free(fqf); } diff --git a/src/parse.c b/src/parse.c index 61e3efa..e27a385 100644 --- a/src/parse.c +++ b/src/parse.c @@ -1,272 +1,178 @@ -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - */ + +#include +#include +#include #include "parse.h" #include "common.h" -#include -#include - -static const size_t init_str_size = 128; -static const size_t fastq_buf_size = 4096; -static void fastq_alloc_str(str_t* s) +static void str_init(str_t* str) { - s->s = malloc_or_die(init_str_size); - s->s[0] = '\0'; - s->n = 0; - s->size = init_str_size; + str->n = 0; + str->size = 128; + str->s = malloc_or_die(str->size = 0); + str->s[0] = '\0'; } -static void fastq_expand_str(str_t* s) +static void str_free(str_t* str) { - s->size *= 2; - realloc_or_die(s->s, s->size); + free(str->s); } -seq_t* fastq_alloc_seq() +/* Reserve space for `size` more characters. */ +static void str_reserve_extra(str_t* str, size_t size) { - seq_t* seq = malloc_or_die(sizeof(seq_t)); - fastq_alloc_str(&seq->id1); - fastq_alloc_str(&seq->seq); - fastq_alloc_str(&seq->id2); - fastq_alloc_str(&seq->qual); - - return seq; + if (str->n + size > str->size) { + if (str->n + size > 2 * str->size) { + str->size = str->n + size; + } + else str->size *= 2; + str->s = realloc_or_die(str->s, str->size * sizeof(char)); + } } -void fastq_free_seq(seq_t* seq) +/* Copy n characters from c to the end of str. */ +static void str_append(str_t* str, char* c, size_t n) { - free(seq->id1.s); - free(seq->seq.s); - free(seq->id2.s); - free(seq->qual.s); - free(seq); + str_reserve_extra(str, n); + memcpy(str->s + str->n, c, n); + str->n += n; + str->s[str->n] = '\0'; } -typedef enum +seq_t* seq_create() { - STATE_EOF, - STATE_ERROR, - STATE_ID1, - STATE_SEQ, - STATE_ID2, - STATE_QUAL - -} fastq_state; - - -fastq_t* fastq_open(FILE* f) -{ - fastq_t* fqf = malloc_or_die(sizeof(fastq_t)); - or_die((int)((fqf->file = gzdopen(fileno(f), "rb")) != NULL), - "Can not open gzip file."); - - fqf->state = STATE_ID1; - fqf->buf = malloc_or_die(fastq_buf_size); - fqf->buf[0] = '\0'; - fqf->c = fqf->buf; - - return fqf; + seq_t* seq = malloc_or_die(sizeof(seq_t)); + str_init(&seq->id1); + str_init(&seq->seq); + str_init(&seq->id2); + str_init(&seq->qual); + return seq; } -void fastq_close(fastq_t* fqf) +void seq_free(seq_t* seq) { - gzclose(fqf->file); - free(fqf->buf); - free(fqf); + str_free(&seq->id1); + str_free(&seq->seq); + str_free(&seq->id2); + str_free(&seq->qual); + free(seq); } -void fastq_refill(fastq_t* f) -{ - int errnum; - const char* errmsg; +static const size_t parser_buf_size = 1000000; - int n = gzread(f->file, f->buf, fastq_buf_size - 1); - if (n <= 0) { - if (gzeof(f->file)) { - f->state = STATE_EOF; - n = 0; - } - else { - errmsg = gzerror(f->file, &errnum); - fprintf(stderr, "I/O error: %s\n", errmsg); - exit(1); - } - } +struct fastq_t_ +{ + FILE* file; + size_t readlen; + char* buf; + char* next; + bool linestart; +}; - f->buf[n] = '\0'; - f->c = f->buf; -} -void fastq_get_line(fastq_t* f, str_t* s) +fastq_t* fastq_create(FILE* file) { - size_t i = 0; - - if (f->state == STATE_EOF) goto fastq_get_line_done; - - while (1) { - switch (*f->c) { - case '\0': - fastq_refill(f); - if (f->state == STATE_EOF) goto fastq_get_line_done; - break; - - case '\r': - f->c++; - break; - - case '\n': - goto fastq_get_line_done; - - default: - while (s->size < i + 2) { - fastq_expand_str(s); - } - if (s) s->s[i++] = *f->c; - f->c++; - } - - } - -fastq_get_line_done: - if (s) { - s->s[i] = '\0'; - s->n = i; - } + fastq_t* f = malloc_or_die(sizeof(fastq_t)); + f->file = file; + f->next = f->buf = malloc_or_die(parser_buf_size); + f->readlen = 0; + f->linestart = true; + return f; } - -int fastq_next(fastq_t* f, seq_t* seq) +void fastq_free(fastq_t* f) { - if (f->state == STATE_EOF) return 0; + free(f->buf); + free(f); +} - while (1) { - /* read more, if needed */ - if (*f->c == '\0' ) { - fastq_refill(f); - if (f->state == STATE_EOF) return 0; - continue; - } +typedef enum { + FASTQ_STATE_ID1, /* Reading ID1. */ + FASTQ_STATE_SEQ, /* Reading the sequence. */ + FASTQ_STATE_ID2, /* Reading ID2. */ + FASTQ_STATE_QUAL, /* Reading quality scores. */ +} fastq_parser_state_t; - /* skip over leading whitespace */ - else if (isspace(*f->c)) { - /* do nothing */ - } - /* skip comments */ - /* - else if (*f->c == ';') { - fastq_get_line(f, NULL); - if (f->state == STATE_EOF) return 0; - } - */ - - /* read id1 */ - else if (f->state == STATE_ID1) { - if (*f->c == '@' || *f->c == '>') { - f->c++; - fastq_get_line(f, &seq->id1); - if (f->state == STATE_EOF) return 0; - - f->state = STATE_SEQ; - } - else { - fprintf(stderr, - "Malformed FASTQ file: expecting an '@' or '>', saw a '%c'\n", - *f->c); - exit(1); +bool fastq_read(fastq_t* f, seq_t* seq) +{ + seq->id1.n = seq->seq.n = seq->id2.n = seq->qual.n = 0; + fastq_parser_state_t state = FASTQ_STATE_ID1; + char* end = f->buf + f->readlen; + do { + while (f->next < end) { + /* Consume pointless special characters prefixing IDs */ + if ((state == FASTQ_STATE_ID1 && f->linestart && f->next[0] == '@') || + (state == FASTQ_STATE_ID2 && f->linestart && f->next[0] == '+')) { + f->linestart = false; + ++f->next; + continue; } - } - - /* read sequence */ - else if (f->state == STATE_SEQ) { - fastq_get_line(f, &seq->seq); - if (f->state == STATE_EOF) return 0; - - f->state = STATE_ID2; - } - - /* read id2 */ - else if (f->state == STATE_ID2) { - if (*f->c == '+') { - f->c++; - fastq_get_line(f, &seq->id2); - if (f->state == STATE_EOF) return 0; - f->state = STATE_QUAL; + char* u = memchr(f->next, '\n', end - f->next); + if (u == NULL) { + f->linestart = false; + u = end; } - else { - /* fasta style entry */ - seq->id2.s[0] = '\0'; - seq->qual.s[0] = '\0'; - - f->state = STATE_ID1; - break; + else f->linestart = true; + + switch (state) { + case FASTQ_STATE_ID1: + str_append(&seq->id1, f->next, u - f->next); + if (f->linestart) state = FASTQ_STATE_SEQ; + break; + + case FASTQ_STATE_SEQ: + str_append(&seq->seq, f->next, u - f->next); + if (f->linestart) state = FASTQ_STATE_ID2; + break; + + case FASTQ_STATE_ID2: + str_append(&seq->id2, f->next, u - f->next); + if (f->linestart) state = FASTQ_STATE_QUAL; + break; + + case FASTQ_STATE_QUAL: + str_append(&seq->qual, f->next, u - f->next); + if (f->linestart) { + f->next = u + 1; + return true; + } + break; } - } - /* read quality string */ - else if (f->state == STATE_QUAL) { - fastq_get_line(f, &seq->qual); - if (f->state == STATE_EOF) return 1; - - f->state = STATE_ID1; - break; + f->next = u + 1; } - else { - fputs("Inexplicable error in fastq parser.\n", stderr); - exit(1); - } + /* Try to read more. */ + f->readlen = fread(f->buf, 1, parser_buf_size, f->file); + f->next = f->buf; + end = f->buf + f->readlen; + } while (f->readlen); - f->c++; - } - - return 1; + return false; } -void fastq_rewind(fastq_t* fqf) +void fastq_print(FILE* fout, const seq_t* seq) { - gzrewind(fqf->file); - fqf->state = STATE_ID1; - fqf->buf[0] = '\0'; - fqf->c = fqf->buf; -} - - -void fastq_print(FILE* fout, seq_t* seq) -{ - /* FASTQ */ - if (seq->qual.n > 0) { - fprintf(fout, "@%s\n%s\n+%s\n%s\n", - seq->id1.s, - seq->seq.s, - seq->id2.s, - seq->qual.s ); - } - - /* FASTA */ - else { - fprintf(fout, ">%s\n%s\n", - seq->id1.s, - seq->seq.s ); - } + fprintf(fout, "@%s\n%s\n+%s\n%s\n", + seq->id1.s, + seq->seq.s, + seq->id2.s, + seq->qual.s ); } diff --git a/src/parse.h b/src/parse.h index d9bbac6..8500f0c 100644 --- a/src/parse.h +++ b/src/parse.h @@ -11,10 +11,10 @@ #ifndef FASTQ_TOOLS_PARSE_H #define FASTQ_TOOLS_PARSE_H -#include -#include - +#include +#include +/* A string structure to keep-track of a reserved space. */ typedef struct { char* s; /* null-terminated string */ @@ -23,7 +23,7 @@ typedef struct } str_t; - +/* A single fastq entry. */ typedef struct { str_t id1; @@ -33,25 +33,44 @@ typedef struct } seq_t; -seq_t* fastq_alloc_seq(); -void fastq_free_seq(seq_t*); +/* Allocate a new empty seq_t. */ +seq_t* seq_create(); -typedef struct -{ - gzFile file; - int state; - char* buf; - char* c; -} fastq_t; +/* Free a seq allocated with seq_create. */ +void seq_free(seq_t* seq); + + +/* Internal data for the fastq parser. */ +typedef struct fastq_t_ fastq_t; + + +/* Create a new fastq parser object. + * + * Args: + * file: A file that has been opened for reading. + */ +fastq_t* fastq_create(FILE* file); + + +/* Free memory associated with a fastq_t object. */ +void fastq_free(fastq_t*); -fastq_t* fastq_open(FILE*); -void fastq_close(fastq_t*); -int fastq_next(fastq_t*, seq_t*); -void fastq_rewind(fastq_t*); +/* Read one fastq entry. + * + * Args: + * f: A fastq_t parser object. + * seq: A seq_t object that has been allocated with seq_create. + * + * Returns: + * True if an entry was read, false if end-of-file was reached. + */ +bool fastq_read(fastq_t* f, seq_t* seq); + -void fastq_print(FILE* fout, seq_t* seq); +/* Print a fastq entry. */ +void fastq_print(FILE* fout, const seq_t* seq); #endif diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 0000000..3bd6e06 --- /dev/null +++ b/tests/Makefile.am @@ -0,0 +1,6 @@ + +check_PROGRAMS = random_fastq cat_fastq + +TESTS = parse_fastq + + diff --git a/tests/cat_fastq.c b/tests/cat_fastq.c new file mode 100644 index 0000000..449ee5d --- /dev/null +++ b/tests/cat_fastq.c @@ -0,0 +1,21 @@ + +#include "../src/parse.c" +#include "../src/common.c" + +int main() +{ + seq_t* seq = seq_create(); + fastq_t* f = fastq_create(stdin); + + while (fastq_read(f, seq)) { + fastq_print(stdout, seq); + } + + fastq_free(f); + seq_free(seq); + + return EXIT_SUCCESS; +} + + + diff --git a/tests/parse_fastq b/tests/parse_fastq new file mode 100755 index 0000000..0be008a --- /dev/null +++ b/tests/parse_fastq @@ -0,0 +1,5 @@ +#!/bin/sh + +./random_fastq | head -n 40000 | ./cat_fastq > /dev/null + + diff --git a/tests/random_fastq.c b/tests/random_fastq.c new file mode 100644 index 0000000..b4853ce --- /dev/null +++ b/tests/random_fastq.c @@ -0,0 +1,146 @@ + +/* + random_fastq + ------------ + Generate random data in FASTQ format. +*/ + +#include +#include +#include + +static void print_help() +{ + printf( +"Usage: random_fastq [option]...\n" +"Generate an endless stream of random FASTQ data to standard out.\n\n" +"Options:\n" +" TODO\n\n" +"Beware: the only purpose of this program is test quip.\n" +"No particular guarantees are made.\n\n"); +} + + +/* Draw n samples from a categorial distribution of size k with cumulative + * distribution given by cs and element us. The sample is stored in xs which + * is assumed to be of appropriate length. */ +void randcat(const char* us, const double* cs, size_t k, char* xs, size_t n) +{ + size_t i; + double r; + for (i = 0; i < n; ++i) { + r = drand48(); + int a = 0, b = (int) k, c; + do { + c = (a + b) / 2; + if (r <= cs[c]) b = c; + else a = c + 1; + } while (a < b); + + xs[i] = us[a]; + } +} + + +int main(int argc, char* argv[]) +{ + static struct option long_options[] = + { + {"min-length", required_argument, NULL, 'm'}, + {"max-length", required_argument, NULL, 'M'}, + {"length", required_argument, NULL, 'l'}, + {"id-length", required_argument, NULL, 'i'}, + {"help", no_argument, NULL, 'h'}, + {NULL, 0, NULL, 0} + }; + + size_t min_len = 100, max_len = 100; + size_t id_len = 50; + + int opt, opt_idx; + while (1) { + opt = getopt_long(argc, argv, "h", long_options, &opt_idx); + + if (opt == -1) break; + + switch (opt) { + case 'm': + min_len = (size_t) strtoul(optarg, NULL, 10); + break; + + case 'M': + max_len = (size_t) strtoul(optarg, NULL, 10); + break; + + case 'l': + min_len = max_len = (size_t) strtoul(optarg, NULL, 10); + break; + + case 'i': + id_len = (size_t) strtoul(optarg, NULL, 10); + break; + + case 'h': + print_help(); + return EXIT_SUCCESS; + + case '?': + return EXIT_FAILURE; + + default: + abort(); + } + } + + char nucleotides[5] = {'A', 'C', 'G', 'T', 'N'}; + double nuc_cs[5] = {0.28, 0.49, 0.70, 0.90, 1.00}; + + char qualities[64]; + double qual_cs[64]; + size_t i; + double last_c = 0.0; + for (i = 0; i < 64; ++i) { + qualities[i] = '!' + i; + last_c = qual_cs[i] = last_c + 1.0 / 64.0; + } + + char id_chars[94]; + double id_cs[94]; + last_c = 0.0; + for (i = 0; i < 94; ++i) { + id_chars[i] = '!' + i; + last_c = id_cs[i] = last_c + 1.0 / 94.0; + } + + char* id = malloc(id_len + 1); + char* seq = malloc(max_len + 1); + char* qual = malloc(max_len + 1); + size_t len = min_len; + + while (1) { + if (max_len > min_len) { + len = min_len + (size_t) (drand48() * (double) (1 + max_len - min_len)); + } + + randcat(id_chars, id_cs, 94, id, id_len); + id[id_len] = '\0'; + + randcat(nucleotides, nuc_cs, 5, seq, len); + seq[len] = '\0'; + + randcat(qualities, qual_cs, 64, qual, len); + qual[len] = '\0'; + + printf( + "@%s\n%s\n+\n%s\n", + id, seq, qual); + } + + /* Yeah, right. As if we'll ever get here. */ + free(id); + free(seq); + free(qual); + + return EXIT_SUCCESS; +} + -- 2.39.2