ACLOCAL_AMFLAGS = -I m4
-SUBDIRS = src doc
+SUBDIRS = src tests doc
AC_CHECK_FUNC(fileno, ,
AC_MSG_ERROR([The 'fileno' function is missing.]))
-# check zlib
-AX_CHECK_ZLIB
-
# check pcre
AC_CHECK_PROG(HAVE_PCRE, pcre-config, yes, no)
if test "x$HAVE_PCRE" = "xno"
AC_CONFIG_FILES([Makefile
src/Makefile
+ tests/Makefile
doc/Makefile
src/version.h])
int ovector[3];
size_t count = 0;
- fastq_t* fqf = fastq_open(fin);
- seq_t* seq = fastq_alloc_seq();
+ fastq_t* fqf = fastq_create(fin);
+ seq_t* seq = seq_create();
- while (fastq_next(fqf, seq)) {
+ while (fastq_read(fqf, seq)) {
rc = pcre_exec(re, /* pattern */
NULL, /* extra data */
}
}
- fastq_free_seq(seq);
- fastq_close(fqf);
+ seq_free(seq);
+ fastq_free(fqf);
if (count_flag) fprintf(fout, "%zu\n", count);
}
void count_fastq_kmers(FILE* fin, uint32_t* cs)
{
- seq_t* seq = fastq_alloc_seq();
- fastq_t* fqf = fastq_open(fin);
+ seq_t* seq = seq_create();
+ fastq_t* fqf = fastq_create(fin);
int i;
int n;
uint32_t kmer;
- while (fastq_next(fqf, seq)) {
+ while (fastq_read(fqf, seq)) {
n = (int)seq->seq.n - k + 1;
for (i = 0; i < n; i++) {
if( packkmer(seq->seq.s + i, &kmer, k) ) {
}
}
- fastq_free_seq(seq);
- fastq_close(fqf);
+ seq_free(seq);
+ fastq_free(fqf);
}
{
int score;
- fastq_t* fqf = fastq_open(fin);
- seq_t* seq = fastq_alloc_seq();
+ fastq_t* fqf = fastq_create(fin);
+ seq_t* seq = seq_create();
- while (fastq_next(fqf, seq)) {
+ while (fastq_read(fqf, seq)) {
fprintf(fout, "%s\t", seq->seq.s);
fastq_sw_conv_seq((unsigned char*)seq->seq.s, seq->seq.n);
fprintf(fout, "%d\n", score);
}
- fastq_free_seq(seq);
- fastq_close(fqf);
+ seq_free(seq);
+ fastq_free(fqf);
}
void tally_quals(FILE* fin, unsigned int** xs, size_t* n)
{
- seq_t* seq = fastq_alloc_seq();
- fastq_t* fqf = fastq_open(fin);
+ seq_t* seq = seq_create();
+ fastq_t* fqf = fastq_create(fin);
size_t i;
- while (fastq_next(fqf, seq)) {
+ while (fastq_read(fqf, seq)) {
if (seq->qual.n > *n) {
*xs = realloc_or_die(*xs, 255 * seq->qual.n * sizeof(unsigned int));
memset(*xs + *n, 0, 255 * (seq->qual.n - *n) * sizeof(unsigned int));
}
}
- fastq_free_seq(seq);
- fastq_close(fqf);
+ seq_free(seq);
+ fastq_free(fqf);
}
void fastq_qualadj(FILE* fin, FILE* fout, int offset)
{
- fastq_t* fqf = fastq_open(fin);
- seq_t* seq = fastq_alloc_seq();
+ fastq_t* fqf = fastq_create(fin);
+ seq_t* seq = seq_create();
size_t i;
int c;
- while (fastq_next(fqf, seq)) {
+ while (fastq_read(fqf, seq)) {
for (i = 0; i < seq->qual.n; ++i) {
c = (int) seq->qual.s[i] + offset;
c = c < 0 ? 0 : (c > 126 ? 126: c);
fastq_print(fout, seq);
}
- fastq_free_seq(seq);
- fastq_close(fqf);
+ seq_free(seq);
+ fastq_free(fqf);
}
"Options:\n"
" -n N the number of reads to sample (default: 10000)\n"
" -p N the proportion of the total reads to sample\n"
-" -o, --output=PREFIX output file prefix\n"
+" -o, --output=PREFIX output file prefix\n"
" -c, --complement-output=PREFIX\n"
" output reads not included in the random sample to\n"
" a file (or files) with the given prefix (by default,\n"
/* count the number of entries in a fastq file */
unsigned long count_entries(fastq_t* fqf)
{
- seq_t* seq = fastq_alloc_seq();
+ seq_t* seq = seq_create();
unsigned long n = 0;
- while (fastq_next(fqf, seq)) ++n;
- fastq_free_seq(seq);
+ while (fastq_read(fqf, seq)) ++n;
+ seq_free(seq);
return n;
}
unsigned long n, n2;
- fastq_t* f1 = fastq_open(file1);
- fastq_t* f2 = file2 == NULL ? NULL : fastq_open(file2);
+ fastq_t* f1 = fastq_create(file1);
+ fastq_t* f2 = file2 == NULL ? NULL : fastq_create(file2);
n = count_entries(f1);
if (f2 != NULL) {
unsigned long j = 0; // index into xs
int ret;
- seq_t* seq1 = fastq_alloc_seq();
- seq_t* seq2 = fastq_alloc_seq();
+ seq_t* seq1 = seq_create();
+ seq_t* seq2 = seq_create();
- while (j < k && fastq_next(f1, seq1)) {
+ while (j < k && fastq_read(f1, seq1)) {
if (f2 != NULL){
- ret = fastq_next(f2, seq2);
+ ret = fastq_read(f2, seq2);
if (ret == 0) {
fputs("Input files have differing numbers of entries.\n", stderr);
exit(1);
++i;
}
- fastq_free_seq(seq1);
- fastq_free_seq(seq2);
- fastq_close(f1);
- if (f2 != NULL) fastq_close(f2);
+ seq_free(seq1);
+ seq_free(seq2);
+ fastq_free(f1);
+ if (f2 != NULL) fastq_free(f2);
fclose(fout1);
if (fout2 != NULL) fclose(fout2);
void fastq_hash(FILE* fin, hash_table* T)
{
- fastq_t* fqf = fastq_open(fin);
- seq_t* seq = fastq_alloc_seq();
+ fastq_t* fqf = fastq_create(fin);
+ seq_t* seq = seq_create();
- while (fastq_next(fqf, seq)) {
+ while (fastq_read(fqf, seq)) {
inc_hash_table(T, seq->seq.s, seq->seq.n);
total_reads++;
}
}
- fastq_free_seq(seq);
- fastq_close(fqf);
+ seq_free(seq);
+ fastq_free(fqf);
}
-/*
- * This file is part of fastq-tools.
- *
- * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
- *
- */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
#include "parse.h"
#include "common.h"
-#include <stdlib.h>
-#include <ctype.h>
-
-static const size_t init_str_size = 128;
-static const size_t fastq_buf_size = 4096;
-static void fastq_alloc_str(str_t* s)
+static void str_init(str_t* str)
{
- s->s = malloc_or_die(init_str_size);
- s->s[0] = '\0';
- s->n = 0;
- s->size = init_str_size;
+ str->n = 0;
+ str->size = 128;
+ str->s = malloc_or_die(str->size = 0);
+ str->s[0] = '\0';
}
-static void fastq_expand_str(str_t* s)
+static void str_free(str_t* str)
{
- s->size *= 2;
- realloc_or_die(s->s, s->size);
+ free(str->s);
}
-seq_t* fastq_alloc_seq()
+/* Reserve space for `size` more characters. */
+static void str_reserve_extra(str_t* str, size_t size)
{
- seq_t* seq = malloc_or_die(sizeof(seq_t));
- fastq_alloc_str(&seq->id1);
- fastq_alloc_str(&seq->seq);
- fastq_alloc_str(&seq->id2);
- fastq_alloc_str(&seq->qual);
-
- return seq;
+ if (str->n + size > str->size) {
+ if (str->n + size > 2 * str->size) {
+ str->size = str->n + size;
+ }
+ else str->size *= 2;
+ str->s = realloc_or_die(str->s, str->size * sizeof(char));
+ }
}
-void fastq_free_seq(seq_t* seq)
+/* Copy n characters from c to the end of str. */
+static void str_append(str_t* str, char* c, size_t n)
{
- free(seq->id1.s);
- free(seq->seq.s);
- free(seq->id2.s);
- free(seq->qual.s);
- free(seq);
+ str_reserve_extra(str, n);
+ memcpy(str->s + str->n, c, n);
+ str->n += n;
+ str->s[str->n] = '\0';
}
-typedef enum
+seq_t* seq_create()
{
- STATE_EOF,
- STATE_ERROR,
- STATE_ID1,
- STATE_SEQ,
- STATE_ID2,
- STATE_QUAL
-
-} fastq_state;
-
-
-fastq_t* fastq_open(FILE* f)
-{
- fastq_t* fqf = malloc_or_die(sizeof(fastq_t));
- or_die((int)((fqf->file = gzdopen(fileno(f), "rb")) != NULL),
- "Can not open gzip file.");
-
- fqf->state = STATE_ID1;
- fqf->buf = malloc_or_die(fastq_buf_size);
- fqf->buf[0] = '\0';
- fqf->c = fqf->buf;
-
- return fqf;
+ seq_t* seq = malloc_or_die(sizeof(seq_t));
+ str_init(&seq->id1);
+ str_init(&seq->seq);
+ str_init(&seq->id2);
+ str_init(&seq->qual);
+ return seq;
}
-void fastq_close(fastq_t* fqf)
+void seq_free(seq_t* seq)
{
- gzclose(fqf->file);
- free(fqf->buf);
- free(fqf);
+ str_free(&seq->id1);
+ str_free(&seq->seq);
+ str_free(&seq->id2);
+ str_free(&seq->qual);
+ free(seq);
}
-void fastq_refill(fastq_t* f)
-{
- int errnum;
- const char* errmsg;
+static const size_t parser_buf_size = 1000000;
- int n = gzread(f->file, f->buf, fastq_buf_size - 1);
- if (n <= 0) {
- if (gzeof(f->file)) {
- f->state = STATE_EOF;
- n = 0;
- }
- else {
- errmsg = gzerror(f->file, &errnum);
- fprintf(stderr, "I/O error: %s\n", errmsg);
- exit(1);
- }
- }
+struct fastq_t_
+{
+ FILE* file;
+ size_t readlen;
+ char* buf;
+ char* next;
+ bool linestart;
+};
- f->buf[n] = '\0';
- f->c = f->buf;
-}
-void fastq_get_line(fastq_t* f, str_t* s)
+fastq_t* fastq_create(FILE* file)
{
- size_t i = 0;
-
- if (f->state == STATE_EOF) goto fastq_get_line_done;
-
- while (1) {
- switch (*f->c) {
- case '\0':
- fastq_refill(f);
- if (f->state == STATE_EOF) goto fastq_get_line_done;
- break;
-
- case '\r':
- f->c++;
- break;
-
- case '\n':
- goto fastq_get_line_done;
-
- default:
- while (s->size < i + 2) {
- fastq_expand_str(s);
- }
- if (s) s->s[i++] = *f->c;
- f->c++;
- }
-
- }
-
-fastq_get_line_done:
- if (s) {
- s->s[i] = '\0';
- s->n = i;
- }
+ fastq_t* f = malloc_or_die(sizeof(fastq_t));
+ f->file = file;
+ f->next = f->buf = malloc_or_die(parser_buf_size);
+ f->readlen = 0;
+ f->linestart = true;
+ return f;
}
-
-int fastq_next(fastq_t* f, seq_t* seq)
+void fastq_free(fastq_t* f)
{
- if (f->state == STATE_EOF) return 0;
+ free(f->buf);
+ free(f);
+}
- while (1) {
- /* read more, if needed */
- if (*f->c == '\0' ) {
- fastq_refill(f);
- if (f->state == STATE_EOF) return 0;
- continue;
- }
+typedef enum {
+ FASTQ_STATE_ID1, /* Reading ID1. */
+ FASTQ_STATE_SEQ, /* Reading the sequence. */
+ FASTQ_STATE_ID2, /* Reading ID2. */
+ FASTQ_STATE_QUAL, /* Reading quality scores. */
+} fastq_parser_state_t;
- /* skip over leading whitespace */
- else if (isspace(*f->c)) {
- /* do nothing */
- }
- /* skip comments */
- /*
- else if (*f->c == ';') {
- fastq_get_line(f, NULL);
- if (f->state == STATE_EOF) return 0;
- }
- */
-
- /* read id1 */
- else if (f->state == STATE_ID1) {
- if (*f->c == '@' || *f->c == '>') {
- f->c++;
- fastq_get_line(f, &seq->id1);
- if (f->state == STATE_EOF) return 0;
-
- f->state = STATE_SEQ;
- }
- else {
- fprintf(stderr,
- "Malformed FASTQ file: expecting an '@' or '>', saw a '%c'\n",
- *f->c);
- exit(1);
+bool fastq_read(fastq_t* f, seq_t* seq)
+{
+ seq->id1.n = seq->seq.n = seq->id2.n = seq->qual.n = 0;
+ fastq_parser_state_t state = FASTQ_STATE_ID1;
+ char* end = f->buf + f->readlen;
+ do {
+ while (f->next < end) {
+ /* Consume pointless special characters prefixing IDs */
+ if ((state == FASTQ_STATE_ID1 && f->linestart && f->next[0] == '@') ||
+ (state == FASTQ_STATE_ID2 && f->linestart && f->next[0] == '+')) {
+ f->linestart = false;
+ ++f->next;
+ continue;
}
- }
-
- /* read sequence */
- else if (f->state == STATE_SEQ) {
- fastq_get_line(f, &seq->seq);
- if (f->state == STATE_EOF) return 0;
-
- f->state = STATE_ID2;
- }
-
- /* read id2 */
- else if (f->state == STATE_ID2) {
- if (*f->c == '+') {
- f->c++;
- fastq_get_line(f, &seq->id2);
- if (f->state == STATE_EOF) return 0;
- f->state = STATE_QUAL;
+ char* u = memchr(f->next, '\n', end - f->next);
+ if (u == NULL) {
+ f->linestart = false;
+ u = end;
}
- else {
- /* fasta style entry */
- seq->id2.s[0] = '\0';
- seq->qual.s[0] = '\0';
-
- f->state = STATE_ID1;
- break;
+ else f->linestart = true;
+
+ switch (state) {
+ case FASTQ_STATE_ID1:
+ str_append(&seq->id1, f->next, u - f->next);
+ if (f->linestart) state = FASTQ_STATE_SEQ;
+ break;
+
+ case FASTQ_STATE_SEQ:
+ str_append(&seq->seq, f->next, u - f->next);
+ if (f->linestart) state = FASTQ_STATE_ID2;
+ break;
+
+ case FASTQ_STATE_ID2:
+ str_append(&seq->id2, f->next, u - f->next);
+ if (f->linestart) state = FASTQ_STATE_QUAL;
+ break;
+
+ case FASTQ_STATE_QUAL:
+ str_append(&seq->qual, f->next, u - f->next);
+ if (f->linestart) {
+ f->next = u + 1;
+ return true;
+ }
+ break;
}
- }
- /* read quality string */
- else if (f->state == STATE_QUAL) {
- fastq_get_line(f, &seq->qual);
- if (f->state == STATE_EOF) return 1;
-
- f->state = STATE_ID1;
- break;
+ f->next = u + 1;
}
- else {
- fputs("Inexplicable error in fastq parser.\n", stderr);
- exit(1);
- }
+ /* Try to read more. */
+ f->readlen = fread(f->buf, 1, parser_buf_size, f->file);
+ f->next = f->buf;
+ end = f->buf + f->readlen;
+ } while (f->readlen);
- f->c++;
- }
-
- return 1;
+ return false;
}
-void fastq_rewind(fastq_t* fqf)
+void fastq_print(FILE* fout, const seq_t* seq)
{
- gzrewind(fqf->file);
- fqf->state = STATE_ID1;
- fqf->buf[0] = '\0';
- fqf->c = fqf->buf;
-}
-
-
-void fastq_print(FILE* fout, seq_t* seq)
-{
- /* FASTQ */
- if (seq->qual.n > 0) {
- fprintf(fout, "@%s\n%s\n+%s\n%s\n",
- seq->id1.s,
- seq->seq.s,
- seq->id2.s,
- seq->qual.s );
- }
-
- /* FASTA */
- else {
- fprintf(fout, ">%s\n%s\n",
- seq->id1.s,
- seq->seq.s );
- }
+ fprintf(fout, "@%s\n%s\n+%s\n%s\n",
+ seq->id1.s,
+ seq->seq.s,
+ seq->id2.s,
+ seq->qual.s );
}
#ifndef FASTQ_TOOLS_PARSE_H
#define FASTQ_TOOLS_PARSE_H
-#include <stdio.h>
-#include <zlib.h>
-
+#include <stdbool.h>
+#include <stdlib.h>
+/* A string structure to keep-track of a reserved space. */
typedef struct
{
char* s; /* null-terminated string */
} str_t;
-
+/* A single fastq entry. */
typedef struct
{
str_t id1;
} seq_t;
-seq_t* fastq_alloc_seq();
-void fastq_free_seq(seq_t*);
+/* Allocate a new empty seq_t. */
+seq_t* seq_create();
-typedef struct
-{
- gzFile file;
- int state;
- char* buf;
- char* c;
-} fastq_t;
+/* Free a seq allocated with seq_create. */
+void seq_free(seq_t* seq);
+
+
+/* Internal data for the fastq parser. */
+typedef struct fastq_t_ fastq_t;
+
+
+/* Create a new fastq parser object.
+ *
+ * Args:
+ * file: A file that has been opened for reading.
+ */
+fastq_t* fastq_create(FILE* file);
+
+
+/* Free memory associated with a fastq_t object. */
+void fastq_free(fastq_t*);
-fastq_t* fastq_open(FILE*);
-void fastq_close(fastq_t*);
-int fastq_next(fastq_t*, seq_t*);
-void fastq_rewind(fastq_t*);
+/* Read one fastq entry.
+ *
+ * Args:
+ * f: A fastq_t parser object.
+ * seq: A seq_t object that has been allocated with seq_create.
+ *
+ * Returns:
+ * True if an entry was read, false if end-of-file was reached.
+ */
+bool fastq_read(fastq_t* f, seq_t* seq);
+
-void fastq_print(FILE* fout, seq_t* seq);
+/* Print a fastq entry. */
+void fastq_print(FILE* fout, const seq_t* seq);
#endif
--- /dev/null
+
+check_PROGRAMS = random_fastq cat_fastq
+
+TESTS = parse_fastq
+
+
--- /dev/null
+
+#include "../src/parse.c"
+#include "../src/common.c"
+
+int main()
+{
+ seq_t* seq = seq_create();
+ fastq_t* f = fastq_create(stdin);
+
+ while (fastq_read(f, seq)) {
+ fastq_print(stdout, seq);
+ }
+
+ fastq_free(f);
+ seq_free(seq);
+
+ return EXIT_SUCCESS;
+}
+
+
+
--- /dev/null
+#!/bin/sh
+
+./random_fastq | head -n 40000 | ./cat_fastq > /dev/null
+
+
--- /dev/null
+
+/*
+ random_fastq
+ ------------
+ Generate random data in FASTQ format.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <getopt.h>
+
+static void print_help()
+{
+ printf(
+"Usage: random_fastq [option]...\n"
+"Generate an endless stream of random FASTQ data to standard out.\n\n"
+"Options:\n"
+" TODO\n\n"
+"Beware: the only purpose of this program is test quip.\n"
+"No particular guarantees are made.\n\n");
+}
+
+
+/* Draw n samples from a categorial distribution of size k with cumulative
+ * distribution given by cs and element us. The sample is stored in xs which
+ * is assumed to be of appropriate length. */
+void randcat(const char* us, const double* cs, size_t k, char* xs, size_t n)
+{
+ size_t i;
+ double r;
+ for (i = 0; i < n; ++i) {
+ r = drand48();
+ int a = 0, b = (int) k, c;
+ do {
+ c = (a + b) / 2;
+ if (r <= cs[c]) b = c;
+ else a = c + 1;
+ } while (a < b);
+
+ xs[i] = us[a];
+ }
+}
+
+
+int main(int argc, char* argv[])
+{
+ static struct option long_options[] =
+ {
+ {"min-length", required_argument, NULL, 'm'},
+ {"max-length", required_argument, NULL, 'M'},
+ {"length", required_argument, NULL, 'l'},
+ {"id-length", required_argument, NULL, 'i'},
+ {"help", no_argument, NULL, 'h'},
+ {NULL, 0, NULL, 0}
+ };
+
+ size_t min_len = 100, max_len = 100;
+ size_t id_len = 50;
+
+ int opt, opt_idx;
+ while (1) {
+ opt = getopt_long(argc, argv, "h", long_options, &opt_idx);
+
+ if (opt == -1) break;
+
+ switch (opt) {
+ case 'm':
+ min_len = (size_t) strtoul(optarg, NULL, 10);
+ break;
+
+ case 'M':
+ max_len = (size_t) strtoul(optarg, NULL, 10);
+ break;
+
+ case 'l':
+ min_len = max_len = (size_t) strtoul(optarg, NULL, 10);
+ break;
+
+ case 'i':
+ id_len = (size_t) strtoul(optarg, NULL, 10);
+ break;
+
+ case 'h':
+ print_help();
+ return EXIT_SUCCESS;
+
+ case '?':
+ return EXIT_FAILURE;
+
+ default:
+ abort();
+ }
+ }
+
+ char nucleotides[5] = {'A', 'C', 'G', 'T', 'N'};
+ double nuc_cs[5] = {0.28, 0.49, 0.70, 0.90, 1.00};
+
+ char qualities[64];
+ double qual_cs[64];
+ size_t i;
+ double last_c = 0.0;
+ for (i = 0; i < 64; ++i) {
+ qualities[i] = '!' + i;
+ last_c = qual_cs[i] = last_c + 1.0 / 64.0;
+ }
+
+ char id_chars[94];
+ double id_cs[94];
+ last_c = 0.0;
+ for (i = 0; i < 94; ++i) {
+ id_chars[i] = '!' + i;
+ last_c = id_cs[i] = last_c + 1.0 / 94.0;
+ }
+
+ char* id = malloc(id_len + 1);
+ char* seq = malloc(max_len + 1);
+ char* qual = malloc(max_len + 1);
+ size_t len = min_len;
+
+ while (1) {
+ if (max_len > min_len) {
+ len = min_len + (size_t) (drand48() * (double) (1 + max_len - min_len));
+ }
+
+ randcat(id_chars, id_cs, 94, id, id_len);
+ id[id_len] = '\0';
+
+ randcat(nucleotides, nuc_cs, 5, seq, len);
+ seq[len] = '\0';
+
+ randcat(qualities, qual_cs, 64, qual, len);
+ qual[len] = '\0';
+
+ printf(
+ "@%s\n%s\n+\n%s\n",
+ id, seq, qual);
+ }
+
+ /* Yeah, right. As if we'll ever get here. */
+ free(id);
+ free(seq);
+ free(qual);
+
+ return EXIT_SUCCESS;
+}
+