-bin_PROGRAMS = fastq-grep fastq-kmers fastq-match
+bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq
-fastq_common_src=fastq-common.h fastq-common.c
-fastq_parse_src=fastq-parse.h fastq-parse.c
-fastq_sw_src=fastq-sw.h fastq-sw.c
+fastq_common_src=common.h common.c
+fastq_parse_src=parse.h parse.c
+fastq_sw_src=sw.h sw.c
+fastq_hash_src=hash.h hash.c
fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src)
fastq_grep_LDADD = $(PCRE_LIBS) -lz
fastq_match_SOURCES = fastq-match.c $(fastq_common_src) $(fastq_parse_src) $(fastq_sw_src)
fastq_match_LDADD = -lz
+fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src)
+fastq_uniq_LDADD = -lz
+
--- /dev/null
+
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ */
+
+
+#include "common.h"
+#include <stdlib.h>
+
+
+void or_die(int b, const char* msg)
+{
+ if (b == 0) {
+ fputs(msg, stderr);
+ exit(1);
+ }
+}
+
+
+void* malloc_or_die(size_t n)
+{
+ void* p = malloc(n);
+ if (p == NULL) {
+ fprintf(stderr, "Can not allocate %zu bytes.\n", n);
+ exit(1);
+ }
+ return p;
+}
+
+
+void* realloc_or_die(void* ptr, size_t n)
+{
+ void* p = realloc(ptr, n);
+ if (p == NULL) {
+ fprintf(stderr, "Can not allocate %zu bytes.\n", n);
+ exit(1);
+ }
+ return p;
+}
+
+
+FILE* fopen_or_die(const char* path, const char* mode)
+{
+ FILE* f = fopen(path, mode);
+ if (f == NULL) {
+ fprintf(stderr, "Can not open file %s with mode %s.\n", path, mode);
+ exit(1);
+ }
+ return f;
+}
+
+
+
--- /dev/null
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * common :
+ * A few common functions, primarily for crashing whilst retaining our dignity.
+ *
+ */
+
+#ifndef FASTQ_TOOLS_COMMON_H
+#define FASTQ_TOOLS_COMMON_H
+
+#include <stdio.h>
+
+void or_die(int b, const char* msg);
+
+void* malloc_or_die(size_t);
+void* realloc_or_die(void*, size_t);
+FILE* fopen_or_die(const char*, const char*);
+
+#endif
+
+++ /dev/null
-
-/*
- * This file is part of fastq-tools.
- *
- * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
- *
- */
-
-
-#include "fastq-common.h"
-#include <stdlib.h>
-
-
-void or_die(int b, const char* msg)
-{
- if (b == 0) {
- fputs(msg, stderr);
- exit(1);
- }
-}
-
-
-void* malloc_or_die(size_t n)
-{
- void* p = malloc(n);
- if (p == NULL) {
- fprintf(stderr, "Can not allocate %zu bytes.\n", n);
- exit(1);
- }
- return p;
-}
-
-
-void* realloc_or_die(void* ptr, size_t n)
-{
- void* p = realloc(ptr, n);
- if (p == NULL) {
- fprintf(stderr, "Can not allocate %zu bytes.\n", n);
- exit(1);
- }
- return p;
-}
-
-
-FILE* fopen_or_die(const char* path, const char* mode)
-{
- FILE* f = fopen(path, mode);
- if (f == NULL) {
- fprintf(stderr, "Can not open file %s with mode %s.\n", path, mode);
- exit(1);
- }
- return f;
-}
-
-
-
+++ /dev/null
-/*
- * This file is part of fastq-tools.
- *
- * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
- *
- * common :
- * A few common functions, primarily for crashing whilst retaining our dignity.
- *
- */
-
-#ifndef FASTQ_TOOLS_COMMON_H
-#define FASTQ_TOOLS_COMMON_H
-
-#include <stdio.h>
-
-void or_die(int b, const char* msg);
-
-void* malloc_or_die(size_t);
-void* realloc_or_die(void*, size_t);
-FILE* fopen_or_die(const char*, const char*);
-
-#endif
-
*/
-#include "fastq-common.h"
-#include "fastq-parse.h"
+#include "common.h"
+#include "parse.h"
#include <stdio.h>
#include <string.h>
#include <getopt.h>
*
*/
-#include "fastq-common.h"
-#include "fastq-parse.h"
+#include "common.h"
+#include "parse.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
*/
-#include "fastq-common.h"
-#include "fastq-parse.h"
-#include "fastq-sw.h"
+#include "common.h"
+#include "parse.h"
+#include "sw.h"
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
void print_help()
{
- fprintf( stderr,
+ fprintf(stderr,
"fastq-match [OPTION]... QUERY [FILE]...\n"
"Perform Smith-Waterman local alignment of a query sequence\n"
"against each sequence in a fastq file.\n\n"
while (1) {
opt = getopt_long(argc, argv, "h", long_options, &opt_idx);
- if( opt == -1 ) break;
+ if (opt == -1) break;
switch (opt) {
case 0:
+++ /dev/null
-/*
- * This file is part of fastq-tools.
- *
- * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
- *
- */
-
-#include "fastq-parse.h"
-#include "fastq-common.h"
-#include <stdlib.h>
-#include <ctype.h>
-
-
-static const size_t init_str_size = 128;
-static const size_t fastq_buf_size = 4096;
-
-static void fastq_alloc_str(str_t* s)
-{
- s->s = malloc_or_die(init_str_size);
- s->s[0] = '\0';
- s->n = 0;
- s->size = init_str_size;
-}
-
-
-static void fastq_expand_str(str_t* s)
-{
- s->size *= 2;
- realloc_or_die(s->s, s->size);
-}
-
-
-seq_t* fastq_alloc_seq()
-{
- seq_t* seq = malloc_or_die(sizeof(seq_t));
- fastq_alloc_str(&seq->id1);
- fastq_alloc_str(&seq->seq);
- fastq_alloc_str(&seq->id2);
- fastq_alloc_str(&seq->qual);
-
- return seq;
-}
-
-
-void fastq_free_seq(seq_t* seq)
-{
- free(seq->id1.s);
- free(seq->seq.s);
- free(seq->id2.s);
- free(seq->qual.s);
- free(seq);
-}
-
-
-typedef enum
-{
- STATE_EOF,
- STATE_ERROR,
- STATE_ID1,
- STATE_SEQ,
- STATE_ID2,
- STATE_QUAL
-
-} fastq_state;
-
-
-fastq_t* fastq_open(FILE* f)
-{
- fastq_t* fqf = malloc_or_die(sizeof(fastq_t));
- or_die((int)(fqf->file = gzdopen(fileno(f), "rb")),
- "Can not open gzip file.");
-
- fqf->state = STATE_ID1;
- fqf->buf = malloc_or_die(fastq_buf_size);
- fqf->buf[0] = '\0';
- fqf->c = fqf->buf;
-
- return fqf;
-}
-
-
-void fastq_close(fastq_t* fqf)
-{
- gzclose(fqf->file);
- free(fqf->buf);
- free(fqf);
-}
-
-
-void fastq_refill(fastq_t* f)
-{
- int errnum;
- const char* errmsg;
-
- int n = gzread(f->file, f->buf, fastq_buf_size - 1);
-
- if (n <= 0) {
- if (gzeof(f->file)) {
- f->state = STATE_EOF;
- n = 0;
- }
- else {
- errmsg = gzerror(f->file, &errnum);
- fprintf(stderr, "I/O error: %s\n", errmsg);
- exit(1);
- }
- }
-
- f->buf[n] = '\0';
- f->c = f->buf;
-}
-
-
-void fastq_get_line(fastq_t* f, str_t* s)
-{
- int i = 0;
-
- if (f->state == STATE_EOF) goto fastq_get_line_done;
-
- while (1) {
- switch (*f->c) {
- case '\0':
- fastq_refill(f);
- if (f->state == STATE_EOF) goto fastq_get_line_done;
- break;
-
- case '\r':
- f->c++;
- break;
-
- case '\n':
- goto fastq_get_line_done;
-
- default:
- while (s->size < i + 2) {
- fastq_expand_str(s);
- }
- if (s) s->s[i++] = *f->c;
- f->c++;
- }
-
- }
-
-fastq_get_line_done:
- if (s) {
- s->s[i] = '\0';
- s->n = i;
- }
-}
-
-
-
-int fastq_next(fastq_t* f, seq_t* seq)
-{
- if (f->state == STATE_EOF) return 0;
-
- while (1) {
-
- /* read more, if needed */
- if (*f->c == '\0' ) {
- fastq_refill(f);
- if (f->state == STATE_EOF) return 0;
- continue;
- }
-
- /* skip over leading whitespace */
- else if (isspace(*f->c)) {
- /* do nothing */
- }
-
- /* skip comments */
- else if (*f->c == ';') {
- fastq_get_line(f, NULL);
- if (f->state == STATE_EOF) return 0;
- }
-
- /* read id1 */
- else if (f->state == STATE_ID1) {
- if (*f->c == '@') {
- f->c++;
- fastq_get_line(f, &seq->id1);
- if (f->state == STATE_EOF) return 0;
-
- f->state = STATE_SEQ;
- }
- else {
- fprintf(stderr, "Malformed FASTQ file: expecting an '@', saw a '%c'\n", *f->c);
- exit(1);
- }
- }
-
- /* read sequence */
- else if (f->state == STATE_SEQ) {
- fastq_get_line(f, &seq->seq);
- if (f->state == STATE_EOF) return 0;
-
- f->state = STATE_ID2;
- }
-
- /* read id2 */
- else if (f->state == STATE_ID2) {
- if (*f->c == '+') {
- f->c++;
- fastq_get_line(f, &seq->id2);
- if (f->state == STATE_EOF) return 0;
-
- f->state = STATE_QUAL;
- }
- else {
- fprintf(stderr, "Malformed FASTQ file: expecting an '+', saw a '%c'\n", *f->c);
- exit(1);
- }
- }
-
- /* read quality string */
- else if (f->state == STATE_QUAL) {
- fastq_get_line(f, &seq->qual);
- if (f->state == STATE_EOF) return 1;
-
- f->state = STATE_ID1;
- break;
- }
-
- else {
- fputs("Inexplicable error in fastq parser.\n", stderr);
- exit(1);
- }
-
- f->c++;
- }
-
- return 1;
-}
-
-
+++ /dev/null
-/*
- * This file is part of fastq-tools.
- *
- * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
- *
- * fastq-parse :
- * A parser for FASTQ files.
- *
- * This parser is mostly derivative of Heng Li's.
- * See: http://lh3lh3.users.sourceforge.net/kseq.shtml
- *
- */
-
-#ifndef FASTQ_TOOLS_PARSE_H
-#define FASTQ_TOOLS_PARSE_H
-
-#include <stdio.h>
-#include <zlib.h>
-
-
-typedef struct
-{
- char* s; /* null-terminated string */
- size_t n; /* length of s */
- size_t size; /* bytes allocated for s */
-} str_t;
-
-
-
-typedef struct
-{
- str_t id1;
- str_t seq;
- str_t id2;
- str_t qual;
-} seq_t;
-
-
-seq_t* fastq_alloc_seq();
-void fastq_free_seq(seq_t*);
-
-
-typedef struct
-{
- gzFile file;
- int state;
- char* buf;
- char* c;
-} fastq_t;
-
-
-fastq_t* fastq_open(FILE*);
-void fastq_close(fastq_t*);
-int fastq_next(fastq_t*, seq_t*);
-
-#endif
-
+++ /dev/null
-/*
- * This file is part of fastq-tools.
- *
- * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
- *
- * This is a very simple, 'fast enough' implementation of the Smith-Waterman
- * algorithm specifically for short nucleotide sequences, working in O(mn) time
- * and O(m) space, implemented according to the original Gotoh paper and
- * Phil Green's implementation in cross_match.
- *
- * There is no fancy bit packing or vectorization, but such features would offer
- * diminishing returns when aligning short sequences such as high throughput
- * sequencing data. For example the Farrar SSE2 algorithm might be a tiny bit
- * faster, but would diminish portability.
- *
- */
-
-
-#include "fastq-sw.h"
-#include "fastq-common.h"
-#include <string.h>
-#include <stdlib.h>
-
-
-static const int sw_default_d[25] =
- /* A C G T N */
- { 1, -2, -2, -2 , 0,
- -2, 1, -2, -2, 0,
- -2, -2, 1, -2, 0,
- -2, -2, -2, 1, 0,
- 0, 0, 0, 0, 0 };
-
-static const int sw_mat50_d[25] =
- { 2, -2, 0, -2 , 0,
- -2, 2, -2, 0, 0,
- 0, -2, 2, -2, 0,
- -2, 0, -2, 2, 0,
- 0, 0, 0, 0, 0 };
-
-static const int sw_mat70_d[25] =
- { 2, -2, -1, -2 , 0,
- -2, 2, -2, -1, 0,
- -1, -2, 2, -2, 0,
- -2, -1, -2, 2, 0,
- 0, 0, 0, 0, 0 };
-
-
-static inline int imax(int a, int b)
-{
- return a > b ? a : b;
-}
-
-static inline int imax4(int a, int b, int c, int d)
-{
- return imax(imax(a,b), imax(c,d));
-}
-
-
-
-void fastq_sw_conv_seq(unsigned char* seq, int n)
-{
- while (*seq && n) {
- switch (*seq) {
- case 'A' :
- case 'a':
- case 'U':
- case 'u':
- *seq = 0;
- break;
-
- case 'C':
- case 'c':
- *seq = 1;
- break;
-
- case 'G':
- case 'g':
- *seq = 2;
- break;
-
- case 'T':
- case 't':
- *seq = 3;
- break;
-
- case 'N':
- case 'n':
- default:
- *seq = 4;
- }
-
- seq++;
- n--;
- }
-}
-
-
-sw_t* fastq_alloc_sw(const unsigned char* subject, int size)
-{
- sw_t* sw = malloc_or_die(sizeof(sw_t));
-
- sw->subject = malloc_or_die(size);
- memcpy(sw->subject, subject, size);
-
- /* default cost matrix */
- memcpy(sw->d, sw_default_d, 25 * sizeof(int));
-
- /* default gap costs */
- sw->gap_open = -4;
- sw->gap_extend = -3;
-
- sw->work0 = malloc_or_die(size * sizeof(int));
- sw->work1 = malloc_or_die(size * sizeof(int));
- sw->size = size;
-
- return sw;
-}
-
-
-void fastq_free_sw(sw_t* sw)
-{
- free(sw->subject);
- free(sw->work0);
- free(sw->work1);
- free(sw);
-}
-
-
-
-int fastq_sw(sw_t* sw, const unsigned char* x, int n)
-{
- /* conveniance */
- int* maxstu = sw->work0;
- int* t = sw->work1;
- int m = sw->size;
- const int* d = sw->d;
- int gap_op = sw->gap_open;
- int gap_ex = sw->gap_extend;
- unsigned char* y = sw->subject;
-
- int i, j;
-
- int score = 0;
-
- /* zero maxstu row */
- memset(maxstu, 0, m * sizeof(int));
-
- /* initialize t row to a negative value to prohibit
- * leading with gap extensions */
- for (j = 0; j < m; j++) t[j] = -1;
-
- int u, s;
- int maxstu0;
-
- for (i = 0; i < n; i++) {
-
- /* special case for column 0 */
- t[0] = imax(maxstu[0] + gap_op, t[0] + gap_ex);
- u = gap_op;
- maxstu[0] = imax4(0, d[5 * y[0] + x[0]], t[0], u);
- maxstu0 = 0;
- score = imax(score, maxstu[0]);
-
-
- for (j = 1; j < m; j++) {
- t[j] = imax(maxstu[j] + gap_op, t[j] + gap_ex);
- u = imax(maxstu[j-1] + gap_op, u + gap_ex);
- s = maxstu0 + d[5 * y[j] + x[i]];
- maxstu0 = maxstu[j];
-
- maxstu[j] = imax4(0, s, t[j], u);
- score = imax(score, maxstu[j]);
- }
- }
-
- return score;
-}
-
-
-
+++ /dev/null
-/*
- * This file is part of fastq-tools.
- *
- * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
- *
- * fastq-sw :
- * Local alignments of nucleotide sequences via Smith-Waterman.
- *
- */
-
-#ifndef FASTQ_TOOLS_SW_H
-#define FASTQ_TOOLS_SW_H
-
-
-typedef struct
-{
- unsigned char* subject;
- int size;
-
- int d[25]; /* cost matrix */
- int gap_open; /* gap open */
- int gap_extend; /* gap extend */
-
- /* matrix rows, used internally */
- int* work0;
- int* work1;
-
-} sw_t;
-
-/* convert a n ASCII nucleotide sequence to one suitable for fastq_sw */
-void fastq_sw_conv_seq(unsigned char*, int n);
-
-sw_t* fastq_alloc_sw(const unsigned char *subject, int size);
-void fastq_free_sw(sw_t*);
-
-int fastq_sw(sw_t*, const unsigned char* query, int size);
-
-#endif
-
--- /dev/null
+
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-uniq :
+ * Collapsing a fastq file into only unique read sequences.
+ */
+
+
+#include "common.h"
+#include "hash.h"
+#include "parse.h"
+#include <string.h>
+#include <zlib.h>
+#include <getopt.h>
+
+
+#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
+# include <fcntl.h>
+# include <io.h>
+# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
+#else
+# define SET_BINARY_MODE(file)
+#endif
+
+
+static int help_flag;
+static int verbose_flag;
+size_t total_reads;
+
+void print_help()
+{
+ fprintf(stderr,
+"fastq-uniq [OPTION] [FILE]...\n"
+"Output a non-redundant FASTQ file, in which there are no duplicate reads.\n"
+"(Warning: this program can be somewhat memory intensive.)\n\n"
+"Options:\n"
+" -h, --help print this message\n"
+" -v, --verbose print status along the way\n"
+ );
+}
+
+
+void fastq_hash(FILE* fin, hash_table* T)
+{
+ fastq_t* fqf = fastq_open(fin);
+ seq_t* seq = fastq_alloc_seq();
+
+ while (fastq_next(fqf, seq)) {
+ inc_hash_table(T, seq->seq.s, seq->seq.n);
+
+ total_reads++;
+ if (verbose_flag && total_reads % 100000 == 0) {
+ fprintf(stderr, "%zu reads processed ...\n", total_reads);
+ }
+ }
+
+ fastq_free_seq(seq);
+ fastq_close(fqf);
+}
+
+
+int compare_hashed_value_count(const void* x, const void* y)
+{
+ hashed_value* const * a = x;
+ hashed_value* const * b = y;
+
+ if( (*a)->count > (*b)->count ) return -1;
+ if( (*a)->count < (*b)->count ) return 1;
+ return 0;
+}
+
+
+
+void print_hash_table(FILE* fout, hash_table* T)
+{
+ hashed_value** S = dump_hash_table(T);
+ qsort(S, T->m, sizeof(hashed_value*), compare_hashed_value_count);
+
+ size_t i;
+ for (i = 0; i < T->m; i++) {
+ fprintf(fout, ">unique-read-%07zu (%zu copies)\n", i, S[i]->count);
+ fwrite(S[i]->value, S[i]->len, sizeof(char), fout);
+ fprintf(fout, "\n");
+ }
+ free(S);
+}
+
+
+
+int main(int argc, char* argv[])
+{
+ SET_BINARY_MODE(stdin);
+ SET_BINARY_MODE(stdout);
+
+ hash_table* T = create_hash_table();
+
+ FILE* fin ;
+
+ help_flag = 0;
+
+ int opt;
+ int opt_idx;
+
+ static struct option long_options[] =
+ {
+ {"help", no_argument, &help_flag, 1},
+ {"verbose", no_argument, &verbose_flag, 1},
+ {0, 0, 0, 0}
+ };
+
+ while (1) {
+ opt = getopt_long(argc, argv, "hv", long_options, &opt_idx);
+
+ if (opt == -1) break;
+
+ switch (opt) {
+ case 0:
+ if (long_options[opt_idx].flag != 0) break;
+ if (optarg) {
+ }
+ break;
+
+ case 'h':
+ help_flag = 1;
+ break;
+
+ case 'v':
+ verbose_flag = 1;
+ break;
+
+ case '?':
+ return 1;
+
+ default:
+ abort();
+ }
+ }
+
+ if (help_flag) {
+ print_help();
+ return 0;
+ }
+
+ if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
+ fastq_hash(stdin, T);
+ }
+ else {
+ for (; optind < argc; optind++) {
+ fin = fopen(argv[optind], "rb");
+ if (fin == NULL) {
+ fprintf(stderr, "No such file '%s'.\n", argv[optind]);
+ continue;
+ }
+
+ fastq_hash(fin, T);
+ }
+ }
+
+ print_hash_table(stdout, T);
+
+ destroy_hash_table(T);
+ return 0;
+}
--- /dev/null
+
+#include "hash.h"
+#include "common.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+
+static const size_t INITIAL_TABLE_SIZE = 128;
+static const double MAX_LOAD = 0.75;
+
+
+/*
+ * Paul Hsieh's SuperFastHash
+ * http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
+ || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
+#define get16bits(d) (*((const uint16_t *) (d)))
+#endif
+
+#if !defined (get16bits)
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+ +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+
+static uint32_t hash(const char * data, size_t len) {
+ uint32_t hash = len, tmp;
+ int rem;
+
+ if (len <= 0 || data == NULL) return 0;
+
+ rem = len & 3;
+ len >>= 2;
+
+ /* Main loop */
+ for (;len > 0; len--) {
+ hash += get16bits (data);
+ tmp = (get16bits (data+2) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ data += 2*sizeof (uint16_t);
+ hash += hash >> 11;
+ }
+
+ /* Handle end cases */
+ switch (rem) {
+ case 3: hash += get16bits (data);
+ hash ^= hash << 16;
+ hash ^= data[sizeof (uint16_t)] << 18;
+ hash += hash >> 11;
+ break;
+ case 2: hash += get16bits (data);
+ hash ^= hash << 11;
+ hash += hash >> 17;
+ break;
+ case 1: hash += *data;
+ hash ^= hash << 10;
+ hash += hash >> 1;
+ }
+
+ /* Force "avalanching" of final 127 bits */
+ hash ^= hash << 3;
+ hash += hash >> 5;
+ hash ^= hash << 4;
+ hash += hash >> 17;
+ hash ^= hash << 25;
+ hash += hash >> 6;
+
+ return hash;
+}
+
+
+
+static void rehash(hash_table* T, size_t new_n);
+static void clear_hash_table(hash_table*);
+
+
+
+hash_table* create_hash_table()
+{
+ hash_table* T = malloc_or_die(sizeof(hash_table));
+ T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(hashed_value*));
+ memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(hashed_value*));
+ T->n = INITIAL_TABLE_SIZE;
+ T->m = 0;
+ T->max_m = T->n * MAX_LOAD;
+
+ return T;
+}
+
+
+void destroy_hash_table(hash_table* T)
+{
+ if (T != NULL) {
+ clear_hash_table(T);
+ free(T->A);
+ free(T);
+ }
+}
+
+
+
+void clear_hash_table(hash_table* T)
+{
+ hashed_value* u;
+ size_t i;
+ for (i = 0; i < T->n; i++) {
+ while (T->A[i]) {
+ u = T->A[i]->next;
+ free(T->A[i]->value);
+ free(T->A[i]);
+ T->A[i] = u;
+ }
+ }
+
+ T->m = 0;
+}
+
+
+static void insert_without_copy(hash_table* T, hashed_value* V)
+{
+ uint32_t h = hash(V->value, V->len) % T->n;
+ V->next = T->A[h];
+ T->A[h] = V;
+ T->m++;
+}
+
+
+
+static void rehash(hash_table* T, size_t new_n)
+{
+ hash_table U;
+ U.n = new_n;
+ U.m = 0;
+ U.max_m = U.n * MAX_LOAD;
+ U.A = malloc_or_die(U.n * sizeof(hashed_value*));
+ memset(U.A, 0, U.n * sizeof(hashed_value*));
+
+ hashed_value *j, *k;
+ size_t i;
+ for (i = 0; i < T->n; i++) {
+ j = T->A[i];
+ while (j) {
+ k = j->next;
+ insert_without_copy(&U, j);
+ j = k;
+ }
+ T->A[i] = NULL;
+ }
+
+ free(T->A);
+ T->A = U.A;
+ T->n = U.n;
+ T->max_m = U.max_m;
+}
+
+
+void inc_hash_table(hash_table* T, const char* value, size_t len)
+{
+ if (T->m >= T->max_m) rehash(T, T->n * 2);
+
+ uint32_t h = hash(value, len) % T->n;
+
+ hashed_value* u = T->A[h];
+
+ while (u) {
+ if (u->len == len && memcmp(u->value, value, len) == 0) {
+ u->count++;
+ return;
+ }
+
+ u = u->next;
+ }
+
+ u = malloc_or_die(sizeof(hashed_value));
+ u->value = malloc_or_die(len);
+ memcpy(u->value, value, len);
+ u->len = len;
+ u->count = 1;
+
+ u->next = T->A[h];
+ T->A[h] = u;
+
+ T->m++;
+}
+
+
+
+hashed_value** dump_hash_table(hash_table* T)
+{
+ hashed_value** D = malloc_or_die(T->m * sizeof(hashed_value*));
+
+ hashed_value* u;
+ size_t i, j;
+ for (i = 0, j = 0; i < T->n; i++) {
+ u = T->A[i];
+ while (u) {
+ D[j++] = u;
+ u = u->next;
+ }
+ }
+
+ return D;
+}
+
+
+
--- /dev/null
+
+#ifndef FASTQ_TOOLS_HASH_H
+#define FASTQ_TOOLS_HASH_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+
+typedef struct hashed_value_
+{
+ char* value;
+ size_t len;
+ uint32_t count;
+ struct hashed_value_* next;
+} hashed_value;
+
+
+typedef struct
+{
+ hashed_value** A; /* table proper */
+ size_t n; /* table size */
+ size_t m; /* hashed items */
+ size_t max_m; /* max hashed items before rehash */
+} hash_table;
+
+
+hash_table* create_hash_table();
+
+void destroy_hash_table(hash_table*);
+
+void inc_hash_table(hash_table*, const char* value, size_t len);
+
+hashed_value** dump_hash_table(hash_table*);
+
+
+#endif
+
--- /dev/null
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ */
+
+#include "parse.h"
+#include "common.h"
+#include <stdlib.h>
+#include <ctype.h>
+
+
+static const size_t init_str_size = 128;
+static const size_t fastq_buf_size = 4096;
+
+static void fastq_alloc_str(str_t* s)
+{
+ s->s = malloc_or_die(init_str_size);
+ s->s[0] = '\0';
+ s->n = 0;
+ s->size = init_str_size;
+}
+
+
+static void fastq_expand_str(str_t* s)
+{
+ s->size *= 2;
+ realloc_or_die(s->s, s->size);
+}
+
+
+seq_t* fastq_alloc_seq()
+{
+ seq_t* seq = malloc_or_die(sizeof(seq_t));
+ fastq_alloc_str(&seq->id1);
+ fastq_alloc_str(&seq->seq);
+ fastq_alloc_str(&seq->id2);
+ fastq_alloc_str(&seq->qual);
+
+ return seq;
+}
+
+
+void fastq_free_seq(seq_t* seq)
+{
+ free(seq->id1.s);
+ free(seq->seq.s);
+ free(seq->id2.s);
+ free(seq->qual.s);
+ free(seq);
+}
+
+
+typedef enum
+{
+ STATE_EOF,
+ STATE_ERROR,
+ STATE_ID1,
+ STATE_SEQ,
+ STATE_ID2,
+ STATE_QUAL
+
+} fastq_state;
+
+
+fastq_t* fastq_open(FILE* f)
+{
+ fastq_t* fqf = malloc_or_die(sizeof(fastq_t));
+ or_die((int)(fqf->file = gzdopen(fileno(f), "rb")),
+ "Can not open gzip file.");
+
+ fqf->state = STATE_ID1;
+ fqf->buf = malloc_or_die(fastq_buf_size);
+ fqf->buf[0] = '\0';
+ fqf->c = fqf->buf;
+
+ return fqf;
+}
+
+
+void fastq_close(fastq_t* fqf)
+{
+ gzclose(fqf->file);
+ free(fqf->buf);
+ free(fqf);
+}
+
+
+void fastq_refill(fastq_t* f)
+{
+ int errnum;
+ const char* errmsg;
+
+ int n = gzread(f->file, f->buf, fastq_buf_size - 1);
+
+ if (n <= 0) {
+ if (gzeof(f->file)) {
+ f->state = STATE_EOF;
+ n = 0;
+ }
+ else {
+ errmsg = gzerror(f->file, &errnum);
+ fprintf(stderr, "I/O error: %s\n", errmsg);
+ exit(1);
+ }
+ }
+
+ f->buf[n] = '\0';
+ f->c = f->buf;
+}
+
+
+void fastq_get_line(fastq_t* f, str_t* s)
+{
+ int i = 0;
+
+ if (f->state == STATE_EOF) goto fastq_get_line_done;
+
+ while (1) {
+ switch (*f->c) {
+ case '\0':
+ fastq_refill(f);
+ if (f->state == STATE_EOF) goto fastq_get_line_done;
+ break;
+
+ case '\r':
+ f->c++;
+ break;
+
+ case '\n':
+ goto fastq_get_line_done;
+
+ default:
+ while (s->size < i + 2) {
+ fastq_expand_str(s);
+ }
+ if (s) s->s[i++] = *f->c;
+ f->c++;
+ }
+
+ }
+
+fastq_get_line_done:
+ if (s) {
+ s->s[i] = '\0';
+ s->n = i;
+ }
+}
+
+
+
+int fastq_next(fastq_t* f, seq_t* seq)
+{
+ if (f->state == STATE_EOF) return 0;
+
+ while (1) {
+
+ /* read more, if needed */
+ if (*f->c == '\0' ) {
+ fastq_refill(f);
+ if (f->state == STATE_EOF) return 0;
+ continue;
+ }
+
+ /* skip over leading whitespace */
+ else if (isspace(*f->c)) {
+ /* do nothing */
+ }
+
+ /* skip comments */
+ else if (*f->c == ';') {
+ fastq_get_line(f, NULL);
+ if (f->state == STATE_EOF) return 0;
+ }
+
+ /* read id1 */
+ else if (f->state == STATE_ID1) {
+ if (*f->c == '@') {
+ f->c++;
+ fastq_get_line(f, &seq->id1);
+ if (f->state == STATE_EOF) return 0;
+
+ f->state = STATE_SEQ;
+ }
+ else {
+ fprintf(stderr, "Malformed FASTQ file: expecting an '@', saw a '%c'\n", *f->c);
+ exit(1);
+ }
+ }
+
+ /* read sequence */
+ else if (f->state == STATE_SEQ) {
+ fastq_get_line(f, &seq->seq);
+ if (f->state == STATE_EOF) return 0;
+
+ f->state = STATE_ID2;
+ }
+
+ /* read id2 */
+ else if (f->state == STATE_ID2) {
+ if (*f->c == '+') {
+ f->c++;
+ fastq_get_line(f, &seq->id2);
+ if (f->state == STATE_EOF) return 0;
+
+ f->state = STATE_QUAL;
+ }
+ else {
+ fprintf(stderr, "Malformed FASTQ file: expecting an '+', saw a '%c'\n", *f->c);
+ exit(1);
+ }
+ }
+
+ /* read quality string */
+ else if (f->state == STATE_QUAL) {
+ fastq_get_line(f, &seq->qual);
+ if (f->state == STATE_EOF) return 1;
+
+ f->state = STATE_ID1;
+ break;
+ }
+
+ else {
+ fputs("Inexplicable error in fastq parser.\n", stderr);
+ exit(1);
+ }
+
+ f->c++;
+ }
+
+ return 1;
+}
+
+
--- /dev/null
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-parse :
+ * A parser for FASTQ files.
+ *
+ * This parser is mostly derivative of Heng Li's.
+ * See: http://lh3lh3.users.sourceforge.net/kseq.shtml
+ *
+ */
+
+#ifndef FASTQ_TOOLS_PARSE_H
+#define FASTQ_TOOLS_PARSE_H
+
+#include <stdio.h>
+#include <zlib.h>
+
+
+typedef struct
+{
+ char* s; /* null-terminated string */
+ size_t n; /* length of s */
+ size_t size; /* bytes allocated for s */
+} str_t;
+
+
+
+typedef struct
+{
+ str_t id1;
+ str_t seq;
+ str_t id2;
+ str_t qual;
+} seq_t;
+
+
+seq_t* fastq_alloc_seq();
+void fastq_free_seq(seq_t*);
+
+
+typedef struct
+{
+ gzFile file;
+ int state;
+ char* buf;
+ char* c;
+} fastq_t;
+
+
+fastq_t* fastq_open(FILE*);
+void fastq_close(fastq_t*);
+int fastq_next(fastq_t*, seq_t*);
+
+#endif
+
--- /dev/null
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * This is a very simple, 'fast enough' implementation of the Smith-Waterman
+ * algorithm specifically for short nucleotide sequences, working in O(mn) time
+ * and O(m) space, implemented according to the original Gotoh paper and
+ * Phil Green's implementation in cross_match.
+ *
+ * There is no fancy bit packing or vectorization, but such features would offer
+ * diminishing returns when aligning short sequences such as high throughput
+ * sequencing data. For example the Farrar SSE2 algorithm might be a tiny bit
+ * faster, but would diminish portability.
+ *
+ */
+
+
+#include "sw.h"
+#include "common.h"
+#include <string.h>
+#include <stdlib.h>
+
+
+static const int sw_default_d[25] =
+ /* A C G T N */
+ { 1, -2, -2, -2 , 0,
+ -2, 1, -2, -2, 0,
+ -2, -2, 1, -2, 0,
+ -2, -2, -2, 1, 0,
+ 0, 0, 0, 0, 0 };
+
+static const int sw_mat50_d[25] =
+ { 2, -2, 0, -2 , 0,
+ -2, 2, -2, 0, 0,
+ 0, -2, 2, -2, 0,
+ -2, 0, -2, 2, 0,
+ 0, 0, 0, 0, 0 };
+
+static const int sw_mat70_d[25] =
+ { 2, -2, -1, -2 , 0,
+ -2, 2, -2, -1, 0,
+ -1, -2, 2, -2, 0,
+ -2, -1, -2, 2, 0,
+ 0, 0, 0, 0, 0 };
+
+
+static inline int imax(int a, int b)
+{
+ return a > b ? a : b;
+}
+
+static inline int imax4(int a, int b, int c, int d)
+{
+ return imax(imax(a,b), imax(c,d));
+}
+
+
+
+void fastq_sw_conv_seq(unsigned char* seq, int n)
+{
+ while (*seq && n) {
+ switch (*seq) {
+ case 'A' :
+ case 'a':
+ case 'U':
+ case 'u':
+ *seq = 0;
+ break;
+
+ case 'C':
+ case 'c':
+ *seq = 1;
+ break;
+
+ case 'G':
+ case 'g':
+ *seq = 2;
+ break;
+
+ case 'T':
+ case 't':
+ *seq = 3;
+ break;
+
+ case 'N':
+ case 'n':
+ default:
+ *seq = 4;
+ }
+
+ seq++;
+ n--;
+ }
+}
+
+
+sw_t* fastq_alloc_sw(const unsigned char* subject, int size)
+{
+ sw_t* sw = malloc_or_die(sizeof(sw_t));
+
+ sw->subject = malloc_or_die(size);
+ memcpy(sw->subject, subject, size);
+
+ /* default cost matrix */
+ memcpy(sw->d, sw_default_d, 25 * sizeof(int));
+
+ /* default gap costs */
+ sw->gap_open = -4;
+ sw->gap_extend = -3;
+
+ sw->work0 = malloc_or_die(size * sizeof(int));
+ sw->work1 = malloc_or_die(size * sizeof(int));
+ sw->size = size;
+
+ return sw;
+}
+
+
+void fastq_free_sw(sw_t* sw)
+{
+ free(sw->subject);
+ free(sw->work0);
+ free(sw->work1);
+ free(sw);
+}
+
+
+
+int fastq_sw(sw_t* sw, const unsigned char* x, int n)
+{
+ /* conveniance */
+ int* maxstu = sw->work0;
+ int* t = sw->work1;
+ int m = sw->size;
+ const int* d = sw->d;
+ int gap_op = sw->gap_open;
+ int gap_ex = sw->gap_extend;
+ unsigned char* y = sw->subject;
+
+ int i, j;
+
+ int score = 0;
+
+ /* zero maxstu row */
+ memset(maxstu, 0, m * sizeof(int));
+
+ /* initialize t row to a negative value to prohibit
+ * leading with gap extensions */
+ for (j = 0; j < m; j++) t[j] = -1;
+
+ int u, s;
+ int maxstu0;
+
+ for (i = 0; i < n; i++) {
+
+ /* special case for column 0 */
+ t[0] = imax(maxstu[0] + gap_op, t[0] + gap_ex);
+ u = gap_op;
+ maxstu[0] = imax4(0, d[5 * y[0] + x[0]], t[0], u);
+ maxstu0 = 0;
+ score = imax(score, maxstu[0]);
+
+
+ for (j = 1; j < m; j++) {
+ t[j] = imax(maxstu[j] + gap_op, t[j] + gap_ex);
+ u = imax(maxstu[j-1] + gap_op, u + gap_ex);
+ s = maxstu0 + d[5 * y[j] + x[i]];
+ maxstu0 = maxstu[j];
+
+ maxstu[j] = imax4(0, s, t[j], u);
+ score = imax(score, maxstu[j]);
+ }
+ }
+
+ return score;
+}
+
+
+
--- /dev/null
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-sw :
+ * Local alignments of nucleotide sequences via Smith-Waterman.
+ *
+ */
+
+#ifndef FASTQ_TOOLS_SW_H
+#define FASTQ_TOOLS_SW_H
+
+
+typedef struct
+{
+ unsigned char* subject;
+ int size;
+
+ int d[25]; /* cost matrix */
+ int gap_open; /* gap open */
+ int gap_extend; /* gap extend */
+
+ /* matrix rows, used internally */
+ int* work0;
+ int* work1;
+
+} sw_t;
+
+/* convert a n ASCII nucleotide sequence to one suitable for fastq_sw */
+void fastq_sw_conv_seq(unsigned char*, int n);
+
+sw_t* fastq_alloc_sw(const unsigned char *subject, int size);
+void fastq_free_sw(sw_t*);
+
+int fastq_sw(sw_t*, const unsigned char* query, int size);
+
+#endif
+