From: Daniel Jones Date: Thu, 17 Mar 2011 12:53:12 +0000 (-0700) Subject: a program to output a nonredundant list of readss X-Git-Url: https://git.donarmstrong.com/?p=fastq-tools.git;a=commitdiff_plain;h=40ab4c0cde1bfee1616777995998b0cbc5ffc741 a program to output a nonredundant list of readss --- diff --git a/src/Makefile.am b/src/Makefile.am index e28add9..989ae69 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,9 +1,10 @@ -bin_PROGRAMS = fastq-grep fastq-kmers fastq-match +bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq -fastq_common_src=fastq-common.h fastq-common.c -fastq_parse_src=fastq-parse.h fastq-parse.c -fastq_sw_src=fastq-sw.h fastq-sw.c +fastq_common_src=common.h common.c +fastq_parse_src=parse.h parse.c +fastq_sw_src=sw.h sw.c +fastq_hash_src=hash.h hash.c fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src) fastq_grep_LDADD = $(PCRE_LIBS) -lz @@ -14,4 +15,7 @@ fastq_kmers_LDADD = -lz fastq_match_SOURCES = fastq-match.c $(fastq_common_src) $(fastq_parse_src) $(fastq_sw_src) fastq_match_LDADD = -lz +fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src) +fastq_uniq_LDADD = -lz + diff --git a/src/common.c b/src/common.c new file mode 100644 index 0000000..4598819 --- /dev/null +++ b/src/common.c @@ -0,0 +1,56 @@ + +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + */ + + +#include "common.h" +#include + + +void or_die(int b, const char* msg) +{ + if (b == 0) { + fputs(msg, stderr); + exit(1); + } +} + + +void* malloc_or_die(size_t n) +{ + void* p = malloc(n); + if (p == NULL) { + fprintf(stderr, "Can not allocate %zu bytes.\n", n); + exit(1); + } + return p; +} + + +void* realloc_or_die(void* ptr, size_t n) +{ + void* p = realloc(ptr, n); + if (p == NULL) { + fprintf(stderr, "Can not allocate %zu bytes.\n", n); + exit(1); + } + return p; +} + + +FILE* fopen_or_die(const char* path, const char* mode) +{ + FILE* f = fopen(path, mode); + if (f == NULL) { + fprintf(stderr, "Can not open file %s with mode %s.\n", path, mode); + exit(1); + } + return f; +} + + + diff --git a/src/common.h b/src/common.h new file mode 100644 index 0000000..cdc89e8 --- /dev/null +++ b/src/common.h @@ -0,0 +1,23 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * common : + * A few common functions, primarily for crashing whilst retaining our dignity. + * + */ + +#ifndef FASTQ_TOOLS_COMMON_H +#define FASTQ_TOOLS_COMMON_H + +#include + +void or_die(int b, const char* msg); + +void* malloc_or_die(size_t); +void* realloc_or_die(void*, size_t); +FILE* fopen_or_die(const char*, const char*); + +#endif + diff --git a/src/fastq-common.c b/src/fastq-common.c deleted file mode 100644 index ae1ecac..0000000 --- a/src/fastq-common.c +++ /dev/null @@ -1,56 +0,0 @@ - -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - */ - - -#include "fastq-common.h" -#include - - -void or_die(int b, const char* msg) -{ - if (b == 0) { - fputs(msg, stderr); - exit(1); - } -} - - -void* malloc_or_die(size_t n) -{ - void* p = malloc(n); - if (p == NULL) { - fprintf(stderr, "Can not allocate %zu bytes.\n", n); - exit(1); - } - return p; -} - - -void* realloc_or_die(void* ptr, size_t n) -{ - void* p = realloc(ptr, n); - if (p == NULL) { - fprintf(stderr, "Can not allocate %zu bytes.\n", n); - exit(1); - } - return p; -} - - -FILE* fopen_or_die(const char* path, const char* mode) -{ - FILE* f = fopen(path, mode); - if (f == NULL) { - fprintf(stderr, "Can not open file %s with mode %s.\n", path, mode); - exit(1); - } - return f; -} - - - diff --git a/src/fastq-common.h b/src/fastq-common.h deleted file mode 100644 index cdc89e8..0000000 --- a/src/fastq-common.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - * common : - * A few common functions, primarily for crashing whilst retaining our dignity. - * - */ - -#ifndef FASTQ_TOOLS_COMMON_H -#define FASTQ_TOOLS_COMMON_H - -#include - -void or_die(int b, const char* msg); - -void* malloc_or_die(size_t); -void* realloc_or_die(void*, size_t); -FILE* fopen_or_die(const char*, const char*); - -#endif - diff --git a/src/fastq-grep.c b/src/fastq-grep.c index f7b3913..67c9190 100644 --- a/src/fastq-grep.c +++ b/src/fastq-grep.c @@ -10,8 +10,8 @@ */ -#include "fastq-common.h" -#include "fastq-parse.h" +#include "common.h" +#include "parse.h" #include #include #include diff --git a/src/fastq-kmers.c b/src/fastq-kmers.c index d78a453..57c75c2 100644 --- a/src/fastq-kmers.c +++ b/src/fastq-kmers.c @@ -9,8 +9,8 @@ * */ -#include "fastq-common.h" -#include "fastq-parse.h" +#include "common.h" +#include "parse.h" #include #include #include diff --git a/src/fastq-match.c b/src/fastq-match.c index 7b55f07..09da29b 100644 --- a/src/fastq-match.c +++ b/src/fastq-match.c @@ -10,9 +10,9 @@ */ -#include "fastq-common.h" -#include "fastq-parse.h" -#include "fastq-sw.h" +#include "common.h" +#include "parse.h" +#include "sw.h" #include #include #include @@ -32,7 +32,7 @@ static int help_flag; void print_help() { - fprintf( stderr, + fprintf(stderr, "fastq-match [OPTION]... QUERY [FILE]...\n" "Perform Smith-Waterman local alignment of a query sequence\n" "against each sequence in a fastq file.\n\n" @@ -97,7 +97,7 @@ int main(int argc, char* argv[]) while (1) { opt = getopt_long(argc, argv, "h", long_options, &opt_idx); - if( opt == -1 ) break; + if (opt == -1) break; switch (opt) { case 0: diff --git a/src/fastq-parse.c b/src/fastq-parse.c deleted file mode 100644 index ec5b478..0000000 --- a/src/fastq-parse.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - */ - -#include "fastq-parse.h" -#include "fastq-common.h" -#include -#include - - -static const size_t init_str_size = 128; -static const size_t fastq_buf_size = 4096; - -static void fastq_alloc_str(str_t* s) -{ - s->s = malloc_or_die(init_str_size); - s->s[0] = '\0'; - s->n = 0; - s->size = init_str_size; -} - - -static void fastq_expand_str(str_t* s) -{ - s->size *= 2; - realloc_or_die(s->s, s->size); -} - - -seq_t* fastq_alloc_seq() -{ - seq_t* seq = malloc_or_die(sizeof(seq_t)); - fastq_alloc_str(&seq->id1); - fastq_alloc_str(&seq->seq); - fastq_alloc_str(&seq->id2); - fastq_alloc_str(&seq->qual); - - return seq; -} - - -void fastq_free_seq(seq_t* seq) -{ - free(seq->id1.s); - free(seq->seq.s); - free(seq->id2.s); - free(seq->qual.s); - free(seq); -} - - -typedef enum -{ - STATE_EOF, - STATE_ERROR, - STATE_ID1, - STATE_SEQ, - STATE_ID2, - STATE_QUAL - -} fastq_state; - - -fastq_t* fastq_open(FILE* f) -{ - fastq_t* fqf = malloc_or_die(sizeof(fastq_t)); - or_die((int)(fqf->file = gzdopen(fileno(f), "rb")), - "Can not open gzip file."); - - fqf->state = STATE_ID1; - fqf->buf = malloc_or_die(fastq_buf_size); - fqf->buf[0] = '\0'; - fqf->c = fqf->buf; - - return fqf; -} - - -void fastq_close(fastq_t* fqf) -{ - gzclose(fqf->file); - free(fqf->buf); - free(fqf); -} - - -void fastq_refill(fastq_t* f) -{ - int errnum; - const char* errmsg; - - int n = gzread(f->file, f->buf, fastq_buf_size - 1); - - if (n <= 0) { - if (gzeof(f->file)) { - f->state = STATE_EOF; - n = 0; - } - else { - errmsg = gzerror(f->file, &errnum); - fprintf(stderr, "I/O error: %s\n", errmsg); - exit(1); - } - } - - f->buf[n] = '\0'; - f->c = f->buf; -} - - -void fastq_get_line(fastq_t* f, str_t* s) -{ - int i = 0; - - if (f->state == STATE_EOF) goto fastq_get_line_done; - - while (1) { - switch (*f->c) { - case '\0': - fastq_refill(f); - if (f->state == STATE_EOF) goto fastq_get_line_done; - break; - - case '\r': - f->c++; - break; - - case '\n': - goto fastq_get_line_done; - - default: - while (s->size < i + 2) { - fastq_expand_str(s); - } - if (s) s->s[i++] = *f->c; - f->c++; - } - - } - -fastq_get_line_done: - if (s) { - s->s[i] = '\0'; - s->n = i; - } -} - - - -int fastq_next(fastq_t* f, seq_t* seq) -{ - if (f->state == STATE_EOF) return 0; - - while (1) { - - /* read more, if needed */ - if (*f->c == '\0' ) { - fastq_refill(f); - if (f->state == STATE_EOF) return 0; - continue; - } - - /* skip over leading whitespace */ - else if (isspace(*f->c)) { - /* do nothing */ - } - - /* skip comments */ - else if (*f->c == ';') { - fastq_get_line(f, NULL); - if (f->state == STATE_EOF) return 0; - } - - /* read id1 */ - else if (f->state == STATE_ID1) { - if (*f->c == '@') { - f->c++; - fastq_get_line(f, &seq->id1); - if (f->state == STATE_EOF) return 0; - - f->state = STATE_SEQ; - } - else { - fprintf(stderr, "Malformed FASTQ file: expecting an '@', saw a '%c'\n", *f->c); - exit(1); - } - } - - /* read sequence */ - else if (f->state == STATE_SEQ) { - fastq_get_line(f, &seq->seq); - if (f->state == STATE_EOF) return 0; - - f->state = STATE_ID2; - } - - /* read id2 */ - else if (f->state == STATE_ID2) { - if (*f->c == '+') { - f->c++; - fastq_get_line(f, &seq->id2); - if (f->state == STATE_EOF) return 0; - - f->state = STATE_QUAL; - } - else { - fprintf(stderr, "Malformed FASTQ file: expecting an '+', saw a '%c'\n", *f->c); - exit(1); - } - } - - /* read quality string */ - else if (f->state == STATE_QUAL) { - fastq_get_line(f, &seq->qual); - if (f->state == STATE_EOF) return 1; - - f->state = STATE_ID1; - break; - } - - else { - fputs("Inexplicable error in fastq parser.\n", stderr); - exit(1); - } - - f->c++; - } - - return 1; -} - - diff --git a/src/fastq-parse.h b/src/fastq-parse.h deleted file mode 100644 index 56a074a..0000000 --- a/src/fastq-parse.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - * fastq-parse : - * A parser for FASTQ files. - * - * This parser is mostly derivative of Heng Li's. - * See: http://lh3lh3.users.sourceforge.net/kseq.shtml - * - */ - -#ifndef FASTQ_TOOLS_PARSE_H -#define FASTQ_TOOLS_PARSE_H - -#include -#include - - -typedef struct -{ - char* s; /* null-terminated string */ - size_t n; /* length of s */ - size_t size; /* bytes allocated for s */ -} str_t; - - - -typedef struct -{ - str_t id1; - str_t seq; - str_t id2; - str_t qual; -} seq_t; - - -seq_t* fastq_alloc_seq(); -void fastq_free_seq(seq_t*); - - -typedef struct -{ - gzFile file; - int state; - char* buf; - char* c; -} fastq_t; - - -fastq_t* fastq_open(FILE*); -void fastq_close(fastq_t*); -int fastq_next(fastq_t*, seq_t*); - -#endif - diff --git a/src/fastq-sw.c b/src/fastq-sw.c deleted file mode 100644 index eb26ff5..0000000 --- a/src/fastq-sw.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - * This is a very simple, 'fast enough' implementation of the Smith-Waterman - * algorithm specifically for short nucleotide sequences, working in O(mn) time - * and O(m) space, implemented according to the original Gotoh paper and - * Phil Green's implementation in cross_match. - * - * There is no fancy bit packing or vectorization, but such features would offer - * diminishing returns when aligning short sequences such as high throughput - * sequencing data. For example the Farrar SSE2 algorithm might be a tiny bit - * faster, but would diminish portability. - * - */ - - -#include "fastq-sw.h" -#include "fastq-common.h" -#include -#include - - -static const int sw_default_d[25] = - /* A C G T N */ - { 1, -2, -2, -2 , 0, - -2, 1, -2, -2, 0, - -2, -2, 1, -2, 0, - -2, -2, -2, 1, 0, - 0, 0, 0, 0, 0 }; - -static const int sw_mat50_d[25] = - { 2, -2, 0, -2 , 0, - -2, 2, -2, 0, 0, - 0, -2, 2, -2, 0, - -2, 0, -2, 2, 0, - 0, 0, 0, 0, 0 }; - -static const int sw_mat70_d[25] = - { 2, -2, -1, -2 , 0, - -2, 2, -2, -1, 0, - -1, -2, 2, -2, 0, - -2, -1, -2, 2, 0, - 0, 0, 0, 0, 0 }; - - -static inline int imax(int a, int b) -{ - return a > b ? a : b; -} - -static inline int imax4(int a, int b, int c, int d) -{ - return imax(imax(a,b), imax(c,d)); -} - - - -void fastq_sw_conv_seq(unsigned char* seq, int n) -{ - while (*seq && n) { - switch (*seq) { - case 'A' : - case 'a': - case 'U': - case 'u': - *seq = 0; - break; - - case 'C': - case 'c': - *seq = 1; - break; - - case 'G': - case 'g': - *seq = 2; - break; - - case 'T': - case 't': - *seq = 3; - break; - - case 'N': - case 'n': - default: - *seq = 4; - } - - seq++; - n--; - } -} - - -sw_t* fastq_alloc_sw(const unsigned char* subject, int size) -{ - sw_t* sw = malloc_or_die(sizeof(sw_t)); - - sw->subject = malloc_or_die(size); - memcpy(sw->subject, subject, size); - - /* default cost matrix */ - memcpy(sw->d, sw_default_d, 25 * sizeof(int)); - - /* default gap costs */ - sw->gap_open = -4; - sw->gap_extend = -3; - - sw->work0 = malloc_or_die(size * sizeof(int)); - sw->work1 = malloc_or_die(size * sizeof(int)); - sw->size = size; - - return sw; -} - - -void fastq_free_sw(sw_t* sw) -{ - free(sw->subject); - free(sw->work0); - free(sw->work1); - free(sw); -} - - - -int fastq_sw(sw_t* sw, const unsigned char* x, int n) -{ - /* conveniance */ - int* maxstu = sw->work0; - int* t = sw->work1; - int m = sw->size; - const int* d = sw->d; - int gap_op = sw->gap_open; - int gap_ex = sw->gap_extend; - unsigned char* y = sw->subject; - - int i, j; - - int score = 0; - - /* zero maxstu row */ - memset(maxstu, 0, m * sizeof(int)); - - /* initialize t row to a negative value to prohibit - * leading with gap extensions */ - for (j = 0; j < m; j++) t[j] = -1; - - int u, s; - int maxstu0; - - for (i = 0; i < n; i++) { - - /* special case for column 0 */ - t[0] = imax(maxstu[0] + gap_op, t[0] + gap_ex); - u = gap_op; - maxstu[0] = imax4(0, d[5 * y[0] + x[0]], t[0], u); - maxstu0 = 0; - score = imax(score, maxstu[0]); - - - for (j = 1; j < m; j++) { - t[j] = imax(maxstu[j] + gap_op, t[j] + gap_ex); - u = imax(maxstu[j-1] + gap_op, u + gap_ex); - s = maxstu0 + d[5 * y[j] + x[i]]; - maxstu0 = maxstu[j]; - - maxstu[j] = imax4(0, s, t[j], u); - score = imax(score, maxstu[j]); - } - } - - return score; -} - - - diff --git a/src/fastq-sw.h b/src/fastq-sw.h deleted file mode 100644 index 6e35a77..0000000 --- a/src/fastq-sw.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - * fastq-sw : - * Local alignments of nucleotide sequences via Smith-Waterman. - * - */ - -#ifndef FASTQ_TOOLS_SW_H -#define FASTQ_TOOLS_SW_H - - -typedef struct -{ - unsigned char* subject; - int size; - - int d[25]; /* cost matrix */ - int gap_open; /* gap open */ - int gap_extend; /* gap extend */ - - /* matrix rows, used internally */ - int* work0; - int* work1; - -} sw_t; - -/* convert a n ASCII nucleotide sequence to one suitable for fastq_sw */ -void fastq_sw_conv_seq(unsigned char*, int n); - -sw_t* fastq_alloc_sw(const unsigned char *subject, int size); -void fastq_free_sw(sw_t*); - -int fastq_sw(sw_t*, const unsigned char* query, int size); - -#endif - diff --git a/src/fastq-uniq.c b/src/fastq-uniq.c new file mode 100644 index 0000000..fddb52a --- /dev/null +++ b/src/fastq-uniq.c @@ -0,0 +1,166 @@ + +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * fastq-uniq : + * Collapsing a fastq file into only unique read sequences. + */ + + +#include "common.h" +#include "hash.h" +#include "parse.h" +#include +#include +#include + + +#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) +# include +# include +# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) +#else +# define SET_BINARY_MODE(file) +#endif + + +static int help_flag; +static int verbose_flag; +size_t total_reads; + +void print_help() +{ + fprintf(stderr, +"fastq-uniq [OPTION] [FILE]...\n" +"Output a non-redundant FASTQ file, in which there are no duplicate reads.\n" +"(Warning: this program can be somewhat memory intensive.)\n\n" +"Options:\n" +" -h, --help print this message\n" +" -v, --verbose print status along the way\n" + ); +} + + +void fastq_hash(FILE* fin, hash_table* T) +{ + fastq_t* fqf = fastq_open(fin); + seq_t* seq = fastq_alloc_seq(); + + while (fastq_next(fqf, seq)) { + inc_hash_table(T, seq->seq.s, seq->seq.n); + + total_reads++; + if (verbose_flag && total_reads % 100000 == 0) { + fprintf(stderr, "%zu reads processed ...\n", total_reads); + } + } + + fastq_free_seq(seq); + fastq_close(fqf); +} + + +int compare_hashed_value_count(const void* x, const void* y) +{ + hashed_value* const * a = x; + hashed_value* const * b = y; + + if( (*a)->count > (*b)->count ) return -1; + if( (*a)->count < (*b)->count ) return 1; + return 0; +} + + + +void print_hash_table(FILE* fout, hash_table* T) +{ + hashed_value** S = dump_hash_table(T); + qsort(S, T->m, sizeof(hashed_value*), compare_hashed_value_count); + + size_t i; + for (i = 0; i < T->m; i++) { + fprintf(fout, ">unique-read-%07zu (%zu copies)\n", i, S[i]->count); + fwrite(S[i]->value, S[i]->len, sizeof(char), fout); + fprintf(fout, "\n"); + } + free(S); +} + + + +int main(int argc, char* argv[]) +{ + SET_BINARY_MODE(stdin); + SET_BINARY_MODE(stdout); + + hash_table* T = create_hash_table(); + + FILE* fin ; + + help_flag = 0; + + int opt; + int opt_idx; + + static struct option long_options[] = + { + {"help", no_argument, &help_flag, 1}, + {"verbose", no_argument, &verbose_flag, 1}, + {0, 0, 0, 0} + }; + + while (1) { + opt = getopt_long(argc, argv, "hv", long_options, &opt_idx); + + if (opt == -1) break; + + switch (opt) { + case 0: + if (long_options[opt_idx].flag != 0) break; + if (optarg) { + } + break; + + case 'h': + help_flag = 1; + break; + + case 'v': + verbose_flag = 1; + break; + + case '?': + return 1; + + default: + abort(); + } + } + + if (help_flag) { + print_help(); + return 0; + } + + if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) { + fastq_hash(stdin, T); + } + else { + for (; optind < argc; optind++) { + fin = fopen(argv[optind], "rb"); + if (fin == NULL) { + fprintf(stderr, "No such file '%s'.\n", argv[optind]); + continue; + } + + fastq_hash(fin, T); + } + } + + print_hash_table(stdout, T); + + destroy_hash_table(T); + return 0; +} diff --git a/src/hash.c b/src/hash.c new file mode 100644 index 0000000..9b1a043 --- /dev/null +++ b/src/hash.c @@ -0,0 +1,210 @@ + +#include "hash.h" +#include "common.h" +#include +#include +#include + + +static const size_t INITIAL_TABLE_SIZE = 128; +static const double MAX_LOAD = 0.75; + + +/* + * Paul Hsieh's SuperFastHash + * http://www.azillionmonkeys.com/qed/hash.html + */ + + +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif + +static uint32_t hash(const char * data, size_t len) { + uint32_t hash = len, tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} + + + +static void rehash(hash_table* T, size_t new_n); +static void clear_hash_table(hash_table*); + + + +hash_table* create_hash_table() +{ + hash_table* T = malloc_or_die(sizeof(hash_table)); + T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(hashed_value*)); + memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(hashed_value*)); + T->n = INITIAL_TABLE_SIZE; + T->m = 0; + T->max_m = T->n * MAX_LOAD; + + return T; +} + + +void destroy_hash_table(hash_table* T) +{ + if (T != NULL) { + clear_hash_table(T); + free(T->A); + free(T); + } +} + + + +void clear_hash_table(hash_table* T) +{ + hashed_value* u; + size_t i; + for (i = 0; i < T->n; i++) { + while (T->A[i]) { + u = T->A[i]->next; + free(T->A[i]->value); + free(T->A[i]); + T->A[i] = u; + } + } + + T->m = 0; +} + + +static void insert_without_copy(hash_table* T, hashed_value* V) +{ + uint32_t h = hash(V->value, V->len) % T->n; + V->next = T->A[h]; + T->A[h] = V; + T->m++; +} + + + +static void rehash(hash_table* T, size_t new_n) +{ + hash_table U; + U.n = new_n; + U.m = 0; + U.max_m = U.n * MAX_LOAD; + U.A = malloc_or_die(U.n * sizeof(hashed_value*)); + memset(U.A, 0, U.n * sizeof(hashed_value*)); + + hashed_value *j, *k; + size_t i; + for (i = 0; i < T->n; i++) { + j = T->A[i]; + while (j) { + k = j->next; + insert_without_copy(&U, j); + j = k; + } + T->A[i] = NULL; + } + + free(T->A); + T->A = U.A; + T->n = U.n; + T->max_m = U.max_m; +} + + +void inc_hash_table(hash_table* T, const char* value, size_t len) +{ + if (T->m >= T->max_m) rehash(T, T->n * 2); + + uint32_t h = hash(value, len) % T->n; + + hashed_value* u = T->A[h]; + + while (u) { + if (u->len == len && memcmp(u->value, value, len) == 0) { + u->count++; + return; + } + + u = u->next; + } + + u = malloc_or_die(sizeof(hashed_value)); + u->value = malloc_or_die(len); + memcpy(u->value, value, len); + u->len = len; + u->count = 1; + + u->next = T->A[h]; + T->A[h] = u; + + T->m++; +} + + + +hashed_value** dump_hash_table(hash_table* T) +{ + hashed_value** D = malloc_or_die(T->m * sizeof(hashed_value*)); + + hashed_value* u; + size_t i, j; + for (i = 0, j = 0; i < T->n; i++) { + u = T->A[i]; + while (u) { + D[j++] = u; + u = u->next; + } + } + + return D; +} + + + diff --git a/src/hash.h b/src/hash.h new file mode 100644 index 0000000..f101824 --- /dev/null +++ b/src/hash.h @@ -0,0 +1,37 @@ + +#ifndef FASTQ_TOOLS_HASH_H +#define FASTQ_TOOLS_HASH_H + +#include +#include + + +typedef struct hashed_value_ +{ + char* value; + size_t len; + uint32_t count; + struct hashed_value_* next; +} hashed_value; + + +typedef struct +{ + hashed_value** A; /* table proper */ + size_t n; /* table size */ + size_t m; /* hashed items */ + size_t max_m; /* max hashed items before rehash */ +} hash_table; + + +hash_table* create_hash_table(); + +void destroy_hash_table(hash_table*); + +void inc_hash_table(hash_table*, const char* value, size_t len); + +hashed_value** dump_hash_table(hash_table*); + + +#endif + diff --git a/src/parse.c b/src/parse.c new file mode 100644 index 0000000..bec5ac3 --- /dev/null +++ b/src/parse.c @@ -0,0 +1,235 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + */ + +#include "parse.h" +#include "common.h" +#include +#include + + +static const size_t init_str_size = 128; +static const size_t fastq_buf_size = 4096; + +static void fastq_alloc_str(str_t* s) +{ + s->s = malloc_or_die(init_str_size); + s->s[0] = '\0'; + s->n = 0; + s->size = init_str_size; +} + + +static void fastq_expand_str(str_t* s) +{ + s->size *= 2; + realloc_or_die(s->s, s->size); +} + + +seq_t* fastq_alloc_seq() +{ + seq_t* seq = malloc_or_die(sizeof(seq_t)); + fastq_alloc_str(&seq->id1); + fastq_alloc_str(&seq->seq); + fastq_alloc_str(&seq->id2); + fastq_alloc_str(&seq->qual); + + return seq; +} + + +void fastq_free_seq(seq_t* seq) +{ + free(seq->id1.s); + free(seq->seq.s); + free(seq->id2.s); + free(seq->qual.s); + free(seq); +} + + +typedef enum +{ + STATE_EOF, + STATE_ERROR, + STATE_ID1, + STATE_SEQ, + STATE_ID2, + STATE_QUAL + +} fastq_state; + + +fastq_t* fastq_open(FILE* f) +{ + fastq_t* fqf = malloc_or_die(sizeof(fastq_t)); + or_die((int)(fqf->file = gzdopen(fileno(f), "rb")), + "Can not open gzip file."); + + fqf->state = STATE_ID1; + fqf->buf = malloc_or_die(fastq_buf_size); + fqf->buf[0] = '\0'; + fqf->c = fqf->buf; + + return fqf; +} + + +void fastq_close(fastq_t* fqf) +{ + gzclose(fqf->file); + free(fqf->buf); + free(fqf); +} + + +void fastq_refill(fastq_t* f) +{ + int errnum; + const char* errmsg; + + int n = gzread(f->file, f->buf, fastq_buf_size - 1); + + if (n <= 0) { + if (gzeof(f->file)) { + f->state = STATE_EOF; + n = 0; + } + else { + errmsg = gzerror(f->file, &errnum); + fprintf(stderr, "I/O error: %s\n", errmsg); + exit(1); + } + } + + f->buf[n] = '\0'; + f->c = f->buf; +} + + +void fastq_get_line(fastq_t* f, str_t* s) +{ + int i = 0; + + if (f->state == STATE_EOF) goto fastq_get_line_done; + + while (1) { + switch (*f->c) { + case '\0': + fastq_refill(f); + if (f->state == STATE_EOF) goto fastq_get_line_done; + break; + + case '\r': + f->c++; + break; + + case '\n': + goto fastq_get_line_done; + + default: + while (s->size < i + 2) { + fastq_expand_str(s); + } + if (s) s->s[i++] = *f->c; + f->c++; + } + + } + +fastq_get_line_done: + if (s) { + s->s[i] = '\0'; + s->n = i; + } +} + + + +int fastq_next(fastq_t* f, seq_t* seq) +{ + if (f->state == STATE_EOF) return 0; + + while (1) { + + /* read more, if needed */ + if (*f->c == '\0' ) { + fastq_refill(f); + if (f->state == STATE_EOF) return 0; + continue; + } + + /* skip over leading whitespace */ + else if (isspace(*f->c)) { + /* do nothing */ + } + + /* skip comments */ + else if (*f->c == ';') { + fastq_get_line(f, NULL); + if (f->state == STATE_EOF) return 0; + } + + /* read id1 */ + else if (f->state == STATE_ID1) { + if (*f->c == '@') { + f->c++; + fastq_get_line(f, &seq->id1); + if (f->state == STATE_EOF) return 0; + + f->state = STATE_SEQ; + } + else { + fprintf(stderr, "Malformed FASTQ file: expecting an '@', saw a '%c'\n", *f->c); + exit(1); + } + } + + /* read sequence */ + else if (f->state == STATE_SEQ) { + fastq_get_line(f, &seq->seq); + if (f->state == STATE_EOF) return 0; + + f->state = STATE_ID2; + } + + /* read id2 */ + else if (f->state == STATE_ID2) { + if (*f->c == '+') { + f->c++; + fastq_get_line(f, &seq->id2); + if (f->state == STATE_EOF) return 0; + + f->state = STATE_QUAL; + } + else { + fprintf(stderr, "Malformed FASTQ file: expecting an '+', saw a '%c'\n", *f->c); + exit(1); + } + } + + /* read quality string */ + else if (f->state == STATE_QUAL) { + fastq_get_line(f, &seq->qual); + if (f->state == STATE_EOF) return 1; + + f->state = STATE_ID1; + break; + } + + else { + fputs("Inexplicable error in fastq parser.\n", stderr); + exit(1); + } + + f->c++; + } + + return 1; +} + + diff --git a/src/parse.h b/src/parse.h new file mode 100644 index 0000000..56a074a --- /dev/null +++ b/src/parse.h @@ -0,0 +1,57 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * fastq-parse : + * A parser for FASTQ files. + * + * This parser is mostly derivative of Heng Li's. + * See: http://lh3lh3.users.sourceforge.net/kseq.shtml + * + */ + +#ifndef FASTQ_TOOLS_PARSE_H +#define FASTQ_TOOLS_PARSE_H + +#include +#include + + +typedef struct +{ + char* s; /* null-terminated string */ + size_t n; /* length of s */ + size_t size; /* bytes allocated for s */ +} str_t; + + + +typedef struct +{ + str_t id1; + str_t seq; + str_t id2; + str_t qual; +} seq_t; + + +seq_t* fastq_alloc_seq(); +void fastq_free_seq(seq_t*); + + +typedef struct +{ + gzFile file; + int state; + char* buf; + char* c; +} fastq_t; + + +fastq_t* fastq_open(FILE*); +void fastq_close(fastq_t*); +int fastq_next(fastq_t*, seq_t*); + +#endif + diff --git a/src/sw.c b/src/sw.c new file mode 100644 index 0000000..3db345f --- /dev/null +++ b/src/sw.c @@ -0,0 +1,180 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * This is a very simple, 'fast enough' implementation of the Smith-Waterman + * algorithm specifically for short nucleotide sequences, working in O(mn) time + * and O(m) space, implemented according to the original Gotoh paper and + * Phil Green's implementation in cross_match. + * + * There is no fancy bit packing or vectorization, but such features would offer + * diminishing returns when aligning short sequences such as high throughput + * sequencing data. For example the Farrar SSE2 algorithm might be a tiny bit + * faster, but would diminish portability. + * + */ + + +#include "sw.h" +#include "common.h" +#include +#include + + +static const int sw_default_d[25] = + /* A C G T N */ + { 1, -2, -2, -2 , 0, + -2, 1, -2, -2, 0, + -2, -2, 1, -2, 0, + -2, -2, -2, 1, 0, + 0, 0, 0, 0, 0 }; + +static const int sw_mat50_d[25] = + { 2, -2, 0, -2 , 0, + -2, 2, -2, 0, 0, + 0, -2, 2, -2, 0, + -2, 0, -2, 2, 0, + 0, 0, 0, 0, 0 }; + +static const int sw_mat70_d[25] = + { 2, -2, -1, -2 , 0, + -2, 2, -2, -1, 0, + -1, -2, 2, -2, 0, + -2, -1, -2, 2, 0, + 0, 0, 0, 0, 0 }; + + +static inline int imax(int a, int b) +{ + return a > b ? a : b; +} + +static inline int imax4(int a, int b, int c, int d) +{ + return imax(imax(a,b), imax(c,d)); +} + + + +void fastq_sw_conv_seq(unsigned char* seq, int n) +{ + while (*seq && n) { + switch (*seq) { + case 'A' : + case 'a': + case 'U': + case 'u': + *seq = 0; + break; + + case 'C': + case 'c': + *seq = 1; + break; + + case 'G': + case 'g': + *seq = 2; + break; + + case 'T': + case 't': + *seq = 3; + break; + + case 'N': + case 'n': + default: + *seq = 4; + } + + seq++; + n--; + } +} + + +sw_t* fastq_alloc_sw(const unsigned char* subject, int size) +{ + sw_t* sw = malloc_or_die(sizeof(sw_t)); + + sw->subject = malloc_or_die(size); + memcpy(sw->subject, subject, size); + + /* default cost matrix */ + memcpy(sw->d, sw_default_d, 25 * sizeof(int)); + + /* default gap costs */ + sw->gap_open = -4; + sw->gap_extend = -3; + + sw->work0 = malloc_or_die(size * sizeof(int)); + sw->work1 = malloc_or_die(size * sizeof(int)); + sw->size = size; + + return sw; +} + + +void fastq_free_sw(sw_t* sw) +{ + free(sw->subject); + free(sw->work0); + free(sw->work1); + free(sw); +} + + + +int fastq_sw(sw_t* sw, const unsigned char* x, int n) +{ + /* conveniance */ + int* maxstu = sw->work0; + int* t = sw->work1; + int m = sw->size; + const int* d = sw->d; + int gap_op = sw->gap_open; + int gap_ex = sw->gap_extend; + unsigned char* y = sw->subject; + + int i, j; + + int score = 0; + + /* zero maxstu row */ + memset(maxstu, 0, m * sizeof(int)); + + /* initialize t row to a negative value to prohibit + * leading with gap extensions */ + for (j = 0; j < m; j++) t[j] = -1; + + int u, s; + int maxstu0; + + for (i = 0; i < n; i++) { + + /* special case for column 0 */ + t[0] = imax(maxstu[0] + gap_op, t[0] + gap_ex); + u = gap_op; + maxstu[0] = imax4(0, d[5 * y[0] + x[0]], t[0], u); + maxstu0 = 0; + score = imax(score, maxstu[0]); + + + for (j = 1; j < m; j++) { + t[j] = imax(maxstu[j] + gap_op, t[j] + gap_ex); + u = imax(maxstu[j-1] + gap_op, u + gap_ex); + s = maxstu0 + d[5 * y[j] + x[i]]; + maxstu0 = maxstu[j]; + + maxstu[j] = imax4(0, s, t[j], u); + score = imax(score, maxstu[j]); + } + } + + return score; +} + + + diff --git a/src/sw.h b/src/sw.h new file mode 100644 index 0000000..6e35a77 --- /dev/null +++ b/src/sw.h @@ -0,0 +1,39 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * fastq-sw : + * Local alignments of nucleotide sequences via Smith-Waterman. + * + */ + +#ifndef FASTQ_TOOLS_SW_H +#define FASTQ_TOOLS_SW_H + + +typedef struct +{ + unsigned char* subject; + int size; + + int d[25]; /* cost matrix */ + int gap_open; /* gap open */ + int gap_extend; /* gap extend */ + + /* matrix rows, used internally */ + int* work0; + int* work1; + +} sw_t; + +/* convert a n ASCII nucleotide sequence to one suitable for fastq_sw */ +void fastq_sw_conv_seq(unsigned char*, int n); + +sw_t* fastq_alloc_sw(const unsigned char *subject, int size); +void fastq_free_sw(sw_t*); + +int fastq_sw(sw_t*, const unsigned char* query, int size); + +#endif +