From 40ab4c0cde1bfee1616777995998b0cbc5ffc741 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Thu, 17 Mar 2011 05:53:12 -0700 Subject: [PATCH] a program to output a nonredundant list of readss --- src/Makefile.am | 12 +- src/{fastq-common.c => common.c} | 2 +- src/{fastq-common.h => common.h} | 0 src/fastq-grep.c | 4 +- src/fastq-kmers.c | 4 +- src/fastq-match.c | 10 +- src/fastq-uniq.c | 166 ++++++++++++++++++++++++ src/hash.c | 210 +++++++++++++++++++++++++++++++ src/hash.h | 37 ++++++ src/{fastq-parse.c => parse.c} | 4 +- src/{fastq-parse.h => parse.h} | 0 src/{fastq-sw.c => sw.c} | 4 +- src/{fastq-sw.h => sw.h} | 0 13 files changed, 435 insertions(+), 18 deletions(-) rename src/{fastq-common.c => common.c} (97%) rename src/{fastq-common.h => common.h} (100%) create mode 100644 src/fastq-uniq.c create mode 100644 src/hash.c create mode 100644 src/hash.h rename src/{fastq-parse.c => parse.c} (98%) rename src/{fastq-parse.h => parse.h} (100%) rename src/{fastq-sw.c => sw.c} (98%) rename src/{fastq-sw.h => sw.h} (100%) diff --git a/src/Makefile.am b/src/Makefile.am index e28add9..989ae69 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,9 +1,10 @@ -bin_PROGRAMS = fastq-grep fastq-kmers fastq-match +bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq -fastq_common_src=fastq-common.h fastq-common.c -fastq_parse_src=fastq-parse.h fastq-parse.c -fastq_sw_src=fastq-sw.h fastq-sw.c +fastq_common_src=common.h common.c +fastq_parse_src=parse.h parse.c +fastq_sw_src=sw.h sw.c +fastq_hash_src=hash.h hash.c fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src) fastq_grep_LDADD = $(PCRE_LIBS) -lz @@ -14,4 +15,7 @@ fastq_kmers_LDADD = -lz fastq_match_SOURCES = fastq-match.c $(fastq_common_src) $(fastq_parse_src) $(fastq_sw_src) fastq_match_LDADD = -lz +fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src) +fastq_uniq_LDADD = -lz + diff --git a/src/fastq-common.c b/src/common.c similarity index 97% rename from src/fastq-common.c rename to src/common.c index ae1ecac..4598819 100644 --- a/src/fastq-common.c +++ b/src/common.c @@ -7,7 +7,7 @@ */ -#include "fastq-common.h" +#include "common.h" #include diff --git a/src/fastq-common.h b/src/common.h similarity index 100% rename from src/fastq-common.h rename to src/common.h diff --git a/src/fastq-grep.c b/src/fastq-grep.c index f7b3913..67c9190 100644 --- a/src/fastq-grep.c +++ b/src/fastq-grep.c @@ -10,8 +10,8 @@ */ -#include "fastq-common.h" -#include "fastq-parse.h" +#include "common.h" +#include "parse.h" #include #include #include diff --git a/src/fastq-kmers.c b/src/fastq-kmers.c index d78a453..57c75c2 100644 --- a/src/fastq-kmers.c +++ b/src/fastq-kmers.c @@ -9,8 +9,8 @@ * */ -#include "fastq-common.h" -#include "fastq-parse.h" +#include "common.h" +#include "parse.h" #include #include #include diff --git a/src/fastq-match.c b/src/fastq-match.c index 7b55f07..09da29b 100644 --- a/src/fastq-match.c +++ b/src/fastq-match.c @@ -10,9 +10,9 @@ */ -#include "fastq-common.h" -#include "fastq-parse.h" -#include "fastq-sw.h" +#include "common.h" +#include "parse.h" +#include "sw.h" #include #include #include @@ -32,7 +32,7 @@ static int help_flag; void print_help() { - fprintf( stderr, + fprintf(stderr, "fastq-match [OPTION]... QUERY [FILE]...\n" "Perform Smith-Waterman local alignment of a query sequence\n" "against each sequence in a fastq file.\n\n" @@ -97,7 +97,7 @@ int main(int argc, char* argv[]) while (1) { opt = getopt_long(argc, argv, "h", long_options, &opt_idx); - if( opt == -1 ) break; + if (opt == -1) break; switch (opt) { case 0: diff --git a/src/fastq-uniq.c b/src/fastq-uniq.c new file mode 100644 index 0000000..fddb52a --- /dev/null +++ b/src/fastq-uniq.c @@ -0,0 +1,166 @@ + +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * fastq-uniq : + * Collapsing a fastq file into only unique read sequences. + */ + + +#include "common.h" +#include "hash.h" +#include "parse.h" +#include +#include +#include + + +#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) +# include +# include +# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) +#else +# define SET_BINARY_MODE(file) +#endif + + +static int help_flag; +static int verbose_flag; +size_t total_reads; + +void print_help() +{ + fprintf(stderr, +"fastq-uniq [OPTION] [FILE]...\n" +"Output a non-redundant FASTQ file, in which there are no duplicate reads.\n" +"(Warning: this program can be somewhat memory intensive.)\n\n" +"Options:\n" +" -h, --help print this message\n" +" -v, --verbose print status along the way\n" + ); +} + + +void fastq_hash(FILE* fin, hash_table* T) +{ + fastq_t* fqf = fastq_open(fin); + seq_t* seq = fastq_alloc_seq(); + + while (fastq_next(fqf, seq)) { + inc_hash_table(T, seq->seq.s, seq->seq.n); + + total_reads++; + if (verbose_flag && total_reads % 100000 == 0) { + fprintf(stderr, "%zu reads processed ...\n", total_reads); + } + } + + fastq_free_seq(seq); + fastq_close(fqf); +} + + +int compare_hashed_value_count(const void* x, const void* y) +{ + hashed_value* const * a = x; + hashed_value* const * b = y; + + if( (*a)->count > (*b)->count ) return -1; + if( (*a)->count < (*b)->count ) return 1; + return 0; +} + + + +void print_hash_table(FILE* fout, hash_table* T) +{ + hashed_value** S = dump_hash_table(T); + qsort(S, T->m, sizeof(hashed_value*), compare_hashed_value_count); + + size_t i; + for (i = 0; i < T->m; i++) { + fprintf(fout, ">unique-read-%07zu (%zu copies)\n", i, S[i]->count); + fwrite(S[i]->value, S[i]->len, sizeof(char), fout); + fprintf(fout, "\n"); + } + free(S); +} + + + +int main(int argc, char* argv[]) +{ + SET_BINARY_MODE(stdin); + SET_BINARY_MODE(stdout); + + hash_table* T = create_hash_table(); + + FILE* fin ; + + help_flag = 0; + + int opt; + int opt_idx; + + static struct option long_options[] = + { + {"help", no_argument, &help_flag, 1}, + {"verbose", no_argument, &verbose_flag, 1}, + {0, 0, 0, 0} + }; + + while (1) { + opt = getopt_long(argc, argv, "hv", long_options, &opt_idx); + + if (opt == -1) break; + + switch (opt) { + case 0: + if (long_options[opt_idx].flag != 0) break; + if (optarg) { + } + break; + + case 'h': + help_flag = 1; + break; + + case 'v': + verbose_flag = 1; + break; + + case '?': + return 1; + + default: + abort(); + } + } + + if (help_flag) { + print_help(); + return 0; + } + + if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) { + fastq_hash(stdin, T); + } + else { + for (; optind < argc; optind++) { + fin = fopen(argv[optind], "rb"); + if (fin == NULL) { + fprintf(stderr, "No such file '%s'.\n", argv[optind]); + continue; + } + + fastq_hash(fin, T); + } + } + + print_hash_table(stdout, T); + + destroy_hash_table(T); + return 0; +} diff --git a/src/hash.c b/src/hash.c new file mode 100644 index 0000000..9b1a043 --- /dev/null +++ b/src/hash.c @@ -0,0 +1,210 @@ + +#include "hash.h" +#include "common.h" +#include +#include +#include + + +static const size_t INITIAL_TABLE_SIZE = 128; +static const double MAX_LOAD = 0.75; + + +/* + * Paul Hsieh's SuperFastHash + * http://www.azillionmonkeys.com/qed/hash.html + */ + + +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif + +static uint32_t hash(const char * data, size_t len) { + uint32_t hash = len, tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} + + + +static void rehash(hash_table* T, size_t new_n); +static void clear_hash_table(hash_table*); + + + +hash_table* create_hash_table() +{ + hash_table* T = malloc_or_die(sizeof(hash_table)); + T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(hashed_value*)); + memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(hashed_value*)); + T->n = INITIAL_TABLE_SIZE; + T->m = 0; + T->max_m = T->n * MAX_LOAD; + + return T; +} + + +void destroy_hash_table(hash_table* T) +{ + if (T != NULL) { + clear_hash_table(T); + free(T->A); + free(T); + } +} + + + +void clear_hash_table(hash_table* T) +{ + hashed_value* u; + size_t i; + for (i = 0; i < T->n; i++) { + while (T->A[i]) { + u = T->A[i]->next; + free(T->A[i]->value); + free(T->A[i]); + T->A[i] = u; + } + } + + T->m = 0; +} + + +static void insert_without_copy(hash_table* T, hashed_value* V) +{ + uint32_t h = hash(V->value, V->len) % T->n; + V->next = T->A[h]; + T->A[h] = V; + T->m++; +} + + + +static void rehash(hash_table* T, size_t new_n) +{ + hash_table U; + U.n = new_n; + U.m = 0; + U.max_m = U.n * MAX_LOAD; + U.A = malloc_or_die(U.n * sizeof(hashed_value*)); + memset(U.A, 0, U.n * sizeof(hashed_value*)); + + hashed_value *j, *k; + size_t i; + for (i = 0; i < T->n; i++) { + j = T->A[i]; + while (j) { + k = j->next; + insert_without_copy(&U, j); + j = k; + } + T->A[i] = NULL; + } + + free(T->A); + T->A = U.A; + T->n = U.n; + T->max_m = U.max_m; +} + + +void inc_hash_table(hash_table* T, const char* value, size_t len) +{ + if (T->m >= T->max_m) rehash(T, T->n * 2); + + uint32_t h = hash(value, len) % T->n; + + hashed_value* u = T->A[h]; + + while (u) { + if (u->len == len && memcmp(u->value, value, len) == 0) { + u->count++; + return; + } + + u = u->next; + } + + u = malloc_or_die(sizeof(hashed_value)); + u->value = malloc_or_die(len); + memcpy(u->value, value, len); + u->len = len; + u->count = 1; + + u->next = T->A[h]; + T->A[h] = u; + + T->m++; +} + + + +hashed_value** dump_hash_table(hash_table* T) +{ + hashed_value** D = malloc_or_die(T->m * sizeof(hashed_value*)); + + hashed_value* u; + size_t i, j; + for (i = 0, j = 0; i < T->n; i++) { + u = T->A[i]; + while (u) { + D[j++] = u; + u = u->next; + } + } + + return D; +} + + + diff --git a/src/hash.h b/src/hash.h new file mode 100644 index 0000000..f101824 --- /dev/null +++ b/src/hash.h @@ -0,0 +1,37 @@ + +#ifndef FASTQ_TOOLS_HASH_H +#define FASTQ_TOOLS_HASH_H + +#include +#include + + +typedef struct hashed_value_ +{ + char* value; + size_t len; + uint32_t count; + struct hashed_value_* next; +} hashed_value; + + +typedef struct +{ + hashed_value** A; /* table proper */ + size_t n; /* table size */ + size_t m; /* hashed items */ + size_t max_m; /* max hashed items before rehash */ +} hash_table; + + +hash_table* create_hash_table(); + +void destroy_hash_table(hash_table*); + +void inc_hash_table(hash_table*, const char* value, size_t len); + +hashed_value** dump_hash_table(hash_table*); + + +#endif + diff --git a/src/fastq-parse.c b/src/parse.c similarity index 98% rename from src/fastq-parse.c rename to src/parse.c index ec5b478..bec5ac3 100644 --- a/src/fastq-parse.c +++ b/src/parse.c @@ -5,8 +5,8 @@ * */ -#include "fastq-parse.h" -#include "fastq-common.h" +#include "parse.h" +#include "common.h" #include #include diff --git a/src/fastq-parse.h b/src/parse.h similarity index 100% rename from src/fastq-parse.h rename to src/parse.h diff --git a/src/fastq-sw.c b/src/sw.c similarity index 98% rename from src/fastq-sw.c rename to src/sw.c index eb26ff5..3db345f 100644 --- a/src/fastq-sw.c +++ b/src/sw.c @@ -16,8 +16,8 @@ */ -#include "fastq-sw.h" -#include "fastq-common.h" +#include "sw.h" +#include "common.h" #include #include diff --git a/src/fastq-sw.h b/src/sw.h similarity index 100% rename from src/fastq-sw.h rename to src/sw.h -- 2.39.5