From: Daniel Jones Date: Sun, 9 Dec 2012 03:41:17 +0000 (-0800) Subject: Rename hash to hash_table. X-Git-Url: https://git.donarmstrong.com/?p=fastq-tools.git;a=commitdiff_plain;h=42c2a6b10b6313b7f87189f1467996fc3cf18e24 Rename hash to hash_table. --- diff --git a/TODO b/TODO index 4afc59a..7b06a07 100644 --- a/TODO +++ b/TODO @@ -1,35 +1,16 @@ -general -======= +Rename hash.h/c to hash_table.h/c. -Man pages! +Seperate murmurhash into it's own file: hash.h/hash.c. +Document fastq-sort. -I would like to encorporate some domain-specific compression algorithm. Options -I know of: - * G-SQZ (http://www.tgen.org/research/gsqueez.cfm) : not open source (i.e. not an option) - * fqzcomp (http://seqanswers.com/forums/archive/index.php/t-6349.html) - * DSRC (http://bioinformatics.oxfordjournals.org/cgi/content/short/27/6/860?rss=1) : - Not yet available? - * Invent my own. +More comparison functions for fastq-sort. +Link against quip so we can run these tools on quip/fastq/bam/sam with one +interface. - - - - -program specific -================ - - -fastq-grep ----------- - - - -fastq-kmers ------------ - +Make a website for fastq-tools. fastq-match @@ -43,9 +24,3 @@ might look to Phil Green's cross_match implementation for ideas. An option to return the sequences that match past a certain threshold would be useful. - - -fastq-uniq ----------- - - diff --git a/configure.ac b/configure.ac index 93e89b0..f1e45d9 100644 --- a/configure.ac +++ b/configure.ac @@ -16,8 +16,8 @@ m4_ifdef([AC_TYPE_UINT64_T], [AC_TYPE_UINT64_T]) m4_ifdef([AC_TYPE_SIZE_T], [AC_TYPE_SIZE_T]) m4_ifdef([AC_HEADER_STDBOOL], [AC_HEADER_STDBOOL]) -opt_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O3" -dbg_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O0" +opt_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O3 -D_C99_SOURCE" +dbg_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O0 -D_C99_SOURCE" AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], diff --git a/src/Makefile.am b/src/Makefile.am index 1fc7d3a..e3cbee3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -4,7 +4,7 @@ bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual fastq-sa fastq_common_src=common.h common.c fastq_parse_src=parse.h parse.c fastq_sw_src=sw.h sw.c -fastq_hash_src=hash.h hash.c +fastq_hash_table_src=hash_table.h hash_table.c fastq_rng_src=rng.h rng.c fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src) @@ -14,7 +14,7 @@ fastq_kmers_SOURCES = fastq-kmers.c $(fastq_common_src) $(fastq_parse_src) fastq_match_SOURCES = fastq-match.c $(fastq_common_src) $(fastq_parse_src) $(fastq_sw_src) -fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src) +fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_table_src) fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src) diff --git a/src/fastq-sort.c b/src/fastq-sort.c index ff77441..2876b9f 100644 --- a/src/fastq-sort.c +++ b/src/fastq-sort.c @@ -3,8 +3,8 @@ * * Copyright (c) 2012 by Daniel C. Jones * - * fastq-sample : - * Sample reads with or without replacement from a FASTQ file. + * fastq-sort: + * Sort fastq files efficiently. * */ diff --git a/src/fastq-uniq.c b/src/fastq-uniq.c index 3a92565..f77a10c 100644 --- a/src/fastq-uniq.c +++ b/src/fastq-uniq.c @@ -10,7 +10,7 @@ #include "common.h" -#include "hash.h" +#include "hash_table.h" #include "parse.h" #include #include diff --git a/src/hash.c b/src/hash.c deleted file mode 100644 index 25202a4..0000000 --- a/src/hash.c +++ /dev/null @@ -1,218 +0,0 @@ - -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - */ - - -#include "hash.h" -#include "common.h" -#include -#include -#include - - -static const size_t INITIAL_TABLE_SIZE = 128; -static const double MAX_LOAD = 0.75; - - -/* - * Paul Hsieh's SuperFastHash - * http://www.azillionmonkeys.com/qed/hash.html - */ - - -#undef get16bits -#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ - || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) -#define get16bits(d) (*((const uint16_t *) (d))) -#endif - -#if !defined (get16bits) -#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ - +(uint32_t)(((const uint8_t *)(d))[0]) ) -#endif - -static uint32_t hash(const char * data, size_t len) { - uint32_t hash = len, tmp; - int rem; - - if (len <= 0 || data == NULL) return 0; - - rem = len & 3; - len >>= 2; - - /* Main loop */ - for (;len > 0; len--) { - hash += get16bits (data); - tmp = (get16bits (data+2) << 11) ^ hash; - hash = (hash << 16) ^ tmp; - data += 2*sizeof (uint16_t); - hash += hash >> 11; - } - - /* Handle end cases */ - switch (rem) { - case 3: hash += get16bits (data); - hash ^= hash << 16; - hash ^= data[sizeof (uint16_t)] << 18; - hash += hash >> 11; - break; - case 2: hash += get16bits (data); - hash ^= hash << 11; - hash += hash >> 17; - break; - case 1: hash += *data; - hash ^= hash << 10; - hash += hash >> 1; - } - - /* Force "avalanching" of final 127 bits */ - hash ^= hash << 3; - hash += hash >> 5; - hash ^= hash << 4; - hash += hash >> 17; - hash ^= hash << 25; - hash += hash >> 6; - - return hash; -} - - - -static void rehash(hash_table* T, size_t new_n); -static void clear_hash_table(hash_table*); - - - -hash_table* create_hash_table() -{ - hash_table* T = malloc_or_die(sizeof(hash_table)); - T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(hashed_value*)); - memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(hashed_value*)); - T->n = INITIAL_TABLE_SIZE; - T->m = 0; - T->max_m = T->n * MAX_LOAD; - - return T; -} - - -void destroy_hash_table(hash_table* T) -{ - if (T != NULL) { - clear_hash_table(T); - free(T->A); - free(T); - } -} - - - -void clear_hash_table(hash_table* T) -{ - hashed_value* u; - size_t i; - for (i = 0; i < T->n; i++) { - while (T->A[i]) { - u = T->A[i]->next; - free(T->A[i]->value); - free(T->A[i]); - T->A[i] = u; - } - } - - T->m = 0; -} - - -static void insert_without_copy(hash_table* T, hashed_value* V) -{ - uint32_t h = hash(V->value, V->len) % T->n; - V->next = T->A[h]; - T->A[h] = V; - T->m++; -} - - - -static void rehash(hash_table* T, size_t new_n) -{ - hash_table U; - U.n = new_n; - U.m = 0; - U.max_m = U.n * MAX_LOAD; - U.A = malloc_or_die(U.n * sizeof(hashed_value*)); - memset(U.A, 0, U.n * sizeof(hashed_value*)); - - hashed_value *j, *k; - size_t i; - for (i = 0; i < T->n; i++) { - j = T->A[i]; - while (j) { - k = j->next; - insert_without_copy(&U, j); - j = k; - } - T->A[i] = NULL; - } - - free(T->A); - T->A = U.A; - T->n = U.n; - T->max_m = U.max_m; -} - - -void inc_hash_table(hash_table* T, const char* value, size_t len) -{ - if (T->m >= T->max_m) rehash(T, T->n * 2); - - uint32_t h = hash(value, len) % T->n; - - hashed_value* u = T->A[h]; - - while (u) { - if (u->len == len && memcmp(u->value, value, len) == 0) { - u->count++; - return; - } - - u = u->next; - } - - u = malloc_or_die(sizeof(hashed_value)); - u->value = malloc_or_die(len); - memcpy(u->value, value, len); - u->len = len; - u->count = 1; - - u->next = T->A[h]; - T->A[h] = u; - - T->m++; -} - - - -hashed_value** dump_hash_table(hash_table* T) -{ - hashed_value** D = malloc_or_die(T->m * sizeof(hashed_value*)); - - hashed_value* u; - size_t i, j; - for (i = 0, j = 0; i < T->n; i++) { - u = T->A[i]; - while (u) { - D[j++] = u; - u = u->next; - } - } - - return D; -} - - - diff --git a/src/hash.h b/src/hash.h deleted file mode 100644 index e2386a1..0000000 --- a/src/hash.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * This file is part of fastq-tools. - * - * Copyright (c) 2011 by Daniel C. Jones - * - * hash : - * A quick and simple all-purpose hash table. - * - */ - - -#ifndef FASTQ_TOOLS_HASH_H -#define FASTQ_TOOLS_HASH_H - -#include -#include - - -typedef struct hashed_value_ -{ - char* value; - size_t len; - uint32_t count; - struct hashed_value_* next; -} hashed_value; - - -typedef struct -{ - hashed_value** A; /* table proper */ - size_t n; /* table size */ - size_t m; /* hashed items */ - size_t max_m; /* max hashed items before rehash */ -} hash_table; - - -hash_table* create_hash_table(); - -void destroy_hash_table(hash_table*); - -void inc_hash_table(hash_table*, const char* value, size_t len); - -hashed_value** dump_hash_table(hash_table*); - - -#endif - diff --git a/src/hash_table.c b/src/hash_table.c new file mode 100644 index 0000000..c8878a0 --- /dev/null +++ b/src/hash_table.c @@ -0,0 +1,218 @@ + +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + */ + + +#include "hash_table.h" +#include "common.h" +#include +#include +#include + + +static const size_t INITIAL_TABLE_SIZE = 128; +static const double MAX_LOAD = 0.75; + + +/* + * Paul Hsieh's SuperFastHash + * http://www.azillionmonkeys.com/qed/hash.html + */ + + +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif + +static uint32_t hash(const char * data, size_t len) { + uint32_t hash = len, tmp; + int rem; + + if (len <= 0 || data == NULL) return 0; + + rem = len & 3; + len >>= 2; + + /* Main loop */ + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} + + + +static void rehash(hash_table* T, size_t new_n); +static void clear_hash_table(hash_table*); + + + +hash_table* create_hash_table() +{ + hash_table* T = malloc_or_die(sizeof(hash_table)); + T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(hashed_value*)); + memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(hashed_value*)); + T->n = INITIAL_TABLE_SIZE; + T->m = 0; + T->max_m = T->n * MAX_LOAD; + + return T; +} + + +void destroy_hash_table(hash_table* T) +{ + if (T != NULL) { + clear_hash_table(T); + free(T->A); + free(T); + } +} + + + +void clear_hash_table(hash_table* T) +{ + hashed_value* u; + size_t i; + for (i = 0; i < T->n; i++) { + while (T->A[i]) { + u = T->A[i]->next; + free(T->A[i]->value); + free(T->A[i]); + T->A[i] = u; + } + } + + T->m = 0; +} + + +static void insert_without_copy(hash_table* T, hashed_value* V) +{ + uint32_t h = hash(V->value, V->len) % T->n; + V->next = T->A[h]; + T->A[h] = V; + T->m++; +} + + + +static void rehash(hash_table* T, size_t new_n) +{ + hash_table U; + U.n = new_n; + U.m = 0; + U.max_m = U.n * MAX_LOAD; + U.A = malloc_or_die(U.n * sizeof(hashed_value*)); + memset(U.A, 0, U.n * sizeof(hashed_value*)); + + hashed_value *j, *k; + size_t i; + for (i = 0; i < T->n; i++) { + j = T->A[i]; + while (j) { + k = j->next; + insert_without_copy(&U, j); + j = k; + } + T->A[i] = NULL; + } + + free(T->A); + T->A = U.A; + T->n = U.n; + T->max_m = U.max_m; +} + + +void inc_hash_table(hash_table* T, const char* value, size_t len) +{ + if (T->m >= T->max_m) rehash(T, T->n * 2); + + uint32_t h = hash(value, len) % T->n; + + hashed_value* u = T->A[h]; + + while (u) { + if (u->len == len && memcmp(u->value, value, len) == 0) { + u->count++; + return; + } + + u = u->next; + } + + u = malloc_or_die(sizeof(hashed_value)); + u->value = malloc_or_die(len); + memcpy(u->value, value, len); + u->len = len; + u->count = 1; + + u->next = T->A[h]; + T->A[h] = u; + + T->m++; +} + + + +hashed_value** dump_hash_table(hash_table* T) +{ + hashed_value** D = malloc_or_die(T->m * sizeof(hashed_value*)); + + hashed_value* u; + size_t i, j; + for (i = 0, j = 0; i < T->n; i++) { + u = T->A[i]; + while (u) { + D[j++] = u; + u = u->next; + } + } + + return D; +} + + + diff --git a/src/hash_table.h b/src/hash_table.h new file mode 100644 index 0000000..e2386a1 --- /dev/null +++ b/src/hash_table.h @@ -0,0 +1,47 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * hash : + * A quick and simple all-purpose hash table. + * + */ + + +#ifndef FASTQ_TOOLS_HASH_H +#define FASTQ_TOOLS_HASH_H + +#include +#include + + +typedef struct hashed_value_ +{ + char* value; + size_t len; + uint32_t count; + struct hashed_value_* next; +} hashed_value; + + +typedef struct +{ + hashed_value** A; /* table proper */ + size_t n; /* table size */ + size_t m; /* hashed items */ + size_t max_m; /* max hashed items before rehash */ +} hash_table; + + +hash_table* create_hash_table(); + +void destroy_hash_table(hash_table*); + +void inc_hash_table(hash_table*, const char* value, size_t len); + +hashed_value** dump_hash_table(hash_table*); + + +#endif +