-general
-=======
+Rename hash.h/c to hash_table.h/c.
-Man pages!
+Seperate murmurhash into it's own file: hash.h/hash.c.
+Document fastq-sort.
-I would like to encorporate some domain-specific compression algorithm. Options
-I know of:
- * G-SQZ (http://www.tgen.org/research/gsqueez.cfm) : not open source (i.e. not an option)
- * fqzcomp (http://seqanswers.com/forums/archive/index.php/t-6349.html)
- * DSRC (http://bioinformatics.oxfordjournals.org/cgi/content/short/27/6/860?rss=1) :
- Not yet available?
- * Invent my own.
+More comparison functions for fastq-sort.
+Link against quip so we can run these tools on quip/fastq/bam/sam with one
+interface.
-
-
-
-
-program specific
-================
-
-
-fastq-grep
-----------
-
-
-
-fastq-kmers
------------
-
+Make a website for fastq-tools.
fastq-match
An option to return the sequences that match past a certain threshold would be
useful.
-
-
-fastq-uniq
-----------
-
-
m4_ifdef([AC_TYPE_SIZE_T], [AC_TYPE_SIZE_T])
m4_ifdef([AC_HEADER_STDBOOL], [AC_HEADER_STDBOOL])
-opt_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O3"
-dbg_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O0"
+opt_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O3 -D_C99_SOURCE"
+dbg_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O0 -D_C99_SOURCE"
AC_ARG_ENABLE([debug],
[AS_HELP_STRING([--enable-debug],
fastq_common_src=common.h common.c
fastq_parse_src=parse.h parse.c
fastq_sw_src=sw.h sw.c
-fastq_hash_src=hash.h hash.c
+fastq_hash_table_src=hash_table.h hash_table.c
fastq_rng_src=rng.h rng.c
fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src)
fastq_match_SOURCES = fastq-match.c $(fastq_common_src) $(fastq_parse_src) $(fastq_sw_src)
-fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src)
+fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_table_src)
fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src)
*
* Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
*
- * fastq-sample :
- * Sample reads with or without replacement from a FASTQ file.
+ * fastq-sort:
+ * Sort fastq files efficiently.
*
*/
#include "common.h"
-#include "hash.h"
+#include "hash_table.h"
#include "parse.h"
#include <string.h>
#include <inttypes.h>
+++ /dev/null
-
-/*
- * This file is part of fastq-tools.
- *
- * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
- *
- */
-
-
-#include "hash.h"
-#include "common.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-
-
-static const size_t INITIAL_TABLE_SIZE = 128;
-static const double MAX_LOAD = 0.75;
-
-
-/*
- * Paul Hsieh's SuperFastHash
- * http://www.azillionmonkeys.com/qed/hash.html
- */
-
-
-#undef get16bits
-#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
- || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
-#define get16bits(d) (*((const uint16_t *) (d)))
-#endif
-
-#if !defined (get16bits)
-#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
- +(uint32_t)(((const uint8_t *)(d))[0]) )
-#endif
-
-static uint32_t hash(const char * data, size_t len) {
- uint32_t hash = len, tmp;
- int rem;
-
- if (len <= 0 || data == NULL) return 0;
-
- rem = len & 3;
- len >>= 2;
-
- /* Main loop */
- for (;len > 0; len--) {
- hash += get16bits (data);
- tmp = (get16bits (data+2) << 11) ^ hash;
- hash = (hash << 16) ^ tmp;
- data += 2*sizeof (uint16_t);
- hash += hash >> 11;
- }
-
- /* Handle end cases */
- switch (rem) {
- case 3: hash += get16bits (data);
- hash ^= hash << 16;
- hash ^= data[sizeof (uint16_t)] << 18;
- hash += hash >> 11;
- break;
- case 2: hash += get16bits (data);
- hash ^= hash << 11;
- hash += hash >> 17;
- break;
- case 1: hash += *data;
- hash ^= hash << 10;
- hash += hash >> 1;
- }
-
- /* Force "avalanching" of final 127 bits */
- hash ^= hash << 3;
- hash += hash >> 5;
- hash ^= hash << 4;
- hash += hash >> 17;
- hash ^= hash << 25;
- hash += hash >> 6;
-
- return hash;
-}
-
-
-
-static void rehash(hash_table* T, size_t new_n);
-static void clear_hash_table(hash_table*);
-
-
-
-hash_table* create_hash_table()
-{
- hash_table* T = malloc_or_die(sizeof(hash_table));
- T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(hashed_value*));
- memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(hashed_value*));
- T->n = INITIAL_TABLE_SIZE;
- T->m = 0;
- T->max_m = T->n * MAX_LOAD;
-
- return T;
-}
-
-
-void destroy_hash_table(hash_table* T)
-{
- if (T != NULL) {
- clear_hash_table(T);
- free(T->A);
- free(T);
- }
-}
-
-
-
-void clear_hash_table(hash_table* T)
-{
- hashed_value* u;
- size_t i;
- for (i = 0; i < T->n; i++) {
- while (T->A[i]) {
- u = T->A[i]->next;
- free(T->A[i]->value);
- free(T->A[i]);
- T->A[i] = u;
- }
- }
-
- T->m = 0;
-}
-
-
-static void insert_without_copy(hash_table* T, hashed_value* V)
-{
- uint32_t h = hash(V->value, V->len) % T->n;
- V->next = T->A[h];
- T->A[h] = V;
- T->m++;
-}
-
-
-
-static void rehash(hash_table* T, size_t new_n)
-{
- hash_table U;
- U.n = new_n;
- U.m = 0;
- U.max_m = U.n * MAX_LOAD;
- U.A = malloc_or_die(U.n * sizeof(hashed_value*));
- memset(U.A, 0, U.n * sizeof(hashed_value*));
-
- hashed_value *j, *k;
- size_t i;
- for (i = 0; i < T->n; i++) {
- j = T->A[i];
- while (j) {
- k = j->next;
- insert_without_copy(&U, j);
- j = k;
- }
- T->A[i] = NULL;
- }
-
- free(T->A);
- T->A = U.A;
- T->n = U.n;
- T->max_m = U.max_m;
-}
-
-
-void inc_hash_table(hash_table* T, const char* value, size_t len)
-{
- if (T->m >= T->max_m) rehash(T, T->n * 2);
-
- uint32_t h = hash(value, len) % T->n;
-
- hashed_value* u = T->A[h];
-
- while (u) {
- if (u->len == len && memcmp(u->value, value, len) == 0) {
- u->count++;
- return;
- }
-
- u = u->next;
- }
-
- u = malloc_or_die(sizeof(hashed_value));
- u->value = malloc_or_die(len);
- memcpy(u->value, value, len);
- u->len = len;
- u->count = 1;
-
- u->next = T->A[h];
- T->A[h] = u;
-
- T->m++;
-}
-
-
-
-hashed_value** dump_hash_table(hash_table* T)
-{
- hashed_value** D = malloc_or_die(T->m * sizeof(hashed_value*));
-
- hashed_value* u;
- size_t i, j;
- for (i = 0, j = 0; i < T->n; i++) {
- u = T->A[i];
- while (u) {
- D[j++] = u;
- u = u->next;
- }
- }
-
- return D;
-}
-
-
-
+++ /dev/null
-/*
- * This file is part of fastq-tools.
- *
- * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
- *
- * hash :
- * A quick and simple all-purpose hash table.
- *
- */
-
-
-#ifndef FASTQ_TOOLS_HASH_H
-#define FASTQ_TOOLS_HASH_H
-
-#include <stdlib.h>
-#include <stdint.h>
-
-
-typedef struct hashed_value_
-{
- char* value;
- size_t len;
- uint32_t count;
- struct hashed_value_* next;
-} hashed_value;
-
-
-typedef struct
-{
- hashed_value** A; /* table proper */
- size_t n; /* table size */
- size_t m; /* hashed items */
- size_t max_m; /* max hashed items before rehash */
-} hash_table;
-
-
-hash_table* create_hash_table();
-
-void destroy_hash_table(hash_table*);
-
-void inc_hash_table(hash_table*, const char* value, size_t len);
-
-hashed_value** dump_hash_table(hash_table*);
-
-
-#endif
-
--- /dev/null
+
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ */
+
+
+#include "hash_table.h"
+#include "common.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+
+static const size_t INITIAL_TABLE_SIZE = 128;
+static const double MAX_LOAD = 0.75;
+
+
+/*
+ * Paul Hsieh's SuperFastHash
+ * http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
+ || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
+#define get16bits(d) (*((const uint16_t *) (d)))
+#endif
+
+#if !defined (get16bits)
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+ +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+
+static uint32_t hash(const char * data, size_t len) {
+ uint32_t hash = len, tmp;
+ int rem;
+
+ if (len <= 0 || data == NULL) return 0;
+
+ rem = len & 3;
+ len >>= 2;
+
+ /* Main loop */
+ for (;len > 0; len--) {
+ hash += get16bits (data);
+ tmp = (get16bits (data+2) << 11) ^ hash;
+ hash = (hash << 16) ^ tmp;
+ data += 2*sizeof (uint16_t);
+ hash += hash >> 11;
+ }
+
+ /* Handle end cases */
+ switch (rem) {
+ case 3: hash += get16bits (data);
+ hash ^= hash << 16;
+ hash ^= data[sizeof (uint16_t)] << 18;
+ hash += hash >> 11;
+ break;
+ case 2: hash += get16bits (data);
+ hash ^= hash << 11;
+ hash += hash >> 17;
+ break;
+ case 1: hash += *data;
+ hash ^= hash << 10;
+ hash += hash >> 1;
+ }
+
+ /* Force "avalanching" of final 127 bits */
+ hash ^= hash << 3;
+ hash += hash >> 5;
+ hash ^= hash << 4;
+ hash += hash >> 17;
+ hash ^= hash << 25;
+ hash += hash >> 6;
+
+ return hash;
+}
+
+
+
+static void rehash(hash_table* T, size_t new_n);
+static void clear_hash_table(hash_table*);
+
+
+
+hash_table* create_hash_table()
+{
+ hash_table* T = malloc_or_die(sizeof(hash_table));
+ T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(hashed_value*));
+ memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(hashed_value*));
+ T->n = INITIAL_TABLE_SIZE;
+ T->m = 0;
+ T->max_m = T->n * MAX_LOAD;
+
+ return T;
+}
+
+
+void destroy_hash_table(hash_table* T)
+{
+ if (T != NULL) {
+ clear_hash_table(T);
+ free(T->A);
+ free(T);
+ }
+}
+
+
+
+void clear_hash_table(hash_table* T)
+{
+ hashed_value* u;
+ size_t i;
+ for (i = 0; i < T->n; i++) {
+ while (T->A[i]) {
+ u = T->A[i]->next;
+ free(T->A[i]->value);
+ free(T->A[i]);
+ T->A[i] = u;
+ }
+ }
+
+ T->m = 0;
+}
+
+
+static void insert_without_copy(hash_table* T, hashed_value* V)
+{
+ uint32_t h = hash(V->value, V->len) % T->n;
+ V->next = T->A[h];
+ T->A[h] = V;
+ T->m++;
+}
+
+
+
+static void rehash(hash_table* T, size_t new_n)
+{
+ hash_table U;
+ U.n = new_n;
+ U.m = 0;
+ U.max_m = U.n * MAX_LOAD;
+ U.A = malloc_or_die(U.n * sizeof(hashed_value*));
+ memset(U.A, 0, U.n * sizeof(hashed_value*));
+
+ hashed_value *j, *k;
+ size_t i;
+ for (i = 0; i < T->n; i++) {
+ j = T->A[i];
+ while (j) {
+ k = j->next;
+ insert_without_copy(&U, j);
+ j = k;
+ }
+ T->A[i] = NULL;
+ }
+
+ free(T->A);
+ T->A = U.A;
+ T->n = U.n;
+ T->max_m = U.max_m;
+}
+
+
+void inc_hash_table(hash_table* T, const char* value, size_t len)
+{
+ if (T->m >= T->max_m) rehash(T, T->n * 2);
+
+ uint32_t h = hash(value, len) % T->n;
+
+ hashed_value* u = T->A[h];
+
+ while (u) {
+ if (u->len == len && memcmp(u->value, value, len) == 0) {
+ u->count++;
+ return;
+ }
+
+ u = u->next;
+ }
+
+ u = malloc_or_die(sizeof(hashed_value));
+ u->value = malloc_or_die(len);
+ memcpy(u->value, value, len);
+ u->len = len;
+ u->count = 1;
+
+ u->next = T->A[h];
+ T->A[h] = u;
+
+ T->m++;
+}
+
+
+
+hashed_value** dump_hash_table(hash_table* T)
+{
+ hashed_value** D = malloc_or_die(T->m * sizeof(hashed_value*));
+
+ hashed_value* u;
+ size_t i, j;
+ for (i = 0, j = 0; i < T->n; i++) {
+ u = T->A[i];
+ while (u) {
+ D[j++] = u;
+ u = u->next;
+ }
+ }
+
+ return D;
+}
+
+
+
--- /dev/null
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * hash :
+ * A quick and simple all-purpose hash table.
+ *
+ */
+
+
+#ifndef FASTQ_TOOLS_HASH_H
+#define FASTQ_TOOLS_HASH_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+
+typedef struct hashed_value_
+{
+ char* value;
+ size_t len;
+ uint32_t count;
+ struct hashed_value_* next;
+} hashed_value;
+
+
+typedef struct
+{
+ hashed_value** A; /* table proper */
+ size_t n; /* table size */
+ size_t m; /* hashed items */
+ size_t max_m; /* max hashed items before rehash */
+} hash_table;
+
+
+hash_table* create_hash_table();
+
+void destroy_hash_table(hash_table*);
+
+void inc_hash_table(hash_table*, const char* value, size_t len);
+
+hashed_value** dump_hash_table(hash_table*);
+
+
+#endif
+