From 40ab4c0cde1bfee1616777995998b0cbc5ffc741 Mon Sep 17 00:00:00 2001
From: Daniel Jones <dcjones@cs.washington.edu>
Date: Thu, 17 Mar 2011 05:53:12 -0700
Subject: [PATCH] a program to output a nonredundant list of readss

---
 src/Makefile.am                  |  12 +-
 src/{fastq-common.c => common.c} |   2 +-
 src/{fastq-common.h => common.h} |   0
 src/fastq-grep.c                 |   4 +-
 src/fastq-kmers.c                |   4 +-
 src/fastq-match.c                |  10 +-
 src/fastq-uniq.c                 | 166 ++++++++++++++++++++++++
 src/hash.c                       | 210 +++++++++++++++++++++++++++++++
 src/hash.h                       |  37 ++++++
 src/{fastq-parse.c => parse.c}   |   4 +-
 src/{fastq-parse.h => parse.h}   |   0
 src/{fastq-sw.c => sw.c}         |   4 +-
 src/{fastq-sw.h => sw.h}         |   0
 13 files changed, 435 insertions(+), 18 deletions(-)
 rename src/{fastq-common.c => common.c} (97%)
 rename src/{fastq-common.h => common.h} (100%)
 create mode 100644 src/fastq-uniq.c
 create mode 100644 src/hash.c
 create mode 100644 src/hash.h
 rename src/{fastq-parse.c => parse.c} (98%)
 rename src/{fastq-parse.h => parse.h} (100%)
 rename src/{fastq-sw.c => sw.c} (98%)
 rename src/{fastq-sw.h => sw.h} (100%)

diff --git a/src/Makefile.am b/src/Makefile.am
index e28add9..989ae69 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,9 +1,10 @@
 
-bin_PROGRAMS = fastq-grep fastq-kmers fastq-match
+bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq
 
-fastq_common_src=fastq-common.h fastq-common.c
-fastq_parse_src=fastq-parse.h fastq-parse.c
-fastq_sw_src=fastq-sw.h fastq-sw.c
+fastq_common_src=common.h common.c
+fastq_parse_src=parse.h parse.c
+fastq_sw_src=sw.h sw.c
+fastq_hash_src=hash.h hash.c
 
 fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src)
 fastq_grep_LDADD = $(PCRE_LIBS) -lz
@@ -14,4 +15,7 @@ fastq_kmers_LDADD = -lz
 fastq_match_SOURCES = fastq-match.c $(fastq_common_src) $(fastq_parse_src) $(fastq_sw_src)
 fastq_match_LDADD   = -lz
 
+fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src)
+fastq_uniq_LDADD   = -lz
+
 
diff --git a/src/fastq-common.c b/src/common.c
similarity index 97%
rename from src/fastq-common.c
rename to src/common.c
index ae1ecac..4598819 100644
--- a/src/fastq-common.c
+++ b/src/common.c
@@ -7,7 +7,7 @@
  */
 
 
-#include "fastq-common.h"
+#include "common.h"
 #include <stdlib.h>
 
 
diff --git a/src/fastq-common.h b/src/common.h
similarity index 100%
rename from src/fastq-common.h
rename to src/common.h
diff --git a/src/fastq-grep.c b/src/fastq-grep.c
index f7b3913..67c9190 100644
--- a/src/fastq-grep.c
+++ b/src/fastq-grep.c
@@ -10,8 +10,8 @@
  */
 
 
-#include "fastq-common.h"
-#include "fastq-parse.h"
+#include "common.h"
+#include "parse.h"
 #include <stdio.h>
 #include <string.h>
 #include <getopt.h>
diff --git a/src/fastq-kmers.c b/src/fastq-kmers.c
index d78a453..57c75c2 100644
--- a/src/fastq-kmers.c
+++ b/src/fastq-kmers.c
@@ -9,8 +9,8 @@
  *
  */
 
-#include "fastq-common.h"
-#include "fastq-parse.h"
+#include "common.h"
+#include "parse.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
diff --git a/src/fastq-match.c b/src/fastq-match.c
index 7b55f07..09da29b 100644
--- a/src/fastq-match.c
+++ b/src/fastq-match.c
@@ -10,9 +10,9 @@
  */
 
 
-#include "fastq-common.h"
-#include "fastq-parse.h"
-#include "fastq-sw.h"
+#include "common.h"
+#include "parse.h"
+#include "sw.h"
 #include <stdlib.h>
 #include <string.h>
 #include <zlib.h>
@@ -32,7 +32,7 @@ static int help_flag;
 
 void print_help()
 {
-    fprintf( stderr, 
+    fprintf(stderr, 
 "fastq-match [OPTION]... QUERY [FILE]...\n"
 "Perform Smith-Waterman local alignment of a query sequence\n"
 "against each sequence in a fastq file.\n\n"
@@ -97,7 +97,7 @@ int main(int argc, char* argv[])
     while (1) {
         opt = getopt_long(argc, argv, "h", long_options, &opt_idx);
 
-        if( opt == -1 ) break;
+        if (opt == -1) break;
 
         switch (opt) {
             case 0:
diff --git a/src/fastq-uniq.c b/src/fastq-uniq.c
new file mode 100644
index 0000000..fddb52a
--- /dev/null
+++ b/src/fastq-uniq.c
@@ -0,0 +1,166 @@
+
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-uniq :
+ * Collapsing a fastq file into only unique read sequences.
+ */
+
+
+#include "common.h"
+#include "hash.h"
+#include "parse.h"
+#include <string.h>
+#include <zlib.h>
+#include <getopt.h>
+
+
+#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
+#  include <fcntl.h>
+#  include <io.h>
+#  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
+#else
+#  define SET_BINARY_MODE(file)
+#endif
+
+
+static int help_flag;
+static int verbose_flag;
+size_t total_reads;
+
+void print_help()
+{
+    fprintf(stderr,
+"fastq-uniq [OPTION] [FILE]...\n"
+"Output a non-redundant FASTQ file, in which there are no duplicate reads.\n"
+"(Warning: this program can be somewhat memory intensive.)\n\n"
+"Options:\n"
+"  -h, --help              print this message\n"
+"  -v, --verbose           print status along the way\n"
+    );
+}
+
+
+void fastq_hash(FILE* fin, hash_table* T)
+{
+    fastq_t* fqf = fastq_open(fin);
+    seq_t* seq = fastq_alloc_seq();
+
+    while (fastq_next(fqf, seq)) {
+        inc_hash_table(T, seq->seq.s, seq->seq.n);
+
+        total_reads++;
+        if (verbose_flag && total_reads % 100000 == 0) {
+            fprintf(stderr, "%zu reads processed ...\n", total_reads);
+        }
+    }
+
+    fastq_free_seq(seq);
+    fastq_close(fqf);
+}
+
+
+int compare_hashed_value_count(const void* x, const void* y)
+{
+    hashed_value* const * a = x;
+    hashed_value* const * b = y;
+
+    if( (*a)->count > (*b)->count ) return -1;
+    if( (*a)->count < (*b)->count ) return 1;
+    return 0;
+}
+
+
+
+void print_hash_table(FILE* fout, hash_table* T)
+{
+    hashed_value** S = dump_hash_table(T);
+    qsort(S, T->m, sizeof(hashed_value*), compare_hashed_value_count);
+
+    size_t i;
+    for (i = 0; i < T->m; i++) {
+        fprintf(fout, ">unique-read-%07zu (%zu copies)\n", i, S[i]->count);
+        fwrite(S[i]->value, S[i]->len, sizeof(char), fout);
+        fprintf(fout, "\n");
+    }
+    free(S);
+}
+
+
+
+int main(int argc, char* argv[])
+{
+    SET_BINARY_MODE(stdin);
+    SET_BINARY_MODE(stdout);
+
+    hash_table* T = create_hash_table();
+
+    FILE* fin   ;
+
+    help_flag = 0;
+
+    int opt;
+    int opt_idx;
+
+    static struct option long_options[] =
+    {
+        {"help",    no_argument, &help_flag,    1},
+        {"verbose", no_argument, &verbose_flag, 1},
+        {0, 0, 0, 0}
+    };
+
+    while (1) {
+        opt = getopt_long(argc, argv, "hv", long_options, &opt_idx);
+
+        if (opt == -1) break;
+
+        switch (opt) {
+            case 0:
+                if (long_options[opt_idx].flag != 0) break;
+                if (optarg) {
+                }
+                break;
+
+            case 'h':
+                help_flag = 1;
+                break;
+
+            case 'v':
+                verbose_flag = 1;
+                break;
+
+            case '?':
+                return 1;
+
+            default:
+                abort();
+        }
+    }
+
+    if (help_flag) {
+        print_help();
+        return 0;
+    }
+
+    if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
+        fastq_hash(stdin, T);
+    }
+    else {
+        for (; optind < argc; optind++) {
+            fin = fopen(argv[optind], "rb");
+            if (fin == NULL) {
+                fprintf(stderr, "No such file '%s'.\n", argv[optind]);
+                continue;
+            }
+
+            fastq_hash(fin, T);
+        }
+    }
+
+    print_hash_table(stdout, T);
+
+    destroy_hash_table(T);
+    return 0;
+}
diff --git a/src/hash.c b/src/hash.c
new file mode 100644
index 0000000..9b1a043
--- /dev/null
+++ b/src/hash.c
@@ -0,0 +1,210 @@
+
+#include "hash.h"
+#include "common.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+
+static const size_t INITIAL_TABLE_SIZE = 128;
+static const double MAX_LOAD = 0.75;
+
+
+/*
+ * Paul Hsieh's SuperFastHash
+ * http://www.azillionmonkeys.com/qed/hash.html
+ */
+
+
+#undef get16bits
+#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
+    || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
+#define get16bits(d) (*((const uint16_t *) (d)))
+#endif
+
+#if !defined (get16bits)
+#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
+        +(uint32_t)(((const uint8_t *)(d))[0]) )
+#endif
+
+static uint32_t hash(const char * data, size_t len) {
+    uint32_t hash = len, tmp;
+    int rem;
+
+    if (len <= 0 || data == NULL) return 0;
+
+    rem = len & 3;
+    len >>= 2;
+
+    /* Main loop */
+    for (;len > 0; len--) {
+        hash  += get16bits (data);
+        tmp    = (get16bits (data+2) << 11) ^ hash;
+        hash   = (hash << 16) ^ tmp;
+        data  += 2*sizeof (uint16_t);
+        hash  += hash >> 11;
+    }
+
+    /* Handle end cases */
+    switch (rem) {
+        case 3: hash += get16bits (data);
+                hash ^= hash << 16;
+                hash ^= data[sizeof (uint16_t)] << 18;
+                hash += hash >> 11;
+                break;
+        case 2: hash += get16bits (data);
+                hash ^= hash << 11;
+                hash += hash >> 17;
+                break;
+        case 1: hash += *data;
+                hash ^= hash << 10;
+                hash += hash >> 1;
+    }
+
+    /* Force "avalanching" of final 127 bits */
+    hash ^= hash << 3;
+    hash += hash >> 5;
+    hash ^= hash << 4;
+    hash += hash >> 17;
+    hash ^= hash << 25;
+    hash += hash >> 6;
+
+    return hash;
+}
+
+
+
+static void rehash(hash_table* T, size_t new_n);
+static void clear_hash_table(hash_table*);
+
+
+
+hash_table* create_hash_table()
+{
+    hash_table* T = malloc_or_die(sizeof(hash_table));
+    T->A = malloc_or_die(INITIAL_TABLE_SIZE * sizeof(hashed_value*));
+    memset(T->A, 0, INITIAL_TABLE_SIZE * sizeof(hashed_value*));
+    T->n = INITIAL_TABLE_SIZE;
+    T->m = 0;
+    T->max_m = T->n * MAX_LOAD;
+
+    return T;
+}
+
+
+void destroy_hash_table(hash_table* T)
+{
+    if (T != NULL) {
+        clear_hash_table(T);
+        free(T->A);
+        free(T);
+    }
+}
+
+
+
+void clear_hash_table(hash_table* T)
+{
+    hashed_value* u;
+    size_t i;
+    for (i = 0; i < T->n; i++) {
+        while (T->A[i]) {
+            u = T->A[i]->next;
+            free(T->A[i]->value);
+            free(T->A[i]);
+            T->A[i] = u;
+        }
+    }
+
+    T->m = 0;
+}
+
+
+static void insert_without_copy(hash_table* T, hashed_value* V)
+{
+    uint32_t h = hash(V->value, V->len) % T->n;
+    V->next = T->A[h];
+    T->A[h] = V;
+    T->m++;
+}
+
+
+
+static void rehash(hash_table* T, size_t new_n)
+{
+    hash_table U;
+    U.n = new_n;
+    U.m = 0;
+    U.max_m = U.n * MAX_LOAD;
+    U.A = malloc_or_die(U.n * sizeof(hashed_value*));
+    memset(U.A, 0, U.n * sizeof(hashed_value*));
+
+    hashed_value *j, *k;
+    size_t i;
+    for (i = 0; i < T->n; i++) {
+        j = T->A[i];
+        while (j) {
+            k = j->next;
+            insert_without_copy(&U, j);
+            j = k;
+        }
+        T->A[i] = NULL;
+    }
+
+    free(T->A);
+    T->A = U.A;
+    T->n = U.n;
+    T->max_m = U.max_m;
+}
+
+
+void inc_hash_table(hash_table* T, const char* value, size_t len)
+{
+    if (T->m >= T->max_m) rehash(T, T->n * 2);
+
+    uint32_t h = hash(value, len) % T->n;
+
+    hashed_value* u = T->A[h];
+
+    while (u) {
+        if (u->len == len && memcmp(u->value, value, len) == 0) {
+            u->count++;
+            return;
+        }
+
+        u = u->next;
+    }
+
+    u = malloc_or_die(sizeof(hashed_value));
+    u->value = malloc_or_die(len);
+    memcpy(u->value, value, len);
+    u->len = len;
+    u->count = 1;
+
+    u->next = T->A[h];
+    T->A[h] = u;
+
+    T->m++;
+}
+
+
+
+hashed_value** dump_hash_table(hash_table* T)
+{
+    hashed_value** D = malloc_or_die(T->m * sizeof(hashed_value*));
+
+    hashed_value* u;
+    size_t i, j;
+    for (i = 0, j = 0; i < T->n; i++) {
+        u = T->A[i];
+        while (u) {
+            D[j++] = u; 
+            u = u->next;
+        }
+    }
+
+    return D;
+}
+
+
+
diff --git a/src/hash.h b/src/hash.h
new file mode 100644
index 0000000..f101824
--- /dev/null
+++ b/src/hash.h
@@ -0,0 +1,37 @@
+
+#ifndef FASTQ_TOOLS_HASH_H
+#define FASTQ_TOOLS_HASH_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+
+typedef struct hashed_value_
+{
+    char*    value;
+    size_t   len;
+    uint32_t count;
+    struct hashed_value_* next;
+} hashed_value;
+
+
+typedef struct
+{
+    hashed_value** A; /* table proper */
+    size_t n;         /* table size */
+    size_t m;         /* hashed items */
+    size_t max_m;     /* max hashed items before rehash */
+} hash_table;
+
+
+hash_table* create_hash_table();
+
+void destroy_hash_table(hash_table*);
+
+void inc_hash_table(hash_table*, const char* value, size_t len);
+
+hashed_value** dump_hash_table(hash_table*);
+
+
+#endif
+
diff --git a/src/fastq-parse.c b/src/parse.c
similarity index 98%
rename from src/fastq-parse.c
rename to src/parse.c
index ec5b478..bec5ac3 100644
--- a/src/fastq-parse.c
+++ b/src/parse.c
@@ -5,8 +5,8 @@
  *
  */
 
-#include "fastq-parse.h"
-#include "fastq-common.h"
+#include "parse.h"
+#include "common.h"
 #include <stdlib.h>
 #include <ctype.h>
 
diff --git a/src/fastq-parse.h b/src/parse.h
similarity index 100%
rename from src/fastq-parse.h
rename to src/parse.h
diff --git a/src/fastq-sw.c b/src/sw.c
similarity index 98%
rename from src/fastq-sw.c
rename to src/sw.c
index eb26ff5..3db345f 100644
--- a/src/fastq-sw.c
+++ b/src/sw.c
@@ -16,8 +16,8 @@
  */
 
 
-#include "fastq-sw.h"
-#include "fastq-common.h"
+#include "sw.h"
+#include "common.h"
 #include <string.h>
 #include <stdlib.h>
 
diff --git a/src/fastq-sw.h b/src/sw.h
similarity index 100%
rename from src/fastq-sw.h
rename to src/sw.h
-- 
2.39.5