WIP on a fastq-sort program.

author Daniel Jones <dcjones@cs.washington.edu>

Tue, 9 Oct 2012 17:37:19 +0000 (10:37 -0700)

committer Daniel Jones <dcjones@cs.washington.edu>

Tue, 9 Oct 2012 17:37:19 +0000 (10:37 -0700)
author Daniel Jones <dcjones@cs.washington.edu>
Tue, 9 Oct 2012 17:37:19 +0000 (10:37 -0700)
committer Daniel Jones <dcjones@cs.washington.edu>
Tue, 9 Oct 2012 17:37:19 +0000 (10:37 -0700)
diff --git a/src/Makefile.am b/src/Makefile.am

index aeb9c85f6a2df5288462021a6613c246b65b6f0d..1fc7d3a6572e342aa5a3bb5d2b47c161125b96a1 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,5 +1,5 @@
  
-bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual fastq-sample fastq-qualadj
+bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual fastq-sample fastq-qualadj fastq-sort
  
  fastq_common_src=common.h common.c
  fastq_parse_src=parse.h parse.c
@@ -21,3 +21,6 @@ fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src)
  fastq_sample_SOURCES = fastq-sample.c $(fastq_common_src) $(fastq_parse_src) $(fastq_rng_src)
  
  fastq_qualadj_SOURCES = fastq-qualadj.c $(fastq_common_src) $(fastq_parse_src)
+
+fastq_sort_SOURCES = fastq-sort.c $(fastq_common_src) $(fastq_parse_src)
+
diff --git a/src/fastq-grep.c b/src/fastq-grep.c

index 4d1603f7ced45f3d34e28d00e60204689031d4a8..b09b2bff44cc485f12d8bac635098de87f48605c 100644 (file)
--- a/src/fastq-grep.c
+++ b/src/fastq-grep.c
@@ -33,7 +33,7 @@ static const char* prog_name = "fastq-grep";
  
  void print_help()
  {
-    fprintf(stdout, 
+    fprintf(stdout,
  "fastq-grep [OPTION]... PATTERN [FILE]...\n"
  "Search for PATTERN in the read sequences in each FILE or standard input.\n"
  "PATTERN, by default, is a perl compatible regular expression.\n\n"
diff --git a/src/fastq-sample.c b/src/fastq-sample.c

index 0adf02dd27c6d6d73c3c1ae08ddd36ec4729084b..f12620b29e28a2b364cee9aebd6a41c3958c1b30 100644 (file)
--- a/src/fastq-sample.c
+++ b/src/fastq-sample.c
@@ -1,4 +1,3 @@
-
  /*
   * This file is part of fastq-tools.
   *
@@ -8,6 +7,7 @@
   * Sample reads with or without replacement from a FASTQ file.
   *
   */
+
  #include <getopt.h>
  #include <math.h>
  #include <stdio.h>
diff --git a/src/fastq-sort.c b/src/fastq-sort.c

new file mode 100644 (file)

index 0000000..522c0ce
--- /dev/null
+++ b/src/fastq-sort.c
@@ -0,0 +1,245 @@
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-sample :
+ * Sample reads with or without replacement from a FASTQ file.
+ *
+ */
+
+#include <assert.h>
+#include <getopt.h>
+#include <string.h>
+
+#include "common.h"
+#include "parse.h"
+
+
+typedef struct seq_array_t_
+{
+    seq_t* seqs;
+
+    /* Number of seq objects. */
+    size_t n;
+
+    /* Size reserved in seqs. */
+    size_t size;
+
+    /* Space reserved for strings. */
+    char* data;
+
+    /* Data used. */
+    size_t data_used;
+
+    /* Total data size. */
+    size_t data_size;
+} seq_array_t;
+
+
+seq_array_t* seq_array_create(size_t data_size)
+{
+    seq_array_t* a = malloc_or_die(sizeof(seq_array_t));
+    a->size = 1024;
+    a->n = 0;
+    a->seqs = malloc_or_die(sizeof(seq_t));
+
+    a->data_size = data_size;
+    a->data_used = 0;
+    a->data = malloc_or_die(data_size);
+
+    return a;
+}
+
+
+void seq_array_free(seq_array_t* a)
+{
+    free(a->seqs);
+    free(a->data);
+    free(a);
+}
+
+
+/* Push a fastq entry to back of the array. Return false if there was not enough
+ * space. */
+bool seq_array_push(seq_array_t* a, const seq_t* seq)
+{
+    size_t size_needed = (seq->id1.n + 1) + (seq->seq.n + 1) +
+                         (seq->id2.n + 1) + (seq->qual.n + 1);
+
+    if (size_needed > a->data_size - a->data_used) return false;
+
+    if (a->n == a->size) {
+        a->size *= 2;
+        a->seqs = realloc_or_die(a->seqs, a->size * sizeof(seq_t));
+    }
+
+    memcpy(seq->id1.s, &a->data[a->data_used], seq->id1.n + 1);
+    a->seqs[a->n].id1.s = &a->data[a->data_used];
+    a->seqs[a->n].id1.n = seq->id1.n + 1;
+    a->data_used += seq->id1.n + 1;
+
+    memcpy(seq->seq.s, &a->data[a->data_used], seq->seq.n + 1);
+    a->seqs[a->n].seq.s = &a->data[a->data_used];
+    a->seqs[a->n].seq.n = seq->seq.n + 1;
+    a->data_used += seq->seq.n + 1;
+
+    memcpy(seq->id2.s, &a->data[a->data_used], seq->id2.n + 1);
+    a->seqs[a->n].id2.s = &a->data[a->data_used];
+    a->seqs[a->n].id2.n = seq->id2.n + 1;
+    a->data_used += seq->id2.n + 1;
+
+    memcpy(seq->qual.s, &a->data[a->data_used], seq->qual.n + 1);
+    a->seqs[a->n].qual.s = &a->data[a->data_used];
+    a->seqs[a->n].qual.n = seq->qual.n + 1;
+    a->data_used += seq->qual.n + 1;
+
+    ++a->n;
+
+    return true;
+}
+
+
+void seq_array_clear(seq_array_t* a)
+{
+    a->n = 0;
+    a->data_used = 0;
+}
+
+
+void seq_array_sort(seq_array_t* a, int (*cmp)(const void*, const void*))
+{
+    qsort(a->seqs, a->n, sizeof(seq_t), cmp);
+}
+
+
+int seq_cmp_hash(const void* a_, const void* b_)
+{
+    const seq_t* a = (seq_t*) a_;
+    const seq_t* b = (seq_t*) b_;
+    /* TODO: hash and compare. */
+    return 0;
+}
+
+
+static const char* prog_name = "fastq-sort";
+
+
+void print_help()
+{
+    fprintf(stdout,
+"fastq-sort [OPTION]... [FILE]...\n"
+"Concatenate and sort FASTQ files and write to standard output.\n"
+"Options:\n"
+"  -h, --help      print this message\n"
+"  -V, --version   output version information and exit\n"
+   );
+}
+
+
+
+int main(int argc, char* argv[])
+{
+    int opt, opt_idx;
+    size_t buffer_size = 100000000;
+    int (*cmp)(const void*, const void*) = seq_cmp_hash;;
+
+    static struct option long_options[] =
+    {
+        {"help",    no_argument, NULL, 'h'},
+        {"version", no_argument, NULL, 'V'},
+        {0, 0, 0, 0}
+    };
+
+    while (true) {
+        opt = getopt_long(argc, argv, "hV", long_options, &opt_idx);
+        if (opt == -1) break;
+
+        switch (opt) {
+            case 'h':
+                print_help();
+                return 0;
+
+            case 'V':
+                print_version(stdout, prog_name);
+                return 0;
+
+            case '?':
+                return 1;
+
+            default:
+                abort();
+        }
+    }
+
+    seq_array_t* a = seq_array_create(buffer_size);
+    seq_t* seq = seq_create();
+
+    fastq_t* f;
+    if (optind >= argc) {
+        f = fastq_create(stdin);
+        while (fastq_read(f, seq)) {
+            if (!seq_array_push(a, seq)) {
+                seq_array_sort(a, cmp);
+
+                /* TODO: dump a to a temporary file. Push that file name to an
+                 * array somewhere.
+                 * */
+
+                seq_array_clear(a);
+                if (seq_array_push(a, seq)) {
+                    fprintf(stderr, "The buffer size is to small.\n");
+                    return EXIT_FAILURE;
+                }
+            }
+        }
+        fastq_free(f);
+    }
+    else {
+        FILE* file;
+        for (; optind < argc; ++optind) {
+            file = fopen(argv[optind], "rb");
+            if (file == NULL) {
+                fprintf(stderr, "Cannot open %s for reading.\n", argv[optind]);
+                return EXIT_FAILURE;
+            }
+            f = fastq_create(file);
+
+            while (fastq_read(f, seq)) {
+                if (!seq_array_push(a, seq)) {
+                    seq_array_sort(a, cmp);
+
+                    /* TODO: dump a to a temporary file. Push that file name to
+                     * an array somewhere.  */
+
+                    seq_array_clear(a);
+                    if (seq_array_push(a, seq)) {
+                        fprintf(stderr, "The buffer size is to small.\n");
+                        return EXIT_FAILURE;
+                    }
+                }
+            }
+
+            fastq_free(f);
+            fclose(file);
+        }
+    }
+
+    if (a->n > 0) {
+        seq_array_sort(a, cmp);
+
+        /* TODO: special case if everything fit into a. Just dump it to output.
+         */
+
+        /* TODO: dump to a temp file, push file name to the stack. */
+    }
+
+    /* TODO: Merge sort on the external files. */
+
+    seq_free(seq);
+    seq_array_free(a);
+
+    return EXIT_SUCCESS;
+}
+
+
diff --git a/src/parse.c b/src/parse.c

index 8a6123e9b97f14e6818421e8d2a9f06303fad78c..fd139218506c0ab91a4aa449cd22217a6b11d1cd 100644 (file)
--- a/src/parse.c
+++ b/src/parse.c
@@ -66,6 +66,88 @@ void seq_free(seq_t* seq)
  }
  
  
+/* This is MurmurHash3. The original C++ code was placed in the public domain
+ * by its author, Austin Appleby. */
+
+static inline uint32_t fmix(uint32_t h)
+{
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    h ^= h >> 16;
+
+    return h;
+}
+
+
+static inline uint32_t rotl32(uint32_t x, int8_t r)
+{
+    return (x << r) | (x >> (32 - r));
+}
+
+
+uint32_t murmurhash3(const uint8_t* data, size_t len_)
+{
+    const int len = (int) len_;
+    const int nblocks = len / 4;
+
+    uint32_t h1 = 0xc062fb4a;
+
+    uint32_t c1 = 0xcc9e2d51;
+    uint32_t c2 = 0x1b873593;
+
+    //----------
+    // body
+
+    const uint32_t * blocks = (const uint32_t*) (data + nblocks * 4);
+
+    int i;
+    for(i = -nblocks; i; i++)
+    {
+        uint32_t k1 = blocks[i];
+
+        k1 *= c1;
+        k1 = rotl32(k1, 15);
+        k1 *= c2;
+
+        h1 ^= k1;
+        h1 = rotl32(h1, 13);
+        h1 = h1*5+0xe6546b64;
+    }
+
+    //----------
+    // tail
+
+    const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+    uint32_t k1 = 0;
+
+    switch(len & 3)
+    {
+        case 3: k1 ^= tail[2] << 16;
+        case 2: k1 ^= tail[1] << 8;
+        case 1: k1 ^= tail[0];
+              k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
+    }
+
+    //----------
+    // finalization
+
+    h1 ^= len;
+
+    h1 = fmix(h1);
+
+    return h1;
+}
+
+
+uint32_t seq_hash(const seq_t* seq)
+{
+    /* TODO */
+    return 0;
+}
+
  static const size_t parser_buf_size = 1000000;
  
  
diff --git a/src/parse.h b/src/parse.h

index 6224ab8d07368e735cfcb7d504ea67da05994b6e..4a0a0dded6d8668c03ad81e20400e60bec14ecfe 100644 (file)
--- a/src/parse.h
+++ b/src/parse.h
@@ -12,6 +12,7 @@
  #define FASTQ_TOOLS_PARSE_H
  
  #include <stdbool.h>
+#include <stdint.h>
  #include <stdlib.h>
  
  /* A string structure to keep-track of a reserved space. */
@@ -41,6 +42,10 @@ seq_t* seq_create();
  void seq_free(seq_t* seq);
  
  
+/* Hash a fastq entry. */
+uint32_t seq_hash(const seq_t* seq);
+
+
  /* Internal data for the fastq parser. */
  typedef struct fastq_t_ fastq_t;
author	Daniel Jones <dcjones@cs.washington.edu>
	Tue, 9 Oct 2012 17:37:19 +0000 (10:37 -0700)
committer	Daniel Jones <dcjones@cs.washington.edu>
	Tue, 9 Oct 2012 17:37:19 +0000 (10:37 -0700)
src/Makefile.am		patch \| blob \| history
src/fastq-grep.c		patch \| blob \| history
src/fastq-sample.c		patch \| blob \| history
src/fastq-sort.c	[new file with mode: 0644]	patch \| blob
src/parse.c		patch \| blob \| history
src/parse.h		patch \| blob \| history