--- /dev/null
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-sample :
+ * Sample reads with or without replacement from a FASTQ file.
+ *
+ */
+
+#include <assert.h>
+#include <getopt.h>
+#include <string.h>
+
+#include "common.h"
+#include "parse.h"
+
+
+typedef struct seq_array_t_
+{
+ seq_t* seqs;
+
+ /* Number of seq objects. */
+ size_t n;
+
+ /* Size reserved in seqs. */
+ size_t size;
+
+ /* Space reserved for strings. */
+ char* data;
+
+ /* Data used. */
+ size_t data_used;
+
+ /* Total data size. */
+ size_t data_size;
+} seq_array_t;
+
+
+seq_array_t* seq_array_create(size_t data_size)
+{
+ seq_array_t* a = malloc_or_die(sizeof(seq_array_t));
+ a->size = 1024;
+ a->n = 0;
+ a->seqs = malloc_or_die(sizeof(seq_t));
+
+ a->data_size = data_size;
+ a->data_used = 0;
+ a->data = malloc_or_die(data_size);
+
+ return a;
+}
+
+
+void seq_array_free(seq_array_t* a)
+{
+ free(a->seqs);
+ free(a->data);
+ free(a);
+}
+
+
+/* Push a fastq entry to back of the array. Return false if there was not enough
+ * space. */
+bool seq_array_push(seq_array_t* a, const seq_t* seq)
+{
+ size_t size_needed = (seq->id1.n + 1) + (seq->seq.n + 1) +
+ (seq->id2.n + 1) + (seq->qual.n + 1);
+
+ if (size_needed > a->data_size - a->data_used) return false;
+
+ if (a->n == a->size) {
+ a->size *= 2;
+ a->seqs = realloc_or_die(a->seqs, a->size * sizeof(seq_t));
+ }
+
+ memcpy(seq->id1.s, &a->data[a->data_used], seq->id1.n + 1);
+ a->seqs[a->n].id1.s = &a->data[a->data_used];
+ a->seqs[a->n].id1.n = seq->id1.n + 1;
+ a->data_used += seq->id1.n + 1;
+
+ memcpy(seq->seq.s, &a->data[a->data_used], seq->seq.n + 1);
+ a->seqs[a->n].seq.s = &a->data[a->data_used];
+ a->seqs[a->n].seq.n = seq->seq.n + 1;
+ a->data_used += seq->seq.n + 1;
+
+ memcpy(seq->id2.s, &a->data[a->data_used], seq->id2.n + 1);
+ a->seqs[a->n].id2.s = &a->data[a->data_used];
+ a->seqs[a->n].id2.n = seq->id2.n + 1;
+ a->data_used += seq->id2.n + 1;
+
+ memcpy(seq->qual.s, &a->data[a->data_used], seq->qual.n + 1);
+ a->seqs[a->n].qual.s = &a->data[a->data_used];
+ a->seqs[a->n].qual.n = seq->qual.n + 1;
+ a->data_used += seq->qual.n + 1;
+
+ ++a->n;
+
+ return true;
+}
+
+
+void seq_array_clear(seq_array_t* a)
+{
+ a->n = 0;
+ a->data_used = 0;
+}
+
+
+void seq_array_sort(seq_array_t* a, int (*cmp)(const void*, const void*))
+{
+ qsort(a->seqs, a->n, sizeof(seq_t), cmp);
+}
+
+
+int seq_cmp_hash(const void* a_, const void* b_)
+{
+ const seq_t* a = (seq_t*) a_;
+ const seq_t* b = (seq_t*) b_;
+ /* TODO: hash and compare. */
+ return 0;
+}
+
+
+static const char* prog_name = "fastq-sort";
+
+
+void print_help()
+{
+ fprintf(stdout,
+"fastq-sort [OPTION]... [FILE]...\n"
+"Concatenate and sort FASTQ files and write to standard output.\n"
+"Options:\n"
+" -h, --help print this message\n"
+" -V, --version output version information and exit\n"
+ );
+}
+
+
+
+int main(int argc, char* argv[])
+{
+ int opt, opt_idx;
+ size_t buffer_size = 100000000;
+ int (*cmp)(const void*, const void*) = seq_cmp_hash;;
+
+ static struct option long_options[] =
+ {
+ {"help", no_argument, NULL, 'h'},
+ {"version", no_argument, NULL, 'V'},
+ {0, 0, 0, 0}
+ };
+
+ while (true) {
+ opt = getopt_long(argc, argv, "hV", long_options, &opt_idx);
+ if (opt == -1) break;
+
+ switch (opt) {
+ case 'h':
+ print_help();
+ return 0;
+
+ case 'V':
+ print_version(stdout, prog_name);
+ return 0;
+
+ case '?':
+ return 1;
+
+ default:
+ abort();
+ }
+ }
+
+ seq_array_t* a = seq_array_create(buffer_size);
+ seq_t* seq = seq_create();
+
+ fastq_t* f;
+ if (optind >= argc) {
+ f = fastq_create(stdin);
+ while (fastq_read(f, seq)) {
+ if (!seq_array_push(a, seq)) {
+ seq_array_sort(a, cmp);
+
+ /* TODO: dump a to a temporary file. Push that file name to an
+ * array somewhere.
+ * */
+
+ seq_array_clear(a);
+ if (seq_array_push(a, seq)) {
+ fprintf(stderr, "The buffer size is to small.\n");
+ return EXIT_FAILURE;
+ }
+ }
+ }
+ fastq_free(f);
+ }
+ else {
+ FILE* file;
+ for (; optind < argc; ++optind) {
+ file = fopen(argv[optind], "rb");
+ if (file == NULL) {
+ fprintf(stderr, "Cannot open %s for reading.\n", argv[optind]);
+ return EXIT_FAILURE;
+ }
+ f = fastq_create(file);
+
+ while (fastq_read(f, seq)) {
+ if (!seq_array_push(a, seq)) {
+ seq_array_sort(a, cmp);
+
+ /* TODO: dump a to a temporary file. Push that file name to
+ * an array somewhere. */
+
+ seq_array_clear(a);
+ if (seq_array_push(a, seq)) {
+ fprintf(stderr, "The buffer size is to small.\n");
+ return EXIT_FAILURE;
+ }
+ }
+ }
+
+ fastq_free(f);
+ fclose(file);
+ }
+ }
+
+ if (a->n > 0) {
+ seq_array_sort(a, cmp);
+
+ /* TODO: special case if everything fit into a. Just dump it to output.
+ */
+
+ /* TODO: dump to a temp file, push file name to the stack. */
+ }
+
+ /* TODO: Merge sort on the external files. */
+
+ seq_free(seq);
+ seq_array_free(a);
+
+ return EXIT_SUCCESS;
+}
+
+
}
+/* This is MurmurHash3. The original C++ code was placed in the public domain
+ * by its author, Austin Appleby. */
+
+static inline uint32_t fmix(uint32_t h)
+{
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+
+static inline uint32_t rotl32(uint32_t x, int8_t r)
+{
+ return (x << r) | (x >> (32 - r));
+}
+
+
+uint32_t murmurhash3(const uint8_t* data, size_t len_)
+{
+ const int len = (int) len_;
+ const int nblocks = len / 4;
+
+ uint32_t h1 = 0xc062fb4a;
+
+ uint32_t c1 = 0xcc9e2d51;
+ uint32_t c2 = 0x1b873593;
+
+ //----------
+ // body
+
+ const uint32_t * blocks = (const uint32_t*) (data + nblocks * 4);
+
+ int i;
+ for(i = -nblocks; i; i++)
+ {
+ uint32_t k1 = blocks[i];
+
+ k1 *= c1;
+ k1 = rotl32(k1, 15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = rotl32(h1, 13);
+ h1 = h1*5+0xe6546b64;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+ uint32_t k1 = 0;
+
+ switch(len & 3)
+ {
+ case 3: k1 ^= tail[2] << 16;
+ case 2: k1 ^= tail[1] << 8;
+ case 1: k1 ^= tail[0];
+ k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
+ }
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+
+ h1 = fmix(h1);
+
+ return h1;
+}
+
+
+uint32_t seq_hash(const seq_t* seq)
+{
+ /* TODO */
+ return 0;
+}
+
static const size_t parser_buf_size = 1000000;