*
* Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
*
- * fastq-sample :
- * Sample reads with or without replacement from a FASTQ file.
+ * fastq-sort:
+ * Sort fastq files efficiently.
*
*/
#include <assert.h>
+#include <ctype.h>
#include <getopt.h>
#include <string.h>
#include <unistd.h>
#include "parse.h"
+/* User comparison function. */
+static int (*user_cmp)(const void*, const void*);
+static int (*cmp)(const void*, const void*);
+
+int rev_cmp(const void* a, const void* b)
+{
+ return -user_cmp(a, b);
+}
+
+
/* A collection of filenames of sorted chunks of fastq. */
typedef struct seq_dumps_t_
{
memcpy(&a->data[a->data_used], seq->id1.s, seq->id1.n + 1);
a->seqs[a->n].id1.s = &a->data[a->data_used];
- a->seqs[a->n].id1.n = seq->id1.n + 1;
+ a->seqs[a->n].id1.n = seq->id1.n;
a->data_used += seq->id1.n + 1;
memcpy(&a->data[a->data_used], seq->seq.s, seq->seq.n + 1);
a->seqs[a->n].seq.s = &a->data[a->data_used];
- a->seqs[a->n].seq.n = seq->seq.n + 1;
+ a->seqs[a->n].seq.n = seq->seq.n;
a->data_used += seq->seq.n + 1;
memcpy(&a->data[a->data_used], seq->id2.s, seq->id2.n + 1);
a->seqs[a->n].id2.s = &a->data[a->data_used];
- a->seqs[a->n].id2.n = seq->id2.n + 1;
+ a->seqs[a->n].id2.n = seq->id2.n;
a->data_used += seq->id2.n + 1;
memcpy(&a->data[a->data_used], seq->qual.s, seq->qual.n + 1);
a->seqs[a->n].qual.s = &a->data[a->data_used];
- a->seqs[a->n].qual.n = seq->qual.n + 1;
+ a->seqs[a->n].qual.n = seq->qual.n;
a->data_used += seq->qual.n + 1;
++a->n;
}
+/* Parse a size specification, which is just a number with a K, M, G suffix. */
+size_t parse_size(const char* str)
+{
+ char* endptr;
+ unsigned long size = strtoul(str, &endptr, 10);
+
+ if (toupper(*endptr) == 'K') size *= 1000;
+ else if (toupper(*endptr) == 'M') size *= 1000000;
+ else if (toupper(*endptr) == 'G') size *= 1000000000;
+
+ return size;
+}
+
+
int main(int argc, char* argv[])
{
int opt, opt_idx;
size_t buffer_size = 100000000;
- int (*cmp)(const void*, const void*) = seq_cmp_hash;;
+ bool reverse_sort = false;
+ user_cmp = seq_cmp_id;
static struct option long_options[] =
{
- {"id", no_argument, NULL, 'I'},
- {"seq", no_argument, NULL, 'S'},
- {"random", no_argument, NULL, 'R'},
- {"help", no_argument, NULL, 'h'},
- {"version", no_argument, NULL, 'V'},
+ {"buffer-size", required_argument, NULL, 'S'},
+ {"reverse", no_argument, NULL, 'r'},
+ {"id", no_argument, NULL, 'i'},
+ {"seq", no_argument, NULL, 's'},
+ {"random", no_argument, NULL, 'R'},
+ {"help", no_argument, NULL, 'h'},
+ {"version", no_argument, NULL, 'V'},
{0, 0, 0, 0}
};
while (true) {
- opt = getopt_long(argc, argv, "hV", long_options, &opt_idx);
+ opt = getopt_long(argc, argv, "S:risRhV", long_options, &opt_idx);
if (opt == -1) break;
switch (opt) {
- case 'I':
- cmp = seq_cmp_id;
+ case 'S':
+ buffer_size = parse_size(optarg);
break;
- case 'S':
- cmp = seq_cmp_seq;
+ case 'r':
+ reverse_sort = true;
+ break;
+
+ case 'i':
+ user_cmp = seq_cmp_id;
+ break;
+
+ case 's':
+ user_cmp = seq_cmp_seq;
break;
case 'R':
- cmp = seq_cmp_hash;
+ user_cmp = seq_cmp_hash;
break;
case 'h':
}
}
+ cmp = reverse_sort ? rev_cmp : user_cmp;
+
seq_array_t* a = seq_array_create(buffer_size);
seq_dumps_t* d = seq_dumps_create();
seq_t* seq = seq_create();