X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Ffastq-sort.c;h=9799b075348ad087ef6df654f075618db0af4e50;hb=52779892ccc2ac676654a34baff8d1e7645121de;hp=171b46a501280fa3adb53b24c4cc0584291d5feb;hpb=88680e52e8c58cd20f29d0de15dceefe15e81c8f;p=fastq-tools.git diff --git a/src/fastq-sort.c b/src/fastq-sort.c index 171b46a..9799b07 100644 --- a/src/fastq-sort.c +++ b/src/fastq-sort.c @@ -3,12 +3,13 @@ * * Copyright (c) 2012 by Daniel C. Jones * - * fastq-sample : - * Sample reads with or without replacement from a FASTQ file. + * fastq-sort: + * Sort fastq files efficiently. * */ #include +#include #include #include #include @@ -17,6 +18,16 @@ #include "parse.h" +/* User comparison function. */ +static int (*user_cmp)(const void*, const void*); +static int (*cmp)(const void*, const void*); + +int rev_cmp(const void* a, const void* b) +{ + return -user_cmp(a, b); +} + + /* A collection of filenames of sorted chunks of fastq. */ typedef struct seq_dumps_t_ { @@ -235,22 +246,22 @@ bool seq_array_push(seq_array_t* a, const seq_t* seq) memcpy(&a->data[a->data_used], seq->id1.s, seq->id1.n + 1); a->seqs[a->n].id1.s = &a->data[a->data_used]; - a->seqs[a->n].id1.n = seq->id1.n + 1; + a->seqs[a->n].id1.n = seq->id1.n; a->data_used += seq->id1.n + 1; memcpy(&a->data[a->data_used], seq->seq.s, seq->seq.n + 1); a->seqs[a->n].seq.s = &a->data[a->data_used]; - a->seqs[a->n].seq.n = seq->seq.n + 1; + a->seqs[a->n].seq.n = seq->seq.n; a->data_used += seq->seq.n + 1; memcpy(&a->data[a->data_used], seq->id2.s, seq->id2.n + 1); a->seqs[a->n].id2.s = &a->data[a->data_used]; - a->seqs[a->n].id2.n = seq->id2.n + 1; + a->seqs[a->n].id2.n = seq->id2.n; a->data_used += seq->id2.n + 1; memcpy(&a->data[a->data_used], seq->qual.s, seq->qual.n + 1); a->seqs[a->n].qual.s = &a->data[a->data_used]; - a->seqs[a->n].qual.n = seq->qual.n + 1; + a->seqs[a->n].qual.n = seq->qual.n; a->data_used += seq->qual.n + 1; ++a->n; @@ -347,37 +358,62 @@ void print_help() } +/* Parse a size specification, which is just a number with a K, M, G suffix. */ +size_t parse_size(const char* str) +{ + char* endptr; + unsigned long size = strtoul(str, &endptr, 10); + + if (toupper(*endptr) == 'K') size *= 1000; + else if (toupper(*endptr) == 'M') size *= 1000000; + else if (toupper(*endptr) == 'G') size *= 1000000000; + + return size; +} + + int main(int argc, char* argv[]) { int opt, opt_idx; size_t buffer_size = 100000000; - int (*cmp)(const void*, const void*) = seq_cmp_hash;; + bool reverse_sort = false; + user_cmp = seq_cmp_id; static struct option long_options[] = { - {"id", no_argument, NULL, 'I'}, - {"seq", no_argument, NULL, 'S'}, - {"random", no_argument, NULL, 'R'}, - {"help", no_argument, NULL, 'h'}, - {"version", no_argument, NULL, 'V'}, + {"buffer-size", required_argument, NULL, 'S'}, + {"reverse", no_argument, NULL, 'r'}, + {"id", no_argument, NULL, 'i'}, + {"seq", no_argument, NULL, 's'}, + {"random", no_argument, NULL, 'R'}, + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, {0, 0, 0, 0} }; while (true) { - opt = getopt_long(argc, argv, "hV", long_options, &opt_idx); + opt = getopt_long(argc, argv, "S:risRhV", long_options, &opt_idx); if (opt == -1) break; switch (opt) { - case 'I': - cmp = seq_cmp_id; + case 'S': + buffer_size = parse_size(optarg); break; - case 'S': - cmp = seq_cmp_seq; + case 'r': + reverse_sort = true; + break; + + case 'i': + user_cmp = seq_cmp_id; + break; + + case 's': + user_cmp = seq_cmp_seq; break; case 'R': - cmp = seq_cmp_hash; + user_cmp = seq_cmp_hash; break; case 'h': @@ -396,6 +432,8 @@ int main(int argc, char* argv[]) } } + cmp = reverse_sort ? rev_cmp : user_cmp; + seq_array_t* a = seq_array_create(buffer_size); seq_dumps_t* d = seq_dumps_create(); seq_t* seq = seq_create();