*
* Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
*
- * fastq-sample :
- * Sample reads with or without replacement from a FASTQ file.
+ * fastq-sort:
+ * Sort fastq files efficiently.
*
*/
#include <assert.h>
+#include <ctype.h>
#include <getopt.h>
#include <string.h>
+#include <time.h>
#include <unistd.h>
#include "common.h"
memcpy(&a->data[a->data_used], seq->id1.s, seq->id1.n + 1);
a->seqs[a->n].id1.s = &a->data[a->data_used];
- a->seqs[a->n].id1.n = seq->id1.n + 1;
+ a->seqs[a->n].id1.n = seq->id1.n;
a->data_used += seq->id1.n + 1;
memcpy(&a->data[a->data_used], seq->seq.s, seq->seq.n + 1);
a->seqs[a->n].seq.s = &a->data[a->data_used];
- a->seqs[a->n].seq.n = seq->seq.n + 1;
+ a->seqs[a->n].seq.n = seq->seq.n;
a->data_used += seq->seq.n + 1;
memcpy(&a->data[a->data_used], seq->id2.s, seq->id2.n + 1);
a->seqs[a->n].id2.s = &a->data[a->data_used];
- a->seqs[a->n].id2.n = seq->id2.n + 1;
+ a->seqs[a->n].id2.n = seq->id2.n;
a->data_used += seq->id2.n + 1;
memcpy(&a->data[a->data_used], seq->qual.s, seq->qual.n + 1);
a->seqs[a->n].qual.s = &a->data[a->data_used];
- a->seqs[a->n].qual.n = seq->qual.n + 1;
+ a->seqs[a->n].qual.n = seq->qual.n;
a->data_used += seq->qual.n + 1;
++a->n;
}
+static float seq_gc(const seq_t* s)
+{
+ if (s->seq.n == 0) return 0.0;
+
+ float gc = 0;
+ size_t i;
+ for (i = 0; i < s->seq.n; ++i) {
+ if (s->seq.s[i] == 'g' || s->seq.s[i] == 'G' ||
+ s->seq.s[i] == 'c' || s->seq.s[i] == 'C') {
+ ++gc;
+ }
+ }
+
+ return gc / (float) s->seq.n;
+}
+
+
+int seq_cmp_gc(const void* a, const void* b)
+{
+ float gc_a = seq_gc((seq_t*) a);
+ float gc_b = seq_gc((seq_t*) b);
+
+ if (gc_a < gc_b) return -1;
+ else if (gc_a > gc_b) return 1;
+ else return 0;
+}
+
+
+static float seq_mean_qual(const seq_t* s)
+{
+ if (s->qual.n == 0) return 0.0;
+
+ float q = 0.0;
+ size_t i;
+ for (i = 0; i < s->qual.n; ++i) {
+ q += (float) s->qual.s[i];
+ }
+ return q / (float) s->qual.n;
+}
+
+
+int seq_cmp_mean_qual(const void* a, const void* b)
+{
+ float mq_a = seq_mean_qual((seq_t*) a);
+ float mq_b = seq_mean_qual((seq_t*) b);
+
+ if (mq_a < mq_b) return -1;
+ else if (mq_a > mq_b) return 1;
+ else return 0;
+}
+
+
void seq_array_dump(seq_dumps_t* d, const seq_array_t* a)
{
const char* template = "/tmp/fastq_sort.XXXXXXXX";
"fastq-sort [OPTION]... [FILE]...\n"
"Concatenate and sort FASTQ files and write to standard output.\n"
"Options:\n"
+" -r, --reverse sort in reverse (i.e., descending) order\n"
" -I, --id sort alphabetically by read identifier\n"
" -S, --seq sort alphabetically by sequence\n"
" -R, --random randomly shuffle the sequences\n"
-" -G, --gc sort by GC content\n" /* TODO */
-" -M, --median-qual sort by median quality score\n" /* TODO */
+" --seed[=SEED] seed to use for random shuffle.\n"
+" -G, --gc sort by GC content\n"
+" -M, --mean-qual sort by median quality score\n" /* TODO */
" -h, --help print this message\n"
" -V, --version output version information and exit\n"
);
}
+/* Parse a size specification, which is just a number with a K, M, G suffix. */
+size_t parse_size(const char* str)
+{
+ char* endptr;
+ unsigned long size = strtoul(str, &endptr, 10);
+
+ if (toupper(*endptr) == 'K') size *= 1000;
+ else if (toupper(*endptr) == 'M') size *= 1000000;
+ else if (toupper(*endptr) == 'G') size *= 1000000000;
+
+ return size;
+}
+
+
int main(int argc, char* argv[])
{
int opt, opt_idx;
static struct option long_options[] =
{
- {"reverse", no_argument, NULL, 'r'},
- {"id", no_argument, NULL, 'I'},
- {"seq", no_argument, NULL, 'S'},
- {"random", no_argument, NULL, 'R'},
- {"help", no_argument, NULL, 'h'},
- {"version", no_argument, NULL, 'V'},
+ {"buffer-size", required_argument, NULL, 'S'},
+ {"reverse", no_argument, NULL, 'r'},
+ {"id", no_argument, NULL, 'i'},
+ {"seq", no_argument, NULL, 's'},
+ {"random", no_argument, NULL, 'R'},
+ {"seed", optional_argument, NULL, 0},
+ {"gc", no_argument, NULL, 'G'},
+ {"mean-qual", no_argument, NULL, 'M'},
+ {"help", no_argument, NULL, 'h'},
+ {"version", no_argument, NULL, 'V'},
{0, 0, 0, 0}
};
while (true) {
- opt = getopt_long(argc, argv, "rISRhV", long_options, &opt_idx);
+ opt = getopt_long(argc, argv, "S:risRGhV", long_options, &opt_idx);
if (opt == -1) break;
switch (opt) {
+ case 'S':
+ buffer_size = parse_size(optarg);
+ break;
+
case 'r':
reverse_sort = true;
break;
- case 'I':
+ case 'i':
user_cmp = seq_cmp_id;
break;
- case 'S':
+ case 's':
user_cmp = seq_cmp_seq;
break;
user_cmp = seq_cmp_hash;
break;
+ case 'G':
+ user_cmp = seq_cmp_gc;
+ break;
+
+ case 'M':
+ user_cmp = seq_cmp_mean_qual;
+ break;
+
case 'h':
print_help();
return 0;
print_version(stdout, prog_name);
return 0;
+ case 0:
+ if (strcmp(long_options[opt_idx].name, "seed") == 0) {
+ uint32_t seed;
+ if (optarg) {
+ seed = strtoul(optarg, NULL, 10);
+ }
+ else {
+ seed = (uint32_t) time(NULL);
+ }
+ seq_hash_set_seed(seed);
+ }
+ break;
+
case '?':
return 1;