2 * This file is part of fastq-tools.
4 * Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
7 * Sort fastq files efficiently.
22 /* User comparison function. */
23 static int (*user_cmp)(const void*, const void*);
24 static int (*cmp)(const void*, const void*);
26 int rev_cmp(const void* a, const void* b)
28 return -user_cmp(a, b);
32 /* A collection of filenames of sorted chunks of fastq. */
33 typedef struct seq_dumps_t_
41 seq_dumps_t* seq_dumps_create()
43 seq_dumps_t* d = malloc_or_die(sizeof(seq_dumps_t));
46 d->fns = malloc_or_die(d->size * sizeof(char*));
51 void seq_dumps_free(seq_dumps_t* d)
54 for (i = 0; i < d->n; ++i) {
55 if (unlink(d->fns[i]) == -1) {
56 fprintf(stderr, "Warning: unable to remove temporary file %s.\n",
66 static inline size_t heap_parent(size_t i) { return (i - 1) / 2; }
67 static inline size_t heap_left (size_t i) { return 2 * i + 1; }
68 static inline size_t heap_right (size_t i) { return 2 * i + 2; }
71 /* Enqueue an item in the heap.
74 * heap: A binary heap in which stores indexs into seqs.
75 * n: Fixed maximum size of the heap.
76 * m: Current size of the heap.
78 * idx: Index to enqueue.
79 * cmp: Comparison function.
82 void heap_push(size_t* heap, size_t n, size_t* m, seq_t** seqs,
83 int (*cmp)(const void*, const void*), size_t idx)
86 size_t tmp, j, i = (*m)++;
92 if (cmp(seqs[heap[i]], seqs[heap[j]]) < 0) {
103 /* Dequeue an item from a heap. */
104 size_t heap_pop(size_t* heap, size_t* m, seq_t** seqs,
105 int (*cmp)(const void*, const void*))
108 size_t ans = heap[0];
109 heap[0] = heap[--(*m)];
110 size_t tmp, l, r, j, i = 0;
115 if (l >= (*m)) break;
121 j = cmp(seqs[heap[l]], seqs[heap[r]]) < 0 ? l : r;
124 if (cmp(seqs[heap[i]], seqs[heap[j]]) > 0) {
137 /* n-way merge sort to stdout */
138 void merge_sort(const seq_dumps_t* d, int (*cmp)(const void*, const void*))
140 FILE** files = malloc_or_die(d->n * sizeof(FILE*));
142 for (i = 0; i < d->n; ++i) {
143 files[i] = fopen(d->fns[i], "rb");
144 if (files[i] == NULL) {
145 fprintf(stderr, "Cannot open temporary file %s for reading.\n",
151 fastq_t** fs = malloc_or_die(d->n * sizeof(fastq_t*));
152 seq_t** seqs = malloc_or_die(d->n * sizeof(seq_t*));
153 for (i = 0; i < d->n; ++i) {
154 fs[i] = fastq_create(files[i]);
155 seqs[i] = seq_create();
158 /* A binary heap of indexes to fs. We use this to repeatedly pop the
159 * smallest fastq entry. */
160 size_t* heap = malloc_or_die(d->n * sizeof(size_t));
165 for (i = 0; i < d->n; ++i) {
166 if (fastq_read(fs[i], seqs[i])) {
167 heap_push(heap, d->n, &m, seqs, cmp, i);
172 i = heap_pop(heap, &m, seqs, cmp);
173 fastq_print(stdout, seqs[i]);
174 if (fastq_read(fs[i], seqs[i])) {
175 heap_push(heap, d->n, &m, seqs, cmp, i);
179 for (i = 0; i < d->n; ++i) {
190 typedef struct seq_array_t_
194 /* Number of seq objects. */
197 /* Size reserved in seqs. */
200 /* Space reserved for strings. */
206 /* Total data size. */
211 seq_array_t* seq_array_create(size_t data_size)
213 seq_array_t* a = malloc_or_die(sizeof(seq_array_t));
216 a->seqs = malloc_or_die(a->size * sizeof(seq_t));
218 a->data_size = data_size;
220 a->data = malloc_or_die(data_size);
226 void seq_array_free(seq_array_t* a)
234 /* Push a fastq entry to back of the array. Return false if there was not enough
236 bool seq_array_push(seq_array_t* a, const seq_t* seq)
238 size_t size_needed = (seq->id1.n + 1) + (seq->seq.n + 1) +
239 (seq->id2.n + 1) + (seq->qual.n + 1);
241 if (size_needed > a->data_size - a->data_used) return false;
243 if (a->n == a->size) {
245 a->seqs = realloc_or_die(a->seqs, a->size * sizeof(seq_t));
248 memcpy(&a->data[a->data_used], seq->id1.s, seq->id1.n + 1);
249 a->seqs[a->n].id1.s = &a->data[a->data_used];
250 a->seqs[a->n].id1.n = seq->id1.n;
251 a->data_used += seq->id1.n + 1;
253 memcpy(&a->data[a->data_used], seq->seq.s, seq->seq.n + 1);
254 a->seqs[a->n].seq.s = &a->data[a->data_used];
255 a->seqs[a->n].seq.n = seq->seq.n;
256 a->data_used += seq->seq.n + 1;
258 memcpy(&a->data[a->data_used], seq->id2.s, seq->id2.n + 1);
259 a->seqs[a->n].id2.s = &a->data[a->data_used];
260 a->seqs[a->n].id2.n = seq->id2.n;
261 a->data_used += seq->id2.n + 1;
263 memcpy(&a->data[a->data_used], seq->qual.s, seq->qual.n + 1);
264 a->seqs[a->n].qual.s = &a->data[a->data_used];
265 a->seqs[a->n].qual.n = seq->qual.n;
266 a->data_used += seq->qual.n + 1;
274 void seq_array_clear(seq_array_t* a)
281 void seq_array_sort(seq_array_t* a, int (*cmp)(const void*, const void*))
283 qsort(a->seqs, a->n, sizeof(seq_t), cmp);
287 int seq_cmp_hash(const void* a, const void* b)
289 uint32_t ha = seq_hash((seq_t*) a);
290 uint32_t hb = seq_hash((seq_t*) b);
292 if (ha < hb) return -1;
293 else if (ha > hb) return 1;
299 int seq_cmp_id(const void* a, const void* b)
301 return strcmp(((seq_t*) a)->id1.s, ((seq_t*) b)->id1.s);
305 int seq_cmp_seq(const void* a, const void* b)
307 return strcmp(((seq_t*) a)->seq.s, ((seq_t*) b)->seq.s);
311 static float seq_gc(const seq_t* s)
313 if (s->seq.n == 0) return 0.0;
317 for (i = 0; i < s->seq.n; ++i) {
318 if (s->seq.s[i] == 'g' || s->seq.s[i] == 'G' ||
319 s->seq.s[i] == 'c' || s->seq.s[i] == 'C') {
324 return gc / (float) s->seq.n;
328 int seq_cmp_gc(const void* a, const void* b)
330 float gc_a = seq_gc((seq_t*) a);
331 float gc_b = seq_gc((seq_t*) b);
333 if (gc_a < gc_b) return -1;
334 else if (gc_a > gc_b) return 1;
339 static float seq_mean_qual(const seq_t* s)
341 if (s->qual.n == 0) return 0.0;
345 for (i = 0; i < s->qual.n; ++i) {
346 q += (float) s->qual.s[i];
348 return q / (float) s->qual.n;
352 int seq_cmp_mean_qual(const void* a, const void* b)
354 float mq_a = seq_mean_qual((seq_t*) a);
355 float mq_b = seq_mean_qual((seq_t*) b);
357 if (mq_a < mq_b) return -1;
358 else if (mq_a > mq_b) return 1;
363 void seq_array_dump(seq_dumps_t* d, const seq_array_t* a)
365 const char* template = "/tmp/fastq_sort.XXXXXXXX";
366 char* fn = malloc_or_die(strlen(template) + 1);
367 memcpy(fn, template, strlen(template) + 1);
368 if (mktemp(fn) == NULL) {
369 fprintf(stderr, "Unable to create a temporary file.\n");
373 FILE* f = fopen(fn, "wb");
375 fprintf(stderr, "Unable to open temporary file %s for writing.\n", fn);
380 for (i = 0; i < a->n; ++i) {
381 fastq_print(f, &a->seqs[i]);
384 if (d->n == d->size) {
386 d->fns = realloc_or_die(d->fns, d->size * sizeof(char*));
394 static const char* prog_name = "fastq-sort";
400 "fastq-sort [OPTION]... [FILE]...\n"
401 "Concatenate and sort FASTQ files and write to standard output.\n"
403 " -r, --reverse sort in reverse (i.e., descending) order\n"
404 " -I, --id sort alphabetically by read identifier\n"
405 " -S, --seq sort alphabetically by sequence\n"
406 " -R, --random randomly shuffle the sequences\n"
407 " --seed[=SEED] seed to use for random shuffle.\n"
408 " -G, --gc sort by GC content\n"
409 " -M, --mean-qual sort by median quality score\n"
410 " -h, --help print this message\n"
411 " -V, --version output version information and exit\n"
416 /* Parse a size specification, which is just a number with a K, M, G suffix. */
417 size_t parse_size(const char* str)
420 unsigned long size = strtoul(str, &endptr, 10);
422 if (toupper(*endptr) == 'K') size *= 1000;
423 else if (toupper(*endptr) == 'M') size *= 1000000;
424 else if (toupper(*endptr) == 'G') size *= 1000000000;
430 int main(int argc, char* argv[])
433 size_t buffer_size = 100000000;
434 bool reverse_sort = false;
435 user_cmp = seq_cmp_id;
437 static struct option long_options[] =
439 {"buffer-size", required_argument, NULL, 'S'},
440 {"reverse", no_argument, NULL, 'r'},
441 {"id", no_argument, NULL, 'i'},
442 {"seq", no_argument, NULL, 's'},
443 {"random", no_argument, NULL, 'R'},
444 {"seed", optional_argument, NULL, 0},
445 {"gc", no_argument, NULL, 'G'},
446 {"mean-qual", no_argument, NULL, 'M'},
447 {"help", no_argument, NULL, 'h'},
448 {"version", no_argument, NULL, 'V'},
453 opt = getopt_long(argc, argv, "S:risRGhV", long_options, &opt_idx);
454 if (opt == -1) break;
458 buffer_size = parse_size(optarg);
466 user_cmp = seq_cmp_id;
470 user_cmp = seq_cmp_seq;
474 user_cmp = seq_cmp_hash;
478 user_cmp = seq_cmp_gc;
482 user_cmp = seq_cmp_mean_qual;
490 print_version(stdout, prog_name);
494 if (strcmp(long_options[opt_idx].name, "seed") == 0) {
497 seed = strtoul(optarg, NULL, 10);
500 seed = (uint32_t) time(NULL);
502 seq_hash_set_seed(seed);
514 cmp = reverse_sort ? rev_cmp : user_cmp;
516 seq_array_t* a = seq_array_create(buffer_size);
517 seq_dumps_t* d = seq_dumps_create();
518 seq_t* seq = seq_create();
521 if (optind >= argc) {
522 f = fastq_create(stdin);
523 while (fastq_read(f, seq)) {
524 if (!seq_array_push(a, seq)) {
525 seq_array_sort(a, cmp);
526 seq_array_dump(d, a);
528 if (!seq_array_push(a, seq)) {
529 fprintf(stderr, "The buffer size is to small.\n");
538 for (; optind < argc; ++optind) {
539 file = fopen(argv[optind], "rb");
541 fprintf(stderr, "Cannot open %s for reading.\n", argv[optind]);
544 f = fastq_create(file);
546 while (fastq_read(f, seq)) {
547 if (!seq_array_push(a, seq)) {
548 seq_array_sort(a, cmp);
549 seq_array_dump(d, a);
551 if (!seq_array_push(a, seq)) {
552 fprintf(stderr, "The buffer size is to small.\n");
564 seq_array_sort(a, cmp);
566 /* We were able to sort everything in memory. */
569 for (i = 0; i < a->n; ++i) {
570 fastq_print(stdout, &a->seqs[i]);
574 seq_array_dump(d, a);