2 * This file is part of fastq-tools.
4 * Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
7 * Sample reads with or without replacement from a FASTQ file.
21 /* User comparison function. */
22 static int (*user_cmp)(const void*, const void*);
23 static int (*cmp)(const void*, const void*);
25 int rev_cmp(const void* a, const void* b)
27 return -user_cmp(a, b);
31 /* A collection of filenames of sorted chunks of fastq. */
32 typedef struct seq_dumps_t_
40 seq_dumps_t* seq_dumps_create()
42 seq_dumps_t* d = malloc_or_die(sizeof(seq_dumps_t));
45 d->fns = malloc_or_die(d->size * sizeof(char*));
50 void seq_dumps_free(seq_dumps_t* d)
53 for (i = 0; i < d->n; ++i) {
54 if (unlink(d->fns[i]) == -1) {
55 fprintf(stderr, "Warning: unable to remove temporary file %s.\n",
65 static inline size_t heap_parent(size_t i) { return (i - 1) / 2; }
66 static inline size_t heap_left (size_t i) { return 2 * i + 1; }
67 static inline size_t heap_right (size_t i) { return 2 * i + 2; }
70 /* Enqueue an item in the heap.
73 * heap: A binary heap in which stores indexs into seqs.
74 * n: Fixed maximum size of the heap.
75 * m: Current size of the heap.
77 * idx: Index to enqueue.
78 * cmp: Comparison function.
81 void heap_push(size_t* heap, size_t n, size_t* m, seq_t** seqs,
82 int (*cmp)(const void*, const void*), size_t idx)
85 size_t tmp, j, i = (*m)++;
91 if (cmp(seqs[heap[i]], seqs[heap[j]]) < 0) {
102 /* Dequeue an item from a heap. */
103 size_t heap_pop(size_t* heap, size_t* m, seq_t** seqs,
104 int (*cmp)(const void*, const void*))
107 size_t ans = heap[0];
108 heap[0] = heap[--(*m)];
109 size_t tmp, l, r, j, i = 0;
114 if (l >= (*m)) break;
120 j = cmp(seqs[heap[l]], seqs[heap[r]]) < 0 ? l : r;
123 if (cmp(seqs[heap[i]], seqs[heap[j]]) > 0) {
136 /* n-way merge sort to stdout */
137 void merge_sort(const seq_dumps_t* d, int (*cmp)(const void*, const void*))
139 FILE** files = malloc_or_die(d->n * sizeof(FILE*));
141 for (i = 0; i < d->n; ++i) {
142 files[i] = fopen(d->fns[i], "rb");
143 if (files[i] == NULL) {
144 fprintf(stderr, "Cannot open temporary file %s for reading.\n",
150 fastq_t** fs = malloc_or_die(d->n * sizeof(fastq_t*));
151 seq_t** seqs = malloc_or_die(d->n * sizeof(seq_t*));
152 for (i = 0; i < d->n; ++i) {
153 fs[i] = fastq_create(files[i]);
154 seqs[i] = seq_create();
157 /* A binary heap of indexes to fs. We use this to repeatedly pop the
158 * smallest fastq entry. */
159 size_t* heap = malloc_or_die(d->n * sizeof(size_t));
164 for (i = 0; i < d->n; ++i) {
165 if (fastq_read(fs[i], seqs[i])) {
166 heap_push(heap, d->n, &m, seqs, cmp, i);
171 i = heap_pop(heap, &m, seqs, cmp);
172 fastq_print(stdout, seqs[i]);
173 if (fastq_read(fs[i], seqs[i])) {
174 heap_push(heap, d->n, &m, seqs, cmp, i);
178 for (i = 0; i < d->n; ++i) {
189 typedef struct seq_array_t_
193 /* Number of seq objects. */
196 /* Size reserved in seqs. */
199 /* Space reserved for strings. */
205 /* Total data size. */
210 seq_array_t* seq_array_create(size_t data_size)
212 seq_array_t* a = malloc_or_die(sizeof(seq_array_t));
215 a->seqs = malloc_or_die(a->size * sizeof(seq_t));
217 a->data_size = data_size;
219 a->data = malloc_or_die(data_size);
225 void seq_array_free(seq_array_t* a)
233 /* Push a fastq entry to back of the array. Return false if there was not enough
235 bool seq_array_push(seq_array_t* a, const seq_t* seq)
237 size_t size_needed = (seq->id1.n + 1) + (seq->seq.n + 1) +
238 (seq->id2.n + 1) + (seq->qual.n + 1);
240 if (size_needed > a->data_size - a->data_used) return false;
242 if (a->n == a->size) {
244 a->seqs = realloc_or_die(a->seqs, a->size * sizeof(seq_t));
247 memcpy(&a->data[a->data_used], seq->id1.s, seq->id1.n + 1);
248 a->seqs[a->n].id1.s = &a->data[a->data_used];
249 a->seqs[a->n].id1.n = seq->id1.n + 1;
250 a->data_used += seq->id1.n + 1;
252 memcpy(&a->data[a->data_used], seq->seq.s, seq->seq.n + 1);
253 a->seqs[a->n].seq.s = &a->data[a->data_used];
254 a->seqs[a->n].seq.n = seq->seq.n + 1;
255 a->data_used += seq->seq.n + 1;
257 memcpy(&a->data[a->data_used], seq->id2.s, seq->id2.n + 1);
258 a->seqs[a->n].id2.s = &a->data[a->data_used];
259 a->seqs[a->n].id2.n = seq->id2.n + 1;
260 a->data_used += seq->id2.n + 1;
262 memcpy(&a->data[a->data_used], seq->qual.s, seq->qual.n + 1);
263 a->seqs[a->n].qual.s = &a->data[a->data_used];
264 a->seqs[a->n].qual.n = seq->qual.n + 1;
265 a->data_used += seq->qual.n + 1;
273 void seq_array_clear(seq_array_t* a)
280 void seq_array_sort(seq_array_t* a, int (*cmp)(const void*, const void*))
282 qsort(a->seqs, a->n, sizeof(seq_t), cmp);
286 int seq_cmp_hash(const void* a, const void* b)
288 uint32_t ha = seq_hash((seq_t*) a);
289 uint32_t hb = seq_hash((seq_t*) b);
291 if (ha < hb) return -1;
292 else if (ha > hb) return 1;
298 int seq_cmp_id(const void* a, const void* b)
300 return strcmp(((seq_t*) a)->id1.s, ((seq_t*) b)->id1.s);
304 int seq_cmp_seq(const void* a, const void* b)
306 return strcmp(((seq_t*) a)->seq.s, ((seq_t*) b)->seq.s);
310 void seq_array_dump(seq_dumps_t* d, const seq_array_t* a)
312 const char* template = "/tmp/fastq_sort.XXXXXXXX";
313 char* fn = malloc_or_die(strlen(template) + 1);
314 memcpy(fn, template, strlen(template) + 1);
315 if (mktemp(fn) == NULL) {
316 fprintf(stderr, "Unable to create a temporary file.\n");
320 FILE* f = fopen(fn, "wb");
322 fprintf(stderr, "Unable to open temporary file %s for writing.\n", fn);
327 for (i = 0; i < a->n; ++i) {
328 fastq_print(f, &a->seqs[i]);
331 if (d->n == d->size) {
333 d->fns = realloc_or_die(d->fns, d->size * sizeof(char*));
341 static const char* prog_name = "fastq-sort";
347 "fastq-sort [OPTION]... [FILE]...\n"
348 "Concatenate and sort FASTQ files and write to standard output.\n"
350 " -I, --id sort alphabetically by read identifier\n"
351 " -S, --seq sort alphabetically by sequence\n"
352 " -R, --random randomly shuffle the sequences\n"
353 " -G, --gc sort by GC content\n" /* TODO */
354 " -M, --median-qual sort by median quality score\n" /* TODO */
355 " -h, --help print this message\n"
356 " -V, --version output version information and exit\n"
361 /* Parse a size specification, which is just a number with a K, M, G suffix. */
362 size_t parse_size(const char* str)
365 unsigned long size = strtoul(str, &endptr, 10);
367 if (toupper(*endptr) == 'K') size *= 1000;
368 else if (toupper(*endptr) == 'M') size *= 1000000;
369 else if (toupper(*endptr) == 'G') size *= 1000000000;
375 int main(int argc, char* argv[])
378 size_t buffer_size = 100000000;
379 bool reverse_sort = false;
380 user_cmp = seq_cmp_id;
382 static struct option long_options[] =
384 {"buffer-size", required_argument, NULL, 'S'},
385 {"reverse", no_argument, NULL, 'r'},
386 {"id", no_argument, NULL, 'i'},
387 {"seq", no_argument, NULL, 's'},
388 {"random", no_argument, NULL, 'R'},
389 {"help", no_argument, NULL, 'h'},
390 {"version", no_argument, NULL, 'V'},
395 opt = getopt_long(argc, argv, "S:risRhV", long_options, &opt_idx);
396 if (opt == -1) break;
400 buffer_size = parse_size(optarg);
408 user_cmp = seq_cmp_id;
412 user_cmp = seq_cmp_seq;
416 user_cmp = seq_cmp_hash;
424 print_version(stdout, prog_name);
435 cmp = reverse_sort ? rev_cmp : user_cmp;
437 seq_array_t* a = seq_array_create(buffer_size);
438 seq_dumps_t* d = seq_dumps_create();
439 seq_t* seq = seq_create();
442 if (optind >= argc) {
443 f = fastq_create(stdin);
444 while (fastq_read(f, seq)) {
445 if (!seq_array_push(a, seq)) {
446 seq_array_sort(a, cmp);
447 seq_array_dump(d, a);
449 if (!seq_array_push(a, seq)) {
450 fprintf(stderr, "The buffer size is to small.\n");
459 for (; optind < argc; ++optind) {
460 file = fopen(argv[optind], "rb");
462 fprintf(stderr, "Cannot open %s for reading.\n", argv[optind]);
465 f = fastq_create(file);
467 while (fastq_read(f, seq)) {
468 if (!seq_array_push(a, seq)) {
469 seq_array_sort(a, cmp);
470 seq_array_dump(d, a);
472 if (!seq_array_push(a, seq)) {
473 fprintf(stderr, "The buffer size is to small.\n");
485 seq_array_sort(a, cmp);
487 /* We were able to sort everything in memory. */
490 for (i = 0; i < a->n; ++i) {
491 fastq_print(stdout, &a->seqs[i]);
495 seq_array_dump(d, a);