2 * This file is part of fastq-tools.
4 * Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
7 * Sample reads with or without replacement from a FASTQ file.
20 /* User comparison function. */
21 static int (*user_cmp)(const void*, const void*);
22 static int (*cmp)(const void*, const void*);
24 int rev_cmp(const void* a, const void* b)
26 return -user_cmp(a, b);
30 /* A collection of filenames of sorted chunks of fastq. */
31 typedef struct seq_dumps_t_
39 seq_dumps_t* seq_dumps_create()
41 seq_dumps_t* d = malloc_or_die(sizeof(seq_dumps_t));
44 d->fns = malloc_or_die(d->size * sizeof(char*));
49 void seq_dumps_free(seq_dumps_t* d)
52 for (i = 0; i < d->n; ++i) {
53 if (unlink(d->fns[i]) == -1) {
54 fprintf(stderr, "Warning: unable to remove temporary file %s.\n",
64 static inline size_t heap_parent(size_t i) { return (i - 1) / 2; }
65 static inline size_t heap_left (size_t i) { return 2 * i + 1; }
66 static inline size_t heap_right (size_t i) { return 2 * i + 2; }
69 /* Enqueue an item in the heap.
72 * heap: A binary heap in which stores indexs into seqs.
73 * n: Fixed maximum size of the heap.
74 * m: Current size of the heap.
76 * idx: Index to enqueue.
77 * cmp: Comparison function.
80 void heap_push(size_t* heap, size_t n, size_t* m, seq_t** seqs,
81 int (*cmp)(const void*, const void*), size_t idx)
84 size_t tmp, j, i = (*m)++;
90 if (cmp(seqs[heap[i]], seqs[heap[j]]) < 0) {
101 /* Dequeue an item from a heap. */
102 size_t heap_pop(size_t* heap, size_t* m, seq_t** seqs,
103 int (*cmp)(const void*, const void*))
106 size_t ans = heap[0];
107 heap[0] = heap[--(*m)];
108 size_t tmp, l, r, j, i = 0;
113 if (l >= (*m)) break;
119 j = cmp(seqs[heap[l]], seqs[heap[r]]) < 0 ? l : r;
122 if (cmp(seqs[heap[i]], seqs[heap[j]]) > 0) {
135 /* n-way merge sort to stdout */
136 void merge_sort(const seq_dumps_t* d, int (*cmp)(const void*, const void*))
138 FILE** files = malloc_or_die(d->n * sizeof(FILE*));
140 for (i = 0; i < d->n; ++i) {
141 files[i] = fopen(d->fns[i], "rb");
142 if (files[i] == NULL) {
143 fprintf(stderr, "Cannot open temporary file %s for reading.\n",
149 fastq_t** fs = malloc_or_die(d->n * sizeof(fastq_t*));
150 seq_t** seqs = malloc_or_die(d->n * sizeof(seq_t*));
151 for (i = 0; i < d->n; ++i) {
152 fs[i] = fastq_create(files[i]);
153 seqs[i] = seq_create();
156 /* A binary heap of indexes to fs. We use this to repeatedly pop the
157 * smallest fastq entry. */
158 size_t* heap = malloc_or_die(d->n * sizeof(size_t));
163 for (i = 0; i < d->n; ++i) {
164 if (fastq_read(fs[i], seqs[i])) {
165 heap_push(heap, d->n, &m, seqs, cmp, i);
170 i = heap_pop(heap, &m, seqs, cmp);
171 fastq_print(stdout, seqs[i]);
172 if (fastq_read(fs[i], seqs[i])) {
173 heap_push(heap, d->n, &m, seqs, cmp, i);
177 for (i = 0; i < d->n; ++i) {
188 typedef struct seq_array_t_
192 /* Number of seq objects. */
195 /* Size reserved in seqs. */
198 /* Space reserved for strings. */
204 /* Total data size. */
209 seq_array_t* seq_array_create(size_t data_size)
211 seq_array_t* a = malloc_or_die(sizeof(seq_array_t));
214 a->seqs = malloc_or_die(a->size * sizeof(seq_t));
216 a->data_size = data_size;
218 a->data = malloc_or_die(data_size);
224 void seq_array_free(seq_array_t* a)
232 /* Push a fastq entry to back of the array. Return false if there was not enough
234 bool seq_array_push(seq_array_t* a, const seq_t* seq)
236 size_t size_needed = (seq->id1.n + 1) + (seq->seq.n + 1) +
237 (seq->id2.n + 1) + (seq->qual.n + 1);
239 if (size_needed > a->data_size - a->data_used) return false;
241 if (a->n == a->size) {
243 a->seqs = realloc_or_die(a->seqs, a->size * sizeof(seq_t));
246 memcpy(&a->data[a->data_used], seq->id1.s, seq->id1.n + 1);
247 a->seqs[a->n].id1.s = &a->data[a->data_used];
248 a->seqs[a->n].id1.n = seq->id1.n + 1;
249 a->data_used += seq->id1.n + 1;
251 memcpy(&a->data[a->data_used], seq->seq.s, seq->seq.n + 1);
252 a->seqs[a->n].seq.s = &a->data[a->data_used];
253 a->seqs[a->n].seq.n = seq->seq.n + 1;
254 a->data_used += seq->seq.n + 1;
256 memcpy(&a->data[a->data_used], seq->id2.s, seq->id2.n + 1);
257 a->seqs[a->n].id2.s = &a->data[a->data_used];
258 a->seqs[a->n].id2.n = seq->id2.n + 1;
259 a->data_used += seq->id2.n + 1;
261 memcpy(&a->data[a->data_used], seq->qual.s, seq->qual.n + 1);
262 a->seqs[a->n].qual.s = &a->data[a->data_used];
263 a->seqs[a->n].qual.n = seq->qual.n + 1;
264 a->data_used += seq->qual.n + 1;
272 void seq_array_clear(seq_array_t* a)
279 void seq_array_sort(seq_array_t* a, int (*cmp)(const void*, const void*))
281 qsort(a->seqs, a->n, sizeof(seq_t), cmp);
285 int seq_cmp_hash(const void* a, const void* b)
287 uint32_t ha = seq_hash((seq_t*) a);
288 uint32_t hb = seq_hash((seq_t*) b);
290 if (ha < hb) return -1;
291 else if (ha > hb) return 1;
297 int seq_cmp_id(const void* a, const void* b)
299 return strcmp(((seq_t*) a)->id1.s, ((seq_t*) b)->id1.s);
303 int seq_cmp_seq(const void* a, const void* b)
305 return strcmp(((seq_t*) a)->seq.s, ((seq_t*) b)->seq.s);
309 void seq_array_dump(seq_dumps_t* d, const seq_array_t* a)
311 const char* template = "/tmp/fastq_sort.XXXXXXXX";
312 char* fn = malloc_or_die(strlen(template) + 1);
313 memcpy(fn, template, strlen(template) + 1);
314 if (mktemp(fn) == NULL) {
315 fprintf(stderr, "Unable to create a temporary file.\n");
319 FILE* f = fopen(fn, "wb");
321 fprintf(stderr, "Unable to open temporary file %s for writing.\n", fn);
326 for (i = 0; i < a->n; ++i) {
327 fastq_print(f, &a->seqs[i]);
330 if (d->n == d->size) {
332 d->fns = realloc_or_die(d->fns, d->size * sizeof(char*));
340 static const char* prog_name = "fastq-sort";
346 "fastq-sort [OPTION]... [FILE]...\n"
347 "Concatenate and sort FASTQ files and write to standard output.\n"
349 " -I, --id sort alphabetically by read identifier\n"
350 " -S, --seq sort alphabetically by sequence\n"
351 " -R, --random randomly shuffle the sequences\n"
352 " -G, --gc sort by GC content\n" /* TODO */
353 " -M, --median-qual sort by median quality score\n" /* TODO */
354 " -h, --help print this message\n"
355 " -V, --version output version information and exit\n"
360 int main(int argc, char* argv[])
363 size_t buffer_size = 100000000;
364 bool reverse_sort = false;
365 user_cmp = seq_cmp_id;
367 static struct option long_options[] =
369 {"reverse", no_argument, NULL, 'r'},
370 {"id", no_argument, NULL, 'I'},
371 {"seq", no_argument, NULL, 'S'},
372 {"random", no_argument, NULL, 'R'},
373 {"help", no_argument, NULL, 'h'},
374 {"version", no_argument, NULL, 'V'},
379 opt = getopt_long(argc, argv, "rISRhV", long_options, &opt_idx);
380 if (opt == -1) break;
388 user_cmp = seq_cmp_id;
392 user_cmp = seq_cmp_seq;
396 user_cmp = seq_cmp_hash;
404 print_version(stdout, prog_name);
415 cmp = reverse_sort ? rev_cmp : user_cmp;
417 seq_array_t* a = seq_array_create(buffer_size);
418 seq_dumps_t* d = seq_dumps_create();
419 seq_t* seq = seq_create();
422 if (optind >= argc) {
423 f = fastq_create(stdin);
424 while (fastq_read(f, seq)) {
425 if (!seq_array_push(a, seq)) {
426 seq_array_sort(a, cmp);
427 seq_array_dump(d, a);
429 if (!seq_array_push(a, seq)) {
430 fprintf(stderr, "The buffer size is to small.\n");
439 for (; optind < argc; ++optind) {
440 file = fopen(argv[optind], "rb");
442 fprintf(stderr, "Cannot open %s for reading.\n", argv[optind]);
445 f = fastq_create(file);
447 while (fastq_read(f, seq)) {
448 if (!seq_array_push(a, seq)) {
449 seq_array_sort(a, cmp);
450 seq_array_dump(d, a);
452 if (!seq_array_push(a, seq)) {
453 fprintf(stderr, "The buffer size is to small.\n");
465 seq_array_sort(a, cmp);
467 /* We were able to sort everything in memory. */
470 for (i = 0; i < a->n; ++i) {
471 fastq_print(stdout, &a->seqs[i]);
475 seq_array_dump(d, a);