2 * This file is part of fastq-tools.
4 * Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
7 * Sample reads with or without replacement from a FASTQ file.
20 /* A collection of filenames of sorted chunks of fastq. */
21 typedef struct seq_dumps_t_
29 seq_dumps_t* seq_dumps_create()
31 seq_dumps_t* d = malloc_or_die(sizeof(seq_dumps_t));
34 d->fns = malloc_or_die(d->size * sizeof(char*));
39 void seq_dumps_free(seq_dumps_t* d)
42 for (i = 0; i < d->n; ++i) {
43 if (unlink(d->fns[i]) == -1) {
44 fprintf(stderr, "Warning: unable to remove temporary file %s.\n",
54 static inline size_t heap_parent(size_t i) { return (i - 1) / 2; }
55 static inline size_t heap_left (size_t i) { return 2 * i + 1; }
56 static inline size_t heap_right (size_t i) { return 2 * i + 2; }
59 /* Enqueue an item in the heap.
62 * heap: A binary heap in which stores indexs into seqs.
63 * n: Fixed maximum size of the heap.
64 * m: Current size of the heap.
66 * idx: Index to enqueue.
67 * cmp: Comparison function.
70 void heap_push(size_t* heap, size_t n, size_t* m, seq_t** seqs,
71 int (*cmp)(const void*, const void*), size_t idx)
74 size_t tmp, j, i = (*m)++;
80 if (cmp(seqs[heap[i]], seqs[heap[j]]) < 0) {
91 /* Dequeue an item from a heap. */
92 size_t heap_pop(size_t* heap, size_t* m, seq_t** seqs,
93 int (*cmp)(const void*, const void*))
97 heap[0] = heap[--(*m)];
98 size_t tmp, l, r, j, i = 0;
103 if (l >= (*m)) break;
109 j = cmp(seqs[heap[l]], seqs[heap[r]]) < 0 ? l : r;
112 if (cmp(seqs[heap[i]], seqs[heap[j]]) > 0) {
125 /* n-way merge sort to stdout */
126 void merge_sort(const seq_dumps_t* d, int (*cmp)(const void*, const void*))
128 FILE** files = malloc_or_die(d->n * sizeof(FILE*));
130 for (i = 0; i < d->n; ++i) {
131 files[i] = fopen(d->fns[i], "rb");
132 if (files[i] == NULL) {
133 fprintf(stderr, "Cannot open temporary file %s for reading.\n",
139 fastq_t** fs = malloc_or_die(d->n * sizeof(fastq_t*));
140 seq_t** seqs = malloc_or_die(d->n * sizeof(seq_t*));
141 for (i = 0; i < d->n; ++i) {
142 fs[i] = fastq_create(files[i]);
143 seqs[i] = seq_create();
146 /* A binary heap of indexes to fs. We use this to repeatedly pop the
147 * smallest fastq entry. */
148 size_t* heap = malloc_or_die(d->n * sizeof(size_t));
153 for (i = 0; i < d->n; ++i) {
154 if (fastq_read(fs[i], seqs[i])) {
155 heap_push(heap, d->n, &m, seqs, cmp, i);
160 i = heap_pop(heap, &m, seqs, cmp);
161 fastq_print(stdout, seqs[i]);
162 if (fastq_read(fs[i], seqs[i])) {
163 heap_push(heap, d->n, &m, seqs, cmp, i);
167 for (i = 0; i < d->n; ++i) {
178 typedef struct seq_array_t_
182 /* Number of seq objects. */
185 /* Size reserved in seqs. */
188 /* Space reserved for strings. */
194 /* Total data size. */
199 seq_array_t* seq_array_create(size_t data_size)
201 seq_array_t* a = malloc_or_die(sizeof(seq_array_t));
204 a->seqs = malloc_or_die(a->size * sizeof(seq_t));
206 a->data_size = data_size;
208 a->data = malloc_or_die(data_size);
214 void seq_array_free(seq_array_t* a)
222 /* Push a fastq entry to back of the array. Return false if there was not enough
224 bool seq_array_push(seq_array_t* a, const seq_t* seq)
226 size_t size_needed = (seq->id1.n + 1) + (seq->seq.n + 1) +
227 (seq->id2.n + 1) + (seq->qual.n + 1);
229 if (size_needed > a->data_size - a->data_used) return false;
231 if (a->n == a->size) {
233 a->seqs = realloc_or_die(a->seqs, a->size * sizeof(seq_t));
236 memcpy(&a->data[a->data_used], seq->id1.s, seq->id1.n + 1);
237 a->seqs[a->n].id1.s = &a->data[a->data_used];
238 a->seqs[a->n].id1.n = seq->id1.n + 1;
239 a->data_used += seq->id1.n + 1;
241 memcpy(&a->data[a->data_used], seq->seq.s, seq->seq.n + 1);
242 a->seqs[a->n].seq.s = &a->data[a->data_used];
243 a->seqs[a->n].seq.n = seq->seq.n + 1;
244 a->data_used += seq->seq.n + 1;
246 memcpy(&a->data[a->data_used], seq->id2.s, seq->id2.n + 1);
247 a->seqs[a->n].id2.s = &a->data[a->data_used];
248 a->seqs[a->n].id2.n = seq->id2.n + 1;
249 a->data_used += seq->id2.n + 1;
251 memcpy(&a->data[a->data_used], seq->qual.s, seq->qual.n + 1);
252 a->seqs[a->n].qual.s = &a->data[a->data_used];
253 a->seqs[a->n].qual.n = seq->qual.n + 1;
254 a->data_used += seq->qual.n + 1;
262 void seq_array_clear(seq_array_t* a)
269 void seq_array_sort(seq_array_t* a, int (*cmp)(const void*, const void*))
271 qsort(a->seqs, a->n, sizeof(seq_t), cmp);
275 int seq_cmp_hash(const void* a, const void* b)
277 uint32_t ha = seq_hash((seq_t*) a);
278 uint32_t hb = seq_hash((seq_t*) b);
280 if (ha < hb) return -1;
281 else if (ha > hb) return 1;
287 int seq_cmp_id(const void* a, const void* b)
289 return strcmp(((seq_t*) a)->id1.s, ((seq_t*) b)->id1.s);
293 int seq_cmp_seq(const void* a, const void* b)
295 return strcmp(((seq_t*) a)->seq.s, ((seq_t*) b)->seq.s);
299 void seq_array_dump(seq_dumps_t* d, const seq_array_t* a)
301 const char* template = "/tmp/fastq_sort.XXXXXXXX";
302 char* fn = malloc_or_die(strlen(template) + 1);
303 memcpy(fn, template, strlen(template) + 1);
304 if (mktemp(fn) == NULL) {
305 fprintf(stderr, "Unable to create a temporary file.\n");
309 FILE* f = fopen(fn, "wb");
311 fprintf(stderr, "Unable to open temporary file %s for writing.\n", fn);
316 for (i = 0; i < a->n; ++i) {
317 fastq_print(f, &a->seqs[i]);
320 if (d->n == d->size) {
322 d->fns = realloc_or_die(d->fns, d->size * sizeof(char*));
330 static const char* prog_name = "fastq-sort";
336 "fastq-sort [OPTION]... [FILE]...\n"
337 "Concatenate and sort FASTQ files and write to standard output.\n"
339 " -I, --id sort alphabetically by read identifier\n"
340 " -S, --seq sort alphabetically by sequence\n"
341 " -R, --random randomly shuffle the sequences\n"
342 " -G, --gc sort by GC content\n" /* TODO */
343 " -M, --median-qual sort by median quality score\n" /* TODO */
344 " -h, --help print this message\n"
345 " -V, --version output version information and exit\n"
350 int main(int argc, char* argv[])
353 size_t buffer_size = 100000000;
354 int (*cmp)(const void*, const void*) = seq_cmp_hash;;
356 static struct option long_options[] =
358 {"id", no_argument, NULL, 'I'},
359 {"seq", no_argument, NULL, 'S'},
360 {"random", no_argument, NULL, 'R'},
361 {"help", no_argument, NULL, 'h'},
362 {"version", no_argument, NULL, 'V'},
367 opt = getopt_long(argc, argv, "hV", long_options, &opt_idx);
368 if (opt == -1) break;
388 print_version(stdout, prog_name);
399 seq_array_t* a = seq_array_create(buffer_size);
400 seq_dumps_t* d = seq_dumps_create();
401 seq_t* seq = seq_create();
404 if (optind >= argc) {
405 f = fastq_create(stdin);
406 while (fastq_read(f, seq)) {
407 if (!seq_array_push(a, seq)) {
408 seq_array_sort(a, cmp);
409 seq_array_dump(d, a);
411 if (!seq_array_push(a, seq)) {
412 fprintf(stderr, "The buffer size is to small.\n");
421 for (; optind < argc; ++optind) {
422 file = fopen(argv[optind], "rb");
424 fprintf(stderr, "Cannot open %s for reading.\n", argv[optind]);
427 f = fastq_create(file);
429 while (fastq_read(f, seq)) {
430 if (!seq_array_push(a, seq)) {
431 seq_array_sort(a, cmp);
432 seq_array_dump(d, a);
434 if (!seq_array_push(a, seq)) {
435 fprintf(stderr, "The buffer size is to small.\n");
447 seq_array_sort(a, cmp);
449 /* We were able to sort everything in memory. */
452 for (i = 0; i < a->n; ++i) {
453 fastq_print(stdout, &a->seqs[i]);
457 seq_array_dump(d, a);