2 * This file is part of fastq-tools.
4 * Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
7 * Sample reads with or without replacement from a FASTQ file.
19 typedef struct seq_array_t_
23 /* Number of seq objects. */
26 /* Size reserved in seqs. */
29 /* Space reserved for strings. */
35 /* Total data size. */
40 seq_array_t* seq_array_create(size_t data_size)
42 seq_array_t* a = malloc_or_die(sizeof(seq_array_t));
45 a->seqs = malloc_or_die(sizeof(seq_t));
47 a->data_size = data_size;
49 a->data = malloc_or_die(data_size);
55 void seq_array_free(seq_array_t* a)
63 /* Push a fastq entry to back of the array. Return false if there was not enough
65 bool seq_array_push(seq_array_t* a, const seq_t* seq)
67 size_t size_needed = (seq->id1.n + 1) + (seq->seq.n + 1) +
68 (seq->id2.n + 1) + (seq->qual.n + 1);
70 if (size_needed > a->data_size - a->data_used) return false;
72 if (a->n == a->size) {
74 a->seqs = realloc_or_die(a->seqs, a->size * sizeof(seq_t));
77 memcpy(seq->id1.s, &a->data[a->data_used], seq->id1.n + 1);
78 a->seqs[a->n].id1.s = &a->data[a->data_used];
79 a->seqs[a->n].id1.n = seq->id1.n + 1;
80 a->data_used += seq->id1.n + 1;
82 memcpy(seq->seq.s, &a->data[a->data_used], seq->seq.n + 1);
83 a->seqs[a->n].seq.s = &a->data[a->data_used];
84 a->seqs[a->n].seq.n = seq->seq.n + 1;
85 a->data_used += seq->seq.n + 1;
87 memcpy(seq->id2.s, &a->data[a->data_used], seq->id2.n + 1);
88 a->seqs[a->n].id2.s = &a->data[a->data_used];
89 a->seqs[a->n].id2.n = seq->id2.n + 1;
90 a->data_used += seq->id2.n + 1;
92 memcpy(seq->qual.s, &a->data[a->data_used], seq->qual.n + 1);
93 a->seqs[a->n].qual.s = &a->data[a->data_used];
94 a->seqs[a->n].qual.n = seq->qual.n + 1;
95 a->data_used += seq->qual.n + 1;
103 void seq_array_clear(seq_array_t* a)
110 void seq_array_sort(seq_array_t* a, int (*cmp)(const void*, const void*))
112 qsort(a->seqs, a->n, sizeof(seq_t), cmp);
116 int seq_cmp_hash(const void* a_, const void* b_)
118 const seq_t* a = (seq_t*) a_;
119 const seq_t* b = (seq_t*) b_;
120 /* TODO: hash and compare. */
125 static const char* prog_name = "fastq-sort";
131 "fastq-sort [OPTION]... [FILE]...\n"
132 "Concatenate and sort FASTQ files and write to standard output.\n"
134 " -h, --help print this message\n"
135 " -V, --version output version information and exit\n"
141 int main(int argc, char* argv[])
144 size_t buffer_size = 100000000;
145 int (*cmp)(const void*, const void*) = seq_cmp_hash;;
147 static struct option long_options[] =
149 {"help", no_argument, NULL, 'h'},
150 {"version", no_argument, NULL, 'V'},
155 opt = getopt_long(argc, argv, "hV", long_options, &opt_idx);
156 if (opt == -1) break;
164 print_version(stdout, prog_name);
175 seq_array_t* a = seq_array_create(buffer_size);
176 seq_t* seq = seq_create();
179 if (optind >= argc) {
180 f = fastq_create(stdin);
181 while (fastq_read(f, seq)) {
182 if (!seq_array_push(a, seq)) {
183 seq_array_sort(a, cmp);
185 /* TODO: dump a to a temporary file. Push that file name to an
190 if (seq_array_push(a, seq)) {
191 fprintf(stderr, "The buffer size is to small.\n");
200 for (; optind < argc; ++optind) {
201 file = fopen(argv[optind], "rb");
203 fprintf(stderr, "Cannot open %s for reading.\n", argv[optind]);
206 f = fastq_create(file);
208 while (fastq_read(f, seq)) {
209 if (!seq_array_push(a, seq)) {
210 seq_array_sort(a, cmp);
212 /* TODO: dump a to a temporary file. Push that file name to
213 * an array somewhere. */
216 if (seq_array_push(a, seq)) {
217 fprintf(stderr, "The buffer size is to small.\n");
229 seq_array_sort(a, cmp);
231 /* TODO: special case if everything fit into a. Just dump it to output.
234 /* TODO: dump to a temp file, push file name to the stack. */
237 /* TODO: Merge sort on the external files. */