3 * This file is part of fastq-tools.
5 * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
8 * Sample reads with or without replacement from a FASTQ file.
21 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
24 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
26 # define SET_BINARY_MODE(file)
29 static const char* prog_name = "fastq-sample";
31 static int replacement_flag;
37 "fastq-sample [OPTION]... FILE [FILE2]\n"
38 "Sample random reads from a FASTQ file."
40 " -n N the number of reads to sample (default: 10000)\n"
41 " -p N the proportion of the total reads to sample\n"
42 " -o, --output=PREFIX output file prefix\n"
43 " -c, --complement-output=PREFIX\n"
44 " output reads not included in the random sample to\n"
45 " a file (or files) with the given prefix (by default,\n"
46 " they are not output).\n"
47 " -r, --with-replacement sample with relpacement\n"
48 " -s, --seed=SEED a manual seed to the random number generator\n"
49 " -h, --help print this message\n"
50 " -V, --version output version information and exit\n"
55 /* count the number of entries in a fastq file */
56 unsigned long count_entries(fastq_t* fqf)
58 seq_t* seq = fastq_alloc_seq();
60 while (fastq_next(fqf, seq)) ++n;
67 /* compare two unsigned integers (for qsort) */
68 int cmpul( const void* a, const void* b ) {
69 if( *(unsigned long*) a < *(unsigned long*) b ) return -1;
70 else if( *(unsigned long*) a > *(unsigned long*) b ) return 1;
75 /* randomly shuffle an array of unsigned longs */
76 void shuffle(rng_t* rng, unsigned long* xs, unsigned long n)
78 unsigned long i, j, k;
79 for (i = n - 1; i > 0; --i) {
80 j = fastq_rng_uniform_int(rng, i + 1);
81 k = xs[j]; xs[j] = xs[i]; xs[i] = k; // swap
86 unsigned long* index_with_replacement(rng_t* rng, unsigned long n, unsigned long k)
88 unsigned long* xs = malloc_or_die(k * sizeof(unsigned long));
90 for (i = 0; i < k; ++i) xs[i] = fastq_rng_uniform_int(rng, n);
95 unsigned long* index_without_replacement(rng_t* rng, unsigned long n)
97 unsigned long* xs = malloc_or_die(n * sizeof(unsigned long));
99 for (i = 0; i < n; ++i) xs[i] = i;
105 void fastq_sample(unsigned long rng_seed,
106 const char* prefix, const char* cprefix,
107 FILE* file1, FILE* file2, unsigned long k, double p)
110 * The basic idea is this:
112 * 1. Count the number of lines in the file, n.
114 * 2a. If sampling with replacement, generate k random integers in [0, n-1].
116 * 2b. If sampling without replacement, generate a list of integers 0..(n-1),
117 * shuffle with fisher-yates, then consider the first k.
119 * 3. Sort the integer list.
121 * 3. Read through the file again, when the number at the front of the integer
122 * list matches the index of the fastq etry, print the entry, and pop the
129 fastq_t* f1 = fastq_open(file1);
130 fastq_t* f2 = file2 == NULL ? NULL : fastq_open(file2);
132 n = count_entries(f1);
134 n2 = count_entries(f2);
136 fprintf(stderr, "Input files have differing numbers of entries (%lu != %lu).\n", n, n2);
142 if (f2 != NULL) fastq_rewind(f2);
145 k = (unsigned long) (p * (double) n);
146 if (!replacement_flag && k > n) k = n;
149 rng_t* rng = fastq_rng_alloc();
150 fastq_rng_seed(rng, rng_seed);
153 if (replacement_flag) xs = index_with_replacement(rng, n, k);
154 else xs = index_without_replacement(rng, n);
156 qsort(xs, k, sizeof(unsigned long), cmpul);
166 output_len = strlen(prefix) + 7;
167 output_name = malloc_or_die((output_len + 1) * sizeof(char));
169 snprintf(output_name, output_len, "%s.fastq", prefix);
170 fout1 = fopen(output_name, "wb");
172 fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
181 output_len = strlen(prefix) + 9;
182 output_name = malloc_or_die((output_len + 1) * sizeof(char));
184 snprintf(output_name, output_len, "%s.1.fastq", prefix);
185 fout1 = fopen(output_name, "wb");
187 fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
191 snprintf(output_name, output_len, "%s.2.fastq", prefix);
192 fout2 = fopen(output_name, "wb");
194 fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
202 /* open complement output */
206 if (cprefix != NULL && file2 == NULL) {
207 output_len = strlen(cprefix) + 7;
208 output_name = malloc_or_die((output_len + 1) * sizeof(char));
210 snprintf(output_name, output_len, "%s.fastq", cprefix);
211 cfout1 = fopen(output_name, "wb");
212 if (cfout1 == NULL) {
213 fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
221 else if (cprefix != NULL) {
222 output_len = strlen(cprefix) + 9;
223 output_name = malloc_or_die((output_len + 1) * sizeof(char));
225 snprintf(output_name, output_len, "%s.1.fastq", cprefix);
226 cfout1 = fopen(output_name, "wb");
227 if (cfout1 == NULL) {
228 fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
232 snprintf(output_name, output_len, "%s.2.fastq", cprefix);
233 cfout2 = fopen(output_name, "wb");
234 if (cfout1 == NULL) {
235 fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
244 unsigned long i = 0; // read number
245 unsigned long j = 0; // index into xs
248 seq_t* seq1 = fastq_alloc_seq();
249 seq_t* seq2 = fastq_alloc_seq();
251 while (j < k && fastq_next(f1, seq1)) {
253 ret = fastq_next(f2, seq2);
255 fputs("Input files have differing numbers of entries.\n", stderr);
261 while (j < k && xs[j] == i) {
262 fastq_print(fout1, seq1);
263 if (f2 != NULL) fastq_print(fout2, seq2);
267 else if (cfout1 != NULL) {
268 fastq_print(cfout1, seq1);
269 if (f2 != NULL) fastq_print(cfout2, seq2);
275 fastq_free_seq(seq1);
276 fastq_free_seq(seq2);
278 if (f2 != NULL) fastq_close(f2);
281 if (fout2 != NULL) fclose(fout2);
283 if (cfout1 != NULL) fclose(cfout1);
284 if (cfout2 != NULL) fclose(cfout2);
291 int main(int argc, char* argv[])
293 SET_BINARY_MODE(stdin);
294 SET_BINARY_MODE(stdout);
299 const char* prefix = NULL;
300 const char* cprefix = NULL;
301 unsigned long rng_seed = 4357;
302 unsigned long k = 10000; // number of reads to sample
303 double p = -1; // proportion of reads to sample
306 static struct option long_options[] =
308 {"with-replacement", no_argument, NULL, 'r'},
309 {"complement-output", required_argument, NULL, 'c'},
310 {"seed", required_argument, NULL, 's'},
311 {"output", no_argument, NULL, 'o'},
312 {"help", no_argument, NULL, 'h'},
313 {"version", no_argument, NULL, 'V'},
317 replacement_flag = 0;
320 opt = getopt_long(argc, argv, "n:p:o:rs:hV", long_options, &opt_idx);
322 if( opt == -1 ) break;
326 if (long_options[opt_idx].flag != 0) break;
332 k = strtoul(optarg, NULL, 10);
338 fputs("Sample proportion ('-p') is less than zero.\n", stderr);
344 replacement_flag = 1;
348 rng_seed = strtoul(optarg, NULL, 10);
364 print_version(stdout, prog_name);
378 char* prefix_alloc = NULL;
380 if (optind >= argc) {
381 fputs("An input file must be given.\n", stderr);
386 file1 = fopen(argv[optind], "rb");
388 fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
392 if (prefix == NULL) {
393 /* guess at a reasonable output refix by trimming the
394 * trailing file extension, if any. */
398 tmp = strrchr(argv[optind], '/');
399 if (tmp != NULL) argv[optind] = tmp + 1;
401 /* exclude file suffixes */
402 tmp = strchr(argv[optind], '.');
403 if (tmp == NULL) prefix = argv[optind];
405 prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char));
406 memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char));
407 prefix_alloc[tmp - argv[optind]] = '\0';
408 prefix = prefix_alloc;
414 file2 = fopen(argv[optind], "rb");
416 fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
422 fastq_sample(rng_seed, prefix, cprefix, file1, file2, k, p);