From 55b423636a17f6477e090dc670064cb8263e119c Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Tue, 25 Oct 2011 13:45:51 -0700 Subject: [PATCH] A program to randomly sample reads from a fastq file. --- configure.ac | 2 +- doc/Makefile.am | 2 +- doc/fastq-sample.1 | 45 ++++++ src/Makefile.am | 8 +- src/fastq-sample.c | 377 +++++++++++++++++++++++++++++++++++++++++++++ src/parse.c | 9 ++ src/parse.h | 1 + src/rng.c | 241 +++++++++++++++++++++++++++++ src/rng.h | 24 +++ 9 files changed, 705 insertions(+), 4 deletions(-) create mode 100644 doc/fastq-sample.1 create mode 100644 src/fastq-sample.c create mode 100644 src/rng.c create mode 100644 src/rng.h diff --git a/configure.ac b/configure.ac index 522949e..a1039a0 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ -AC_INIT( [fastq-tools], [0.2], [dcjones@cs.washington.ed] ) +AC_INIT( [fastq-tools], [0.3], [dcjones@cs.washington.ed] ) AM_INIT_AUTOMAKE( [foreign -Wall -Werror] ) m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])]) diff --git a/doc/Makefile.am b/doc/Makefile.am index da53738..8695fbd 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -1,4 +1,4 @@ -man_MANS = fastq-grep.1 fastq-kmers.1 fastq-match.1 fastq-uniq.1 +dist_man_MANS = fastq-grep.1 fastq-kmers.1 fastq-match.1 fastq-uniq.1 fastq-sample.1 diff --git a/doc/fastq-sample.1 b/doc/fastq-sample.1 new file mode 100644 index 0000000..53434e0 --- /dev/null +++ b/doc/fastq-sample.1 @@ -0,0 +1,45 @@ +.TH FASTQ-SAMPLE 1 + +.SH NAME +fastq-sample - sample random reads from a fastq file + +.SH SYNOPSIS +.B fastq-sample [OPTION]... [[FILE] [FILE2]] + +.SH DESCRIPTION +Given a FASTQ file, random reads are sampled and output, with or without +replacement, according to the '-r' option. The number of reads to sample can be +specifed with the '-n' option, or in terms of the proportion of total reads +using '-p' option. + +If two files are given, the input is treated as paired-end, and matching pairs +are sampled and output into seperate files: [prefix].1.fastq and +[prefix].2.fastq, where [prefix] is set with the '-o' option. + +.SH OPTIONS +.TP +\fB\-n N\fR +The number of reads to sample and output +.TP +\fB\-p N\fR +The number of reads to sample in terms of the proportion of total reads. If +sampling with replacement, this number may be greater than 1.0. +.TP +\fB\-o\fR, \fB\-\-output=PREFIX\fR +The filename prefix to which output should be written. If single-end data is +being sampled, the output file is [PREFIX].fastq, and with paired-end, +[PREFIX].1.fastq and [PREFIX].2.fastq. +.TP +\fB\-r\fR, \fB\-\-with\-replacement\fR +Sample with replacement. (By default, sampling is done without replacement.) +.TP +\fB\-h\fR, \fB\-\-help\fR +Output a help message and exit. +.TP +\fB\-V\fR, \fB\-\-version\fR +Output version information and exit. + + +.SH AUTHOR +Written by Daniel C. Jones + diff --git a/src/Makefile.am b/src/Makefile.am index 080e884..7bc48d0 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,10 +1,11 @@ -bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual +bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual fastq-sample fastq_common_src=common.h common.c fastq_parse_src=parse.h parse.c fastq_sw_src=sw.h sw.c fastq_hash_src=hash.h hash.c +fastq_rng_src=rng.h rng.c fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src) fastq_grep_LDADD = $(PCRE_LIBS) -lz @@ -18,6 +19,9 @@ fastq_match_LDADD = -lz fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src) fastq_uniq_LDADD = -lz -fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src) $(fastq_qual_src) +fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src) fastq_qual_LDADD = -lz +fastq_sample_SOURCES = fastq-sample.c $(fastq_common_src) $(fastq_parse_src) $(fastq_rng_src) +fastq_sample_LDADD = -lz + diff --git a/src/fastq-sample.c b/src/fastq-sample.c new file mode 100644 index 0000000..279b6cf --- /dev/null +++ b/src/fastq-sample.c @@ -0,0 +1,377 @@ + +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * fastq-sample : + * Sample reads with or without replacement from a FASTQ file. + * + */ + +#include "common.h" +#include "parse.h" +#include "rng.h" +#include +#include +#include +#include +#include + +#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) +# include +# include +# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) +#else +# define SET_BINARY_MODE(file) +#endif + +static const char* prog_name = "fastq-sample"; + +static int replacement_flag; + + +void print_help() +{ + fprintf(stdout, +"fastq-sample [OPTION]... [FILE [FILE2]]\n" +"Sample random reads from a FASTQ file." +"Options:\n" +" -n N the number of reads to sample (default: 10000)\n" +" -p N the proportion of the total reads to sample\n" +" -o, --output=PREFIX output file prefix\n" +" -r, --with-replacement sample with relpacement\n" +" -h, --help print this message\n" +" -V, --version output version information and exit\n" + ); +} + + +/* count the number of entries in a fastq file */ +unsigned long count_entries(fastq_t* fqf) +{ + seq_t* seq = fastq_alloc_seq(); + unsigned long n = 0; + while (fastq_next(fqf, seq)) ++n; + fastq_free_seq(seq); + + return n; +} + + +/* compare two unsigned integers (for qsort) */ +int cmpul( const void* a, const void* b ) { + if( *(unsigned long*) a < *(unsigned long*) b ) return -1; + else if( *(unsigned long*) a > *(unsigned long*) b ) return 1; + else return 0; +} + + +/* randomly shuffle an array of unsigned longs */ +void shuffle(rng_t* rng, unsigned long* xs, unsigned long n) +{ + unsigned long i, j, k; + for (i = n - 1; i > 0; --i) { + j = fastq_rng_uniform_int(rng, i + 1); + k = xs[j]; xs[j] = xs[i]; xs[i] = k; // swap + } +} + + +unsigned long* index_with_replacement(rng_t* rng, unsigned long n, unsigned long k) +{ + unsigned long* xs = malloc_or_die(k * sizeof(unsigned long)); + size_t i; + for (i = 0; i < k; ++i) xs[i] = fastq_rng_uniform_int(rng, n); + return xs; +} + + +unsigned long* index_without_replacement(rng_t* rng, unsigned long n) +{ + unsigned long* xs = malloc_or_die(n * sizeof(unsigned long)); + size_t i; + for (i = 0; i < n; ++i) xs[i] = i; + shuffle(rng, xs, n); + return xs; +} + + +void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, double p) +{ + /* + * The basic idea is this: + * + * 1. Count the number of lines in the file, n. + * + * 2a. If sampling with replacement, generate k random integers in [0, n-1]. + * + * 2b. If sampling without replacement, generate a list of integers 0..(n-1), + * shuffle with fisher-yates, then consider the first k. + * + * 3. Sort the integer list. + * + * 3. Read through the file again, when the number at the front of the integer + * list matches the index of the fastq etry, print the entry, and pop the + * number. + */ + + + unsigned long n, n2; + + fastq_t* f1 = fastq_open(file1); + fastq_t* f2 = file2 == NULL ? NULL : fastq_open(file2); + + n = count_entries(f1); + if (f2 != NULL) { + n2 = count_entries(f2); + if (n != n2) { + fprintf(stderr, "Input files have differing numbers of entries (%lu != %lu).\n", n, n2); + exit(1); + } + } + + fastq_rewind(f1); + if (f2 != NULL) fastq_rewind(f2); + + if (p > 0.0) { + k = (unsigned long) (p * (double) n); + if (!replacement_flag && k > n) k = n; + } + + rng_t* rng = fastq_rng_alloc(); + + unsigned long* xs; + if (replacement_flag) xs = index_with_replacement(rng, n, k); + else xs = index_without_replacement(rng, n); + + qsort(xs, k, sizeof(unsigned long), cmpul); + + + /* open output */ + FILE* fout1; + FILE* fout2; + + char* output_name; + size_t output_len; + if (file2 == NULL) { + output_len = strlen(prefix) + 7; + output_name = malloc_or_die((output_len + 1) * sizeof(char)); + + snprintf(output_name, output_len, "%s.fastq", prefix); + fout1 = fopen(output_name, "wb"); + if (fout1 == NULL) { + fprintf(stderr, "Cannot open file %s for writing.\n", output_name); + exit(1); + } + + fout2 = NULL; + + free(output_name); + } + else { + output_len = strlen(prefix) + 9; + output_name = malloc_or_die((output_len + 1) * sizeof(char)); + + snprintf(output_name, output_len, "%s.1.fastq", prefix); + fout1 = fopen(output_name, "wb"); + if (fout1 == NULL) { + fprintf(stderr, "Cannot open file %s for writing.\n", output_name); + exit(1); + } + + snprintf(output_name, output_len, "%s.2.fastq", prefix); + fout2 = fopen(output_name, "wb"); + if (fout1 == NULL) { + fprintf(stderr, "Cannot open file %s for writing.\n", output_name); + exit(1); + } + + free(output_name); + } + + + + unsigned long i = 0; // read number + unsigned long j = 0; // index into xs + + int ret; + seq_t* seq1 = fastq_alloc_seq(); + seq_t* seq2 = fastq_alloc_seq(); + + while (j < k && fastq_next(f1, seq1)) { + if (f2 != NULL){ + ret = fastq_next(f2, seq2); + if (ret == 0) { + fputs("Input files have differing numbers of entries.\n", stderr); + exit(1); + } + } + + while (j < k && xs[j] == i) { + fastq_print(fout1, seq1); + if (f2 != NULL) fastq_print(fout2, seq2); + ++j; + } + + ++i; + } + + fastq_free_seq(seq1); + fastq_free_seq(seq2); + fastq_close(f1); + if (f2 != NULL) fastq_close(f2); + + fclose(fout1); + if (fout2 != NULL) fclose(fout2); + + fastq_rng_free(rng); + free(xs); +} + + +int main(int argc, char* argv[]) +{ + SET_BINARY_MODE(stdin); + SET_BINARY_MODE(stdout); + + int opt; + int opt_idx; + + const char* prefix = NULL; + unsigned long k = 10000; // number of reads to sample + double p = -1; // proportion of reads to sample + + + static struct option long_options[] = + { + {"with-replacement", no_argument, NULL, 'r'}, + {"output", no_argument, NULL, 'o'}, + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, + {0, 0, 0, 0} + }; + + replacement_flag = 0; + + while (1) { + opt = getopt_long(argc, argv, "n:p:o:rhV", long_options, &opt_idx); + + if( opt == -1 ) break; + + switch (opt) { + case 0: + if (long_options[opt_idx].flag != 0) break; + if (optarg) { + } + break; + + case 'n': + k = strtoul(optarg, NULL, 10); + break; + + case 'p': + p = atof(optarg); + if (p < 0.0) { + fputs("Sample proportion ('-p') is less than zero.\n", stderr); + return 1; + } + break; + + case 'r': + replacement_flag = 1; + break; + + case 'o': + prefix = optarg; + break; + + case 'h': + print_help(); + return 0; + + case 'V': + print_version(stdout, prog_name); + return 0; + + case '?': + return 1; + + default: + abort(); + } + } + + FILE* file1 = NULL; + FILE* file2 = NULL; + + char* prefix_alloc = NULL; + + if (optind >= argc) { + file1 = stdin; + } + else { + if (strcmp(argv[optind], "-") == 0) { + file1 = stdin; + if (prefix == NULL) prefix = "stdin.sample"; + } + else { + file1 = fopen(argv[optind], "rb"); + if (file1 == NULL) { + fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); + return 1; + } + + if (prefix == NULL) { + /* guess at a reasonable output refix by trimming the + * trailing file extension, if any. */ + char* tmp; + + /* base name */ + tmp = strrchr(argv[optind], '/'); + if (tmp != NULL) argv[optind] = tmp + 1; + + /* exclude file suffixes */ + tmp = strchr(argv[optind], '.'); + if (tmp == NULL) prefix = argv[optind]; + else { + prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char)); + memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char)); + prefix_alloc[tmp - argv[optind]] = '\0'; + prefix = prefix_alloc; + } + } + } + ++optind; + + if (optind < argc) { + if (strcmp(argv[optind], "-") == 0) { + if (file1 == stdin) { + fprintf(stderr, "Both input files cannot be standard input.\n"); + return 1; + } + file2 = stdin; + } + else { + file2 = fopen(argv[optind], "rb"); + if (file2 == NULL) { + fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); + return 1; + } + } + } + } + + fastq_sample(prefix, file1, file2, k, p); + + free(prefix_alloc); + return 0; +} + + + + + + + + diff --git a/src/parse.c b/src/parse.c index 1ccc7de..61e3efa 100644 --- a/src/parse.c +++ b/src/parse.c @@ -241,6 +241,15 @@ int fastq_next(fastq_t* f, seq_t* seq) } +void fastq_rewind(fastq_t* fqf) +{ + gzrewind(fqf->file); + fqf->state = STATE_ID1; + fqf->buf[0] = '\0'; + fqf->c = fqf->buf; +} + + void fastq_print(FILE* fout, seq_t* seq) { /* FASTQ */ diff --git a/src/parse.h b/src/parse.h index d2e2612..d9bbac6 100644 --- a/src/parse.h +++ b/src/parse.h @@ -49,6 +49,7 @@ typedef struct fastq_t* fastq_open(FILE*); void fastq_close(fastq_t*); int fastq_next(fastq_t*, seq_t*); +void fastq_rewind(fastq_t*); void fastq_print(FILE* fout, seq_t* seq); diff --git a/src/rng.c b/src/rng.c new file mode 100644 index 0000000..b503699 --- /dev/null +++ b/src/rng.c @@ -0,0 +1,241 @@ + +/* This code taken from the GNU Scientific Library and adapted for use in + * fastq-tools. It is subject to the following licence. + */ + +/* This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. You should have received + a copy of the GNU General Public License along with this program; + if not, write to the Free Foundation, Inc., 59 Temple Place, Suite + 330, Boston, MA 02111-1307 USA + + Original implementation was copyright (C) 1997 Makoto Matsumoto and + Takuji Nishimura. Coded by Takuji Nishimura, considering the + suggestions by Topher Cooper and Marc Rieffel in July-Aug. 1997, "A + C-program for MT19937: Integer version (1998/4/6)" + + This implementation copyright (C) 1998 Brian Gough. I reorganized + the code to use the module framework of GSL. The license on this + implementation was changed from LGPL to GPL, following paragraph 3 + of the LGPL, version 2. + + Update: + + The seeding procedure has been updated to match the 10/99 release + of MT19937. + + Update: + + The seeding procedure has been updated again to match the 2002 + release of MT19937 + + The original code included the comment: "When you use this, send an + email to: matumoto@math.keio.ac.jp with an appropriate reference to + your work". + + Makoto Matsumoto has a web page with more information about the + generator, http://www.math.keio.ac.jp/~matumoto/emt.html. + + The paper below has details of the algorithm. + + From: Makoto Matsumoto and Takuji Nishimura, "Mersenne Twister: A + 623-dimensionally equidistributerd uniform pseudorandom number + generator". ACM Transactions on Modeling and Computer Simulation, + Vol. 8, No. 1 (Jan. 1998), Pages 3-30 + + You can obtain the paper directly from Makoto Matsumoto's web page. + + The period of this generator is 2^{19937} - 1. + +*/ + +#include "rng.h" +#include "common.h" +#include +#include + + +/* default seed */ +unsigned long int default_seed = 4357; + + +#define N 624 /* Period parameters */ +#define M 397 + +/* most significant w-r bits */ +static const unsigned long UPPER_MASK = 0x80000000UL; + +/* least significant r bits */ +static const unsigned long LOWER_MASK = 0x7fffffffUL; + + +static const unsigned long RNG_MIN = 0; +static const unsigned long RNG_MAX = 0xffffffffUL; + + +struct rng_t_ +{ + unsigned long mt[N]; + int mti; +}; + + +static inline unsigned long mt_get(rng_t* state) +{ + unsigned long k ; + unsigned long int *const mt = state->mt; + +#define MAGIC(y) (((y)&0x1) ? 0x9908b0dfUL : 0) + + if (state->mti >= N) + { /* generate N words at one time */ + int kk; + + for (kk = 0; kk < N - M; kk++) + { + unsigned long y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK); + mt[kk] = mt[kk + M] ^ (y >> 1) ^ MAGIC(y); + } + for (; kk < N - 1; kk++) + { + unsigned long y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK); + mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ MAGIC(y); + } + + { + unsigned long y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK); + mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ MAGIC(y); + } + + state->mti = 0; + } + + /* Tempering */ + + k = mt[state->mti]; + k ^= (k >> 11); + k ^= (k << 7) & 0x9d2c5680UL; + k ^= (k << 15) & 0xefc60000UL; + k ^= (k >> 18); + + state->mti++; + + return k; +} + +double mt_get_double(rng_t* state) +{ + return mt_get(state) / 4294967296.0; +} + +void mt_set(rng_t* state, unsigned long int s) +{ + int i; + + if (s == 0) + s = 4357; /* the default seed is 4357 */ + + state->mt[0]= s & 0xffffffffUL; + + for (i = 1; i < N; i++) + { + /* See Knuth's "Art of Computer Programming" Vol. 2, 3rd + Ed. p.106 for multiplier. */ + + state->mt[i] = + (1812433253UL * (state->mt[i-1] ^ (state->mt[i-1] >> 30)) + i); + + state->mt[i] &= 0xffffffffUL; + } + + state->mti = i; +} + +#if 0 // these two seeding procedures are not used +static void mt_1999_set(rng_t* state, unsigned long int s) +{ + int i; + + if (s == 0) + s = 4357; /* the default seed is 4357 */ + + /* This is the October 1999 version of the seeding procedure. It + was updated by the original developers to avoid the periodicity + in the simple congruence originally used. + + Note that an ANSI-C unsigned long integer arithmetic is + automatically modulo 2^32 (or a higher power of two), so we can + safely ignore overflow. */ + +#define LCG(x) ((69069 * x) + 1) &0xffffffffUL + + for (i = 0; i < N; i++) + { + state->mt[i] = s & 0xffff0000UL; + s = LCG(s); + state->mt[i] |= (s &0xffff0000UL) >> 16; + s = LCG(s); + } + + state->mti = i; +} + +/* This is the original version of the seeding procedure, no longer + used but available for compatibility with the original MT19937. */ + +static void mt_1998_set(rng_t* state, unsigned long int s) +{ + int i; + + if (s == 0) + s = 4357; /* the default seed is 4357 */ + + state->mt[0] = s & 0xffffffffUL; + +#define LCG1998(n) ((69069 * n) & 0xffffffffUL) + + for (i = 1; i < N; i++) + state->mt[i] = LCG1998 (state->mt[i - 1]); + + state->mti = i; +} +#endif + +rng_t* fastq_rng_alloc() +{ + rng_t* rng = malloc_or_die(sizeof(rng_t)); + mt_set(rng, default_seed); + return rng; +} + + +void fastq_rng_free(rng_t* rng) +{ + free(rng); +} + +void fastq_rng_seed(rng_t* rng, unsigned long seed) +{ + mt_set(rng, seed); +} + +unsigned long fastq_rng_uniform_int(rng_t* rng, unsigned long k) +{ + unsigned long scale = (RNG_MAX - RNG_MIN) / k; + assert(scale > 0); + unsigned long r; + + do { + r = (mt_get(rng) - RNG_MIN) / scale; + } while (r >= k); + + return r; +} + diff --git a/src/rng.h b/src/rng.h new file mode 100644 index 0000000..469b6ed --- /dev/null +++ b/src/rng.h @@ -0,0 +1,24 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2011 by Daniel C. Jones + * + * rng : + * Robust pseudo-random number generation. + * + */ + +#ifndef FASTQ_TOOLS_RNG_H +#define FASTQ_TOOLS_RNG_H + +typedef struct rng_t_ rng_t; + +rng_t* fastq_rng_alloc(); +void fastq_rng_free(rng_t*); +void fastq_rng_seed(rng_t*, unsigned long seed); + +/* Uniform integer in [0, k-1] */ +unsigned long fastq_rng_uniform_int(rng_t*, unsigned long k); + +#endif + -- 2.39.2