-AC_INIT( [fastq-tools], [0.2], [dcjones@cs.washington.ed] )
+AC_INIT( [fastq-tools], [0.3], [dcjones@cs.washington.ed] )
AM_INIT_AUTOMAKE( [foreign -Wall -Werror] )
m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])
-man_MANS = fastq-grep.1 fastq-kmers.1 fastq-match.1 fastq-uniq.1
+dist_man_MANS = fastq-grep.1 fastq-kmers.1 fastq-match.1 fastq-uniq.1 fastq-sample.1
--- /dev/null
+.TH FASTQ-SAMPLE 1
+
+.SH NAME
+fastq-sample - sample random reads from a fastq file
+
+.SH SYNOPSIS
+.B fastq-sample [OPTION]... [[FILE] [FILE2]]
+
+.SH DESCRIPTION
+Given a FASTQ file, random reads are sampled and output, with or without
+replacement, according to the '-r' option. The number of reads to sample can be
+specifed with the '-n' option, or in terms of the proportion of total reads
+using '-p' option.
+
+If two files are given, the input is treated as paired-end, and matching pairs
+are sampled and output into seperate files: [prefix].1.fastq and
+[prefix].2.fastq, where [prefix] is set with the '-o' option.
+
+.SH OPTIONS
+.TP
+\fB\-n N\fR
+The number of reads to sample and output
+.TP
+\fB\-p N\fR
+The number of reads to sample in terms of the proportion of total reads. If
+sampling with replacement, this number may be greater than 1.0.
+.TP
+\fB\-o\fR, \fB\-\-output=PREFIX\fR
+The filename prefix to which output should be written. If single-end data is
+being sampled, the output file is [PREFIX].fastq, and with paired-end,
+[PREFIX].1.fastq and [PREFIX].2.fastq.
+.TP
+\fB\-r\fR, \fB\-\-with\-replacement\fR
+Sample with replacement. (By default, sampling is done without replacement.)
+.TP
+\fB\-h\fR, \fB\-\-help\fR
+Output a help message and exit.
+.TP
+\fB\-V\fR, \fB\-\-version\fR
+Output version information and exit.
+
+
+.SH AUTHOR
+Written by Daniel C. Jones <dcjones@cs.washington.edu>
+
-bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual
+bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual fastq-sample
fastq_common_src=common.h common.c
fastq_parse_src=parse.h parse.c
fastq_sw_src=sw.h sw.c
fastq_hash_src=hash.h hash.c
+fastq_rng_src=rng.h rng.c
fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src)
fastq_grep_LDADD = $(PCRE_LIBS) -lz
fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src)
fastq_uniq_LDADD = -lz
-fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src) $(fastq_qual_src)
+fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src)
fastq_qual_LDADD = -lz
+fastq_sample_SOURCES = fastq-sample.c $(fastq_common_src) $(fastq_parse_src) $(fastq_rng_src)
+fastq_sample_LDADD = -lz
+
--- /dev/null
+
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-sample :
+ * Sample reads with or without replacement from a FASTQ file.
+ *
+ */
+
+#include "common.h"
+#include "parse.h"
+#include "rng.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <getopt.h>
+#include <zlib.h>
+
+#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
+# include <fcntl.h>
+# include <io.h>
+# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
+#else
+# define SET_BINARY_MODE(file)
+#endif
+
+static const char* prog_name = "fastq-sample";
+
+static int replacement_flag;
+
+
+void print_help()
+{
+ fprintf(stdout,
+"fastq-sample [OPTION]... [FILE [FILE2]]\n"
+"Sample random reads from a FASTQ file."
+"Options:\n"
+" -n N the number of reads to sample (default: 10000)\n"
+" -p N the proportion of the total reads to sample\n"
+" -o, --output=PREFIX output file prefix\n"
+" -r, --with-replacement sample with relpacement\n"
+" -h, --help print this message\n"
+" -V, --version output version information and exit\n"
+ );
+}
+
+
+/* count the number of entries in a fastq file */
+unsigned long count_entries(fastq_t* fqf)
+{
+ seq_t* seq = fastq_alloc_seq();
+ unsigned long n = 0;
+ while (fastq_next(fqf, seq)) ++n;
+ fastq_free_seq(seq);
+
+ return n;
+}
+
+
+/* compare two unsigned integers (for qsort) */
+int cmpul( const void* a, const void* b ) {
+ if( *(unsigned long*) a < *(unsigned long*) b ) return -1;
+ else if( *(unsigned long*) a > *(unsigned long*) b ) return 1;
+ else return 0;
+}
+
+
+/* randomly shuffle an array of unsigned longs */
+void shuffle(rng_t* rng, unsigned long* xs, unsigned long n)
+{
+ unsigned long i, j, k;
+ for (i = n - 1; i > 0; --i) {
+ j = fastq_rng_uniform_int(rng, i + 1);
+ k = xs[j]; xs[j] = xs[i]; xs[i] = k; // swap
+ }
+}
+
+
+unsigned long* index_with_replacement(rng_t* rng, unsigned long n, unsigned long k)
+{
+ unsigned long* xs = malloc_or_die(k * sizeof(unsigned long));
+ size_t i;
+ for (i = 0; i < k; ++i) xs[i] = fastq_rng_uniform_int(rng, n);
+ return xs;
+}
+
+
+unsigned long* index_without_replacement(rng_t* rng, unsigned long n)
+{
+ unsigned long* xs = malloc_or_die(n * sizeof(unsigned long));
+ size_t i;
+ for (i = 0; i < n; ++i) xs[i] = i;
+ shuffle(rng, xs, n);
+ return xs;
+}
+
+
+void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, double p)
+{
+ /*
+ * The basic idea is this:
+ *
+ * 1. Count the number of lines in the file, n.
+ *
+ * 2a. If sampling with replacement, generate k random integers in [0, n-1].
+ *
+ * 2b. If sampling without replacement, generate a list of integers 0..(n-1),
+ * shuffle with fisher-yates, then consider the first k.
+ *
+ * 3. Sort the integer list.
+ *
+ * 3. Read through the file again, when the number at the front of the integer
+ * list matches the index of the fastq etry, print the entry, and pop the
+ * number.
+ */
+
+
+ unsigned long n, n2;
+
+ fastq_t* f1 = fastq_open(file1);
+ fastq_t* f2 = file2 == NULL ? NULL : fastq_open(file2);
+
+ n = count_entries(f1);
+ if (f2 != NULL) {
+ n2 = count_entries(f2);
+ if (n != n2) {
+ fprintf(stderr, "Input files have differing numbers of entries (%lu != %lu).\n", n, n2);
+ exit(1);
+ }
+ }
+
+ fastq_rewind(f1);
+ if (f2 != NULL) fastq_rewind(f2);
+
+ if (p > 0.0) {
+ k = (unsigned long) (p * (double) n);
+ if (!replacement_flag && k > n) k = n;
+ }
+
+ rng_t* rng = fastq_rng_alloc();
+
+ unsigned long* xs;
+ if (replacement_flag) xs = index_with_replacement(rng, n, k);
+ else xs = index_without_replacement(rng, n);
+
+ qsort(xs, k, sizeof(unsigned long), cmpul);
+
+
+ /* open output */
+ FILE* fout1;
+ FILE* fout2;
+
+ char* output_name;
+ size_t output_len;
+ if (file2 == NULL) {
+ output_len = strlen(prefix) + 7;
+ output_name = malloc_or_die((output_len + 1) * sizeof(char));
+
+ snprintf(output_name, output_len, "%s.fastq", prefix);
+ fout1 = fopen(output_name, "wb");
+ if (fout1 == NULL) {
+ fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
+ exit(1);
+ }
+
+ fout2 = NULL;
+
+ free(output_name);
+ }
+ else {
+ output_len = strlen(prefix) + 9;
+ output_name = malloc_or_die((output_len + 1) * sizeof(char));
+
+ snprintf(output_name, output_len, "%s.1.fastq", prefix);
+ fout1 = fopen(output_name, "wb");
+ if (fout1 == NULL) {
+ fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
+ exit(1);
+ }
+
+ snprintf(output_name, output_len, "%s.2.fastq", prefix);
+ fout2 = fopen(output_name, "wb");
+ if (fout1 == NULL) {
+ fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
+ exit(1);
+ }
+
+ free(output_name);
+ }
+
+
+
+ unsigned long i = 0; // read number
+ unsigned long j = 0; // index into xs
+
+ int ret;
+ seq_t* seq1 = fastq_alloc_seq();
+ seq_t* seq2 = fastq_alloc_seq();
+
+ while (j < k && fastq_next(f1, seq1)) {
+ if (f2 != NULL){
+ ret = fastq_next(f2, seq2);
+ if (ret == 0) {
+ fputs("Input files have differing numbers of entries.\n", stderr);
+ exit(1);
+ }
+ }
+
+ while (j < k && xs[j] == i) {
+ fastq_print(fout1, seq1);
+ if (f2 != NULL) fastq_print(fout2, seq2);
+ ++j;
+ }
+
+ ++i;
+ }
+
+ fastq_free_seq(seq1);
+ fastq_free_seq(seq2);
+ fastq_close(f1);
+ if (f2 != NULL) fastq_close(f2);
+
+ fclose(fout1);
+ if (fout2 != NULL) fclose(fout2);
+
+ fastq_rng_free(rng);
+ free(xs);
+}
+
+
+int main(int argc, char* argv[])
+{
+ SET_BINARY_MODE(stdin);
+ SET_BINARY_MODE(stdout);
+
+ int opt;
+ int opt_idx;
+
+ const char* prefix = NULL;
+ unsigned long k = 10000; // number of reads to sample
+ double p = -1; // proportion of reads to sample
+
+
+ static struct option long_options[] =
+ {
+ {"with-replacement", no_argument, NULL, 'r'},
+ {"output", no_argument, NULL, 'o'},
+ {"help", no_argument, NULL, 'h'},
+ {"version", no_argument, NULL, 'V'},
+ {0, 0, 0, 0}
+ };
+
+ replacement_flag = 0;
+
+ while (1) {
+ opt = getopt_long(argc, argv, "n:p:o:rhV", long_options, &opt_idx);
+
+ if( opt == -1 ) break;
+
+ switch (opt) {
+ case 0:
+ if (long_options[opt_idx].flag != 0) break;
+ if (optarg) {
+ }
+ break;
+
+ case 'n':
+ k = strtoul(optarg, NULL, 10);
+ break;
+
+ case 'p':
+ p = atof(optarg);
+ if (p < 0.0) {
+ fputs("Sample proportion ('-p') is less than zero.\n", stderr);
+ return 1;
+ }
+ break;
+
+ case 'r':
+ replacement_flag = 1;
+ break;
+
+ case 'o':
+ prefix = optarg;
+ break;
+
+ case 'h':
+ print_help();
+ return 0;
+
+ case 'V':
+ print_version(stdout, prog_name);
+ return 0;
+
+ case '?':
+ return 1;
+
+ default:
+ abort();
+ }
+ }
+
+ FILE* file1 = NULL;
+ FILE* file2 = NULL;
+
+ char* prefix_alloc = NULL;
+
+ if (optind >= argc) {
+ file1 = stdin;
+ }
+ else {
+ if (strcmp(argv[optind], "-") == 0) {
+ file1 = stdin;
+ if (prefix == NULL) prefix = "stdin.sample";
+ }
+ else {
+ file1 = fopen(argv[optind], "rb");
+ if (file1 == NULL) {
+ fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
+ return 1;
+ }
+
+ if (prefix == NULL) {
+ /* guess at a reasonable output refix by trimming the
+ * trailing file extension, if any. */
+ char* tmp;
+
+ /* base name */
+ tmp = strrchr(argv[optind], '/');
+ if (tmp != NULL) argv[optind] = tmp + 1;
+
+ /* exclude file suffixes */
+ tmp = strchr(argv[optind], '.');
+ if (tmp == NULL) prefix = argv[optind];
+ else {
+ prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char));
+ memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char));
+ prefix_alloc[tmp - argv[optind]] = '\0';
+ prefix = prefix_alloc;
+ }
+ }
+ }
+ ++optind;
+
+ if (optind < argc) {
+ if (strcmp(argv[optind], "-") == 0) {
+ if (file1 == stdin) {
+ fprintf(stderr, "Both input files cannot be standard input.\n");
+ return 1;
+ }
+ file2 = stdin;
+ }
+ else {
+ file2 = fopen(argv[optind], "rb");
+ if (file2 == NULL) {
+ fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
+ return 1;
+ }
+ }
+ }
+ }
+
+ fastq_sample(prefix, file1, file2, k, p);
+
+ free(prefix_alloc);
+ return 0;
+}
+
+
+
+
+
+
+
+
}
+void fastq_rewind(fastq_t* fqf)
+{
+ gzrewind(fqf->file);
+ fqf->state = STATE_ID1;
+ fqf->buf[0] = '\0';
+ fqf->c = fqf->buf;
+}
+
+
void fastq_print(FILE* fout, seq_t* seq)
{
/* FASTQ */
fastq_t* fastq_open(FILE*);
void fastq_close(fastq_t*);
int fastq_next(fastq_t*, seq_t*);
+void fastq_rewind(fastq_t*);
void fastq_print(FILE* fout, seq_t* seq);
--- /dev/null
+
+/* This code taken from the GNU Scientific Library and adapted for use in
+ * fastq-tools. It is subject to the following licence.
+ */
+
+/* This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 3 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details. You should have received
+ a copy of the GNU General Public License along with this program;
+ if not, write to the Free Foundation, Inc., 59 Temple Place, Suite
+ 330, Boston, MA 02111-1307 USA
+
+ Original implementation was copyright (C) 1997 Makoto Matsumoto and
+ Takuji Nishimura. Coded by Takuji Nishimura, considering the
+ suggestions by Topher Cooper and Marc Rieffel in July-Aug. 1997, "A
+ C-program for MT19937: Integer version (1998/4/6)"
+
+ This implementation copyright (C) 1998 Brian Gough. I reorganized
+ the code to use the module framework of GSL. The license on this
+ implementation was changed from LGPL to GPL, following paragraph 3
+ of the LGPL, version 2.
+
+ Update:
+
+ The seeding procedure has been updated to match the 10/99 release
+ of MT19937.
+
+ Update:
+
+ The seeding procedure has been updated again to match the 2002
+ release of MT19937
+
+ The original code included the comment: "When you use this, send an
+ email to: matumoto@math.keio.ac.jp with an appropriate reference to
+ your work".
+
+ Makoto Matsumoto has a web page with more information about the
+ generator, http://www.math.keio.ac.jp/~matumoto/emt.html.
+
+ The paper below has details of the algorithm.
+
+ From: Makoto Matsumoto and Takuji Nishimura, "Mersenne Twister: A
+ 623-dimensionally equidistributerd uniform pseudorandom number
+ generator". ACM Transactions on Modeling and Computer Simulation,
+ Vol. 8, No. 1 (Jan. 1998), Pages 3-30
+
+ You can obtain the paper directly from Makoto Matsumoto's web page.
+
+ The period of this generator is 2^{19937} - 1.
+
+*/
+
+#include "rng.h"
+#include "common.h"
+#include <stdlib.h>
+#include <assert.h>
+
+
+/* default seed */
+unsigned long int default_seed = 4357;
+
+
+#define N 624 /* Period parameters */
+#define M 397
+
+/* most significant w-r bits */
+static const unsigned long UPPER_MASK = 0x80000000UL;
+
+/* least significant r bits */
+static const unsigned long LOWER_MASK = 0x7fffffffUL;
+
+
+static const unsigned long RNG_MIN = 0;
+static const unsigned long RNG_MAX = 0xffffffffUL;
+
+
+struct rng_t_
+{
+ unsigned long mt[N];
+ int mti;
+};
+
+
+static inline unsigned long mt_get(rng_t* state)
+{
+ unsigned long k ;
+ unsigned long int *const mt = state->mt;
+
+#define MAGIC(y) (((y)&0x1) ? 0x9908b0dfUL : 0)
+
+ if (state->mti >= N)
+ { /* generate N words at one time */
+ int kk;
+
+ for (kk = 0; kk < N - M; kk++)
+ {
+ unsigned long y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+ mt[kk] = mt[kk + M] ^ (y >> 1) ^ MAGIC(y);
+ }
+ for (; kk < N - 1; kk++)
+ {
+ unsigned long y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+ mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ MAGIC(y);
+ }
+
+ {
+ unsigned long y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
+ mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ MAGIC(y);
+ }
+
+ state->mti = 0;
+ }
+
+ /* Tempering */
+
+ k = mt[state->mti];
+ k ^= (k >> 11);
+ k ^= (k << 7) & 0x9d2c5680UL;
+ k ^= (k << 15) & 0xefc60000UL;
+ k ^= (k >> 18);
+
+ state->mti++;
+
+ return k;
+}
+
+double mt_get_double(rng_t* state)
+{
+ return mt_get(state) / 4294967296.0;
+}
+
+void mt_set(rng_t* state, unsigned long int s)
+{
+ int i;
+
+ if (s == 0)
+ s = 4357; /* the default seed is 4357 */
+
+ state->mt[0]= s & 0xffffffffUL;
+
+ for (i = 1; i < N; i++)
+ {
+ /* See Knuth's "Art of Computer Programming" Vol. 2, 3rd
+ Ed. p.106 for multiplier. */
+
+ state->mt[i] =
+ (1812433253UL * (state->mt[i-1] ^ (state->mt[i-1] >> 30)) + i);
+
+ state->mt[i] &= 0xffffffffUL;
+ }
+
+ state->mti = i;
+}
+
+#if 0 // these two seeding procedures are not used
+static void mt_1999_set(rng_t* state, unsigned long int s)
+{
+ int i;
+
+ if (s == 0)
+ s = 4357; /* the default seed is 4357 */
+
+ /* This is the October 1999 version of the seeding procedure. It
+ was updated by the original developers to avoid the periodicity
+ in the simple congruence originally used.
+
+ Note that an ANSI-C unsigned long integer arithmetic is
+ automatically modulo 2^32 (or a higher power of two), so we can
+ safely ignore overflow. */
+
+#define LCG(x) ((69069 * x) + 1) &0xffffffffUL
+
+ for (i = 0; i < N; i++)
+ {
+ state->mt[i] = s & 0xffff0000UL;
+ s = LCG(s);
+ state->mt[i] |= (s &0xffff0000UL) >> 16;
+ s = LCG(s);
+ }
+
+ state->mti = i;
+}
+
+/* This is the original version of the seeding procedure, no longer
+ used but available for compatibility with the original MT19937. */
+
+static void mt_1998_set(rng_t* state, unsigned long int s)
+{
+ int i;
+
+ if (s == 0)
+ s = 4357; /* the default seed is 4357 */
+
+ state->mt[0] = s & 0xffffffffUL;
+
+#define LCG1998(n) ((69069 * n) & 0xffffffffUL)
+
+ for (i = 1; i < N; i++)
+ state->mt[i] = LCG1998 (state->mt[i - 1]);
+
+ state->mti = i;
+}
+#endif
+
+rng_t* fastq_rng_alloc()
+{
+ rng_t* rng = malloc_or_die(sizeof(rng_t));
+ mt_set(rng, default_seed);
+ return rng;
+}
+
+
+void fastq_rng_free(rng_t* rng)
+{
+ free(rng);
+}
+
+void fastq_rng_seed(rng_t* rng, unsigned long seed)
+{
+ mt_set(rng, seed);
+}
+
+unsigned long fastq_rng_uniform_int(rng_t* rng, unsigned long k)
+{
+ unsigned long scale = (RNG_MAX - RNG_MIN) / k;
+ assert(scale > 0);
+ unsigned long r;
+
+ do {
+ r = (mt_get(rng) - RNG_MIN) / scale;
+ } while (r >= k);
+
+ return r;
+}
+
--- /dev/null
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * rng :
+ * Robust pseudo-random number generation.
+ *
+ */
+
+#ifndef FASTQ_TOOLS_RNG_H
+#define FASTQ_TOOLS_RNG_H
+
+typedef struct rng_t_ rng_t;
+
+rng_t* fastq_rng_alloc();
+void fastq_rng_free(rng_t*);
+void fastq_rng_seed(rng_t*, unsigned long seed);
+
+/* Uniform integer in [0, k-1] */
+unsigned long fastq_rng_uniform_int(rng_t*, unsigned long k);
+
+#endif
+