A program to randomly sample reads from a fastq file.

author Daniel Jones <dcjones@cs.washington.edu>

Tue, 25 Oct 2011 20:45:51 +0000 (13:45 -0700)

committer Daniel Jones <dcjones@cs.washington.edu>

Tue, 25 Oct 2011 20:45:51 +0000 (13:45 -0700)
author Daniel Jones <dcjones@cs.washington.edu>
Tue, 25 Oct 2011 20:45:51 +0000 (13:45 -0700)
committer Daniel Jones <dcjones@cs.washington.edu>
Tue, 25 Oct 2011 20:45:51 +0000 (13:45 -0700)
diff --git a/configure.ac b/configure.ac

index 522949e39e50207dbb7df6d28724fd73796b6e24..a1039a075350b1d07ba4e1feb0a719cb2fe51a0f 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
  
-AC_INIT( [fastq-tools], [0.2], [dcjones@cs.washington.ed] )
+AC_INIT( [fastq-tools], [0.3], [dcjones@cs.washington.ed] )
  AM_INIT_AUTOMAKE( [foreign -Wall -Werror] )
  m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])
  
diff --git a/doc/Makefile.am b/doc/Makefile.am

index da537385207e2a4eec797c243ddd1168ae1361b1..8695fbdb48daf26f3bb24b3d397beb6fe1320d2d 100644 (file)
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,4 +1,4 @@
  
-man_MANS = fastq-grep.1 fastq-kmers.1 fastq-match.1 fastq-uniq.1
+dist_man_MANS = fastq-grep.1 fastq-kmers.1 fastq-match.1 fastq-uniq.1 fastq-sample.1
  
  
diff --git a/doc/fastq-sample.1 b/doc/fastq-sample.1

new file mode 100644 (file)

index 0000000..53434e0
--- /dev/null
+++ b/doc/fastq-sample.1
@@ -0,0 +1,45 @@
+.TH FASTQ-SAMPLE 1
+
+.SH NAME
+fastq-sample - sample random reads from a fastq file
+
+.SH SYNOPSIS
+.B fastq-sample [OPTION]... [[FILE] [FILE2]]
+
+.SH DESCRIPTION
+Given a FASTQ file, random reads are sampled and output, with or without
+replacement, according to the '-r' option. The number of reads to sample can be
+specifed with the '-n' option, or in terms of the proportion of total reads
+using '-p' option.
+
+If two files are given, the input is treated as paired-end, and matching pairs
+are sampled and output into seperate files: [prefix].1.fastq and
+[prefix].2.fastq, where [prefix] is set with the '-o' option.
+
+.SH OPTIONS
+.TP
+\fB\-n N\fR
+The number of reads to sample and output
+.TP
+\fB\-p N\fR
+The number of reads to sample in terms of the proportion of total reads. If
+sampling with replacement, this number may be greater than 1.0.
+.TP
+\fB\-o\fR, \fB\-\-output=PREFIX\fR
+The filename prefix to which output should be written. If single-end data is
+being sampled, the output file is [PREFIX].fastq, and with paired-end,
+[PREFIX].1.fastq and [PREFIX].2.fastq.
+.TP
+\fB\-r\fR, \fB\-\-with\-replacement\fR
+Sample with replacement. (By default, sampling is done without replacement.)
+.TP
+\fB\-h\fR, \fB\-\-help\fR
+Output a help message and exit.
+.TP
+\fB\-V\fR, \fB\-\-version\fR
+Output version information and exit.
+
+
+.SH AUTHOR
+Written by Daniel C. Jones <dcjones@cs.washington.edu>
+
diff --git a/src/Makefile.am b/src/Makefile.am

index 080e884de39ef1e55937c09d1522474cc9733568..7bc48d0a98227d61d086d743718d132fbaedf53d 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,10 +1,11 @@
  
-bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual
+bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual fastq-sample
  
  fastq_common_src=common.h common.c
  fastq_parse_src=parse.h parse.c
  fastq_sw_src=sw.h sw.c
  fastq_hash_src=hash.h hash.c
+fastq_rng_src=rng.h rng.c
  
  fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src)
  fastq_grep_LDADD = $(PCRE_LIBS) -lz
@@ -18,6 +19,9 @@ fastq_match_LDADD   = -lz
  fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src)
  fastq_uniq_LDADD   = -lz
  
-fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src) $(fastq_qual_src)
+fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src)
  fastq_qual_LDADD   = -lz
  
+fastq_sample_SOURCES = fastq-sample.c $(fastq_common_src) $(fastq_parse_src) $(fastq_rng_src)
+fastq_sample_LDADD   = -lz
+
diff --git a/src/fastq-sample.c b/src/fastq-sample.c

new file mode 100644 (file)

index 0000000..279b6cf
--- /dev/null
+++ b/src/fastq-sample.c
@@ -0,0 +1,377 @@
+
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-sample :
+ * Sample reads with or without replacement from a FASTQ file.
+ *
+ */
+
+#include "common.h"
+#include "parse.h"
+#include "rng.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <getopt.h>
+#include <zlib.h>
+
+#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
+#  include <fcntl.h>
+#  include <io.h>
+#  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
+#else
+#  define SET_BINARY_MODE(file)
+#endif
+
+static const char* prog_name = "fastq-sample";
+
+static int replacement_flag;
+
+
+void print_help()
+{
+    fprintf(stdout,
+"fastq-sample [OPTION]... [FILE [FILE2]]\n"
+"Sample random reads from a FASTQ file."
+"Options:\n"
+"  -n N                    the number of reads to sample (default: 10000)\n"
+"  -p N                    the proportion of the total reads to sample\n"
+"  -o, --output=PREFIX     output file prefix\n" 
+"  -r, --with-replacement  sample with relpacement\n"
+"  -h, --help              print this message\n"
+"  -V, --version           output version information and exit\n"
+    );
+}
+
+
+/* count the number of entries in a fastq file */
+unsigned long count_entries(fastq_t* fqf)
+{
+    seq_t* seq = fastq_alloc_seq();
+    unsigned long n = 0;
+    while (fastq_next(fqf, seq)) ++n;
+    fastq_free_seq(seq);
+
+    return n;
+}
+
+
+/* compare two unsigned integers (for qsort) */
+int cmpul( const void* a, const void* b ) {
+    if(      *(unsigned long*) a < *(unsigned long*) b ) return -1;
+    else if( *(unsigned long*) a > *(unsigned long*) b ) return 1;
+    else                                                 return 0;
+}
+
+
+/* randomly shuffle an array of unsigned longs */
+void shuffle(rng_t* rng, unsigned long* xs, unsigned long n)
+{
+    unsigned long i, j, k;
+    for (i = n - 1; i > 0; --i) {
+        j = fastq_rng_uniform_int(rng, i + 1);
+        k = xs[j]; xs[j] = xs[i]; xs[i] = k; // swap
+    }
+}
+
+
+unsigned long* index_with_replacement(rng_t* rng, unsigned long n, unsigned long k)
+{
+    unsigned long* xs = malloc_or_die(k * sizeof(unsigned long));
+    size_t i;
+    for (i = 0; i < k; ++i) xs[i] = fastq_rng_uniform_int(rng, n);
+    return xs;
+}
+
+
+unsigned long* index_without_replacement(rng_t* rng, unsigned long n)
+{
+    unsigned long* xs = malloc_or_die(n * sizeof(unsigned long));
+    size_t i;
+    for (i = 0; i < n; ++i) xs[i] = i;
+    shuffle(rng, xs, n);
+    return xs;
+}
+
+
+void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, double p)
+{
+     /*
+      * The basic idea is this:
+      *
+      * 1. Count the number of lines in the file, n.
+      *
+      * 2a. If sampling with replacement, generate k random integers in [0, n-1].
+      *
+      * 2b. If sampling without replacement, generate a list of integers 0..(n-1),
+      *     shuffle with fisher-yates, then consider the first k.
+      *
+      * 3. Sort the integer list.
+      *
+      * 3. Read through the file again, when the number at the front of the integer
+      *    list matches the index of the fastq etry, print the entry, and pop the
+      *    number.
+      */
+
+
+    unsigned long n, n2;
+
+    fastq_t* f1 = fastq_open(file1);
+    fastq_t* f2 = file2 == NULL ? NULL : fastq_open(file2);
+
+    n = count_entries(f1);
+    if (f2 != NULL) {
+        n2 = count_entries(f2);
+        if (n != n2) {
+            fprintf(stderr, "Input files have differing numbers of entries (%lu != %lu).\n", n, n2);
+            exit(1);
+        }
+    }
+
+    fastq_rewind(f1);
+    if (f2 != NULL) fastq_rewind(f2);
+
+    if (p > 0.0) {
+        k = (unsigned long) (p * (double) n);
+        if (!replacement_flag && k > n) k = n;
+    }
+
+    rng_t* rng = fastq_rng_alloc();
+
+    unsigned long* xs;
+    if (replacement_flag) xs = index_with_replacement(rng, n, k);
+    else                  xs = index_without_replacement(rng, n);
+
+    qsort(xs, k, sizeof(unsigned long), cmpul);
+
+
+    /* open output */
+    FILE* fout1;
+    FILE* fout2;
+
+    char* output_name;
+    size_t output_len;
+    if (file2 == NULL) {
+        output_len = strlen(prefix) + 7;
+        output_name = malloc_or_die((output_len + 1) * sizeof(char));
+
+        snprintf(output_name, output_len, "%s.fastq", prefix);
+        fout1 = fopen(output_name, "wb");
+        if (fout1 == NULL) {
+            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
+            exit(1);
+        }
+
+        fout2 = NULL;
+
+        free(output_name);
+    }
+    else {
+        output_len = strlen(prefix) + 9;
+        output_name = malloc_or_die((output_len + 1) * sizeof(char));
+
+        snprintf(output_name, output_len, "%s.1.fastq", prefix);
+        fout1 = fopen(output_name, "wb");
+        if (fout1 == NULL) {
+            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
+            exit(1);
+        }
+
+        snprintf(output_name, output_len, "%s.2.fastq", prefix);
+        fout2 = fopen(output_name, "wb");
+        if (fout1 == NULL) {
+            fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
+            exit(1);
+        }
+
+        free(output_name);
+    }
+
+
+
+    unsigned long i = 0; // read number
+    unsigned long j = 0; // index into xs
+
+    int ret;
+    seq_t* seq1 = fastq_alloc_seq();
+    seq_t* seq2 = fastq_alloc_seq();
+
+    while (j < k && fastq_next(f1, seq1)) {
+        if (f2 != NULL){
+            ret = fastq_next(f2, seq2);
+            if (ret == 0) {
+                fputs("Input files have differing numbers of entries.\n", stderr);
+                exit(1);
+            }
+        }
+
+        while (j < k && xs[j] == i) {
+            fastq_print(fout1, seq1);
+            if (f2 != NULL) fastq_print(fout2, seq2);
+            ++j;
+        }
+
+        ++i;
+    }
+
+    fastq_free_seq(seq1);
+    fastq_free_seq(seq2);
+    fastq_close(f1);
+    if (f2 != NULL) fastq_close(f2);
+
+    fclose(fout1);
+    if (fout2 != NULL) fclose(fout2);
+
+    fastq_rng_free(rng);
+    free(xs);
+}
+
+
+int main(int argc, char* argv[])
+{
+    SET_BINARY_MODE(stdin);
+    SET_BINARY_MODE(stdout);
+
+    int opt;
+    int opt_idx;
+
+    const char* prefix = NULL;
+    unsigned long k = 10000; // number of reads to sample
+    double        p = -1;    // proportion of reads to sample
+
+
+    static struct option long_options[] =
+        { 
+          {"with-replacement", no_argument, NULL, 'r'},
+          {"output",           no_argument, NULL, 'o'},
+          {"help",             no_argument, NULL, 'h'},
+          {"version",          no_argument, NULL, 'V'},
+          {0, 0, 0, 0}
+        };
+
+    replacement_flag = 0;
+
+    while (1) {
+        opt = getopt_long(argc, argv, "n:p:o:rhV", long_options, &opt_idx);
+
+        if( opt == -1 ) break;
+
+        switch (opt) {
+            case 0:
+                if (long_options[opt_idx].flag != 0) break;
+                if (optarg) {
+                }
+                break;
+
+            case 'n':
+                k = strtoul(optarg, NULL, 10);
+                break;
+
+            case 'p':
+                p = atof(optarg);
+                if (p < 0.0) {
+                    fputs("Sample proportion ('-p') is less than zero.\n", stderr);
+                    return 1;
+                }
+                break;
+
+            case 'r':
+                replacement_flag = 1;
+                break;
+
+            case 'o':
+                prefix = optarg;
+                break;
+
+            case 'h':
+                print_help();
+                return 0;
+
+            case 'V':
+                print_version(stdout, prog_name);
+                return 0;
+
+            case '?':
+                return 1;
+
+            default:
+                abort();
+        }
+    }
+
+    FILE* file1 = NULL;
+    FILE* file2 = NULL;
+
+    char* prefix_alloc = NULL;
+
+    if (optind >= argc) {
+        file1 = stdin;
+    }
+    else {
+        if (strcmp(argv[optind], "-") == 0) {
+            file1 = stdin;
+            if (prefix == NULL) prefix = "stdin.sample";
+        }
+        else {
+            file1 = fopen(argv[optind], "rb");
+            if (file1 == NULL) {
+                fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
+                return 1;
+            }
+
+            if (prefix == NULL) {
+                /* guess at a reasonable output refix by trimming the
+                 * trailing file extension, if any.  */
+                char* tmp;
+
+                /* base name */
+                tmp = strrchr(argv[optind], '/');
+                if (tmp != NULL) argv[optind] = tmp + 1;
+
+                /* exclude file suffixes */
+                tmp = strchr(argv[optind], '.');
+                if (tmp == NULL) prefix = argv[optind];
+                else {
+                    prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char));
+                    memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char));
+                    prefix_alloc[tmp - argv[optind]] = '\0';
+                    prefix = prefix_alloc;
+                }
+            }
+        }
+        ++optind;
+
+        if (optind < argc) {
+            if (strcmp(argv[optind], "-") == 0) {
+                if (file1 == stdin) {
+                    fprintf(stderr, "Both input files cannot be standard input.\n");
+                    return 1;
+                }
+                file2 = stdin;
+            }
+            else {
+                file2 = fopen(argv[optind], "rb");
+                if (file2 == NULL) {
+                    fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
+                    return 1;
+                }
+            }
+        }
+    }
+
+    fastq_sample(prefix, file1, file2, k, p);
+
+    free(prefix_alloc);
+    return 0;
+}
+
+
+
+
+
+
+
+
diff --git a/src/parse.c b/src/parse.c

index 1ccc7de241306890665c413a1ab34e64c568b37d..61e3efa3cab29160ce59e4823d9aaa726e335059 100644 (file)
--- a/src/parse.c
+++ b/src/parse.c
@@ -241,6 +241,15 @@ int fastq_next(fastq_t* f, seq_t* seq)
  }
  
  
+void fastq_rewind(fastq_t* fqf)
+{
+    gzrewind(fqf->file);
+    fqf->state = STATE_ID1;
+    fqf->buf[0] = '\0';
+    fqf->c = fqf->buf;
+}
+
+
  void fastq_print(FILE* fout, seq_t* seq)
  {
      /* FASTQ */
diff --git a/src/parse.h b/src/parse.h

index d2e2612bea3261dbbf6d3db2537e71e4eecd6f8e..d9bbac69c4ece25babbba03e12d93d160380a466 100644 (file)
--- a/src/parse.h
+++ b/src/parse.h
@@ -49,6 +49,7 @@ typedef struct
  fastq_t* fastq_open(FILE*);
  void fastq_close(fastq_t*);
  int  fastq_next(fastq_t*, seq_t*);
+void fastq_rewind(fastq_t*);
  
  void fastq_print(FILE* fout, seq_t* seq);
  
diff --git a/src/rng.c b/src/rng.c

new file mode 100644 (file)

index 0000000..b503699
--- /dev/null
+++ b/src/rng.c
@@ -0,0 +1,241 @@
+
+/* This code taken from the GNU Scientific Library and adapted for use in
+ * fastq-tools. It is subject to the following licence.
+ */
+
+/* This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.  You should have received
+   a copy of the GNU General Public License along with this program;
+   if not, write to the Free Foundation, Inc., 59 Temple Place, Suite
+   330, Boston, MA 02111-1307 USA
+
+   Original implementation was copyright (C) 1997 Makoto Matsumoto and
+   Takuji Nishimura. Coded by Takuji Nishimura, considering the
+   suggestions by Topher Cooper and Marc Rieffel in July-Aug. 1997, "A
+   C-program for MT19937: Integer version (1998/4/6)"
+
+   This implementation copyright (C) 1998 Brian Gough. I reorganized
+   the code to use the module framework of GSL.  The license on this
+   implementation was changed from LGPL to GPL, following paragraph 3
+   of the LGPL, version 2.
+
+   Update:
+
+   The seeding procedure has been updated to match the 10/99 release
+   of MT19937.
+
+   Update:
+
+   The seeding procedure has been updated again to match the 2002
+   release of MT19937
+
+   The original code included the comment: "When you use this, send an
+   email to: matumoto@math.keio.ac.jp with an appropriate reference to
+   your work".
+
+   Makoto Matsumoto has a web page with more information about the
+   generator, http://www.math.keio.ac.jp/~matumoto/emt.html. 
+
+   The paper below has details of the algorithm.
+
+   From: Makoto Matsumoto and Takuji Nishimura, "Mersenne Twister: A
+   623-dimensionally equidistributerd uniform pseudorandom number
+   generator". ACM Transactions on Modeling and Computer Simulation,
+   Vol. 8, No. 1 (Jan. 1998), Pages 3-30
+
+   You can obtain the paper directly from Makoto Matsumoto's web page.
+
+   The period of this generator is 2^{19937} - 1.
+
+*/
+
+#include "rng.h"
+#include "common.h"
+#include <stdlib.h>
+#include <assert.h>
+
+
+/* default seed */
+unsigned long int default_seed = 4357;
+
+
+#define N 624   /* Period parameters */
+#define M 397
+
+/* most significant w-r bits */
+static const unsigned long UPPER_MASK = 0x80000000UL;   
+
+/* least significant r bits */
+static const unsigned long LOWER_MASK = 0x7fffffffUL;   
+
+
+static const unsigned long RNG_MIN = 0;
+static const unsigned long RNG_MAX = 0xffffffffUL;
+
+
+struct rng_t_
+{
+    unsigned long mt[N];
+    int mti;
+};
+
+
+static inline unsigned long mt_get(rng_t* state)
+{
+  unsigned long k ;
+  unsigned long int *const mt = state->mt;
+
+#define MAGIC(y) (((y)&0x1) ? 0x9908b0dfUL : 0)
+
+  if (state->mti >= N)
+    {   /* generate N words at one time */
+      int kk;
+
+      for (kk = 0; kk < N - M; kk++)
+        {
+          unsigned long y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+          mt[kk] = mt[kk + M] ^ (y >> 1) ^ MAGIC(y);
+        }
+      for (; kk < N - 1; kk++)
+        {
+          unsigned long y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+          mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ MAGIC(y);
+        }
+
+      {
+        unsigned long y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
+        mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ MAGIC(y);
+      }
+
+      state->mti = 0;
+    }
+
+  /* Tempering */
+  
+  k = mt[state->mti];
+  k ^= (k >> 11);
+  k ^= (k << 7) & 0x9d2c5680UL;
+  k ^= (k << 15) & 0xefc60000UL;
+  k ^= (k >> 18);
+
+  state->mti++;
+
+  return k;
+}
+
+double mt_get_double(rng_t* state)
+{
+  return mt_get(state) / 4294967296.0;
+}
+
+void mt_set(rng_t* state, unsigned long int s)
+{
+  int i;
+
+  if (s == 0)
+    s = 4357;   /* the default seed is 4357 */
+
+  state->mt[0]= s & 0xffffffffUL;
+
+  for (i = 1; i < N; i++)
+    {
+      /* See Knuth's "Art of Computer Programming" Vol. 2, 3rd
+         Ed. p.106 for multiplier. */
+
+      state->mt[i] =
+        (1812433253UL * (state->mt[i-1] ^ (state->mt[i-1] >> 30)) + i);
+      
+      state->mt[i] &= 0xffffffffUL;
+    }
+
+  state->mti = i;
+}
+
+#if 0 // these two seeding procedures are not used
+static void mt_1999_set(rng_t* state, unsigned long int s)
+{
+  int i;
+
+  if (s == 0)
+    s = 4357;   /* the default seed is 4357 */
+
+  /* This is the October 1999 version of the seeding procedure. It
+     was updated by the original developers to avoid the periodicity
+     in the simple congruence originally used.
+
+     Note that an ANSI-C unsigned long integer arithmetic is
+     automatically modulo 2^32 (or a higher power of two), so we can
+     safely ignore overflow. */
+
+#define LCG(x) ((69069 * x) + 1) &0xffffffffUL
+
+  for (i = 0; i < N; i++)
+    {
+      state->mt[i] = s & 0xffff0000UL;
+      s = LCG(s);
+      state->mt[i] |= (s &0xffff0000UL) >> 16;
+      s = LCG(s);
+    }
+
+  state->mti = i;
+}
+
+/* This is the original version of the seeding procedure, no longer
+   used but available for compatibility with the original MT19937. */
+
+static void mt_1998_set(rng_t* state, unsigned long int s)
+{
+  int i;
+
+  if (s == 0)
+    s = 4357;   /* the default seed is 4357 */
+
+  state->mt[0] = s & 0xffffffffUL;
+
+#define LCG1998(n) ((69069 * n) & 0xffffffffUL)
+
+  for (i = 1; i < N; i++)
+    state->mt[i] = LCG1998 (state->mt[i - 1]);
+
+  state->mti = i;
+}
+#endif
+
+rng_t* fastq_rng_alloc()
+{
+    rng_t* rng = malloc_or_die(sizeof(rng_t));
+    mt_set(rng, default_seed);
+    return rng;
+}
+
+
+void fastq_rng_free(rng_t* rng)
+{
+    free(rng);
+}
+
+void fastq_rng_seed(rng_t* rng, unsigned long seed)
+{
+    mt_set(rng, seed);
+}
+
+unsigned long fastq_rng_uniform_int(rng_t* rng, unsigned long k)
+{
+    unsigned long scale = (RNG_MAX - RNG_MIN) / k;
+    assert(scale > 0);
+    unsigned long r;
+
+    do {
+        r = (mt_get(rng) - RNG_MIN) / scale;
+    } while (r >= k);
+
+    return r;
+}
+
diff --git a/src/rng.h b/src/rng.h

new file mode 100644 (file)

index 0000000..469b6ed
--- /dev/null
+++ b/src/rng.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * rng :
+ * Robust pseudo-random number generation.
+ *
+ */
+
+#ifndef FASTQ_TOOLS_RNG_H
+#define FASTQ_TOOLS_RNG_H
+
+typedef struct rng_t_ rng_t;
+
+rng_t* fastq_rng_alloc();
+void   fastq_rng_free(rng_t*);
+void fastq_rng_seed(rng_t*, unsigned long seed);
+
+/* Uniform integer in [0, k-1] */
+unsigned long fastq_rng_uniform_int(rng_t*, unsigned long k);
+
+#endif
+
author	Daniel Jones <dcjones@cs.washington.edu>
	Tue, 25 Oct 2011 20:45:51 +0000 (13:45 -0700)
committer	Daniel Jones <dcjones@cs.washington.edu>
	Tue, 25 Oct 2011 20:45:51 +0000 (13:45 -0700)
configure.ac		patch \| blob \| history
doc/Makefile.am		patch \| blob \| history
doc/fastq-sample.1	[new file with mode: 0644]	patch \| blob
src/Makefile.am		patch \| blob \| history
src/fastq-sample.c	[new file with mode: 0644]	patch \| blob
src/parse.c		patch \| blob \| history
src/parse.h		patch \| blob \| history
src/rng.c	[new file with mode: 0644]	patch \| blob
src/rng.h	[new file with mode: 0644]	patch \| blob