AC_PROG_CC
AM_PROG_CC_C_O
+m4_ifdef([AC_TYPE_UINT8_T], [AC_TYPE_UINT8_T])
+m4_ifdef([AC_TYPE_UINT16_T], [AC_TYPE_UINT16_T])
+m4_ifdef([AC_TYPE_INT32_T], [AC_TYPE_INT32_T])
+m4_ifdef([AC_TYPE_UINT32_T], [AC_TYPE_UINT32_T])
+m4_ifdef([AC_TYPE_UINT64_T], [AC_TYPE_UINT64_T])
+m4_ifdef([AC_TYPE_SIZE_T], [AC_TYPE_SIZE_T])
+m4_ifdef([AC_HEADER_STDBOOL], [AC_HEADER_STDBOOL])
+
opt_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O3"
dbg_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O0"
\fB\-o\fR, \fB\-\-output=PREFIX\fR
The filename prefix to which output should be written. If single-end data is
being sampled, the output file is [PREFIX].fastq, and with paired-end,
-[PREFIX].1.fastq and [PREFIX].2.fastq.
+[PREFIX].1.fastq and [PREFIX].2.fastq. (Default: \"sample\")
.TP
\fB\-r\fR, \fB\-\-with\-replacement\fR
Sample with replacement. (By default, sampling is done without replacement.)
+++ /dev/null
-# ===========================================================================
-# http://www.gnu.org/software/autoconf-archive/ax_check_zlib.html
-# ===========================================================================
-#
-# SYNOPSIS
-#
-# AX_CHECK_ZLIB()
-#
-# DESCRIPTION
-#
-# This macro searches for an installed zlib library. If nothing was
-# specified when calling configure, it searches first in /usr/local and
-# then in /usr, /opt/local and /sw. If the --with-zlib=DIR is specified,
-# it will try to find it in DIR/include/zlib.h and DIR/lib/libz.a. If
-# --without-zlib is specified, the library is not searched at all.
-#
-# If either the header file (zlib.h) or the library (libz) is not found,
-# the configuration exits on error, asking for a valid zlib installation
-# directory or --without-zlib.
-#
-# The macro defines the symbol HAVE_LIBZ if the library is found. You
-# should use autoheader to include a definition for this symbol in a
-# config.h file. Sample usage in a C/C++ source is as follows:
-#
-# #ifdef HAVE_LIBZ
-# #include <zlib.h>
-# #endif /* HAVE_LIBZ */
-#
-# LICENSE
-#
-# Copyright (c) 2008 Loic Dachary <loic@senga.org>
-# Copyright (c) 2010 Bastien Chevreux <bach@chevreux.org>
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by the
-# Free Software Foundation; either version 2 of the License, or (at your
-# option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-# Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-# As a special exception, the respective Autoconf Macro's copyright owner
-# gives unlimited permission to copy, distribute and modify the configure
-# scripts that are the output of Autoconf when processing the Macro. You
-# need not follow the terms of the GNU General Public License when using
-# or distributing such scripts, even though portions of the text of the
-# Macro appear in them. The GNU General Public License (GPL) does govern
-# all other use of the material that constitutes the Autoconf Macro.
-#
-# This special exception to the GPL applies to versions of the Autoconf
-# Macro released by the Autoconf Archive. When you make and distribute a
-# modified version of the Autoconf Macro, you may extend this special
-# exception to the GPL to apply to your modified version as well.
-
-#serial 8
-
-AU_ALIAS([CHECK_ZLIB], [AX_CHECK_ZLIB])
-AC_DEFUN([AX_CHECK_ZLIB],
-#
-# Handle user hints
-#
-[AC_MSG_CHECKING(if zlib is wanted)
-AC_ARG_WITH(zlib,
-[ --with-zlib=DIR root directory path of zlib installation [defaults to
- /usr/local or /usr if not found in /usr/local]
- --without-zlib to disable zlib usage completely],
-[if test "$withval" != no ; then
- zlib_places="/usr/local /usr /opt/local /sw"
- AC_MSG_RESULT(yes)
- if test -d "$withval"
- then
- zlib_places="$withval $zlib_places"
- else
- AC_MSG_WARN([Sorry, $withval does not exist, checking usual places])
- fi
-else
- AC_MSG_RESULT(no)
-fi],
-[AC_MSG_RESULT(yes)])
-
-#
-# Locate zlib, if wanted
-#
-if test -n "${zlib_places}"
-then
- # check the user supplied or any other more or less 'standard' place:
- # Most UNIX systems : /usr/local and /usr
- # MacPorts / Fink on OSX : /opt/local respectively /sw
- for ZLIB_HOME in ${zlib_places} ; do
- if test -f "${ZLIB_HOME}/include/zlib.h"; then break; fi
- ZLIB_HOME=""
- done
-
- # if zlib.h was nowhere to be found, give a notice and bail out
- if test ! -n "${ZLIB_HOME}"; then
- AC_MSG_ERROR(No zlib.h in any include directory of ${zlib_places}: either specify a valid zlib installation with --with-zlib=DIR or disable zlib usage with --without-zlib)
- fi
-
- ZLIB_OLD_LDFLAGS=$LDFLAGS
- ZLIB_OLD_CPPFLAGS=$LDFLAGS
- LDFLAGS="$LDFLAGS -L${ZLIB_HOME}/lib"
- CPPFLAGS="$CPPFLAGS -I${ZLIB_HOME}/include"
- AC_LANG_SAVE
- AC_LANG_C
- AC_CHECK_LIB(z, inflateEnd, [zlib_cv_libz=yes], [zlib_cv_libz=no])
- AC_CHECK_HEADER(zlib.h, [zlib_cv_zlib_h=yes], [zlib_cv_zlib_h=no])
- AC_LANG_RESTORE
- if test "$zlib_cv_libz" = "yes" -a "$zlib_cv_zlib_h" = "yes"
- then
- #
- # If both library and header were found, use them
- #
- AC_CHECK_LIB(z, inflateEnd)
- AC_MSG_CHECKING(zlib in ${ZLIB_HOME})
- AC_MSG_RESULT(ok)
- else
- #
- # If either header or library was not found, revert and bomb
- #
- AC_MSG_CHECKING(zlib in ${ZLIB_HOME})
- LDFLAGS="$ZLIB_OLD_LDFLAGS"
- CPPFLAGS="$ZLIB_OLD_CPPFLAGS"
- AC_MSG_RESULT(failed)
- AC_MSG_ERROR(either specify a valid zlib installation with --with-zlib=DIR or disable zlib usage with --without-zlib)
- fi
-fi
-
-])
fastq_rng_src=rng.h rng.c
fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src)
-fastq_grep_LDADD = $(PCRE_LIBS) -lz
+fastq_grep_LDADD = $(PCRE_LIBS)
fastq_kmers_SOURCES = fastq-kmers.c $(fastq_common_src) $(fastq_parse_src)
-fastq_kmers_LDADD = -lz
fastq_match_SOURCES = fastq-match.c $(fastq_common_src) $(fastq_parse_src) $(fastq_sw_src)
-fastq_match_LDADD = -lz
fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src)
-fastq_uniq_LDADD = -lz
fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src)
-fastq_qual_LDADD = -lz
fastq_sample_SOURCES = fastq-sample.c $(fastq_common_src) $(fastq_parse_src) $(fastq_rng_src)
-fastq_sample_LDADD = -lz
fastq_qualadj_SOURCES = fastq-qualadj.c $(fastq_common_src) $(fastq_parse_src)
-fastq_qualadj_LDADD = -lz
*
*/
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
#include "common.h"
#include "version.h"
-#include <stdlib.h>
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
void print_version(FILE *f, const char* prog_name)
}
+/* Open a file for writing, creating it if it doesn't exist, and complaining if
+ * it does. */
+FILE* open_without_clobber(const char* filename)
+{
+ int fd = open(filename, O_WRONLY | O_CREAT | O_BINARY | O_EXCL,
+ S_IRUSR | S_IWUSR);
+
+ if (fd == -1) {
+ if (errno == EEXIST) {
+ fprintf(stderr, "Refusing to overwrite %s.\n", filename);
+ exit(EXIT_FAILURE);
+ }
+ else {
+ fprintf(stderr, "Cannot open %s for writing.\n", filename);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ FILE* f = fdopen(fd, "wb");
+ if (f == NULL) {
+ fprintf(stderr, "Cannot open %s for writing.\n", filename);
+ exit(EXIT_FAILURE);
+ }
+
+ return f;
+}
+
void* realloc_or_die(void*, size_t);
FILE* fopen_or_die(const char*, const char*);
+/* Open a file for reading, creating it if it doesn't exist, and complaining if
+ * it does. */
+FILE* open_without_clobber(const char* filename);
+
#endif
* Sample reads with or without replacement from a FASTQ file.
*
*/
+#include <getopt.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
#include "common.h"
#include "parse.h"
#include "rng.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <getopt.h>
-#include <zlib.h>
-#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
-# include <fcntl.h>
-# include <io.h>
-# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
-#else
-# define SET_BINARY_MODE(file)
-#endif
static const char* prog_name = "fastq-sample";
"Options:\n"
" -n N the number of reads to sample (default: 10000)\n"
" -p N the proportion of the total reads to sample\n"
-" -o, --output=PREFIX output file prefix\n"
+" -o, --output=PREFIX output file prefix\n (Default: \"sample\")"
" -c, --complement-output=PREFIX\n"
" output reads not included in the random sample to\n"
" a file (or files) with the given prefix (by default,\n"
" -s, --seed=SEED a manual seed to the random number generator\n"
" -h, --help print this message\n"
" -V, --version output version information and exit\n"
- );
+);
}
void fastq_sample(unsigned long rng_seed,
- const char* prefix, const char* cprefix,
- FILE* file1, FILE* file2, unsigned long k, double p)
+ const char* prefix, const char* cprefix,
+ FILE* file1, FILE* file2, unsigned long k, double p)
{
- /*
- * The basic idea is this:
- *
- * 1. Count the number of lines in the file, n.
- *
- * 2a. If sampling with replacement, generate k random integers in [0, n-1].
- *
- * 2b. If sampling without replacement, generate a list of integers 0..(n-1),
- * shuffle with fisher-yates, then consider the first k.
- *
- * 3. Sort the integer list.
- *
- * 3. Read through the file again, when the number at the front of the integer
- * list matches the index of the fastq etry, print the entry, and pop the
- * number.
- */
+ /*
+ * The basic idea is this:
+ *
+ * 1. Count the number of lines in the file, n.
+ *
+ * 2a. If sampling with replacement, generate k random integers in [0, n-1].
+ *
+ * 2b. If sampling without replacement, generate a list of integers 0..(n-1),
+ * shuffle with fisher-yates, then consider the first k.
+ *
+ * 3. Sort the integer list.
+ *
+ * 3. Read through the file again, when the number at the front of the integer
+ * list matches the index of the fastq etry, print the entry, and pop the
+ * number.
+ */
unsigned long n, n2;
if (f2 != NULL) fastq_rewind(f2);
if (p > 0.0) {
- k = (unsigned long) (p * (double) n);
+ k = (unsigned long) round(p * (double) n);
if (!replacement_flag && k > n) k = n;
}
qsort(xs, k, sizeof(unsigned long), cmpul);
-
/* open output */
FILE* fout1;
FILE* fout2;
output_name = malloc_or_die((output_len + 1) * sizeof(char));
snprintf(output_name, output_len, "%s.fastq", prefix);
- fout1 = fopen(output_name, "wb");
+ fout1 = open_without_clobber(output_name);
if (fout1 == NULL) {
fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
exit(1);
output_name = malloc_or_die((output_len + 1) * sizeof(char));
snprintf(output_name, output_len, "%s.1.fastq", prefix);
- fout1 = fopen(output_name, "wb");
+ fout1 = open_without_clobber(output_name);
if (fout1 == NULL) {
fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
exit(1);
}
snprintf(output_name, output_len, "%s.2.fastq", prefix);
- fout2 = fopen(output_name, "wb");
+ fout1 = open_without_clobber(output_name);
if (fout1 == NULL) {
fprintf(stderr, "Cannot open file %s for writing.\n", output_name);
exit(1);
free(output_name);
}
-
/* open complement output */
FILE* cfout1 = NULL;
FILE* cfout2 = NULL;
free(output_name);
}
-
-
unsigned long i = 0; // read number
unsigned long j = 0; // index into xs
int main(int argc, char* argv[])
{
- SET_BINARY_MODE(stdin);
- SET_BINARY_MODE(stdout);
-
int opt;
int opt_idx;
- const char* prefix = NULL;
- const char* cprefix = NULL;
+ const char* prefix = "sample";
+ const char* cprefix = NULL;
unsigned long rng_seed = 4357;
unsigned long k = 10000; // number of reads to sample
double p = -1; // proportion of reads to sample
-
static struct option long_options[] =
- {
+ {
{"with-replacement", no_argument, NULL, 'r'},
{"complement-output", required_argument, NULL, 'c'},
{"seed", required_argument, NULL, 's'},
FILE* file1 = NULL;
FILE* file2 = NULL;
- char* prefix_alloc = NULL;
-
if (optind >= argc) {
fputs("An input file must be given.\n", stderr);
print_help();
- exit(1);
+ return EXIT_FAILURE;
}
- else {
- file1 = fopen(argv[optind], "rb");
- if (file1 == NULL) {
- fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
- return 1;
- }
- if (prefix == NULL) {
- /* guess at a reasonable output refix by trimming the
- * trailing file extension, if any. */
- char* tmp;
-
- /* base name */
- tmp = strrchr(argv[optind], '/');
- if (tmp != NULL) argv[optind] = tmp + 1;
-
- /* exclude file suffixes */
- tmp = strchr(argv[optind], '.');
- if (tmp == NULL) prefix = argv[optind];
- else {
- prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char));
- memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char));
- prefix_alloc[tmp - argv[optind]] = '\0';
- prefix = prefix_alloc;
- }
- }
- ++optind;
+ file1 = fopen(argv[optind], "rb");
+ if (file1 == NULL) {
+ fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
+ return 1;
+ }
- if (optind < argc) {
- file2 = fopen(argv[optind], "rb");
- if (file2 == NULL) {
- fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
- return 1;
- }
+ if (++optind < argc) {
+ file2 = fopen(argv[optind], "rb");
+ if (file2 == NULL) {
+ fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]);
+ return 1;
}
}
fastq_sample(rng_seed, prefix, cprefix, file1, file2, k, p);
- free(prefix_alloc);
- return 0;
+ return EXIT_SUCCESS;
}
-
-
-
-
-
-
-
}
+void fastq_rewind(fastq_t* f)
+{
+ rewind(f->file);
+ f->next = f->buf;
+ f->readlen = 0;
+}
+
+
void fastq_print(FILE* fout, const seq_t* seq)
{
fprintf(fout, "@%s\n%s\n+%s\n%s\n",
bool fastq_read(fastq_t* f, seq_t* seq);
+/* Rewind the fastq file.
+ *
+ * The FILE passed to fastq_create must be seekable for this to work.
+ */
+void fastq_rewind(fastq_t* f);
+
+
/* Print a fastq entry. */
void fastq_print(FILE* fout, const seq_t* seq);
your work".
Makoto Matsumoto has a web page with more information about the
- generator, http://www.math.keio.ac.jp/~matumoto/emt.html.
+ generator, http://www.math.keio.ac.jp/~matumoto/emt.html.
The paper below has details of the algorithm.