From a7a328a691ab370ed7fd61c9c04af9def02395d5 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Tue, 9 Oct 2012 00:06:12 -0700 Subject: [PATCH] Fix the weird output behavior of fastq-sample. --- configure.ac | 8 +++ doc/fastq-sample.1 | 2 +- m4/ax_check_zlib.m4 | 133 ------------------------------------------- src/Makefile.am | 8 +-- src/common.c | 35 +++++++++++- src/common.h | 4 ++ src/fastq-sample.c | 135 +++++++++++++++----------------------------- src/parse.c | 8 +++ src/parse.h | 7 +++ src/rng.c | 2 +- 10 files changed, 108 insertions(+), 234 deletions(-) delete mode 100644 m4/ax_check_zlib.m4 diff --git a/configure.ac b/configure.ac index 4e6c266..93e89b0 100644 --- a/configure.ac +++ b/configure.ac @@ -8,6 +8,14 @@ AC_CONFIG_MACRO_DIR([m4]) AC_PROG_CC AM_PROG_CC_C_O +m4_ifdef([AC_TYPE_UINT8_T], [AC_TYPE_UINT8_T]) +m4_ifdef([AC_TYPE_UINT16_T], [AC_TYPE_UINT16_T]) +m4_ifdef([AC_TYPE_INT32_T], [AC_TYPE_INT32_T]) +m4_ifdef([AC_TYPE_UINT32_T], [AC_TYPE_UINT32_T]) +m4_ifdef([AC_TYPE_UINT64_T], [AC_TYPE_UINT64_T]) +m4_ifdef([AC_TYPE_SIZE_T], [AC_TYPE_SIZE_T]) +m4_ifdef([AC_HEADER_STDBOOL], [AC_HEADER_STDBOOL]) + opt_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O3" dbg_CFLAGS="--std=c99 -Wall -Wextra -pedantic -g -O0" diff --git a/doc/fastq-sample.1 b/doc/fastq-sample.1 index 4ce0cb4..0ec7d72 100644 --- a/doc/fastq-sample.1 +++ b/doc/fastq-sample.1 @@ -28,7 +28,7 @@ sampling with replacement, this number may be greater than 1.0. \fB\-o\fR, \fB\-\-output=PREFIX\fR The filename prefix to which output should be written. If single-end data is being sampled, the output file is [PREFIX].fastq, and with paired-end, -[PREFIX].1.fastq and [PREFIX].2.fastq. +[PREFIX].1.fastq and [PREFIX].2.fastq. (Default: \"sample\") .TP \fB\-r\fR, \fB\-\-with\-replacement\fR Sample with replacement. (By default, sampling is done without replacement.) diff --git a/m4/ax_check_zlib.m4 b/m4/ax_check_zlib.m4 deleted file mode 100644 index d308183..0000000 --- a/m4/ax_check_zlib.m4 +++ /dev/null @@ -1,133 +0,0 @@ -# =========================================================================== -# http://www.gnu.org/software/autoconf-archive/ax_check_zlib.html -# =========================================================================== -# -# SYNOPSIS -# -# AX_CHECK_ZLIB() -# -# DESCRIPTION -# -# This macro searches for an installed zlib library. If nothing was -# specified when calling configure, it searches first in /usr/local and -# then in /usr, /opt/local and /sw. If the --with-zlib=DIR is specified, -# it will try to find it in DIR/include/zlib.h and DIR/lib/libz.a. If -# --without-zlib is specified, the library is not searched at all. -# -# If either the header file (zlib.h) or the library (libz) is not found, -# the configuration exits on error, asking for a valid zlib installation -# directory or --without-zlib. -# -# The macro defines the symbol HAVE_LIBZ if the library is found. You -# should use autoheader to include a definition for this symbol in a -# config.h file. Sample usage in a C/C++ source is as follows: -# -# #ifdef HAVE_LIBZ -# #include -# #endif /* HAVE_LIBZ */ -# -# LICENSE -# -# Copyright (c) 2008 Loic Dachary -# Copyright (c) 2010 Bastien Chevreux -# -# This program is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation; either version 2 of the License, or (at your -# option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General -# Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program. If not, see . -# -# As a special exception, the respective Autoconf Macro's copyright owner -# gives unlimited permission to copy, distribute and modify the configure -# scripts that are the output of Autoconf when processing the Macro. You -# need not follow the terms of the GNU General Public License when using -# or distributing such scripts, even though portions of the text of the -# Macro appear in them. The GNU General Public License (GPL) does govern -# all other use of the material that constitutes the Autoconf Macro. -# -# This special exception to the GPL applies to versions of the Autoconf -# Macro released by the Autoconf Archive. When you make and distribute a -# modified version of the Autoconf Macro, you may extend this special -# exception to the GPL to apply to your modified version as well. - -#serial 8 - -AU_ALIAS([CHECK_ZLIB], [AX_CHECK_ZLIB]) -AC_DEFUN([AX_CHECK_ZLIB], -# -# Handle user hints -# -[AC_MSG_CHECKING(if zlib is wanted) -AC_ARG_WITH(zlib, -[ --with-zlib=DIR root directory path of zlib installation [defaults to - /usr/local or /usr if not found in /usr/local] - --without-zlib to disable zlib usage completely], -[if test "$withval" != no ; then - zlib_places="/usr/local /usr /opt/local /sw" - AC_MSG_RESULT(yes) - if test -d "$withval" - then - zlib_places="$withval $zlib_places" - else - AC_MSG_WARN([Sorry, $withval does not exist, checking usual places]) - fi -else - AC_MSG_RESULT(no) -fi], -[AC_MSG_RESULT(yes)]) - -# -# Locate zlib, if wanted -# -if test -n "${zlib_places}" -then - # check the user supplied or any other more or less 'standard' place: - # Most UNIX systems : /usr/local and /usr - # MacPorts / Fink on OSX : /opt/local respectively /sw - for ZLIB_HOME in ${zlib_places} ; do - if test -f "${ZLIB_HOME}/include/zlib.h"; then break; fi - ZLIB_HOME="" - done - - # if zlib.h was nowhere to be found, give a notice and bail out - if test ! -n "${ZLIB_HOME}"; then - AC_MSG_ERROR(No zlib.h in any include directory of ${zlib_places}: either specify a valid zlib installation with --with-zlib=DIR or disable zlib usage with --without-zlib) - fi - - ZLIB_OLD_LDFLAGS=$LDFLAGS - ZLIB_OLD_CPPFLAGS=$LDFLAGS - LDFLAGS="$LDFLAGS -L${ZLIB_HOME}/lib" - CPPFLAGS="$CPPFLAGS -I${ZLIB_HOME}/include" - AC_LANG_SAVE - AC_LANG_C - AC_CHECK_LIB(z, inflateEnd, [zlib_cv_libz=yes], [zlib_cv_libz=no]) - AC_CHECK_HEADER(zlib.h, [zlib_cv_zlib_h=yes], [zlib_cv_zlib_h=no]) - AC_LANG_RESTORE - if test "$zlib_cv_libz" = "yes" -a "$zlib_cv_zlib_h" = "yes" - then - # - # If both library and header were found, use them - # - AC_CHECK_LIB(z, inflateEnd) - AC_MSG_CHECKING(zlib in ${ZLIB_HOME}) - AC_MSG_RESULT(ok) - else - # - # If either header or library was not found, revert and bomb - # - AC_MSG_CHECKING(zlib in ${ZLIB_HOME}) - LDFLAGS="$ZLIB_OLD_LDFLAGS" - CPPFLAGS="$ZLIB_OLD_CPPFLAGS" - AC_MSG_RESULT(failed) - AC_MSG_ERROR(either specify a valid zlib installation with --with-zlib=DIR or disable zlib usage with --without-zlib) - fi -fi - -]) diff --git a/src/Makefile.am b/src/Makefile.am index 84707f2..aeb9c85 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -8,22 +8,16 @@ fastq_hash_src=hash.h hash.c fastq_rng_src=rng.h rng.c fastq_grep_SOURCES = fastq-grep.c $(fastq_common_src) $(fastq_parse_src) -fastq_grep_LDADD = $(PCRE_LIBS) -lz +fastq_grep_LDADD = $(PCRE_LIBS) fastq_kmers_SOURCES = fastq-kmers.c $(fastq_common_src) $(fastq_parse_src) -fastq_kmers_LDADD = -lz fastq_match_SOURCES = fastq-match.c $(fastq_common_src) $(fastq_parse_src) $(fastq_sw_src) -fastq_match_LDADD = -lz fastq_uniq_SOURCES = fastq-uniq.c $(fastq_common_src) $(fastq_parse_src) $(fastq_hash_src) -fastq_uniq_LDADD = -lz fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src) -fastq_qual_LDADD = -lz fastq_sample_SOURCES = fastq-sample.c $(fastq_common_src) $(fastq_parse_src) $(fastq_rng_src) -fastq_sample_LDADD = -lz fastq_qualadj_SOURCES = fastq-qualadj.c $(fastq_common_src) $(fastq_parse_src) -fastq_qualadj_LDADD = -lz diff --git a/src/common.c b/src/common.c index b3f47ed..9a11aec 100644 --- a/src/common.c +++ b/src/common.c @@ -6,10 +6,16 @@ * */ +#include +#include +#include #include "common.h" #include "version.h" -#include + +#ifndef O_BINARY +#define O_BINARY 0 +#endif void print_version(FILE *f, const char* prog_name) @@ -61,4 +67,31 @@ FILE* fopen_or_die(const char* path, const char* mode) } +/* Open a file for writing, creating it if it doesn't exist, and complaining if + * it does. */ +FILE* open_without_clobber(const char* filename) +{ + int fd = open(filename, O_WRONLY | O_CREAT | O_BINARY | O_EXCL, + S_IRUSR | S_IWUSR); + + if (fd == -1) { + if (errno == EEXIST) { + fprintf(stderr, "Refusing to overwrite %s.\n", filename); + exit(EXIT_FAILURE); + } + else { + fprintf(stderr, "Cannot open %s for writing.\n", filename); + exit(EXIT_FAILURE); + } + } + + FILE* f = fdopen(fd, "wb"); + if (f == NULL) { + fprintf(stderr, "Cannot open %s for writing.\n", filename); + exit(EXIT_FAILURE); + } + + return f; +} + diff --git a/src/common.h b/src/common.h index 163755b..5cc34fb 100644 --- a/src/common.h +++ b/src/common.h @@ -21,5 +21,9 @@ void* malloc_or_die(size_t); void* realloc_or_die(void*, size_t); FILE* fopen_or_die(const char*, const char*); +/* Open a file for reading, creating it if it doesn't exist, and complaining if + * it does. */ +FILE* open_without_clobber(const char* filename); + #endif diff --git a/src/fastq-sample.c b/src/fastq-sample.c index d3c2b55..0adf02d 100644 --- a/src/fastq-sample.c +++ b/src/fastq-sample.c @@ -8,23 +8,16 @@ * Sample reads with or without replacement from a FASTQ file. * */ +#include +#include +#include +#include +#include #include "common.h" #include "parse.h" #include "rng.h" -#include -#include -#include -#include -#include -#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) -# include -# include -# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) -#else -# define SET_BINARY_MODE(file) -#endif static const char* prog_name = "fastq-sample"; @@ -39,7 +32,7 @@ void print_help() "Options:\n" " -n N the number of reads to sample (default: 10000)\n" " -p N the proportion of the total reads to sample\n" -" -o, --output=PREFIX output file prefix\n" +" -o, --output=PREFIX output file prefix\n (Default: \"sample\")" " -c, --complement-output=PREFIX\n" " output reads not included in the random sample to\n" " a file (or files) with the given prefix (by default,\n" @@ -48,7 +41,7 @@ void print_help() " -s, --seed=SEED a manual seed to the random number generator\n" " -h, --help print this message\n" " -V, --version output version information and exit\n" - ); +); } @@ -103,25 +96,25 @@ unsigned long* index_without_replacement(rng_t* rng, unsigned long n) void fastq_sample(unsigned long rng_seed, - const char* prefix, const char* cprefix, - FILE* file1, FILE* file2, unsigned long k, double p) + const char* prefix, const char* cprefix, + FILE* file1, FILE* file2, unsigned long k, double p) { - /* - * The basic idea is this: - * - * 1. Count the number of lines in the file, n. - * - * 2a. If sampling with replacement, generate k random integers in [0, n-1]. - * - * 2b. If sampling without replacement, generate a list of integers 0..(n-1), - * shuffle with fisher-yates, then consider the first k. - * - * 3. Sort the integer list. - * - * 3. Read through the file again, when the number at the front of the integer - * list matches the index of the fastq etry, print the entry, and pop the - * number. - */ + /* + * The basic idea is this: + * + * 1. Count the number of lines in the file, n. + * + * 2a. If sampling with replacement, generate k random integers in [0, n-1]. + * + * 2b. If sampling without replacement, generate a list of integers 0..(n-1), + * shuffle with fisher-yates, then consider the first k. + * + * 3. Sort the integer list. + * + * 3. Read through the file again, when the number at the front of the integer + * list matches the index of the fastq etry, print the entry, and pop the + * number. + */ unsigned long n, n2; @@ -142,7 +135,7 @@ void fastq_sample(unsigned long rng_seed, if (f2 != NULL) fastq_rewind(f2); if (p > 0.0) { - k = (unsigned long) (p * (double) n); + k = (unsigned long) round(p * (double) n); if (!replacement_flag && k > n) k = n; } @@ -155,7 +148,6 @@ void fastq_sample(unsigned long rng_seed, qsort(xs, k, sizeof(unsigned long), cmpul); - /* open output */ FILE* fout1; FILE* fout2; @@ -167,7 +159,7 @@ void fastq_sample(unsigned long rng_seed, output_name = malloc_or_die((output_len + 1) * sizeof(char)); snprintf(output_name, output_len, "%s.fastq", prefix); - fout1 = fopen(output_name, "wb"); + fout1 = open_without_clobber(output_name); if (fout1 == NULL) { fprintf(stderr, "Cannot open file %s for writing.\n", output_name); exit(1); @@ -182,14 +174,14 @@ void fastq_sample(unsigned long rng_seed, output_name = malloc_or_die((output_len + 1) * sizeof(char)); snprintf(output_name, output_len, "%s.1.fastq", prefix); - fout1 = fopen(output_name, "wb"); + fout1 = open_without_clobber(output_name); if (fout1 == NULL) { fprintf(stderr, "Cannot open file %s for writing.\n", output_name); exit(1); } snprintf(output_name, output_len, "%s.2.fastq", prefix); - fout2 = fopen(output_name, "wb"); + fout1 = open_without_clobber(output_name); if (fout1 == NULL) { fprintf(stderr, "Cannot open file %s for writing.\n", output_name); exit(1); @@ -198,7 +190,6 @@ void fastq_sample(unsigned long rng_seed, free(output_name); } - /* open complement output */ FILE* cfout1 = NULL; FILE* cfout2 = NULL; @@ -239,8 +230,6 @@ void fastq_sample(unsigned long rng_seed, free(output_name); } - - unsigned long i = 0; // read number unsigned long j = 0; // index into xs @@ -290,21 +279,17 @@ void fastq_sample(unsigned long rng_seed, int main(int argc, char* argv[]) { - SET_BINARY_MODE(stdin); - SET_BINARY_MODE(stdout); - int opt; int opt_idx; - const char* prefix = NULL; - const char* cprefix = NULL; + const char* prefix = "sample"; + const char* cprefix = NULL; unsigned long rng_seed = 4357; unsigned long k = 10000; // number of reads to sample double p = -1; // proportion of reads to sample - static struct option long_options[] = - { + { {"with-replacement", no_argument, NULL, 'r'}, {"complement-output", required_argument, NULL, 'c'}, {"seed", required_argument, NULL, 's'}, @@ -375,60 +360,28 @@ int main(int argc, char* argv[]) FILE* file1 = NULL; FILE* file2 = NULL; - char* prefix_alloc = NULL; - if (optind >= argc) { fputs("An input file must be given.\n", stderr); print_help(); - exit(1); + return EXIT_FAILURE; } - else { - file1 = fopen(argv[optind], "rb"); - if (file1 == NULL) { - fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); - return 1; - } - if (prefix == NULL) { - /* guess at a reasonable output refix by trimming the - * trailing file extension, if any. */ - char* tmp; - - /* base name */ - tmp = strrchr(argv[optind], '/'); - if (tmp != NULL) argv[optind] = tmp + 1; - - /* exclude file suffixes */ - tmp = strchr(argv[optind], '.'); - if (tmp == NULL) prefix = argv[optind]; - else { - prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char)); - memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char)); - prefix_alloc[tmp - argv[optind]] = '\0'; - prefix = prefix_alloc; - } - } - ++optind; + file1 = fopen(argv[optind], "rb"); + if (file1 == NULL) { + fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); + return 1; + } - if (optind < argc) { - file2 = fopen(argv[optind], "rb"); - if (file2 == NULL) { - fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); - return 1; - } + if (++optind < argc) { + file2 = fopen(argv[optind], "rb"); + if (file2 == NULL) { + fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); + return 1; } } fastq_sample(rng_seed, prefix, cprefix, file1, file2, k, p); - free(prefix_alloc); - return 0; + return EXIT_SUCCESS; } - - - - - - - diff --git a/src/parse.c b/src/parse.c index e27a385..8a6123e 100644 --- a/src/parse.c +++ b/src/parse.c @@ -166,6 +166,14 @@ bool fastq_read(fastq_t* f, seq_t* seq) } +void fastq_rewind(fastq_t* f) +{ + rewind(f->file); + f->next = f->buf; + f->readlen = 0; +} + + void fastq_print(FILE* fout, const seq_t* seq) { fprintf(fout, "@%s\n%s\n+%s\n%s\n", diff --git a/src/parse.h b/src/parse.h index 8500f0c..6224ab8 100644 --- a/src/parse.h +++ b/src/parse.h @@ -69,6 +69,13 @@ void fastq_free(fastq_t*); bool fastq_read(fastq_t* f, seq_t* seq); +/* Rewind the fastq file. + * + * The FILE passed to fastq_create must be seekable for this to work. + */ +void fastq_rewind(fastq_t* f); + + /* Print a fastq entry. */ void fastq_print(FILE* fout, const seq_t* seq); diff --git a/src/rng.c b/src/rng.c index b503699..ac375d0 100644 --- a/src/rng.c +++ b/src/rng.c @@ -41,7 +41,7 @@ your work". Makoto Matsumoto has a web page with more information about the - generator, http://www.math.keio.ac.jp/~matumoto/emt.html. + generator, http://www.math.keio.ac.jp/~matumoto/emt.html. The paper below has details of the algorithm. -- 2.39.2