From 4e791549f24e86c3d98632c0215ea6ad6436e6ad Mon Sep 17 00:00:00 2001 From: quentin jones Date: Fri, 25 Feb 2011 12:46:17 -0800 Subject: [PATCH] initial work on some useful fastq tools --- Makefile.am | 4 + configure.ac | 48 ++++++++++ m4/ax_check_zlib.m4 | 133 ++++++++++++++++++++++++++ src/Makefile.am | 6 ++ src/fastq-grep.c | 84 +++++++++++++++++ src/kseq.h | 223 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 498 insertions(+) create mode 100644 Makefile.am create mode 100644 configure.ac create mode 100644 m4/ax_check_zlib.m4 create mode 100644 src/Makefile.am create mode 100644 src/fastq-grep.c create mode 100644 src/kseq.h diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..0576973 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,4 @@ + +ACLOCAL_AMFLAGS = -I m4 +SUBDIRS = src + diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..cdde05e --- /dev/null +++ b/configure.ac @@ -0,0 +1,48 @@ + +AC_INIT( [fastq-tools], [0.1], [dcjones@cs.washington.ed] ) +AM_INIT_AUTOMAKE( [foreign -Wall -Werror] ) +m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])]) + +AC_CONFIG_MACRO_DIR([m4]) + +AC_PROG_CC +AM_PROG_CC_C_O + +opt_CFLAGS="-g -O3" +dbg_CFLAGS="-g -O0" + +AC_ARG_ENABLE([debug], + [AS_HELP_STRING([--enable-debug], + [enable debugging info (default is no)])], + [], [enable_debug=no]) + +AS_IF([test "x$enable_debug" = xyes], + [CFLAGS="$dbg_CFLAGS"], + [CFLAGS="$opt_CFLAGS"]) + +AC_DEFINE(_FILE_OFFSET_BITS, 64) + + +# check zlib +AX_CHECK_ZLIB + +# check pcre +AC_CHECK_PROG(HAVE_PCRE, pcre-config, yes, no) +if test "x$HAVE_PCRE" = "xno" +then + AC_MSG_ERROR([The PCRE library is needed. See http://www.pcre.org.]) +fi +CFLAGS="$CFLAGS $(pcre-config --cflags)" +PCRE_LIBS=$(pcre-config --libs) + +# check getopt +AC_CHECK_HEADER(getopt.h, , + AC_MSG_ERROR([The posix getopt.h header is needed.])) + + +CXXFLAGS=$CFLAGS + +AC_CONFIG_FILES( [Makefile + src/Makefile] ) + +AC_OUTPUT diff --git a/m4/ax_check_zlib.m4 b/m4/ax_check_zlib.m4 new file mode 100644 index 0000000..d308183 --- /dev/null +++ b/m4/ax_check_zlib.m4 @@ -0,0 +1,133 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_check_zlib.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CHECK_ZLIB() +# +# DESCRIPTION +# +# This macro searches for an installed zlib library. If nothing was +# specified when calling configure, it searches first in /usr/local and +# then in /usr, /opt/local and /sw. If the --with-zlib=DIR is specified, +# it will try to find it in DIR/include/zlib.h and DIR/lib/libz.a. If +# --without-zlib is specified, the library is not searched at all. +# +# If either the header file (zlib.h) or the library (libz) is not found, +# the configuration exits on error, asking for a valid zlib installation +# directory or --without-zlib. +# +# The macro defines the symbol HAVE_LIBZ if the library is found. You +# should use autoheader to include a definition for this symbol in a +# config.h file. Sample usage in a C/C++ source is as follows: +# +# #ifdef HAVE_LIBZ +# #include +# #endif /* HAVE_LIBZ */ +# +# LICENSE +# +# Copyright (c) 2008 Loic Dachary +# Copyright (c) 2010 Bastien Chevreux +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 8 + +AU_ALIAS([CHECK_ZLIB], [AX_CHECK_ZLIB]) +AC_DEFUN([AX_CHECK_ZLIB], +# +# Handle user hints +# +[AC_MSG_CHECKING(if zlib is wanted) +AC_ARG_WITH(zlib, +[ --with-zlib=DIR root directory path of zlib installation [defaults to + /usr/local or /usr if not found in /usr/local] + --without-zlib to disable zlib usage completely], +[if test "$withval" != no ; then + zlib_places="/usr/local /usr /opt/local /sw" + AC_MSG_RESULT(yes) + if test -d "$withval" + then + zlib_places="$withval $zlib_places" + else + AC_MSG_WARN([Sorry, $withval does not exist, checking usual places]) + fi +else + AC_MSG_RESULT(no) +fi], +[AC_MSG_RESULT(yes)]) + +# +# Locate zlib, if wanted +# +if test -n "${zlib_places}" +then + # check the user supplied or any other more or less 'standard' place: + # Most UNIX systems : /usr/local and /usr + # MacPorts / Fink on OSX : /opt/local respectively /sw + for ZLIB_HOME in ${zlib_places} ; do + if test -f "${ZLIB_HOME}/include/zlib.h"; then break; fi + ZLIB_HOME="" + done + + # if zlib.h was nowhere to be found, give a notice and bail out + if test ! -n "${ZLIB_HOME}"; then + AC_MSG_ERROR(No zlib.h in any include directory of ${zlib_places}: either specify a valid zlib installation with --with-zlib=DIR or disable zlib usage with --without-zlib) + fi + + ZLIB_OLD_LDFLAGS=$LDFLAGS + ZLIB_OLD_CPPFLAGS=$LDFLAGS + LDFLAGS="$LDFLAGS -L${ZLIB_HOME}/lib" + CPPFLAGS="$CPPFLAGS -I${ZLIB_HOME}/include" + AC_LANG_SAVE + AC_LANG_C + AC_CHECK_LIB(z, inflateEnd, [zlib_cv_libz=yes], [zlib_cv_libz=no]) + AC_CHECK_HEADER(zlib.h, [zlib_cv_zlib_h=yes], [zlib_cv_zlib_h=no]) + AC_LANG_RESTORE + if test "$zlib_cv_libz" = "yes" -a "$zlib_cv_zlib_h" = "yes" + then + # + # If both library and header were found, use them + # + AC_CHECK_LIB(z, inflateEnd) + AC_MSG_CHECKING(zlib in ${ZLIB_HOME}) + AC_MSG_RESULT(ok) + else + # + # If either header or library was not found, revert and bomb + # + AC_MSG_CHECKING(zlib in ${ZLIB_HOME}) + LDFLAGS="$ZLIB_OLD_LDFLAGS" + CPPFLAGS="$ZLIB_OLD_CPPFLAGS" + AC_MSG_RESULT(failed) + AC_MSG_ERROR(either specify a valid zlib installation with --with-zlib=DIR or disable zlib usage with --without-zlib) + fi +fi + +]) diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..9496a48 --- /dev/null +++ b/src/Makefile.am @@ -0,0 +1,6 @@ + +bin_PROGRAMS = fastq-grep +fastq_grep_SOURCES = kseq.h fastq-grep.c +fastq_grep_LDADD = $(PCRE_LIBS) -lz + + diff --git a/src/fastq-grep.c b/src/fastq-grep.c new file mode 100644 index 0000000..08402b4 --- /dev/null +++ b/src/fastq-grep.c @@ -0,0 +1,84 @@ + + +/* + * fastq-grep: regular expression searches of the sequences within a fastq file + * + * Febuary 2011 / Daniel Jones + * + */ + + +#include +#include +#include +#include +#include + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + + +void print_help() +{ + fprintf( stderr, +"fastq-grep [OPTION]... PATTERN [FILE]...\n" +"Search for PATTERN in the read sequence in each FILE or standard input.\n" +"PATTERN, by default, is a perl compatible regular expression." + ); + +} + +static int invert_flag; +static int help_flag; + +int main(int argc, char* argv[]) +{ + + int opt; + int opt_idx; + + static struct option long_options[] = + { + {"help", no_argument, &help_flag, 1}, + {"invert-match", no_argument, &invert_flag, 1}, + {0, 0, 0, 0} + }; + + while (1) { + opt = getopt_long(argc, argv, "hv", long_options, &opt_idx); + + if( opt == -1 ) break; + + switch (opt) { + case 0: + if (long_options[opt_idx].flag != 0) break; + if (optarg) { + /* TODO */ + } + break; + + case 'h': + help_flag = 1; + break; + + case 'v': + invert_flag = 1; + break; + + default: + abort(); + } + } + + if (help_flag) { + print_help(); + return 0; + } + + /* TODO */ + + return 0; +} + + + diff --git a/src/kseq.h b/src/kseq.h new file mode 100644 index 0000000..bbe0125 --- /dev/null +++ b/src/kseq.h @@ -0,0 +1,223 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +/* Last Modified: 12APR2009 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_MAX 1 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { \ + if (dret) *dret = 0; \ + str->l = 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < i - ks->begin + 1) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->l == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define __KSEQ_BASIC(type_t) \ + static inline kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + static inline void kseq_rewind(kseq_t *ks) \ + { \ + ks->last_char = 0; \ + ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ + } \ + static inline void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT(type_t, __read) \ + KSTREAM_INIT(type_t, __read, 4096) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(type_t) \ + __KSEQ_READ + +#endif -- 2.39.2