]> git.donarmstrong.com Git - fastq-tools.git/commitdiff
initial work on some useful fastq tools
authorquentin jones <dcjones@datahorse.(none)>
Fri, 25 Feb 2011 20:46:17 +0000 (12:46 -0800)
committerquentin jones <dcjones@datahorse.(none)>
Fri, 25 Feb 2011 20:46:17 +0000 (12:46 -0800)
Makefile.am [new file with mode: 0644]
configure.ac [new file with mode: 0644]
m4/ax_check_zlib.m4 [new file with mode: 0644]
src/Makefile.am [new file with mode: 0644]
src/fastq-grep.c [new file with mode: 0644]
src/kseq.h [new file with mode: 0644]

diff --git a/Makefile.am b/Makefile.am
new file mode 100644 (file)
index 0000000..0576973
--- /dev/null
@@ -0,0 +1,4 @@
+
+ACLOCAL_AMFLAGS = -I m4
+SUBDIRS = src
+
diff --git a/configure.ac b/configure.ac
new file mode 100644 (file)
index 0000000..cdde05e
--- /dev/null
@@ -0,0 +1,48 @@
+
+AC_INIT( [fastq-tools], [0.1], [dcjones@cs.washington.ed] )
+AM_INIT_AUTOMAKE( [foreign -Wall -Werror] )
+m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])
+
+AC_CONFIG_MACRO_DIR([m4])
+
+AC_PROG_CC
+AM_PROG_CC_C_O
+
+opt_CFLAGS="-g -O3"
+dbg_CFLAGS="-g -O0"
+
+AC_ARG_ENABLE([debug],
+              [AS_HELP_STRING([--enable-debug],
+                              [enable debugging info (default is no)])],
+              [], [enable_debug=no])
+
+AS_IF([test "x$enable_debug" = xyes],
+      [CFLAGS="$dbg_CFLAGS"],
+      [CFLAGS="$opt_CFLAGS"])
+
+AC_DEFINE(_FILE_OFFSET_BITS, 64)
+
+
+# check zlib
+AX_CHECK_ZLIB
+
+# check pcre
+AC_CHECK_PROG(HAVE_PCRE, pcre-config, yes, no)
+if test "x$HAVE_PCRE" = "xno"
+then
+    AC_MSG_ERROR([The PCRE library is needed. See http://www.pcre.org.])
+fi
+CFLAGS="$CFLAGS $(pcre-config --cflags)"
+PCRE_LIBS=$(pcre-config --libs)
+
+# check getopt
+AC_CHECK_HEADER(getopt.h, ,
+                AC_MSG_ERROR([The posix getopt.h header is needed.]))
+
+
+CXXFLAGS=$CFLAGS
+
+AC_CONFIG_FILES( [Makefile
+                  src/Makefile] )
+
+AC_OUTPUT
diff --git a/m4/ax_check_zlib.m4 b/m4/ax_check_zlib.m4
new file mode 100644 (file)
index 0000000..d308183
--- /dev/null
@@ -0,0 +1,133 @@
+# ===========================================================================
+#       http://www.gnu.org/software/autoconf-archive/ax_check_zlib.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_ZLIB()
+#
+# DESCRIPTION
+#
+#   This macro searches for an installed zlib library. If nothing was
+#   specified when calling configure, it searches first in /usr/local and
+#   then in /usr, /opt/local and /sw. If the --with-zlib=DIR is specified,
+#   it will try to find it in DIR/include/zlib.h and DIR/lib/libz.a. If
+#   --without-zlib is specified, the library is not searched at all.
+#
+#   If either the header file (zlib.h) or the library (libz) is not found,
+#   the configuration exits on error, asking for a valid zlib installation
+#   directory or --without-zlib.
+#
+#   The macro defines the symbol HAVE_LIBZ if the library is found. You
+#   should use autoheader to include a definition for this symbol in a
+#   config.h file. Sample usage in a C/C++ source is as follows:
+#
+#     #ifdef HAVE_LIBZ
+#     #include <zlib.h>
+#     #endif /* HAVE_LIBZ */
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Loic Dachary <loic@senga.org>
+#   Copyright (c) 2010 Bastien Chevreux <bach@chevreux.org>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([CHECK_ZLIB], [AX_CHECK_ZLIB])
+AC_DEFUN([AX_CHECK_ZLIB],
+#
+# Handle user hints
+#
+[AC_MSG_CHECKING(if zlib is wanted)
+AC_ARG_WITH(zlib,
+[  --with-zlib=DIR root directory path of zlib installation [defaults to
+                    /usr/local or /usr if not found in /usr/local]
+  --without-zlib to disable zlib usage completely],
+[if test "$withval" != no ; then
+  zlib_places="/usr/local /usr /opt/local /sw"
+  AC_MSG_RESULT(yes)
+  if test -d "$withval"
+  then
+    zlib_places="$withval $zlib_places"
+  else
+    AC_MSG_WARN([Sorry, $withval does not exist, checking usual places])
+  fi
+else
+  AC_MSG_RESULT(no)
+fi],
+[AC_MSG_RESULT(yes)])
+
+#
+# Locate zlib, if wanted
+#
+if test -n "${zlib_places}"
+then
+       # check the user supplied or any other more or less 'standard' place:
+       #   Most UNIX systems      : /usr/local and /usr
+       #   MacPorts / Fink on OSX : /opt/local respectively /sw
+       for ZLIB_HOME in ${zlib_places} ; do
+         if test -f "${ZLIB_HOME}/include/zlib.h"; then break; fi
+         ZLIB_HOME=""
+       done
+
+       # if zlib.h was nowhere to be found, give a notice and bail out
+       if test ! -n "${ZLIB_HOME}"; then
+          AC_MSG_ERROR(No zlib.h in any include directory of ${zlib_places}: either specify a valid zlib installation with --with-zlib=DIR or disable zlib usage with --without-zlib)
+       fi
+
+        ZLIB_OLD_LDFLAGS=$LDFLAGS
+        ZLIB_OLD_CPPFLAGS=$LDFLAGS
+        LDFLAGS="$LDFLAGS -L${ZLIB_HOME}/lib"
+        CPPFLAGS="$CPPFLAGS -I${ZLIB_HOME}/include"
+        AC_LANG_SAVE
+        AC_LANG_C
+        AC_CHECK_LIB(z, inflateEnd, [zlib_cv_libz=yes], [zlib_cv_libz=no])
+        AC_CHECK_HEADER(zlib.h, [zlib_cv_zlib_h=yes], [zlib_cv_zlib_h=no])
+        AC_LANG_RESTORE
+        if test "$zlib_cv_libz" = "yes" -a "$zlib_cv_zlib_h" = "yes"
+        then
+                #
+                # If both library and header were found, use them
+                #
+                AC_CHECK_LIB(z, inflateEnd)
+                AC_MSG_CHECKING(zlib in ${ZLIB_HOME})
+                AC_MSG_RESULT(ok)
+        else
+                #
+                # If either header or library was not found, revert and bomb
+                #
+                AC_MSG_CHECKING(zlib in ${ZLIB_HOME})
+                LDFLAGS="$ZLIB_OLD_LDFLAGS"
+                CPPFLAGS="$ZLIB_OLD_CPPFLAGS"
+                AC_MSG_RESULT(failed)
+                AC_MSG_ERROR(either specify a valid zlib installation with --with-zlib=DIR or disable zlib usage with --without-zlib)
+        fi
+fi
+
+])
diff --git a/src/Makefile.am b/src/Makefile.am
new file mode 100644 (file)
index 0000000..9496a48
--- /dev/null
@@ -0,0 +1,6 @@
+
+bin_PROGRAMS = fastq-grep
+fastq_grep_SOURCES = kseq.h fastq-grep.c
+fastq_grep_LDADD = $(PCRE_LIBS) -lz
+
+
diff --git a/src/fastq-grep.c b/src/fastq-grep.c
new file mode 100644 (file)
index 0000000..08402b4
--- /dev/null
@@ -0,0 +1,84 @@
+
+
+/* 
+ * fastq-grep: regular expression searches of the sequences within a fastq file
+ *
+ * Febuary 2011 / Daniel Jones <dcjones@cs.washington.edu> 
+ *
+ */
+
+
+#include <stdio.h>
+#include <string.h>
+#include <getopt.h>
+#include <zlib.h>
+#include <pcre.h>
+
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+
+void print_help()
+{
+    fprintf( stderr, 
+"fastq-grep [OPTION]... PATTERN [FILE]...\n"
+"Search for PATTERN in the read sequence in each FILE or standard input.\n"
+"PATTERN, by default, is a perl compatible regular expression."
+    );
+
+}
+
+static int invert_flag;
+static int help_flag;
+
+int main(int argc, char* argv[])
+{
+
+    int opt;
+    int opt_idx;
+
+    static struct option long_options[] =
+        { 
+          {"help", no_argument, &help_flag, 1},
+          {"invert-match", no_argument, &invert_flag, 1},
+          {0, 0, 0, 0}
+        };
+
+    while (1) {
+        opt = getopt_long(argc, argv, "hv", long_options, &opt_idx);
+
+        if( opt == -1 ) break;
+
+        switch (opt) {
+            case 0:
+                if (long_options[opt_idx].flag != 0) break;
+                if (optarg) {
+                    /* TODO */
+                }
+                break;
+
+            case 'h':
+                help_flag = 1;
+                break;
+
+            case 'v':
+                invert_flag = 1;
+                break;
+
+            default:
+                abort();
+        }
+    }
+
+    if (help_flag) {
+        print_help();
+        return 0;
+    }
+
+    /* TODO */
+
+    return 0;
+}
+
+
+
diff --git a/src/kseq.h b/src/kseq.h
new file mode 100644 (file)
index 0000000..bbe0125
--- /dev/null
@@ -0,0 +1,223 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/* Last Modified: 12APR2009 */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
+#define KS_SEP_TAB   1 // isspace() && !' '
+#define KS_SEP_MAX   1
+
+#define __KS_TYPE(type_t)                                              \
+       typedef struct __kstream_t {                            \
+               char *buf;                                                              \
+               int begin, end, is_eof;                                 \
+               type_t f;                                                               \
+       } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)                                                          \
+       static inline kstream_t *ks_init(type_t f)                                              \
+       {                                                                                                                               \
+               kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
+               ks->f = f;                                                                                                      \
+               ks->buf = (char*)malloc(__bufsize);                                                     \
+               return ks;                                                                                                      \
+       }                                                                                                                               \
+       static inline void ks_destroy(kstream_t *ks)                                    \
+       {                                                                                                                               \
+               if (ks) {                                                                                                       \
+                       free(ks->buf);                                                                                  \
+                       free(ks);                                                                                               \
+               }                                                                                                                       \
+       }
+
+#define __KS_GETC(__read, __bufsize)                                           \
+       static inline int ks_getc(kstream_t *ks)                                \
+       {                                                                                                               \
+               if (ks->is_eof && ks->begin >= ks->end) return -1;      \
+               if (ks->begin >= ks->end) {                                                     \
+                       ks->begin = 0;                                                                  \
+                       ks->end = __read(ks->f, ks->buf, __bufsize);    \
+                       if (ks->end < __bufsize) ks->is_eof = 1;                \
+                       if (ks->end == 0) return -1;                                    \
+               }                                                                                                       \
+               return (int)ks->buf[ks->begin++];                                       \
+       }
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+       size_t l, m;
+       char *s;
+} kstring_t;
+#endif
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)                                                               \
+       static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+       {                                                                                                                                       \
+               if (dret) *dret = 0;                                                                                    \
+               str->l = 0;                                                                                                             \
+               if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
+               for (;;) {                                                                                                              \
+                       int i;                                                                                                          \
+                       if (ks->begin >= ks->end) {                                                                     \
+                               if (!ks->is_eof) {                                                                              \
+                                       ks->begin = 0;                                                                          \
+                                       ks->end = __read(ks->f, ks->buf, __bufsize);            \
+                                       if (ks->end < __bufsize) ks->is_eof = 1;                        \
+                                       if (ks->end == 0) break;                                                        \
+                               } else break;                                                                                   \
+                       }                                                                                                                       \
+                       if (delimiter > KS_SEP_MAX) {                                                           \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (ks->buf[i] == delimiter) break;                                     \
+                       } else if (delimiter == KS_SEP_SPACE) {                                         \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (isspace(ks->buf[i])) break;                                         \
+                       } else if (delimiter == KS_SEP_TAB) {                                           \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
+                       } else i = 0; /* never come to here! */                                         \
+                       if (str->m - str->l < i - ks->begin + 1) {                                      \
+                               str->m = str->l + (i - ks->begin) + 1;                                  \
+                               kroundup32(str->m);                                                                             \
+                               str->s = (char*)realloc(str->s, str->m);                                \
+                       }                                                                                                                       \
+                       memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+                       str->l = str->l + (i - ks->begin);                                                      \
+                       ks->begin = i + 1;                                                                                      \
+                       if (i < ks->end) {                                                                                      \
+                               if (dret) *dret = ks->buf[i];                                                   \
+                               break;                                                                                                  \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (str->l == 0) {                                                                                              \
+                       str->m = 1;                                                                                                     \
+                       str->s = (char*)calloc(1, 1);                                                           \
+               }                                                                                                                               \
+               str->s[str->l] = '\0';                                                                                  \
+               return str->l;                                                                                                  \
+       }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+       __KS_TYPE(type_t)                                                       \
+       __KS_BASIC(type_t, __bufsize)                           \
+       __KS_GETC(__read, __bufsize)                            \
+       __KS_GETUNTIL(__read, __bufsize)
+
+#define __KSEQ_BASIC(type_t)                                                                                   \
+       static inline kseq_t *kseq_init(type_t fd)                                                      \
+       {                                                                                                                                       \
+               kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
+               s->f = ks_init(fd);                                                                                             \
+               return s;                                                                                                               \
+       }                                                                                                                                       \
+       static inline void kseq_rewind(kseq_t *ks)                                                      \
+       {                                                                                                                                       \
+               ks->last_char = 0;                                                                                              \
+               ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
+       }                                                                                                                                       \
+       static inline void kseq_destroy(kseq_t *ks)                                                     \
+       {                                                                                                                                       \
+               if (!ks) return;                                                                                                \
+               free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+               ks_destroy(ks->f);                                                                                              \
+               free(ks);                                                                                                               \
+       }
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ                                                                                                            \
+       static int kseq_read(kseq_t *seq)                                                                       \
+       {                                                                                                                                       \
+               int c;                                                                                                                  \
+               kstream_t *ks = seq->f;                                                                                 \
+               if (seq->last_char == 0) { /* then jump to the next header line */ \
+                       while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');        \
+                       if (c == -1) return -1; /* end of file */                                       \
+                       seq->last_char = c;                                                                                     \
+               } /* the first header char has been read */                                             \
+               seq->comment.l = seq->seq.l = seq->qual.l = 0;                                  \
+               if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;                  \
+               if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);                 \
+               while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+                       if (isgraph(c)) { /* printable non-space character */           \
+                               if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
+                                       seq->seq.m = seq->seq.l + 2;                                            \
+                                       kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
+                                       seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+                               }                                                                                                               \
+                               seq->seq.s[seq->seq.l++] = (char)c;                                             \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+               seq->seq.s[seq->seq.l] = 0;     /* null terminated string */            \
+               if (c != '+') return seq->seq.l; /* FASTA */                                    \
+               if (seq->qual.m < seq->seq.m) { /* allocate enough memory */    \
+                       seq->qual.m = seq->seq.m;                                                                       \
+                       seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
+               }                                                                                                                               \
+               while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+               if (c == -1) return -2; /* we should not stop here */                   \
+               while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
+                       if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
+               seq->qual.s[seq->qual.l] = 0; /* null terminated string */              \
+               seq->last_char = 0;     /* we have not come to the next header line */ \
+               if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
+               return seq->seq.l;                                                                                              \
+       }
+
+#define __KSEQ_TYPE(type_t)                                            \
+       typedef struct {                                                        \
+               kstring_t name, comment, seq, qual;             \
+               int last_char;                                                  \
+               kstream_t *f;                                                   \
+       } kseq_t;
+
+#define KSEQ_INIT(type_t, __read)                              \
+       KSTREAM_INIT(type_t, __read, 4096)                      \
+       __KSEQ_TYPE(type_t)                                                     \
+       __KSEQ_BASIC(type_t)                                            \
+       __KSEQ_READ
+
+#endif