]> git.donarmstrong.com Git - fastq-tools.git/commitdiff
fastq-grep works
authorquentin jones <dcjones@datahorse.(none)>
Fri, 25 Feb 2011 22:48:23 +0000 (14:48 -0800)
committerquentin jones <dcjones@datahorse.(none)>
Fri, 25 Feb 2011 22:48:23 +0000 (14:48 -0800)
configure.ac
src/fastq-grep.c
src/kseq.h

index cdde05e4736e4f3c1a1ec64cc23fd07010331c40..44fda3e630e888369e588adad57507c407a1c199 100644 (file)
@@ -33,7 +33,8 @@ then
     AC_MSG_ERROR([The PCRE library is needed. See http://www.pcre.org.])
 fi
 CFLAGS="$CFLAGS $(pcre-config --cflags)"
-PCRE_LIBS=$(pcre-config --libs)
+PCRE_LIBS="$(pcre-config --libs)"
+AC_SUBST(PCRE_LIBS)
 
 # check getopt
 AC_CHECK_HEADER(getopt.h, ,
index 08402b40b5876f086badb53373fe5c9dcd9d3b22..ca072ebc05fd234cab16e1190aa9a2ac1c9e7716 100644 (file)
 KSEQ_INIT(gzFile, gzread)
 
 
+#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
+#  include <fcntl.h>
+#  include <io.h>
+#  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
+#else
+#  define SET_BINARY_MODE(file)
+#endif
+
+
 void print_help()
 {
     fprintf( stderr, 
 "fastq-grep [OPTION]... PATTERN [FILE]...\n"
-"Search for PATTERN in the read sequence in each FILE or standard input.\n"
-"PATTERN, by default, is a perl compatible regular expression."
+"Search for PATTERN in the read sequences in each FILE or standard input.\n"
+"PATTERN, by default, is a perl compatible regular expression.\n\n"
+"Options:\n"
+"  -h, --help              print this message\n"
+"  -v, --invert-match      select nonmatching entries\n"
     );
-
 }
 
 static int invert_flag;
 static int help_flag;
+static int zout_flag;
+
+
+
+void print_fastq_entry( FILE* fout, kseq_t* seq )
+{
+    fprintf(fout, "@%s\n%s\n+%s\n%s\n",
+                  seq->name.s,
+                  seq->seq.s,
+                  seq->comment.s,
+                  seq->qual.s );
+}
+
+
+void fastq_grep( gzFile fin, FILE* fout, pcre* re )
+{
+    int rc;
+    int ovector[3];
+
+    kseq_t* seq;
+    seq = kseq_init(fin);
+
+    while (kseq_read(seq) >= 0) {
+        rc = pcre_exec(re,          /* pattern */
+                       NULL,        /* extre data */
+                       seq->seq.s,  /* subject */
+                       seq->seq.l,  /* subject length */
+                       0,           /* subject offset */
+                       0,           /* options */
+                       ovector,     /* output vector */
+                       3         ); /* output vector length */
+
+        if ((invert_flag && rc == PCRE_ERROR_NOMATCH) || rc >= 0) {
+            print_fastq_entry( fout, seq );
+        }
+    }
+
+    kseq_destroy(seq);
+}
+
+
 
 int main(int argc, char* argv[])
 {
+    const char* pat;
+    pcre* re;
+    const char* pat_error;
+    int pat_error_offset;
+
+    FILE*  fin;
+    gzFile gzfin;
+
+
+    invert_flag = 0;
+    help_flag   = 0;
+    zout_flag   = 0;
 
     int opt;
     int opt_idx;
 
+
     static struct option long_options[] =
         { 
           {"help", no_argument, &help_flag, 1},
@@ -75,7 +140,54 @@ int main(int argc, char* argv[])
         return 0;
     }
 
-    /* TODO */
+    if (optind >= argc) {
+        fprintf(stderr, "A pattern must be specified.\n");
+        return 1;
+    }
+
+    pat = argv[optind++];
+    re = pcre_compile( pat, PCRE_CASELESS, &pat_error, &pat_error_offset, NULL );
+
+
+    if (re == NULL) {
+        fprintf(stderr, "Syntax error in PCRE pattern at offset: %d: %s\n",
+                pat_error_offset, pat_error );
+        return 1;
+    }
+
+
+    if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
+        gzfin = gzdopen( fileno(stdin), "rb" );
+        if (gzfin == NULL) {
+            fprintf(stderr, "Malformed file 'stdin'.\n");
+            return 1;
+        }
+
+        fastq_grep(gzfin, stdout, re);
+
+        gzclose(gzfin);
+    }
+    else {
+        for (; optind < argc; optind++) {
+            fin = fopen(argv[optind], "rb");
+            if (fin == NULL) {
+                fprintf(stderr, "No such file '%s'.\n", argv[optind]);
+                continue;
+            }
+
+            gzfin = gzdopen(fileno(fin), "rb");
+            if (gzfin == NULL) {
+                fprintf(stderr, "Malformed file '%s'.\n", argv[optind]);
+                continue;
+            }
+
+            fastq_grep(gzfin, stdout, re);
+
+            gzclose(gzfin);
+        }
+    }
+
+    pcre_free(re);
 
     return 0;
 }
index bbe012552ff3b2ac047ddd10655e80827bbb4008..4fafd8546408269ca1f794de3e759e0218256e0b 100644 (file)
@@ -197,7 +197,7 @@ typedef struct __kstring_t {
                        seq->qual.m = seq->seq.m;                                                                       \
                        seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
                }                                                                                                                               \
-               while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+        ks_getuntil(ks, '\n', &seq->comment, &c);                        \
                if (c == -1) return -2; /* we should not stop here */                   \
                while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
                        if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \