From 9537fe433ae5b59162a2f0a0834398c9fb4b7907 Mon Sep 17 00:00:00 2001 From: quentin jones Date: Fri, 25 Feb 2011 14:48:23 -0800 Subject: [PATCH] fastq-grep works --- configure.ac | 3 +- src/fastq-grep.c | 120 +++++++++++++++++++++++++++++++++++++++++++++-- src/kseq.h | 2 +- 3 files changed, 119 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index cdde05e..44fda3e 100644 --- a/configure.ac +++ b/configure.ac @@ -33,7 +33,8 @@ then AC_MSG_ERROR([The PCRE library is needed. See http://www.pcre.org.]) fi CFLAGS="$CFLAGS $(pcre-config --cflags)" -PCRE_LIBS=$(pcre-config --libs) +PCRE_LIBS="$(pcre-config --libs)" +AC_SUBST(PCRE_LIBS) # check getopt AC_CHECK_HEADER(getopt.h, , diff --git a/src/fastq-grep.c b/src/fastq-grep.c index 08402b4..ca072eb 100644 --- a/src/fastq-grep.c +++ b/src/fastq-grep.c @@ -18,25 +18,90 @@ KSEQ_INIT(gzFile, gzread) +#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) +# include +# include +# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) +#else +# define SET_BINARY_MODE(file) +#endif + + void print_help() { fprintf( stderr, "fastq-grep [OPTION]... PATTERN [FILE]...\n" -"Search for PATTERN in the read sequence in each FILE or standard input.\n" -"PATTERN, by default, is a perl compatible regular expression." +"Search for PATTERN in the read sequences in each FILE or standard input.\n" +"PATTERN, by default, is a perl compatible regular expression.\n\n" +"Options:\n" +" -h, --help print this message\n" +" -v, --invert-match select nonmatching entries\n" ); - } static int invert_flag; static int help_flag; +static int zout_flag; + + + +void print_fastq_entry( FILE* fout, kseq_t* seq ) +{ + fprintf(fout, "@%s\n%s\n+%s\n%s\n", + seq->name.s, + seq->seq.s, + seq->comment.s, + seq->qual.s ); +} + + +void fastq_grep( gzFile fin, FILE* fout, pcre* re ) +{ + int rc; + int ovector[3]; + + kseq_t* seq; + seq = kseq_init(fin); + + while (kseq_read(seq) >= 0) { + rc = pcre_exec(re, /* pattern */ + NULL, /* extre data */ + seq->seq.s, /* subject */ + seq->seq.l, /* subject length */ + 0, /* subject offset */ + 0, /* options */ + ovector, /* output vector */ + 3 ); /* output vector length */ + + if ((invert_flag && rc == PCRE_ERROR_NOMATCH) || rc >= 0) { + print_fastq_entry( fout, seq ); + } + } + + kseq_destroy(seq); +} + + int main(int argc, char* argv[]) { + const char* pat; + pcre* re; + const char* pat_error; + int pat_error_offset; + + FILE* fin; + gzFile gzfin; + + + invert_flag = 0; + help_flag = 0; + zout_flag = 0; int opt; int opt_idx; + static struct option long_options[] = { {"help", no_argument, &help_flag, 1}, @@ -75,7 +140,54 @@ int main(int argc, char* argv[]) return 0; } - /* TODO */ + if (optind >= argc) { + fprintf(stderr, "A pattern must be specified.\n"); + return 1; + } + + pat = argv[optind++]; + re = pcre_compile( pat, PCRE_CASELESS, &pat_error, &pat_error_offset, NULL ); + + + if (re == NULL) { + fprintf(stderr, "Syntax error in PCRE pattern at offset: %d: %s\n", + pat_error_offset, pat_error ); + return 1; + } + + + if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) { + gzfin = gzdopen( fileno(stdin), "rb" ); + if (gzfin == NULL) { + fprintf(stderr, "Malformed file 'stdin'.\n"); + return 1; + } + + fastq_grep(gzfin, stdout, re); + + gzclose(gzfin); + } + else { + for (; optind < argc; optind++) { + fin = fopen(argv[optind], "rb"); + if (fin == NULL) { + fprintf(stderr, "No such file '%s'.\n", argv[optind]); + continue; + } + + gzfin = gzdopen(fileno(fin), "rb"); + if (gzfin == NULL) { + fprintf(stderr, "Malformed file '%s'.\n", argv[optind]); + continue; + } + + fastq_grep(gzfin, stdout, re); + + gzclose(gzfin); + } + } + + pcre_free(re); return 0; } diff --git a/src/kseq.h b/src/kseq.h index bbe0125..4fafd85 100644 --- a/src/kseq.h +++ b/src/kseq.h @@ -197,7 +197,7 @@ typedef struct __kstring_t { seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + ks_getuntil(ks, '\n', &seq->comment, &c); \ if (c == -1) return -2; /* we should not stop here */ \ while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ -- 2.39.2