4 * fastq-grep: regular expression searches of the sequences within a fastq file
6 * Febuary 2011 / Daniel Jones <dcjones@cs.washington.edu>
18 KSEQ_INIT(gzFile, gzread)
21 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
24 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
26 # define SET_BINARY_MODE(file)
33 "fastq-grep [OPTION]... PATTERN [FILE]...\n"
34 "Search for PATTERN in the read sequences in each FILE or standard input.\n"
35 "PATTERN, by default, is a perl compatible regular expression.\n\n"
37 " -h, --help print this message\n"
38 " -v, --invert-match select nonmatching entries\n"
39 " -c, --count output only the number of matching sequences\n"
43 static int invert_flag;
45 static int count_flag;
49 void print_fastq_entry( FILE* fout, kseq_t* seq )
51 fprintf(fout, "@%s\n%s\n+%s\n%s\n",
59 void fastq_grep( gzFile fin, FILE* fout, pcre* re )
68 while (kseq_read(seq) >= 0) {
69 rc = pcre_exec(re, /* pattern */
70 NULL, /* extre data */
71 seq->seq.s, /* subject */
72 seq->seq.l, /* subject length */
73 0, /* subject offset */
75 ovector, /* output vector */
76 3 ); /* output vector length */
78 if ((invert_flag && rc == PCRE_ERROR_NOMATCH) || rc >= 0) {
79 if (count_flag) count++;
80 else print_fastq_entry(fout, seq);
86 if (count_flag) fprintf(fout, "%zu\n", count);
91 int main(int argc, char* argv[])
93 SET_BINARY_MODE(stdin);
94 SET_BINARY_MODE(stdout);
98 const char* pat_error;
113 static struct option long_options[] =
115 {"help", no_argument, &help_flag, 1},
116 {"invert-match", no_argument, &invert_flag, 1},
117 {"count", no_argument, &count_flag, 1},
122 opt = getopt_long(argc, argv, "hvc", long_options, &opt_idx);
124 if( opt == -1 ) break;
128 if (long_options[opt_idx].flag != 0) break;
158 if (optind >= argc) {
159 fprintf(stderr, "A pattern must be specified.\n");
163 pat = argv[optind++];
164 re = pcre_compile( pat, PCRE_CASELESS, &pat_error, &pat_error_offset, NULL );
168 fprintf(stderr, "Syntax error in PCRE pattern at offset: %d: %s\n",
169 pat_error_offset, pat_error );
174 if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
175 gzfin = gzdopen( fileno(stdin), "rb" );
177 fprintf(stderr, "Malformed file 'stdin'.\n");
181 fastq_grep(gzfin, stdout, re);
186 for (; optind < argc; optind++) {
187 fin = fopen(argv[optind], "rb");
189 fprintf(stderr, "No such file '%s'.\n", argv[optind]);
193 gzfin = gzdopen(fileno(fin), "rb");
195 fprintf(stderr, "Malformed file '%s'.\n", argv[optind]);
199 fastq_grep(gzfin, stdout, re);