3 * This file is part of fastq-tools.
5 * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
8 * Regular expression searches of the sequences within a FASTQ file.
22 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
25 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
27 # define SET_BINARY_MODE(file)
31 static const char* prog_name = "fastq-grep";
37 "fastq-grep [OPTION]... PATTERN [FILE]...\n"
38 "Search for PATTERN in the read sequences in each FILE or standard input.\n"
39 "PATTERN, by default, is a perl compatible regular expression.\n\n"
41 " -i, --id match the read id (by default, sequence is matched)\n"
42 " -v, --invert-match select nonmatching entries\n"
43 " -m, --mismatches=FILE output mismatching entries to the given file\n"
44 " -c, --count output only the number of matching sequences\n"
45 " -h, --help print this message\n"
46 " -V, --version output version information and exit\n"
50 static int invert_flag;
51 static int count_flag;
56 void fastq_grep(FILE* fin, FILE* fout, FILE* mismatch_file, pcre* re)
62 fastq_t* fqf = fastq_create(fin);
63 seq_t* seq = seq_create();
65 while (fastq_read(fqf, seq)) {
67 rc = pcre_exec(re, /* pattern */
68 NULL, /* extra data */
69 id_flag ? seq->id1.s : seq->seq.s,
70 id_flag ? seq->id1.n : seq->seq.n,
71 0, /* subject offset */
73 ovector, /* output vector */
74 3 ); /* output vector length */
76 if ((invert_flag && rc == PCRE_ERROR_NOMATCH) || (!invert_flag && rc >= 0)) {
77 if (count_flag) count++;
78 else fastq_print(fout, seq);
80 else if (mismatch_file) {
81 fastq_print(mismatch_file, seq);
88 if (count_flag) fprintf(fout, "%zu\n", count);
93 int main(int argc, char* argv[])
95 SET_BINARY_MODE(stdin);
96 SET_BINARY_MODE(stdout);
100 const char* pat_error;
101 int pat_error_offset;
113 FILE* mismatch_file = NULL;
115 static struct option long_options[] =
117 {"id", no_argument, &id_flag, 1},
118 {"invert-match", no_argument, &invert_flag, 1},
119 {"mismatches", required_argument, NULL, 'm'},
120 {"count", no_argument, &count_flag, 1},
121 {"help", no_argument, NULL, 'h'},
122 {"version", no_argument, NULL, 'V'},
127 opt = getopt_long(argc, argv, "ivmchV", long_options, &opt_idx);
129 if (opt == -1) break;
133 if (long_options[opt_idx].flag != 0) break;
147 mismatch_file = fopen(optarg, "w");
148 if (mismatch_file == NULL) {
149 fprintf(stderr, "No such file '%s'.\n", optarg);
163 print_version(stdout, prog_name);
174 if (optind >= argc) {
175 fprintf(stderr, "A pattern must be specified.\n");
179 pat = argv[optind++];
180 re = pcre_compile( pat, PCRE_CASELESS, &pat_error, &pat_error_offset, NULL );
184 fprintf(stderr, "Syntax error in PCRE pattern at offset: %d: %s\n",
185 pat_error_offset, pat_error );
190 if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
191 fastq_grep(stdin, stdout, mismatch_file, re);
194 for (; optind < argc; optind++) {
195 fin = fopen(argv[optind], "rb");
197 fprintf(stderr, "No such file '%s'.\n", argv[optind]);
201 fastq_grep(fin, stdout, mismatch_file, re);
208 if (mismatch_file) fclose(mismatch_file);