3 * This file is part of fastq-tools.
5 * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
8 * Collapsing a fastq file into only unique read sequences.
20 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
23 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
25 # define SET_BINARY_MODE(file)
28 static const char* prog_name = "fastq-uniq";
33 "fastq-uniq [OPTION] [FILE]...\n"
34 "Output a non-redundant FASTQ file, in which there are no duplicate reads.\n"
35 "(Warning: this program can be somewhat memory intensive.)\n\n"
37 " -v, --verbose print status along the way\n"
38 " -h, --help print this message\n"
39 " -V, --version output version information and exit\n"
44 static int verbose_flag;
45 static size_t total_reads;
49 void fastq_hash(FILE* fin, hash_table* T)
51 fastq_t* fqf = fastq_open(fin);
52 seq_t* seq = fastq_alloc_seq();
54 while (fastq_next(fqf, seq)) {
55 inc_hash_table(T, seq->seq.s, seq->seq.n);
58 if (verbose_flag && total_reads % 100000 == 0) {
59 fprintf(stderr, "%zu reads processed ...\n", total_reads);
68 int compare_hashed_value_count(const void* x, const void* y)
70 hashed_value* const * a = x;
71 hashed_value* const * b = y;
73 if( (*a)->count > (*b)->count ) return -1;
74 if( (*a)->count < (*b)->count ) return 1;
80 void print_hash_table(FILE* fout, hash_table* T)
82 hashed_value** S = dump_hash_table(T);
83 qsort(S, T->m, sizeof(hashed_value*), compare_hashed_value_count);
86 for (i = 0; i < T->m; i++) {
87 fprintf(fout, ">unique-read-%07zu (%zu copies)\n", i, S[i]->count);
88 fwrite(S[i]->value, S[i]->len, sizeof(char), fout);
96 int main(int argc, char* argv[])
98 SET_BINARY_MODE(stdin);
99 SET_BINARY_MODE(stdout);
101 hash_table* T = create_hash_table();
108 static struct option long_options[] =
110 {"verbose", no_argument, &verbose_flag, 1},
111 {"help", no_argument, NULL, 'h'},
112 {"version", no_argument, NULL, 'V'},
117 opt = getopt_long(argc, argv, "vhV", long_options, &opt_idx);
119 if (opt == -1) break;
123 if (long_options[opt_idx].flag != 0) break;
140 print_version(stdout, prog_name);
149 if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
150 fastq_hash(stdin, T);
153 for (; optind < argc; optind++) {
154 fin = fopen(argv[optind], "rb");
156 fprintf(stderr, "No such file '%s'.\n", argv[optind]);
164 print_hash_table(stdout, T);
166 destroy_hash_table(T);