3 * This file is part of fastq-tools.
5 * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
8 * Collapsing a fastq file into only unique read sequences.
21 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
24 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
26 # define SET_BINARY_MODE(file)
29 static const char* prog_name = "fastq-uniq";
34 "fastq-uniq [OPTION] [FILE]...\n"
35 "Output a non-redundant FASTQ file, in which there are no duplicate reads.\n"
36 "(Warning: this program can be somewhat memory intensive.)\n\n"
38 " -v, --verbose print status along the way\n"
39 " -h, --help print this message\n"
40 " -V, --version output version information and exit\n"
45 static int verbose_flag;
46 static size_t total_reads;
50 void fastq_hash(FILE* fin, hash_table* T)
52 fastq_t* fqf = fastq_open(fin);
53 seq_t* seq = fastq_alloc_seq();
55 while (fastq_next(fqf, seq)) {
56 inc_hash_table(T, seq->seq.s, seq->seq.n);
59 if (verbose_flag && total_reads % 100000 == 0) {
60 fprintf(stderr, "%zu reads processed ...\n", total_reads);
69 int compare_hashed_value_count(const void* x, const void* y)
71 hashed_value* const * a = x;
72 hashed_value* const * b = y;
74 if( (*a)->count > (*b)->count ) return -1;
75 if( (*a)->count < (*b)->count ) return 1;
81 void print_hash_table(FILE* fout, hash_table* T)
83 hashed_value** S = dump_hash_table(T);
84 qsort(S, T->m, sizeof(hashed_value*), compare_hashed_value_count);
87 for (i = 0; i < T->m; i++) {
88 fprintf(fout, ">unique-read-%07zu (%"PRIu32" copies)\n", i, S[i]->count);
89 fwrite(S[i]->value, S[i]->len, sizeof(char), fout);
97 int main(int argc, char* argv[])
99 SET_BINARY_MODE(stdin);
100 SET_BINARY_MODE(stdout);
102 hash_table* T = create_hash_table();
109 static struct option long_options[] =
111 {"verbose", no_argument, &verbose_flag, 1},
112 {"help", no_argument, NULL, 'h'},
113 {"version", no_argument, NULL, 'V'},
118 opt = getopt_long(argc, argv, "vhV", long_options, &opt_idx);
120 if (opt == -1) break;
124 if (long_options[opt_idx].flag != 0) break;
141 print_version(stdout, prog_name);
150 if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
151 fastq_hash(stdin, T);
154 for (; optind < argc; optind++) {
155 fin = fopen(argv[optind], "rb");
157 fprintf(stderr, "No such file '%s'.\n", argv[optind]);
165 print_hash_table(stdout, T);
167 destroy_hash_table(T);