3 * This file is part of fastq-tools.
5 * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
8 * Collapsing a fastq file into only unique read sequences.
20 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
23 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
25 # define SET_BINARY_MODE(file)
30 static int verbose_flag;
36 "fastq-uniq [OPTION] [FILE]...\n"
37 "Output a non-redundant FASTQ file, in which there are no duplicate reads.\n"
38 "(Warning: this program can be somewhat memory intensive.)\n\n"
40 " -h, --help print this message\n"
41 " -v, --verbose print status along the way\n"
46 void fastq_hash(FILE* fin, hash_table* T)
48 fastq_t* fqf = fastq_open(fin);
49 seq_t* seq = fastq_alloc_seq();
51 while (fastq_next(fqf, seq)) {
52 inc_hash_table(T, seq->seq.s, seq->seq.n);
55 if (verbose_flag && total_reads % 100000 == 0) {
56 fprintf(stderr, "%zu reads processed ...\n", total_reads);
65 int compare_hashed_value_count(const void* x, const void* y)
67 hashed_value* const * a = x;
68 hashed_value* const * b = y;
70 if( (*a)->count > (*b)->count ) return -1;
71 if( (*a)->count < (*b)->count ) return 1;
77 void print_hash_table(FILE* fout, hash_table* T)
79 hashed_value** S = dump_hash_table(T);
80 qsort(S, T->m, sizeof(hashed_value*), compare_hashed_value_count);
83 for (i = 0; i < T->m; i++) {
84 fprintf(fout, ">unique-read-%07zu (%zu copies)\n", i, S[i]->count);
85 fwrite(S[i]->value, S[i]->len, sizeof(char), fout);
93 int main(int argc, char* argv[])
95 SET_BINARY_MODE(stdin);
96 SET_BINARY_MODE(stdout);
98 hash_table* T = create_hash_table();
107 static struct option long_options[] =
109 {"help", no_argument, &help_flag, 1},
110 {"verbose", no_argument, &verbose_flag, 1},
115 opt = getopt_long(argc, argv, "hv", long_options, &opt_idx);
117 if (opt == -1) break;
121 if (long_options[opt_idx].flag != 0) break;
147 if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
148 fastq_hash(stdin, T);
151 for (; optind < argc; optind++) {
152 fin = fopen(argv[optind], "rb");
154 fprintf(stderr, "No such file '%s'.\n", argv[optind]);
162 print_hash_table(stdout, T);
164 destroy_hash_table(T);