3 * This file is part of fastq-tools.
5 * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
8 * Tabulate k-mer frequencies with FASTQ files.
22 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
25 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
27 # define SET_BINARY_MODE(file)
30 static const char* prog_name = "fastq-kmers";
35 "fastq-kmers [OPTION]... [FILE]...\n"
36 "Print kmer counts for the given kmer size.\n"
37 "Output is in two tab-seperated columns for kmer and frequency.\n\n"
39 " -k NUM, --size=NUM kmer size (default: 1)\n"
40 " -h, --help print this message\n"
41 " -V, --version output version information and exit\n"
47 int packkmer( const char* s, uint32_t* kmer, int k )
74 *kmer = (*kmer << 2) | nt;
81 void unpackkmer( uint32_t kmer, char* s, int k )
85 for (i = 0; i < k; i++, s++) {
110 void count_fastq_kmers(FILE* fin, uint32_t* cs)
112 seq_t* seq = fastq_alloc_seq();
113 fastq_t* fqf = fastq_open(fin);
118 while (fastq_next(fqf, seq)) {
119 n = (int)seq->seq.n - k + 1;
120 for (i = 0; i < n; i++) {
121 if( packkmer(seq->seq.s + i, &kmer, k) ) {
132 void print_kmer_freqs(FILE* fout, uint32_t* cs)
134 uint32_t n = 1 << (2*k); /* 4^k */
136 char* kmer_str = malloc((k+1)*sizeof(char));
139 fprintf(fout, "kmer\tfrequency\n");
140 for (kmer = 0; kmer < n; kmer++) {
141 unpackkmer(kmer, kmer_str, k);
142 fprintf(fout, "%s\t%u\n", kmer_str, cs[kmer]);
149 int main(int argc, char* argv[])
151 SET_BINARY_MODE(stdin);
152 SET_BINARY_MODE(stdout);
156 uint32_t n; /* number of kmers: 4^k */
157 uint32_t* cs; /* counts */
163 static struct option long_options[] =
165 {"size", no_argument, 0, 0},
166 {"help", no_argument, 0, 'h'},
167 {"version", no_argument, 0, 'V'},
172 opt = getopt_long(argc, argv, "k:hV", long_options, &opt_idx);
174 if( opt == -1 ) break;
178 if (long_options[opt_idx].flag != 0) break;
195 print_version(stdout, prog_name);
207 fprintf(stderr, "Kmer size must be at least 1.");
212 fprintf(stderr, "Kmer size must be at most 16.");
217 n = 1 << (2*k); /* i.e. 4^k */
218 cs = malloc(n * sizeof(uint32_t));
219 memset(cs, 0, n * sizeof(uint32_t));
222 fprintf(stderr, "Insufficient memory to tally kmers of size %d\n", k );
226 if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
227 count_fastq_kmers(stdin, cs);
230 for (; optind < argc; optind++) {
231 fin = fopen(argv[optind], "rb");
233 fprintf(stderr, "No such file '%s'.\n", argv[optind]);
237 count_fastq_kmers(fin, cs);
241 print_kmer_freqs( stdout, cs );