3 * fastq-kmers: kmer frequences within fastq files
5 * Febuary 2011 / Daniel Jones <dcjones@cs.washington.edu>
18 KSEQ_INIT(gzFile, gzread)
21 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
24 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
26 # define SET_BINARY_MODE(file)
33 "fastq-kmers [OPTION]... [FILE]...\n"
34 "Print kmer counts for the given kmer size.\n"
35 "Output is in two tab-seperated columns for kmer and frequency.\n\n"
37 " -h, --help print this message\n"
38 " -k, --size kmer size (default: 1)\n"
45 int packkmer( const char* s, uint32_t* kmer, int k )
72 *kmer = (*kmer << 2) | nt;
79 void unpackkmer( uint32_t kmer, char* s, int k )
83 for (i = 0; i < k; i++, s++) {
108 void count_fastq_kmers(gzFile* fin, uint32_t* cs)
110 kseq_t* seq = kseq_init(fin);
115 while (kseq_read(seq) >= 0) {
116 n = (int)seq->seq.l - k + 1;
117 for (i = 0; i < n; i++) {
118 if( packkmer(seq->seq.s + i, &kmer, k) ) {
128 void print_kmer_freqs(FILE* fout, uint32_t* cs)
130 uint32_t n = 1 << (2*k); /* 4^k */
132 char* kmer_str = malloc((k+1)*sizeof(char));
135 fprintf(fout, "kmer\tfrequency\n");
136 for (kmer = 0; kmer < n; kmer++) {
137 unpackkmer(kmer, kmer_str, k);
138 fprintf(fout, "%s\t%u\n", kmer_str, cs[kmer]);
145 int main(int argc, char* argv[])
150 uint32_t n; /* number of kmers: 4^k */
151 uint32_t* cs; /* counts */
158 static struct option long_options[] =
160 {"help", no_argument, &help_flag, 1},
161 {"size", no_argument, 0, 0},
166 opt = getopt_long(argc, argv, "hk:", long_options, &opt_idx);
168 if( opt == -1 ) break;
172 if (long_options[opt_idx].flag != 0) break;
202 fprintf(stderr, "Kmer size must be at least 1.");
207 fprintf(stderr, "Kmer size must be at most 16.");
212 n = 1 << (2*k); /* i.e. 4^k */
213 cs = malloc(n * sizeof(uint32_t));
214 memset(cs, 0, n * sizeof(uint32_t));
217 fprintf(stderr, "Insufficient memory to tally kmers of size %d\n", k );
221 if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
222 gzfin = gzdopen( fileno(stdin), "rb" );
224 fprintf(stderr, "Malformed file 'stdin'.\n");
228 count_fastq_kmers(gzfin, cs);
233 for (; optind < argc; optind++) {
234 fin = fopen(argv[optind], "rb");
236 fprintf(stderr, "No such file '%s'.\n", argv[optind]);
240 gzfin = gzdopen(fileno(fin), "rb");
242 fprintf(stderr, "Malformed file '%s'.\n", argv[optind]);
246 count_fastq_kmers(gzfin, cs);
252 print_kmer_freqs( stdout, cs );