From: quentin jones Date: Fri, 25 Feb 2011 23:33:03 +0000 (-0800) Subject: a program to tally kmer frequencies from fastq files X-Git-Url: https://git.donarmstrong.com/?p=fastq-tools.git;a=commitdiff_plain;h=1a97afb4a336f49f700eae050908c3e68f415ee6 a program to tally kmer frequencies from fastq files --- diff --git a/configure.ac b/configure.ac index 44fda3e..35c8877 100644 --- a/configure.ac +++ b/configure.ac @@ -8,8 +8,8 @@ AC_CONFIG_MACRO_DIR([m4]) AC_PROG_CC AM_PROG_CC_C_O -opt_CFLAGS="-g -O3" -dbg_CFLAGS="-g -O0" +opt_CFLAGS="-Wall -g -O3" +dbg_CFLAGS="-Wall -g -O0" AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], diff --git a/src/Makefile.am b/src/Makefile.am index 9496a48..202e6c2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,6 +1,10 @@ -bin_PROGRAMS = fastq-grep +bin_PROGRAMS = fastq-grep fastq-kmers + fastq_grep_SOURCES = kseq.h fastq-grep.c fastq_grep_LDADD = $(PCRE_LIBS) -lz +fastq_kmers_SOURCES = kseq.h fastq-kmers.c +fastq_kmers_LDADD = -lz + diff --git a/src/fastq-grep.c b/src/fastq-grep.c index ca072eb..dea097f 100644 --- a/src/fastq-grep.c +++ b/src/fastq-grep.c @@ -118,7 +118,6 @@ int main(int argc, char* argv[]) case 0: if (long_options[opt_idx].flag != 0) break; if (optarg) { - /* TODO */ } break; @@ -130,6 +129,9 @@ int main(int argc, char* argv[]) invert_flag = 1; break; + case '?': + return 1; + default: abort(); } diff --git a/src/fastq-kmers.c b/src/fastq-kmers.c new file mode 100644 index 0000000..551ff88 --- /dev/null +++ b/src/fastq-kmers.c @@ -0,0 +1,262 @@ + +/* + * fastq-kmers: kmer frequences within fastq files + * + * Febuary 2011 / Daniel Jones + * + */ + + + +#include +#include +#include +#include +#include + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + + +#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__) +# include +# include +# define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) +#else +# define SET_BINARY_MODE(file) +#endif + + +void print_help() +{ + fprintf( stderr, +"fastq-kmers [OPTION]... [FILE]...\n" +"Print kmer counts for the given kmer size.\n" +"Output is in two tab-seperated columns for kmer and frequency.\n\n" +"Options:\n" +" -h, --help print this message\n" +" -k, --size kmer size (default: 1)\n" + ); +} + +static int help_flag; +static int k; + +int packkmer( const char* s, uint32_t* kmer, int k ) +{ + *kmer = 0; + uint32_t nt; + while (k--) { + switch (s[k]) { + case 'a': + case 'A': + nt = 0; + break; + case 'c': + case 'C': + nt = 1; + break; + case 'g': + case 'G': + nt = 2; + break; + case 't': + case 'T': + nt = 3; + break; + + default: + return 0; + } + + *kmer = (*kmer << 2) | nt; + } + + return 1; +} + + +void unpackkmer( uint32_t kmer, char* s, int k ) +{ + int i; + uint32_t nt; + for (i = 0; i < k; i++, s++) { + nt = kmer & 0x3; + kmer = kmer >> 2; + + switch (nt) { + case 0: + *s = 'A'; + break; + case 1: + *s = 'C'; + break; + case 2: + *s = 'G'; + break; + case 3: + *s = 'T'; + break; + default: break; + } + } + + *s = '\0'; +} + + +void count_fastq_kmers(gzFile* fin, uint32_t* cs) +{ + kseq_t* seq = kseq_init(fin); + int i; + int n; + uint32_t kmer; + + while (kseq_read(seq) >= 0) { + n = (int)seq->seq.l - k + 1; + for (i = 0; i < n; i++) { + if( packkmer(seq->seq.s + i, &kmer, k) ) { + cs[kmer]++; + } + } + } + + kseq_destroy(seq); +} + + +void print_kmer_freqs(FILE* fout, uint32_t* cs) +{ + uint32_t n = 1 << (2*k); /* 4^k */ + + char* kmer_str = malloc((k+1)*sizeof(char)); + uint32_t kmer; + + fprintf(fout, "kmer\tfrequency\n"); + for (kmer = 0; kmer < n; kmer++) { + unpackkmer(kmer, kmer_str, k); + fprintf(fout, "%s\t%u\n", kmer_str, cs[kmer]); + } + + free(kmer_str); +} + + +int main(int argc, char* argv[]) +{ + help_flag = 0; + k = 1; + + uint32_t n; /* number of kmers: 4^k */ + uint32_t* cs; /* counts */ + + FILE* fin; + gzFile gzfin; + + int opt; + int opt_idx; + static struct option long_options[] = + { + {"help", no_argument, &help_flag, 1}, + {"size", no_argument, 0, 0}, + {0, 0, 0, 0} + }; + + while (1) { + opt = getopt_long(argc, argv, "hk:", long_options, &opt_idx); + + if( opt == -1 ) break; + + switch (opt) { + case 0: + if (long_options[opt_idx].flag != 0) break; + if (optarg) { + if( opt_idx == 1 ) { + k = atoi(optarg); + } + } + break; + + case 'h': + help_flag = 1; + break; + + case 'k': + k = atoi(optarg); + break; + + case '?': + return 1; + + default: + abort(); + } + } + + if (help_flag) { + print_help(); + return 0; + } + + if (k < 1) { + fprintf(stderr, "Kmer size must be at least 1."); + return 1; + } + + if (k > 16) { + fprintf(stderr, "Kmer size must be at most 16."); + return 1; + } + + + n = 1 << (2*k); /* i.e. 4^k */ + cs = malloc(n * sizeof(uint32_t)); + memset(cs, 0, n * sizeof(uint32_t)); + + if (cs == NULL) { + fprintf(stderr, "Insufficient memory to tally kmers of size %d\n", k ); + return 1; + } + + if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) { + gzfin = gzdopen( fileno(stdin), "rb" ); + if (gzfin == NULL) { + fprintf(stderr, "Malformed file 'stdin'.\n"); + return 1; + } + + count_fastq_kmers(gzfin, cs); + + gzclose(gzfin); + } + else { + for (; optind < argc; optind++) { + fin = fopen(argv[optind], "rb"); + if (fin == NULL) { + fprintf(stderr, "No such file '%s'.\n", argv[optind]); + continue; + } + + gzfin = gzdopen(fileno(fin), "rb"); + if (gzfin == NULL) { + fprintf(stderr, "Malformed file '%s'.\n", argv[optind]); + continue; + } + + count_fastq_kmers(gzfin, cs); + + gzclose(gzfin); + } + } + + print_kmer_freqs( stdout, cs ); + free(cs); + + return 0; +} + + + + + +