From: Daniel Jones Date: Tue, 9 Oct 2012 17:37:19 +0000 (-0700) Subject: WIP on a fastq-sort program. X-Git-Url: https://git.donarmstrong.com/?p=fastq-tools.git;a=commitdiff_plain;h=229848ff462383b1fba2a073e5f7139856d2873d WIP on a fastq-sort program. --- diff --git a/src/Makefile.am b/src/Makefile.am index aeb9c85..1fc7d3a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,5 +1,5 @@ -bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual fastq-sample fastq-qualadj +bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual fastq-sample fastq-qualadj fastq-sort fastq_common_src=common.h common.c fastq_parse_src=parse.h parse.c @@ -21,3 +21,6 @@ fastq_qual_SOURCES = fastq-qual.c $(fastq_common_src) $(fastq_parse_src) fastq_sample_SOURCES = fastq-sample.c $(fastq_common_src) $(fastq_parse_src) $(fastq_rng_src) fastq_qualadj_SOURCES = fastq-qualadj.c $(fastq_common_src) $(fastq_parse_src) + +fastq_sort_SOURCES = fastq-sort.c $(fastq_common_src) $(fastq_parse_src) + diff --git a/src/fastq-grep.c b/src/fastq-grep.c index 4d1603f..b09b2bf 100644 --- a/src/fastq-grep.c +++ b/src/fastq-grep.c @@ -33,7 +33,7 @@ static const char* prog_name = "fastq-grep"; void print_help() { - fprintf(stdout, + fprintf(stdout, "fastq-grep [OPTION]... PATTERN [FILE]...\n" "Search for PATTERN in the read sequences in each FILE or standard input.\n" "PATTERN, by default, is a perl compatible regular expression.\n\n" diff --git a/src/fastq-sample.c b/src/fastq-sample.c index 0adf02d..f12620b 100644 --- a/src/fastq-sample.c +++ b/src/fastq-sample.c @@ -1,4 +1,3 @@ - /* * This file is part of fastq-tools. * @@ -8,6 +7,7 @@ * Sample reads with or without replacement from a FASTQ file. * */ + #include #include #include diff --git a/src/fastq-sort.c b/src/fastq-sort.c new file mode 100644 index 0000000..522c0ce --- /dev/null +++ b/src/fastq-sort.c @@ -0,0 +1,245 @@ +/* + * This file is part of fastq-tools. + * + * Copyright (c) 2012 by Daniel C. Jones + * + * fastq-sample : + * Sample reads with or without replacement from a FASTQ file. + * + */ + +#include +#include +#include + +#include "common.h" +#include "parse.h" + + +typedef struct seq_array_t_ +{ + seq_t* seqs; + + /* Number of seq objects. */ + size_t n; + + /* Size reserved in seqs. */ + size_t size; + + /* Space reserved for strings. */ + char* data; + + /* Data used. */ + size_t data_used; + + /* Total data size. */ + size_t data_size; +} seq_array_t; + + +seq_array_t* seq_array_create(size_t data_size) +{ + seq_array_t* a = malloc_or_die(sizeof(seq_array_t)); + a->size = 1024; + a->n = 0; + a->seqs = malloc_or_die(sizeof(seq_t)); + + a->data_size = data_size; + a->data_used = 0; + a->data = malloc_or_die(data_size); + + return a; +} + + +void seq_array_free(seq_array_t* a) +{ + free(a->seqs); + free(a->data); + free(a); +} + + +/* Push a fastq entry to back of the array. Return false if there was not enough + * space. */ +bool seq_array_push(seq_array_t* a, const seq_t* seq) +{ + size_t size_needed = (seq->id1.n + 1) + (seq->seq.n + 1) + + (seq->id2.n + 1) + (seq->qual.n + 1); + + if (size_needed > a->data_size - a->data_used) return false; + + if (a->n == a->size) { + a->size *= 2; + a->seqs = realloc_or_die(a->seqs, a->size * sizeof(seq_t)); + } + + memcpy(seq->id1.s, &a->data[a->data_used], seq->id1.n + 1); + a->seqs[a->n].id1.s = &a->data[a->data_used]; + a->seqs[a->n].id1.n = seq->id1.n + 1; + a->data_used += seq->id1.n + 1; + + memcpy(seq->seq.s, &a->data[a->data_used], seq->seq.n + 1); + a->seqs[a->n].seq.s = &a->data[a->data_used]; + a->seqs[a->n].seq.n = seq->seq.n + 1; + a->data_used += seq->seq.n + 1; + + memcpy(seq->id2.s, &a->data[a->data_used], seq->id2.n + 1); + a->seqs[a->n].id2.s = &a->data[a->data_used]; + a->seqs[a->n].id2.n = seq->id2.n + 1; + a->data_used += seq->id2.n + 1; + + memcpy(seq->qual.s, &a->data[a->data_used], seq->qual.n + 1); + a->seqs[a->n].qual.s = &a->data[a->data_used]; + a->seqs[a->n].qual.n = seq->qual.n + 1; + a->data_used += seq->qual.n + 1; + + ++a->n; + + return true; +} + + +void seq_array_clear(seq_array_t* a) +{ + a->n = 0; + a->data_used = 0; +} + + +void seq_array_sort(seq_array_t* a, int (*cmp)(const void*, const void*)) +{ + qsort(a->seqs, a->n, sizeof(seq_t), cmp); +} + + +int seq_cmp_hash(const void* a_, const void* b_) +{ + const seq_t* a = (seq_t*) a_; + const seq_t* b = (seq_t*) b_; + /* TODO: hash and compare. */ + return 0; +} + + +static const char* prog_name = "fastq-sort"; + + +void print_help() +{ + fprintf(stdout, +"fastq-sort [OPTION]... [FILE]...\n" +"Concatenate and sort FASTQ files and write to standard output.\n" +"Options:\n" +" -h, --help print this message\n" +" -V, --version output version information and exit\n" + ); +} + + + +int main(int argc, char* argv[]) +{ + int opt, opt_idx; + size_t buffer_size = 100000000; + int (*cmp)(const void*, const void*) = seq_cmp_hash;; + + static struct option long_options[] = + { + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, + {0, 0, 0, 0} + }; + + while (true) { + opt = getopt_long(argc, argv, "hV", long_options, &opt_idx); + if (opt == -1) break; + + switch (opt) { + case 'h': + print_help(); + return 0; + + case 'V': + print_version(stdout, prog_name); + return 0; + + case '?': + return 1; + + default: + abort(); + } + } + + seq_array_t* a = seq_array_create(buffer_size); + seq_t* seq = seq_create(); + + fastq_t* f; + if (optind >= argc) { + f = fastq_create(stdin); + while (fastq_read(f, seq)) { + if (!seq_array_push(a, seq)) { + seq_array_sort(a, cmp); + + /* TODO: dump a to a temporary file. Push that file name to an + * array somewhere. + * */ + + seq_array_clear(a); + if (seq_array_push(a, seq)) { + fprintf(stderr, "The buffer size is to small.\n"); + return EXIT_FAILURE; + } + } + } + fastq_free(f); + } + else { + FILE* file; + for (; optind < argc; ++optind) { + file = fopen(argv[optind], "rb"); + if (file == NULL) { + fprintf(stderr, "Cannot open %s for reading.\n", argv[optind]); + return EXIT_FAILURE; + } + f = fastq_create(file); + + while (fastq_read(f, seq)) { + if (!seq_array_push(a, seq)) { + seq_array_sort(a, cmp); + + /* TODO: dump a to a temporary file. Push that file name to + * an array somewhere. */ + + seq_array_clear(a); + if (seq_array_push(a, seq)) { + fprintf(stderr, "The buffer size is to small.\n"); + return EXIT_FAILURE; + } + } + } + + fastq_free(f); + fclose(file); + } + } + + if (a->n > 0) { + seq_array_sort(a, cmp); + + /* TODO: special case if everything fit into a. Just dump it to output. + */ + + /* TODO: dump to a temp file, push file name to the stack. */ + } + + /* TODO: Merge sort on the external files. */ + + seq_free(seq); + seq_array_free(a); + + return EXIT_SUCCESS; +} + + diff --git a/src/parse.c b/src/parse.c index 8a6123e..fd13921 100644 --- a/src/parse.c +++ b/src/parse.c @@ -66,6 +66,88 @@ void seq_free(seq_t* seq) } +/* This is MurmurHash3. The original C++ code was placed in the public domain + * by its author, Austin Appleby. */ + +static inline uint32_t fmix(uint32_t h) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + + +static inline uint32_t rotl32(uint32_t x, int8_t r) +{ + return (x << r) | (x >> (32 - r)); +} + + +uint32_t murmurhash3(const uint8_t* data, size_t len_) +{ + const int len = (int) len_; + const int nblocks = len / 4; + + uint32_t h1 = 0xc062fb4a; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t*) (data + nblocks * 4); + + int i; + for(i = -nblocks; i; i++) + { + uint32_t k1 = blocks[i]; + + k1 *= c1; + k1 = rotl32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = rotl32(h1, 13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1; + } + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + return h1; +} + + +uint32_t seq_hash(const seq_t* seq) +{ + /* TODO */ + return 0; +} + static const size_t parser_buf_size = 1000000; diff --git a/src/parse.h b/src/parse.h index 6224ab8..4a0a0dd 100644 --- a/src/parse.h +++ b/src/parse.h @@ -12,6 +12,7 @@ #define FASTQ_TOOLS_PARSE_H #include +#include #include /* A string structure to keep-track of a reserved space. */ @@ -41,6 +42,10 @@ seq_t* seq_create(); void seq_free(seq_t* seq); +/* Hash a fastq entry. */ +uint32_t seq_hash(const seq_t* seq); + + /* Internal data for the fastq parser. */ typedef struct fastq_t_ fastq_t;