From f9b7915e0b6aaa00882c111dffa760e7b455da94 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Wed, 26 Oct 2011 12:24:06 -0700 Subject: [PATCH] fastq-sample: options to set the rng seed and output the complement of the sample --- doc/fastq-sample.1 | 10 ++- src/fastq-sample.c | 163 ++++++++++++++++++++++++++++++--------------- 2 files changed, 119 insertions(+), 54 deletions(-) diff --git a/doc/fastq-sample.1 b/doc/fastq-sample.1 index 53434e0..4ce0cb4 100644 --- a/doc/fastq-sample.1 +++ b/doc/fastq-sample.1 @@ -4,7 +4,7 @@ fastq-sample - sample random reads from a fastq file .SH SYNOPSIS -.B fastq-sample [OPTION]... [[FILE] [FILE2]] +.B fastq-sample [OPTION]... FILE [FILE2] .SH DESCRIPTION Given a FASTQ file, random reads are sampled and output, with or without @@ -33,6 +33,14 @@ being sampled, the output file is [PREFIX].fastq, and with paired-end, \fB\-r\fR, \fB\-\-with\-replacement\fR Sample with replacement. (By default, sampling is done without replacement.) .TP +\fB\-c\fR, \fB\-\-complement-output=PREFIX\fR +Output reads not included in the random sample to a file (or files) with the +given prefix. By default, these reads are not output. +.TP +\fB\-s\fR, \fB\-\-seed\fR +Seed the random number generator. Using the same seed on the same data set will +produce the same random sample. +.TP \fB\-h\fR, \fB\-\-help\fR Output a help message and exit. .TP diff --git a/src/fastq-sample.c b/src/fastq-sample.c index 279b6cf..4074df9 100644 --- a/src/fastq-sample.c +++ b/src/fastq-sample.c @@ -34,13 +34,18 @@ static int replacement_flag; void print_help() { fprintf(stdout, -"fastq-sample [OPTION]... [FILE [FILE2]]\n" +"fastq-sample [OPTION]... FILE [FILE2]\n" "Sample random reads from a FASTQ file." "Options:\n" " -n N the number of reads to sample (default: 10000)\n" " -p N the proportion of the total reads to sample\n" " -o, --output=PREFIX output file prefix\n" +" -c, --complement-output=PREFIX\n" +" output reads not included in the random sample to\n" +" a file (or files) with the given prefix (by default,\n" +" they are not output).\n" " -r, --with-replacement sample with relpacement\n" +" -s, --seed=SEED a manual seed to the random number generator\n" " -h, --help print this message\n" " -V, --version output version information and exit\n" ); @@ -97,7 +102,9 @@ unsigned long* index_without_replacement(rng_t* rng, unsigned long n) } -void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, double p) +void fastq_sample(unsigned long rng_seed, + const char* prefix, const char* cprefix, + FILE* file1, FILE* file2, unsigned long k, double p) { /* * The basic idea is this: @@ -140,6 +147,7 @@ void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, } rng_t* rng = fastq_rng_alloc(); + fastq_rng_seed(rng, rng_seed); unsigned long* xs; if (replacement_flag) xs = index_with_replacement(rng, n, k); @@ -191,6 +199,47 @@ void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, } + /* open complement output */ + FILE* cfout1 = NULL; + FILE* cfout2 = NULL; + + if (cprefix != NULL && file2 == NULL) { + output_len = strlen(cprefix) + 7; + output_name = malloc_or_die((output_len + 1) * sizeof(char)); + + snprintf(output_name, output_len, "%s.fastq", cprefix); + cfout1 = fopen(output_name, "wb"); + if (cfout1 == NULL) { + fprintf(stderr, "Cannot open file %s for writing.\n", output_name); + exit(1); + } + + cfout2 = NULL; + + free(output_name); + } + else if (cprefix != NULL) { + output_len = strlen(cprefix) + 9; + output_name = malloc_or_die((output_len + 1) * sizeof(char)); + + snprintf(output_name, output_len, "%s.1.fastq", cprefix); + cfout1 = fopen(output_name, "wb"); + if (cfout1 == NULL) { + fprintf(stderr, "Cannot open file %s for writing.\n", output_name); + exit(1); + } + + snprintf(output_name, output_len, "%s.2.fastq", cprefix); + cfout2 = fopen(output_name, "wb"); + if (cfout1 == NULL) { + fprintf(stderr, "Cannot open file %s for writing.\n", output_name); + exit(1); + } + + free(output_name); + } + + unsigned long i = 0; // read number unsigned long j = 0; // index into xs @@ -208,10 +257,16 @@ void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, } } - while (j < k && xs[j] == i) { - fastq_print(fout1, seq1); - if (f2 != NULL) fastq_print(fout2, seq2); - ++j; + if (xs[j] == i) { + while (j < k && xs[j] == i) { + fastq_print(fout1, seq1); + if (f2 != NULL) fastq_print(fout2, seq2); + ++j; + } + } + else if (cfout1 != NULL) { + fastq_print(cfout1, seq1); + if (f2 != NULL) fastq_print(cfout2, seq2); } ++i; @@ -225,6 +280,9 @@ void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, fclose(fout1); if (fout2 != NULL) fclose(fout2); + if (cfout1 != NULL) fclose(cfout1); + if (cfout2 != NULL) fclose(cfout2); + fastq_rng_free(rng); free(xs); } @@ -239,23 +297,27 @@ int main(int argc, char* argv[]) int opt_idx; const char* prefix = NULL; + const char* cprefix = NULL; + unsigned long rng_seed = 4357; unsigned long k = 10000; // number of reads to sample double p = -1; // proportion of reads to sample static struct option long_options[] = { - {"with-replacement", no_argument, NULL, 'r'}, - {"output", no_argument, NULL, 'o'}, - {"help", no_argument, NULL, 'h'}, - {"version", no_argument, NULL, 'V'}, + {"with-replacement", no_argument, NULL, 'r'}, + {"complement-output", required_argument, NULL, 'c'}, + {"seed", required_argument, NULL, 's'}, + {"output", no_argument, NULL, 'o'}, + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, {0, 0, 0, 0} }; replacement_flag = 0; while (1) { - opt = getopt_long(argc, argv, "n:p:o:rhV", long_options, &opt_idx); + opt = getopt_long(argc, argv, "n:p:o:rs:hV", long_options, &opt_idx); if( opt == -1 ) break; @@ -282,10 +344,18 @@ int main(int argc, char* argv[]) replacement_flag = 1; break; + case 's': + rng_seed = strtoul(optarg, NULL, 10); + break; + case 'o': prefix = optarg; break; + case 'c': + cprefix = optarg; + break; + case 'h': print_help(); return 0; @@ -308,61 +378,48 @@ int main(int argc, char* argv[]) char* prefix_alloc = NULL; if (optind >= argc) { - file1 = stdin; + fputs("An input file must be given.\n", stderr); + print_help(); + exit(1); } else { - if (strcmp(argv[optind], "-") == 0) { - file1 = stdin; - if (prefix == NULL) prefix = "stdin.sample"; + file1 = fopen(argv[optind], "rb"); + if (file1 == NULL) { + fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); + return 1; } - else { - file1 = fopen(argv[optind], "rb"); - if (file1 == NULL) { - fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); - return 1; - } - if (prefix == NULL) { - /* guess at a reasonable output refix by trimming the - * trailing file extension, if any. */ - char* tmp; - - /* base name */ - tmp = strrchr(argv[optind], '/'); - if (tmp != NULL) argv[optind] = tmp + 1; - - /* exclude file suffixes */ - tmp = strchr(argv[optind], '.'); - if (tmp == NULL) prefix = argv[optind]; - else { - prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char)); - memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char)); - prefix_alloc[tmp - argv[optind]] = '\0'; - prefix = prefix_alloc; - } + if (prefix == NULL) { + /* guess at a reasonable output refix by trimming the + * trailing file extension, if any. */ + char* tmp; + + /* base name */ + tmp = strrchr(argv[optind], '/'); + if (tmp != NULL) argv[optind] = tmp + 1; + + /* exclude file suffixes */ + tmp = strchr(argv[optind], '.'); + if (tmp == NULL) prefix = argv[optind]; + else { + prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char)); + memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char)); + prefix_alloc[tmp - argv[optind]] = '\0'; + prefix = prefix_alloc; } } ++optind; if (optind < argc) { - if (strcmp(argv[optind], "-") == 0) { - if (file1 == stdin) { - fprintf(stderr, "Both input files cannot be standard input.\n"); - return 1; - } - file2 = stdin; - } - else { - file2 = fopen(argv[optind], "rb"); - if (file2 == NULL) { - fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); - return 1; - } + file2 = fopen(argv[optind], "rb"); + if (file2 == NULL) { + fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); + return 1; } } } - fastq_sample(prefix, file1, file2, k, p); + fastq_sample(rng_seed, prefix, cprefix, file1, file2, k, p); free(prefix_alloc); return 0; -- 2.39.2