X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2Ffastq-sample.c;h=d3c2b550e3c128844d7df678764242a3d49c1be6;hb=d6b81119e55cf77c1e4c17b9e9026abc01e22b96;hp=279b6cfbac08e93601d6f40eaa16f634d64407ca;hpb=55b423636a17f6477e090dc670064cb8263e119c;p=fastq-tools.git diff --git a/src/fastq-sample.c b/src/fastq-sample.c index 279b6cf..d3c2b55 100644 --- a/src/fastq-sample.c +++ b/src/fastq-sample.c @@ -34,13 +34,18 @@ static int replacement_flag; void print_help() { fprintf(stdout, -"fastq-sample [OPTION]... [FILE [FILE2]]\n" +"fastq-sample [OPTION]... FILE [FILE2]\n" "Sample random reads from a FASTQ file." "Options:\n" " -n N the number of reads to sample (default: 10000)\n" " -p N the proportion of the total reads to sample\n" -" -o, --output=PREFIX output file prefix\n" +" -o, --output=PREFIX output file prefix\n" +" -c, --complement-output=PREFIX\n" +" output reads not included in the random sample to\n" +" a file (or files) with the given prefix (by default,\n" +" they are not output).\n" " -r, --with-replacement sample with relpacement\n" +" -s, --seed=SEED a manual seed to the random number generator\n" " -h, --help print this message\n" " -V, --version output version information and exit\n" ); @@ -50,10 +55,10 @@ void print_help() /* count the number of entries in a fastq file */ unsigned long count_entries(fastq_t* fqf) { - seq_t* seq = fastq_alloc_seq(); + seq_t* seq = seq_create(); unsigned long n = 0; - while (fastq_next(fqf, seq)) ++n; - fastq_free_seq(seq); + while (fastq_read(fqf, seq)) ++n; + seq_free(seq); return n; } @@ -97,7 +102,9 @@ unsigned long* index_without_replacement(rng_t* rng, unsigned long n) } -void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, double p) +void fastq_sample(unsigned long rng_seed, + const char* prefix, const char* cprefix, + FILE* file1, FILE* file2, unsigned long k, double p) { /* * The basic idea is this: @@ -119,8 +126,8 @@ void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, unsigned long n, n2; - fastq_t* f1 = fastq_open(file1); - fastq_t* f2 = file2 == NULL ? NULL : fastq_open(file2); + fastq_t* f1 = fastq_create(file1); + fastq_t* f2 = file2 == NULL ? NULL : fastq_create(file2); n = count_entries(f1); if (f2 != NULL) { @@ -140,6 +147,7 @@ void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, } rng_t* rng = fastq_rng_alloc(); + fastq_rng_seed(rng, rng_seed); unsigned long* xs; if (replacement_flag) xs = index_with_replacement(rng, n, k); @@ -191,40 +199,90 @@ void fastq_sample(const char* prefix, FILE* file1, FILE* file2, unsigned long k, } + /* open complement output */ + FILE* cfout1 = NULL; + FILE* cfout2 = NULL; + + if (cprefix != NULL && file2 == NULL) { + output_len = strlen(cprefix) + 7; + output_name = malloc_or_die((output_len + 1) * sizeof(char)); + + snprintf(output_name, output_len, "%s.fastq", cprefix); + cfout1 = fopen(output_name, "wb"); + if (cfout1 == NULL) { + fprintf(stderr, "Cannot open file %s for writing.\n", output_name); + exit(1); + } + + cfout2 = NULL; + + free(output_name); + } + else if (cprefix != NULL) { + output_len = strlen(cprefix) + 9; + output_name = malloc_or_die((output_len + 1) * sizeof(char)); + + snprintf(output_name, output_len, "%s.1.fastq", cprefix); + cfout1 = fopen(output_name, "wb"); + if (cfout1 == NULL) { + fprintf(stderr, "Cannot open file %s for writing.\n", output_name); + exit(1); + } + + snprintf(output_name, output_len, "%s.2.fastq", cprefix); + cfout2 = fopen(output_name, "wb"); + if (cfout1 == NULL) { + fprintf(stderr, "Cannot open file %s for writing.\n", output_name); + exit(1); + } + + free(output_name); + } + + unsigned long i = 0; // read number unsigned long j = 0; // index into xs int ret; - seq_t* seq1 = fastq_alloc_seq(); - seq_t* seq2 = fastq_alloc_seq(); + seq_t* seq1 = seq_create(); + seq_t* seq2 = seq_create(); - while (j < k && fastq_next(f1, seq1)) { + while (j < k && fastq_read(f1, seq1)) { if (f2 != NULL){ - ret = fastq_next(f2, seq2); + ret = fastq_read(f2, seq2); if (ret == 0) { fputs("Input files have differing numbers of entries.\n", stderr); exit(1); } } - while (j < k && xs[j] == i) { - fastq_print(fout1, seq1); - if (f2 != NULL) fastq_print(fout2, seq2); - ++j; + if (xs[j] == i) { + while (j < k && xs[j] == i) { + fastq_print(fout1, seq1); + if (f2 != NULL) fastq_print(fout2, seq2); + ++j; + } + } + else if (cfout1 != NULL) { + fastq_print(cfout1, seq1); + if (f2 != NULL) fastq_print(cfout2, seq2); } ++i; } - fastq_free_seq(seq1); - fastq_free_seq(seq2); - fastq_close(f1); - if (f2 != NULL) fastq_close(f2); + seq_free(seq1); + seq_free(seq2); + fastq_free(f1); + if (f2 != NULL) fastq_free(f2); fclose(fout1); if (fout2 != NULL) fclose(fout2); + if (cfout1 != NULL) fclose(cfout1); + if (cfout2 != NULL) fclose(cfout2); + fastq_rng_free(rng); free(xs); } @@ -239,23 +297,27 @@ int main(int argc, char* argv[]) int opt_idx; const char* prefix = NULL; + const char* cprefix = NULL; + unsigned long rng_seed = 4357; unsigned long k = 10000; // number of reads to sample double p = -1; // proportion of reads to sample static struct option long_options[] = { - {"with-replacement", no_argument, NULL, 'r'}, - {"output", no_argument, NULL, 'o'}, - {"help", no_argument, NULL, 'h'}, - {"version", no_argument, NULL, 'V'}, + {"with-replacement", no_argument, NULL, 'r'}, + {"complement-output", required_argument, NULL, 'c'}, + {"seed", required_argument, NULL, 's'}, + {"output", no_argument, NULL, 'o'}, + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, {0, 0, 0, 0} }; replacement_flag = 0; while (1) { - opt = getopt_long(argc, argv, "n:p:o:rhV", long_options, &opt_idx); + opt = getopt_long(argc, argv, "n:p:o:rs:hV", long_options, &opt_idx); if( opt == -1 ) break; @@ -282,10 +344,18 @@ int main(int argc, char* argv[]) replacement_flag = 1; break; + case 's': + rng_seed = strtoul(optarg, NULL, 10); + break; + case 'o': prefix = optarg; break; + case 'c': + cprefix = optarg; + break; + case 'h': print_help(); return 0; @@ -308,61 +378,48 @@ int main(int argc, char* argv[]) char* prefix_alloc = NULL; if (optind >= argc) { - file1 = stdin; + fputs("An input file must be given.\n", stderr); + print_help(); + exit(1); } else { - if (strcmp(argv[optind], "-") == 0) { - file1 = stdin; - if (prefix == NULL) prefix = "stdin.sample"; + file1 = fopen(argv[optind], "rb"); + if (file1 == NULL) { + fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); + return 1; } - else { - file1 = fopen(argv[optind], "rb"); - if (file1 == NULL) { - fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); - return 1; - } - if (prefix == NULL) { - /* guess at a reasonable output refix by trimming the - * trailing file extension, if any. */ - char* tmp; - - /* base name */ - tmp = strrchr(argv[optind], '/'); - if (tmp != NULL) argv[optind] = tmp + 1; - - /* exclude file suffixes */ - tmp = strchr(argv[optind], '.'); - if (tmp == NULL) prefix = argv[optind]; - else { - prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char)); - memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char)); - prefix_alloc[tmp - argv[optind]] = '\0'; - prefix = prefix_alloc; - } + if (prefix == NULL) { + /* guess at a reasonable output refix by trimming the + * trailing file extension, if any. */ + char* tmp; + + /* base name */ + tmp = strrchr(argv[optind], '/'); + if (tmp != NULL) argv[optind] = tmp + 1; + + /* exclude file suffixes */ + tmp = strchr(argv[optind], '.'); + if (tmp == NULL) prefix = argv[optind]; + else { + prefix_alloc = malloc_or_die((tmp - argv[optind] + 1) * sizeof(char)); + memcpy(prefix_alloc, argv[optind], (tmp - argv[optind]) * sizeof(char)); + prefix_alloc[tmp - argv[optind]] = '\0'; + prefix = prefix_alloc; } } ++optind; if (optind < argc) { - if (strcmp(argv[optind], "-") == 0) { - if (file1 == stdin) { - fprintf(stderr, "Both input files cannot be standard input.\n"); - return 1; - } - file2 = stdin; - } - else { - file2 = fopen(argv[optind], "rb"); - if (file2 == NULL) { - fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); - return 1; - } + file2 = fopen(argv[optind], "rb"); + if (file2 == NULL) { + fprintf(stderr, "Cannot open '%s' for reading.\n", argv[optind]); + return 1; } } } - fastq_sample(prefix, file1, file2, k, p); + fastq_sample(rng_seed, prefix, cprefix, file1, file2, k, p); free(prefix_alloc); return 0;