Much simpler faster code for parsing fastq files.

[fastq-tools.git] / tests / random_fastq.c
diff --git a/tests/random_fastq.c b/tests/random_fastq.c

new file mode 100644 (file)

index 0000000..b4853ce
--- /dev/null
+++ b/tests/random_fastq.c
@@ -0,0 +1,146 @@
+
+/*
+    random_fastq
+    ------------
+    Generate random data in FASTQ format.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <getopt.h>
+
+static void print_help()
+{
+    printf(
+"Usage: random_fastq [option]...\n"
+"Generate an endless stream of random FASTQ data to standard out.\n\n"
+"Options:\n"
+"    TODO\n\n"
+"Beware: the only purpose of this program is test quip.\n"
+"No particular guarantees are made.\n\n");
+}
+
+
+/* Draw n samples from a categorial distribution of size k with cumulative
+ * distribution given by cs and element us. The sample is stored in xs which
+ * is assumed to be of appropriate length. */
+void randcat(const char* us, const double* cs, size_t k, char* xs, size_t n)
+{
+    size_t i;
+    double r;
+    for (i = 0; i < n; ++i) {
+        r = drand48();
+        int a = 0, b = (int) k, c;
+        do {
+            c = (a + b) / 2;
+            if (r <= cs[c]) b = c;
+            else            a = c + 1;
+        } while (a < b);
+
+        xs[i] = us[a];
+    }
+}
+
+
+int main(int argc, char* argv[])
+{
+    static struct option long_options[] =
+    {
+        {"min-length", required_argument, NULL, 'm'},
+        {"max-length", required_argument, NULL, 'M'},
+        {"length",     required_argument, NULL, 'l'},
+        {"id-length",  required_argument, NULL, 'i'},
+        {"help",       no_argument,       NULL, 'h'},
+        {NULL, 0, NULL, 0}
+    };
+
+    size_t min_len = 100, max_len = 100;
+    size_t id_len = 50;
+
+    int opt, opt_idx;
+    while (1) {
+        opt = getopt_long(argc, argv, "h", long_options, &opt_idx);
+
+        if (opt == -1) break;
+
+        switch (opt) {
+            case 'm':
+                min_len = (size_t) strtoul(optarg, NULL, 10);
+                break;
+
+            case 'M':
+                max_len = (size_t) strtoul(optarg, NULL, 10);
+                break;
+
+            case 'l':
+                min_len = max_len = (size_t) strtoul(optarg, NULL, 10);
+                break;
+
+            case 'i':
+                id_len = (size_t) strtoul(optarg, NULL, 10);
+                break;
+
+            case 'h':
+                print_help();
+                return EXIT_SUCCESS;
+
+            case '?':
+                return EXIT_FAILURE;
+
+            default:
+                abort();
+        }
+    }
+
+    char nucleotides[5] = {'A', 'C', 'G', 'T', 'N'};
+    double nuc_cs[5] = {0.28, 0.49, 0.70, 0.90, 1.00};
+
+    char qualities[64];
+    double qual_cs[64];
+    size_t i;
+    double last_c = 0.0;
+    for (i = 0; i < 64; ++i) {
+        qualities[i] = '!' + i;
+        last_c = qual_cs[i] = last_c + 1.0 / 64.0;
+    }
+
+    char id_chars[94];
+    double id_cs[94];
+    last_c = 0.0;
+    for (i = 0; i < 94; ++i) {
+        id_chars[i] = '!' + i;
+        last_c = id_cs[i] = last_c + 1.0 / 94.0;
+    }
+
+    char* id   = malloc(id_len + 1);
+    char* seq  = malloc(max_len + 1);
+    char* qual = malloc(max_len + 1);
+    size_t len = min_len;
+
+    while (1) {
+        if (max_len > min_len) {
+            len = min_len + (size_t) (drand48() * (double) (1 + max_len - min_len));
+        }
+
+        randcat(id_chars, id_cs, 94, id, id_len);
+        id[id_len] = '\0';
+
+        randcat(nucleotides, nuc_cs, 5, seq, len);
+        seq[len] = '\0';
+
+        randcat(qualities, qual_cs, 64, qual, len);
+        qual[len] = '\0';
+
+        printf(
+            "@%s\n%s\n+\n%s\n",
+            id, seq, qual);
+    }
+
+    /* Yeah, right. As if we'll ever get here. */
+    free(id);
+    free(seq);
+    free(qual);
+
+    return EXIT_SUCCESS;
+}
+