]> git.donarmstrong.com Git - fastq-tools.git/commitdiff
Add a program to detect quality score encoding (fastq-qscale).
authorDaniel Jones <dcjones@cs.washington.edu>
Fri, 14 Dec 2012 06:10:49 +0000 (22:10 -0800)
committerDaniel Jones <dcjones@cs.washington.edu>
Fri, 14 Dec 2012 06:10:49 +0000 (22:10 -0800)
doc/Makefile.am
src/Makefile.am
src/fastq-qscale.c [new file with mode: 0644]

index 17a098204ec8768e41a1da227645c582c166e0c2..e8726b1f9717418f293ebfe2d4385a397dba4bf6 100644 (file)
@@ -1,9 +1,11 @@
 
-dist_man_MANS = fastq-grep.1 fastq-kmers.1 fastq-match.1 fastq-uniq.1 fastq-sample.1 fastq-sort.1
+dist_man_MANS = fastq-grep.1 fastq-kmers.1 fastq-match.1 fastq-uniq.1 \
+                fastq-sample.1 fastq-sort.1 fastq-qscale.1
 
 .1.html :
        groff -man -Thtml < $< \
                | perl -pe 's/<head>/<head>\n<link rel="stylesheet" href="stylesheets\/man.css">/' \
                > $@
 
-html-local : fastq-grep.html fastq-kmers.html fastq-match.html fastq-uniq.html fastq-sample.html fastq-sort.html
+html-local : fastq-grep.html fastq-kmers.html fastq-match.html fastq-uniq.html \
+                fastq-sample.html fastq-sort.html fastq-qscale.html
index e3cbee3e404df3b8801d888ef6c946b899e646e9..8f55c45e0f791ffa94bb5b4bb9d361512d8fe923 100644 (file)
@@ -1,5 +1,7 @@
 
-bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq fastq-qual fastq-sample fastq-qualadj fastq-sort
+bin_PROGRAMS = fastq-grep fastq-kmers fastq-match fastq-uniq \
+               fastq-qual fastq-sample fastq-qualadj fastq-sort \
+               fastq-qscale
 
 fastq_common_src=common.h common.c
 fastq_parse_src=parse.h parse.c
@@ -24,3 +26,5 @@ fastq_qualadj_SOURCES = fastq-qualadj.c $(fastq_common_src) $(fastq_parse_src)
 
 fastq_sort_SOURCES = fastq-sort.c $(fastq_common_src) $(fastq_parse_src)
 
+fastq_qscale_SOURCES = fastq-qscale.c $(fastq_common_src) $(fastq_parse_src)
+
diff --git a/src/fastq-qscale.c b/src/fastq-qscale.c
new file mode 100644 (file)
index 0000000..7704a50
--- /dev/null
@@ -0,0 +1,184 @@
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2012 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-qualscale:
+ * Determine the quality score encoding used by a fastq file.
+ *
+ */
+
+#include <getopt.h>
+
+#include "common.h"
+#include "parse.h"
+
+/* The scales here are taken from the wikipedia article for FASTQ, the relavent
+ * part is reproduced here:
+ *
+ *  SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS.....................................................
+ *  ..........................XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX......................
+ *  ...............................IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII......................
+ *  .................................JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ......................
+ *  LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL....................................................
+ *  !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+ *  |                         |    |        |                              |                     |
+ * 33                        59   64       73                            104                   126
+ *
+ *    S - Sanger        Phred+33,  raw reads typically (0, 40)
+ *    X - Solexa        Solexa+64, raw reads typically (-5, 40)
+ *    I - Illumina 1.3+ Phred+64,  raw reads typically (0, 40)
+ *    J - Illumina 1.5+ Phred+64,  raw reads typically (3, 40)
+ *       with 0=unused, 1=unused, 2=Read Segment Quality Control Indicator (bold)
+ *       (Note: See discussion above).
+ *        L - Illumina 1.8+ Phred+33,  raw reads typically (0, 41)
+ */
+
+
+typedef struct qual_scale_t_
+{
+    const char* description;
+    char min_qual, max_qual;
+} qual_scale_t;
+
+
+/* When the scale is ambiguous, we choose the first compatible one. Hence these
+ * are ordered roughly by increasing exclusivity. */
+#define NUM_SCALES 5
+static qual_scale_t scales[NUM_SCALES] =
+{
+    {"Sanger/Phred+33",       '!', 'I'},
+    {"Illumina 1.8/Phred+33", '!', 'J'},
+    {"Illumina 1.5/Phred+64", 'B', 'h'},
+    {"Illumina 1.3/Phred+64", '@', 'h'},
+    {"Solexa/Solexa+64",      ';', 'h'}
+};
+
+
+/* Return true if x has excatly one 1 bit. */
+bool single_bit(uint32_t x)
+{
+    return x && !(x & (x - 1));
+}
+
+
+/* Make a bitset of compatible scales. */
+uint32_t make_bitset(char min_qual, char max_qual)
+{
+    uint32_t s = 0;
+    uint32_t i;
+    for (i = 0; i < NUM_SCALES; ++i) {
+        if (scales[i].min_qual <= min_qual &&
+            scales[i].max_qual >= max_qual) {
+            s |= 1 << i;
+        }
+    }
+
+    return s;
+}
+
+
+void fastq_qualscale(const char* fn, FILE* fin)
+{
+    char min_qual = '~', max_qual = '!';
+
+    fastq_t* fqf = fastq_create(fin);
+    seq_t* seq = seq_create();
+
+    /* Scales compatible with the data so far. */
+    uint32_t compat_scales;
+
+    size_t n = 100000;
+
+    while (n-- && fastq_read(fqf, seq) && !single_bit(compat_scales)) {
+        char* q = seq->qual.s;
+        while (*q) {
+            if (*q < min_qual) min_qual = *q;
+            if (*q > max_qual) max_qual = *q;
+            ++q;
+        }
+
+        compat_scales = make_bitset(min_qual, max_qual);
+        if (compat_scales == 0 || single_bit(compat_scales)) break;
+    }
+
+    seq_free(seq);
+    fastq_free(fqf);
+
+    if (compat_scales == 0) {
+        printf("%s: Unknown scale ['%c', '%c']\n", fn, min_qual, max_qual);
+    }
+    else {
+        /* low order bit */
+        unsigned int i;
+        for (i = 0; !(compat_scales & (1 << i)); ++i) {}
+        printf("%s: %s\n", fn, scales[i].description);
+    }
+}
+
+
+static const char* prog_name = "fastq-qscale";
+
+
+void print_help()
+{
+    fprintf(stderr,
+"fastq-qscale [OPTION] [FILE]...\n"
+"Detect and output the quality score scale for each file given as an argument.\n\n"
+"Options:\n"
+"  -h, --help              print this message\n"
+"  -V, --version           output version information and exit\n"
+    );
+}
+
+
+int main(int argc, char* argv[])
+{
+    static struct option long_options[] =
+    {
+        {"help",    no_argument, NULL, 'h'},
+        {"version", no_argument, NULL, 'V'},
+        {0, 0, 0, 0}
+    };
+
+    int opt, opt_idx;
+    while (true) {
+        opt = getopt_long(argc, argv, "hV", long_options, &opt_idx);
+        if (opt == -1) break;
+
+        switch (opt) {
+            case 'h':
+                print_help();
+                return 0;
+
+            case 'V':
+                print_version(stdout, prog_name);
+                return 0;
+
+            case '?':
+                return 1;
+
+            default:
+                abort();
+        }
+    }
+
+    if (optind >= argc) {
+        fastq_qualscale("stdin", stdin);
+    }
+    else {
+        for (; optind < argc; optind++) {
+            FILE* fin = fopen(argv[optind], "rb");
+            if (fin == NULL) {
+                fprintf(stderr, "No such file '%s'.\n", argv[optind]);
+                continue;
+            }
+
+            fastq_qualscale(argv[optind], fin);
+            fclose(fin);
+        }
+    }
+
+    return EXIT_SUCCESS;
+}
+