]> git.donarmstrong.com Git - samtools.git/commitdiff
added the phase command
authorHeng Li <lh3@live.co.uk>
Fri, 25 Feb 2011 05:59:27 +0000 (05:59 +0000)
committerHeng Li <lh3@live.co.uk>
Fri, 25 Feb 2011 05:59:27 +0000 (05:59 +0000)
13 files changed:
Makefile
bam_md.c
bam_plcmd.c
bam_sort.c
bamtk.c
cut_target.c [new file with mode: 0644]
errmod.h
faidx.c
khash.h
kstring.c
kstring.h
phase.c [new file with mode: 0644]
samtools.1

index e28822eabfa7499e5cb8d5a1a5198360e73630ee..80f7a31b94f5d2a30909a0cac65a3db8c0f8462f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,8 @@ LOBJS=          bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
                        $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o
 AOBJS=         bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
                        bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o     \
-                       bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o
+                       bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
+                       cut_target.o phase.o
 PROG=          samtools
 INCLUDES=      -I.
 SUBDIRS=       . bcftools misc
@@ -66,6 +67,7 @@ bcf.o:bcftools/bcf.h
 bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h
 bam2bcf_indel.o:bam2bcf.h
 errmod.o:errmod.h
+phase.o:bam.h khash.h ksort.h
 
 faidx.o:faidx.h razf.h khash.h
 faidx_main.o:faidx.h razf.h
index 02d6c5bf7ea5b595709a5840bca387fef4d66add..20b1913efa09ce796392885b0f4b831c8c759f4a 100644 (file)
--- a/bam_md.c
+++ b/bam_md.c
@@ -315,7 +315,8 @@ int bam_fillmd(int argc, char *argv[])
                fprintf(stderr, "         -b       compressed BAM output\n");
                fprintf(stderr, "         -S       the input is SAM with header\n");
                fprintf(stderr, "         -A       modify the quality string\n");
-               fprintf(stderr, "         -r       read-independent local realignment\n\n");
+               fprintf(stderr, "         -r       compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n");
+               fprintf(stderr, "         -E       extended BAQ for better sensitivity but lower specificity\n\n");
                return 1;
        }
        fp = samopen(argv[optind], mode_r, 0);
index d6e8213e51c2d9938d15ced14b4eee0ff3859610..2c333e8c668cb1a48464637d2e56e2e3984d7269 100644 (file)
@@ -934,6 +934,7 @@ int bam_mpileup(int argc, char *argv[])
                fprintf(stderr, "         -A          use anomalous read pairs in SNP/INDEL calling\n");
                fprintf(stderr, "         -g          generate BCF output\n");
                fprintf(stderr, "         -u          do not compress BCF output\n");
+               fprintf(stderr, "         -E          extended BAQ for higher sensitivity but lower specificity\n");
                fprintf(stderr, "         -B          disable BAQ computation\n");
                fprintf(stderr, "         -D          output per-sample DP\n");
                fprintf(stderr, "         -S          output per-sample SP (strand bias P-value, slow)\n");
index 01f7016dd9a35ef27cf4cc232d7529b25d67880b..38f15d655c253dc1f7e8cd50dd39c178c448f584 100644 (file)
@@ -222,8 +222,11 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch
        ks_heapmake(heap, n, heap);
        while (heap->pos != HEAP_EMPTY) {
                bam1_t *b = heap->b;
-               if ((flag & MERGE_RG) && bam_aux_get(b, "RG") == 0)
+               if (flag & MERGE_RG) {
+                       uint8_t *rg = bam_aux_get(b, "RG");
+                       if (rg) bam_aux_del(b, rg);
                        bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
+               }
                bam_write1_core(fpout, &b->core, b->data_len, b->data);
                if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
                        heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b);
diff --git a/bamtk.c b/bamtk.c
index f2b577cca629a9a761045337828b460282f437bf..096cd95c18359904ca3867af2f414959fd2bfbf0 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.12-10 (r896)"
+#define PACKAGE_VERSION "0.1.12-11 (r896:110)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
@@ -27,6 +27,8 @@ int bam_idxstats(int argc, char *argv[]);
 int main_samview(int argc, char *argv[]);
 int main_import(int argc, char *argv[]);
 int main_reheader(int argc, char *argv[]);
+int main_cut_target(int argc, char *argv[]);
+int main_phase(int argc, char *argv[]);
 
 int faidx_main(int argc, char *argv[]);
 int glf3_view_main(int argc, char *argv[]);
@@ -94,6 +96,8 @@ static int usage()
        fprintf(stderr, "         merge       merge sorted alignments\n");
        fprintf(stderr, "         rmdup       remove PCR duplicates\n");
        fprintf(stderr, "         reheader    replace BAM header\n");
+       fprintf(stderr, "         targetcut   cut fosmid regions (for fosmid pool only)\n");
+       fprintf(stderr, "         phase       phase heterozygotes\n");
        fprintf(stderr, "\n");
 #ifdef _WIN32
        fprintf(stderr, "\
@@ -131,6 +135,8 @@ int main(int argc, char *argv[])
        else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1);
        else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
        else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1);
+       else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1);
+       else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1);
 #if _CURSES_LIB != 0
        else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
 #endif
diff --git a/cut_target.c b/cut_target.c
new file mode 100644 (file)
index 0000000..ee687fb
--- /dev/null
@@ -0,0 +1,191 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bam.h"
+#include "errmod.h"
+#include "faidx.h"
+
+#define ERR_DEP 0.83f
+
+typedef struct {
+       int e[2][3], p[2][2];
+} score_param_t;
+
+/* Note that although the two matrics have 10 parameters in total, only 4
+ * (probably 3) are free.  Changing the scoring matrices in a sort of symmetric
+ * way will not change the result. */
+static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} };
+
+typedef struct {
+       int min_baseQ, tid, max_bases;
+       uint16_t *bases;
+       bamFile fp;
+       bam_header_t *h;
+       char *ref;
+       faidx_t *fai;
+       errmod_t *em;
+} ct_t;
+
+static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
+{
+       int i, j, ret, tmp, k, sum[4], qual;
+       float q[16];
+       if (n > g->max_bases) { // enlarge g->bases
+               g->max_bases = n;
+               kroundup32(g->max_bases);
+               g->bases = realloc(g->bases, g->max_bases * 2);
+       }
+       for (i = k = 0; i < n; ++i) {
+               const bam_pileup1_t *p = plp + i;
+               uint8_t *seq;
+               int q, baseQ, b;
+               if (p->is_refskip || p->is_del) continue;
+               baseQ = bam1_qual(p->b)[p->qpos];
+               if (baseQ < g->min_baseQ) continue;
+               seq = bam1_seq(p->b);
+               b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+               if (b > 3) continue;
+               q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+               g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b;
+       }
+       if (k == 0) return 0;
+       errmod_cal(g->em, k, 4, g->bases, q);
+       for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i;
+       for (i = 1; i < 4; ++i) // insertion sort
+               for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
+                       tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
+       qual = (sum[1]>>2) - (sum[0]>>2);
+       k = k < 256? k : 255;
+       ret = (qual < 63? qual : 63) << 2 | (sum[0]&3);
+       return ret<<8|k;
+}
+
+static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns)
+{
+       int i, f[2][2], *prev, *curr, *swap_tmp, s;
+       uint8_t *b; // backtrack array
+       b = calloc(l, 1);
+       f[0][0] = f[0][1] = 0;
+       prev = f[0]; curr = f[1];
+       // fill the backtrack matrix
+       for (i = 0; i < l; ++i) {
+               int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2;
+               int tmp0, tmp1;
+               // compute f[0]
+               tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0)
+               tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1)
+               if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0;
+               else curr[0] = tmp1, b[i] = 1;
+               // compute f[1]
+               tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0)
+               tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1)
+               if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1;
+               else curr[1] = tmp1, b[i] |= 1<<1;
+               // swap
+               swap_tmp = prev; prev = curr; curr = swap_tmp;
+       }
+       // backtrack
+       s = prev[0] > prev[1]? 0 : 1;
+       for (i = l - 1; i > 0; --i) {
+               b[i] |= s<<2;
+               s = b[i]>>s&1;
+       }
+       // print
+       for (i = 0, s = -1; i <= l; ++i) {
+               if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) {
+                       if (s >= 0) {
+                               int j;
+                               printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s);
+                               for (j = s; j < i; ++j) {
+                                       int c = cns[j]>>8;
+                                       if (c == 0) putchar('N');
+                                       else putchar("ACGT"[c&3]);
+                               }
+                               putchar('\t');
+                               for (j = s; j < i; ++j)
+                                       putchar(33 + (cns[j]>>8>>2));
+                               putchar('\n');
+                       }
+                       //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s);
+                       s = -1;
+               } else if ((b[i]>>2&3) && s < 0) s = i;
+       }
+       free(b);
+}
+
+static int read_aln(void *data, bam1_t *b)
+{
+       extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag);
+       ct_t *g = (ct_t*)data;
+       int ret, len;
+       ret = bam_read1(g->fp, b);
+       if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) {
+               if (b->core.tid != g->tid) { // then load the sequence
+                       free(g->ref);
+                       g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
+                       g->tid = b->core.tid;
+               }
+               bam_prob_realn_core(b, g->ref, 1<<1|1);
+       }
+       return ret;
+}
+
+int main_cut_target(int argc, char *argv[])
+{
+       int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l;
+       const bam_pileup1_t *p;
+       bam_plp_t plp;
+       uint16_t *cns;
+       ct_t g;
+
+       memset(&g, 0, sizeof(ct_t));
+       g.min_baseQ = 13; g.tid = -1;
+       while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) {
+               switch (c) {
+                       case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff
+                       case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY
+                       case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE
+                       case '1': g_param.e[1][1] = atoi(optarg); break;
+                       case '2': g_param.e[1][2] = atoi(optarg); break;
+                       case 'f': g.fai = fai_load(optarg);
+                               if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__);
+                               break;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>\n");
+               return 1;
+       }
+       l = max_l = 0; cns = 0;
+       g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+       g.h = bam_header_read(g.fp);
+       g.em = errmod_init(1 - ERR_DEP);
+       plp = bam_plp_init(read_aln, &g);
+       while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) {
+               if (tid < 0) break;
+               if (tid != lasttid) { // change of chromosome
+                       if (cns) process_cns(g.h, lasttid, l, cns);
+                       if (max_l < g.h->target_len[tid]) {
+                               max_l = g.h->target_len[tid];
+                               kroundup32(max_l);
+                               cns = realloc(cns, max_l * 2);
+                       }
+                       l = g.h->target_len[tid];
+                       memset(cns, 0, max_l * 2);
+                       lasttid = tid;
+               }
+               cns[pos] = gencns(&g, n, p);
+               lastpos = pos;
+       }
+       process_cns(g.h, lasttid, l, cns);
+       free(cns);
+       bam_header_destroy(g.h);
+       bam_plp_destroy(plp);
+       bam_close(g.fp);
+       if (g.fai) {
+               fai_destroy(g.fai); free(g.ref);
+       }
+       errmod_destroy(g.em);
+       free(g.bases);
+       return 0;
+}
index e3e9a90540cee5a78bb603956db20e7861555c2d..32c07b67b625924230689fca1d16da4f49e5938b 100644 (file)
--- a/errmod.h
+++ b/errmod.h
@@ -12,6 +12,13 @@ typedef struct {
 
 errmod_t *errmod_init(float depcorr);
 void errmod_destroy(errmod_t *em);
+
+/*
+       n: number of bases
+       m: maximum base
+       bases[i]: qual:6, strand:1, base:4
+       q[i*m+j]: phred-scaled likelihood of (i,j)
+ */
 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q);
 
 #endif
diff --git a/faidx.c b/faidx.c
index dbd8b3e48db4c7a041304b7419fd7ab868badd90..38292a13d73f33e75b12ad02baa3af2f66242bab 100644 (file)
--- a/faidx.c
+++ b/faidx.c
@@ -2,6 +2,7 @@
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdint.h>
 #include "faidx.h"
 #include "khash.h"
 
diff --git a/khash.h b/khash.h
index 1d583efa3b2ff7c91d980383ded2db4aea97f849..a7e80568e0c33bb16c007954a5f25934e306f7df 100644 (file)
--- a/khash.h
+++ b/khash.h
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2008 Genome Research Ltd (GRL).
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
 
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
@@ -23,8 +23,6 @@
    SOFTWARE.
 */
 
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
 /*
   An example:
 
@@ -49,6 +47,14 @@ int main() {
 */
 
 /*
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
   2008-09-19 (0.2.3):
 
        * Corrected the example
@@ -88,17 +94,35 @@ int main() {
   @copyright Heng Li
  */
 
-#define AC_VERSION_KHASH_H "0.2.2"
+#define AC_VERSION_KHASH_H "0.2.5"
 
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
+
+/* compipler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
 
-typedef uint32_t khint_t;
+typedef khint32_t khint_t;
 typedef khint_t khiter_t;
 
 #define __ac_HASH_PRIME_SIZE 32
-static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
+static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
 {
   0ul,          3ul,          11ul,         23ul,         53ul,
   97ul,         193ul,        389ul,        769ul,        1543ul,
@@ -119,17 +143,32 @@ static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
 
 static const double __ac_HASH_UPPER = 0.77;
 
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+#define KHASH_DECLARE(name, khkey_t, khval_t)                                                  \
+       typedef struct {                                                                                                        \
+               khint_t n_buckets, size, n_occupied, upper_bound;                               \
+               khint32_t *flags;                                                                                               \
+               khkey_t *keys;                                                                                                  \
+               khval_t *vals;                                                                                                  \
+       } kh_##name##_t;                                                                                                        \
+       extern kh_##name##_t *kh_init_##name();                                                         \
+       extern void kh_destroy_##name(kh_##name##_t *h);                                        \
+       extern void kh_clear_##name(kh_##name##_t *h);                                          \
+       extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key);      \
+       extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+       extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+       extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
        typedef struct {                                                                                                        \
                khint_t n_buckets, size, n_occupied, upper_bound;                               \
-               uint32_t *flags;                                                                                                \
+               khint32_t *flags;                                                                                               \
                khkey_t *keys;                                                                                                  \
                khval_t *vals;                                                                                                  \
        } kh_##name##_t;                                                                                                        \
-       static inline kh_##name##_t *kh_init_##name() {                                         \
+       SCOPE kh_##name##_t *kh_init_##name() {                                                         \
                return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t));                \
        }                                                                                                                                       \
-       static inline void kh_destroy_##name(kh_##name##_t *h)                          \
+       SCOPE void kh_destroy_##name(kh_##name##_t *h)                                          \
        {                                                                                                                                       \
                if (h) {                                                                                                                \
                        free(h->keys); free(h->flags);                                                          \
@@ -137,14 +176,14 @@ static const double __ac_HASH_UPPER = 0.77;
                        free(h);                                                                                                        \
                }                                                                                                                               \
        }                                                                                                                                       \
-       static inline void kh_clear_##name(kh_##name##_t *h)                            \
+       SCOPE void kh_clear_##name(kh_##name##_t *h)                                            \
        {                                                                                                                                       \
                if (h && h->flags) {                                                                                    \
-                       memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
+                       memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \
                        h->size = h->n_occupied = 0;                                                            \
                }                                                                                                                               \
        }                                                                                                                                       \
-       static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+       SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key)        \
        {                                                                                                                                       \
                if (h->n_buckets) {                                                                                             \
                        khint_t inc, k, i, last;                                                                        \
@@ -158,9 +197,9 @@ static const double __ac_HASH_UPPER = 0.77;
                        return __ac_iseither(h->flags, i)? h->n_buckets : i;            \
                } else return 0;                                                                                                \
        }                                                                                                                                       \
-       static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+       SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
        {                                                                                                                                       \
-               uint32_t *new_flags = 0;                                                                                \
+               khint32_t *new_flags = 0;                                                                               \
                khint_t j = 1;                                                                                                  \
                {                                                                                                                               \
                        khint_t t = __ac_HASH_PRIME_SIZE - 1;                                           \
@@ -168,8 +207,8 @@ static const double __ac_HASH_UPPER = 0.77;
                        new_n_buckets = __ac_prime_list[t+1];                                           \
                        if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
                        else {                                                                                                          \
-                               new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t));     \
-                               memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
+                               new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t));   \
+                               memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
                                if (h->n_buckets < new_n_buckets) {                                             \
                                        h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
                                        if (kh_is_map)                                                                          \
@@ -218,7 +257,7 @@ static const double __ac_HASH_UPPER = 0.77;
                        h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
                }                                                                                                                               \
        }                                                                                                                                       \
-       static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+       SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
        {                                                                                                                                       \
                khint_t x;                                                                                                              \
                if (h->n_occupied >= h->upper_bound) {                                                  \
@@ -256,7 +295,7 @@ static const double __ac_HASH_UPPER = 0.77;
                } else *ret = 0;                                                                                                \
                return x;                                                                                                               \
        }                                                                                                                                       \
-       static inline void kh_del_##name(kh_##name##_t *h, khint_t x)           \
+       SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)                           \
        {                                                                                                                                       \
                if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {                 \
                        __ac_set_isdel_true(h->flags, x);                                                       \
@@ -264,24 +303,27 @@ static const double __ac_HASH_UPPER = 0.77;
                }                                                                                                                               \
        }
 
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+       KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
 /* --- BEGIN OF HASH FUNCTIONS --- */
 
 /*! @function
   @abstract     Integer hash function
-  @param  key   The integer [uint32_t]
+  @param  key   The integer [khint32_t]
   @return       The hash value [khint_t]
  */
-#define kh_int_hash_func(key) (uint32_t)(key)
+#define kh_int_hash_func(key) (khint32_t)(key)
 /*! @function
   @abstract     Integer comparison function
  */
 #define kh_int_hash_equal(a, b) ((a) == (b))
 /*! @function
   @abstract     64-bit integer hash function
-  @param  key   The integer [uint64_t]
+  @param  key   The integer [khint64_t]
   @return       The hash value [khint_t]
  */
-#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
 /*! @function
   @abstract     64-bit integer comparison function
  */
@@ -442,7 +484,7 @@ static inline khint_t __ac_X31_hash_string(const char *s)
   @param  name  Name of the hash table [symbol]
  */
 #define KHASH_SET_INIT_INT(name)                                                                               \
-       KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+       KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
 
 /*! @function
   @abstract     Instantiate a hash map containing integer keys
@@ -450,14 +492,14 @@ static inline khint_t __ac_X31_hash_string(const char *s)
   @param  khval_t  Type of values [type]
  */
 #define KHASH_MAP_INIT_INT(name, khval_t)                                                              \
-       KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+       KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
 
 /*! @function
   @abstract     Instantiate a hash map containing 64-bit integer keys
   @param  name  Name of the hash table [symbol]
  */
 #define KHASH_SET_INIT_INT64(name)                                                                             \
-       KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+       KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
 
 /*! @function
   @abstract     Instantiate a hash map containing 64-bit integer keys
@@ -465,7 +507,7 @@ static inline khint_t __ac_X31_hash_string(const char *s)
   @param  khval_t  Type of values [type]
  */
 #define KHASH_MAP_INIT_INT64(name, khval_t)                                                            \
-       KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+       KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
 
 typedef const char *kh_cstr_t;
 /*! @function
index 43d524c92efc38fb7d417759aa4caaabf402aa05..b2a0dab03c36f616c50aa6e5b79709134855db8e 100644 (file)
--- a/kstring.c
+++ b/kstring.c
@@ -29,16 +29,24 @@ char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
        const char *p, *start;
        if (sep) { // set up the table
                if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
-               aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
-               for (p = sep; *p; ++p)
-                       aux->tab[*p/64] |= 1ull<<(*p%64);
+               aux->finished = 0;
+               if (sep[1]) {
+                       aux->sep = -1;
+                       aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
+                       for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f);
+               } else aux->sep = sep[0];
+       }
+       if (aux->finished) return 0;
+       else if (str) aux->p = str - 1, aux->finished = 0;
+       if (aux->sep < 0) {
+               for (p = start = aux->p + 1; *p; ++p)
+                       if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
+       } else {
+               for (p = start = aux->p + 1; *p; ++p)
+                       if (*p == aux->sep) break;
        }
-       if (str) aux->p = str - 1, aux->tab[0] &= ~1ull;
-       else if (aux->tab[0]&1) return 0;
-       for (p = start = aux->p + 1; *p; ++p)
-               if (aux->tab[*p/64]>>(*p%64)&1) break;
        aux->p = p; // end of token
-       if (*p == 0) aux->tab[0] |= 1; // no more tokens
+       if (*p == 0) aux->finished = 1; // no more tokens
        return (char*)start;
 }
 
index c46a62bc8bd0284826c1c7dd78ff1698d54f54f7..ec5775b485316c4faaa190a2c501a57fc188985c 100644 (file)
--- a/kstring.h
+++ b/kstring.h
@@ -19,6 +19,7 @@ typedef struct __kstring_t {
 
 typedef struct {
        uint64_t tab[4];
+       int sep, finished;
        const char *p; // end of the current token
 } ks_tokaux_t;
 
diff --git a/phase.c b/phase.c
new file mode 100644 (file)
index 0000000..7f06958
--- /dev/null
+++ b/phase.c
@@ -0,0 +1,678 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <math.h>
+#include <zlib.h>
+#include "bam.h"
+#include "errmod.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MAX_VARS 256
+#define FLIP_PENALTY 2
+#define FLIP_THRES 4
+#define MASK_THRES 3
+
+#define FLAG_FIX_CHIMERA 0x1
+#define FLAG_LIST_EXCL   0x4
+
+typedef struct {
+       // configurations, initialized in the main function
+       int flag, k, min_baseQ, min_varLOD, max_depth;
+       // other global variables
+       int vpos_shift;
+       bamFile fp;
+       char *pre;
+       bamFile out[3];
+       // alignment queue
+       int n, m;
+       bam1_t **b;
+} phaseg_t;
+
+typedef struct {
+       int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation!
+       int vpos, beg, end;
+       uint32_t vlen:16, single:1, flip:1, phase:1, phased:1;
+} frag_t, *frag_p;
+
+#define rseq_lt(a,b) ((a)->vpos < (b)->vpos)
+
+#include "khash.h"
+KHASH_SET_INIT_INT64(set64)
+KHASH_MAP_INIT_INT64(64, frag_t)
+
+typedef khash_t(64) nseq_t;
+
+#include "ksort.h"
+KSORT_INIT(rseq, frag_p, rseq_lt)
+
+static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+static inline uint64_t X31_hash_string(const char *s)
+{
+       uint64_t h = *s;
+       if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+       return h;
+}
+
+static void count1(int l, const uint8_t *seq, int *cnt)
+{
+       int i, j, n_ambi;
+       uint32_t z, x;
+       if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous
+       for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases
+               if (seq[i] == 0) ++n_ambi;
+       if (l - n_ambi <= 1) return; // only one SNP
+       for (x = 0; x < 1u<<n_ambi; ++x) { // count
+               for (i = j = 0, z = 0; i < l; ++i) {
+                       int c;
+                       if (seq[i]) c = seq[i] - 1;
+                       else {
+                               c = x>>j&1;
+                               ++j;
+                       }
+                       z = z<<1 | c;
+               }
+               ++cnt[z];
+       }
+}
+
+static int **count_all(int l, int vpos, nseq_t *hash)
+{
+       khint_t k;
+       int i, j, **cnt;
+       uint8_t *seq;
+       seq = calloc(l, 1);
+       cnt = calloc(vpos, sizeof(void*));
+       for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
+       for (k = 0; k < kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       frag_t *f = &kh_val(hash, k);
+                       if (f->vpos >= vpos || f->single) continue; // out of region; or singleton
+                       if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right
+                               f->single = 1;
+                               continue;
+                       }
+                       for (j = 1; j < f->vlen; ++j) {
+                               for (i = 0; i < l; ++i)
+                                       seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)];
+                               count1(l, seq, cnt[f->vpos + j]);
+                       }
+               }
+       }
+       free(seq);
+       return cnt;
+}
+
+// phasing
+static int8_t *dynaprog(int l, int vpos, int **w)
+{
+       int *f[2], *curr, *prev, max, i;
+       int8_t **b, *h = 0;
+       uint32_t x, z = 1u<<(l-1), mask = (1u<<l) - 1;
+       f[0] = calloc(z, sizeof(int));
+       f[1] = calloc(z, sizeof(int));
+       b = calloc(vpos, sizeof(void*));
+       prev = f[0]; curr = f[1];
+       // fill the backtrack matrix
+       for (i = 0; i < vpos; ++i) {
+               int *wi = w[i], *tmp;
+               int8_t *bi;
+               bi = b[i] = calloc(z, 1);
+               /* In the following, x is the current state, which is the
+                * lexicographically smaller local haplotype. xc is the complement of
+                * x, or the larger local haplotype; y0 and y1 are the two predecessors
+                * of x. */
+               for (x = 0; x < z; ++x) { // x0 is the smaller 
+                       uint32_t y0, y1, xc;
+                       int c0, c1;
+                       xc = ~x&mask; y0 = x>>1; y1 = xc>>1;
+                       c0 = prev[y0] + wi[x] + wi[xc];
+                       c1 = prev[y1] + wi[x] + wi[xc];
+                       if (c0 > c1) bi[x] = 0, curr[x] = c0;
+                       else bi[x] = 1, curr[x] = c1;
+               }
+               tmp = prev; prev = curr; curr = tmp; // swap
+       }
+       { // backtrack
+               uint32_t max_x = 0;
+               int which = 0;
+               h = calloc(vpos, 1);
+               for (x = 0, max = 0, max_x = 0; x < z; ++x)
+                       if (prev[x] > max) max = prev[x], max_x = x;
+               for (i = vpos - 1, x = max_x; i >= 0; --i) {
+                       h[i] = which? (~x&1) : (x&1);
+                       which = b[i][x]? !which : which;
+                       x = b[i][x]? (~x&mask)>>1 : x>>1;
+               }
+       }
+       // free
+       for (i = 0; i < vpos; ++i) free(b[i]);
+       free(f[0]); free(f[1]); free(b);
+       return h;
+}
+
+// phase each fragment
+static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip)
+{
+       khint_t k;
+       uint64_t *pcnt;
+       uint32_t *left, *rght, max;
+       left = rght = 0; max = 0;
+       pcnt = calloc(vpos, 8);
+       for (k = 0; k < kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       int i, c[2];
+                       frag_t *f = &kh_val(hash, k);
+                       if (f->vpos >= vpos) continue;
+                       // get the phase
+                       c[0] = c[1] = 0;
+                       for (i = 0; i < f->vlen; ++i) {
+                               if (f->seq[i] == 0) continue;
+                               ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1];
+                       }
+                       f->phase = c[0] > c[1]? 0 : 1;
+                       f->phased = c[0] == c[1]? 0 : 1;
+                       // fix chimera
+                       f->flip = 0;
+                       if (flip && c[0] >= 3 && c[1] >= 3) {
+                               int sum[2], m, mi, md;
+                               if (f->vlen > max) { // enlarge the array
+                                       max = f->vlen;
+                                       kroundup32(max);
+                                       left = realloc(left, max * 4);
+                                       rght = realloc(rght, max * 4);
+                               }
+                               for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts
+                                       if (f->seq[i]) {
+                                               int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+                                               ++sum[c == path[f->vpos + i]? 0 : 1];
+                                       }
+                                       left[i] = sum[1]<<16 | sum[0];
+                               }
+                               for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts
+                                       if (f->seq[i]) {
+                                               int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+                                               ++sum[c == path[f->vpos + i]? 0 : 1];
+                                       }
+                                       rght[i] = sum[1]<<16 | sum[0];
+                               }
+                               // find the best flip point
+                               for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) {
+                                       int a[2];
+                                       a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY;
+                                       a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY;
+                                       if (a[0] > a[1]) {
+                                               if (a[0] > m) m = a[0], md = 0, mi = i;
+                                       } else {
+                                               if (a[1] > m) m = a[1], md = 1, mi = i;
+                                       }
+                               }
+                               if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip
+                                       f->flip = 1;
+                                       if (md == 0) { // flip the tail
+                                               for (i = mi + 1; i < f->vlen; ++i)
+                                                       if (f->seq[i] == 1) f->seq[i] = 2;
+                                                       else if (f->seq[i] == 2) f->seq[i] = 1;
+                                       } else { // flip the head
+                                               for (i = 0; i <= mi; ++i)
+                                                       if (f->seq[i] == 1) f->seq[i] = 2;
+                                                       else if (f->seq[i] == 2) f->seq[i] = 1;
+                                       }
+                               }
+                       }
+                       // update pcnt[]
+                       if (!f->single) {
+                               for (i = 0; i < f->vlen; ++i) {
+                                       int c;
+                                       if (f->seq[i] == 0) continue;
+                                       c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+                                       if (c == path[f->vpos + i]) {
+                                               if (f->phase == 0) ++pcnt[f->vpos + i];
+                                               else pcnt[f->vpos + i] += 1ull<<32;
+                                       } else {
+                                               if (f->phase == 0) pcnt[f->vpos + i] += 1<<16;
+                                               else pcnt[f->vpos + i] += 1ull<<48;
+                                       }
+                               }
+                       }
+               }
+       }
+       free(left); free(rght);
+       return pcnt;
+}
+
+static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n)
+{
+       int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0;
+       uint64_t *list = 0;
+       for (i = 0; i < vpos; ++i) {
+               uint64_t x = pcnt[i];
+               int c[4], pre = score, s;
+               c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff;
+               s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1);
+               if (c[3] > c[2]) s += c[3] - c[2];
+               if (c[1] > c[0]) s += c[1] - c[0];
+               score += s;
+               if (score < 0) score = 0;
+               if (pre == 0 && score > 0) beg = i; // change from zero to non-zero
+               if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) {
+                       if (n == m) {
+                               m = m? m<<1 : 4;
+                               list = realloc(list, m * 8);
+                       }
+                       list[n++] = (uint64_t)beg<<32 | max_i;
+                       i = max_i; // reset i to max_i
+                       score = 0;
+               } else if (score > max) max = score, max_i = i;
+               if (score == 0) max = 0;
+       }
+       *_n = n;
+       return list;
+}
+
+// trim heading and tailing ambiguous bases; mark deleted and remove sequence
+static int clean_seqs(int vpos, nseq_t *hash)
+{
+       khint_t k;
+       int ret = 0;
+       for (k = 0; k < kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       frag_t *f = &kh_val(hash, k);
+                       int beg, end, i;
+                       if (f->vpos >= vpos) {
+                               ret = 1;
+                               continue;
+                       }
+                       for (i = 0; i < f->vlen; ++i)
+                               if (f->seq[i] != 0) break;
+                       beg = i;
+                       for (i = f->vlen - 1; i >= 0; --i)
+                               if (f->seq[i] != 0) break;
+                       end = i + 1;
+                       if (end - beg <= 0) kh_del(64, hash, k);
+                       else {
+                               if (beg != 0) memmove(f->seq, f->seq + beg, end - beg);
+                               f->vpos += beg; f->vlen = end - beg;
+                               f->single = f->vlen == 1? 1 : 0;
+                       }
+               }
+       }
+       return ret;
+}
+
+static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
+{
+       int i, is_flip;
+       is_flip = (drand48() < 0.5);
+       for (i = 0; i < g->n; ++i) {
+               int end, which;
+               uint64_t key;
+               khint_t k;
+               bam1_t *b = g->b[i];
+               key = X31_hash_string(bam1_qname(b));
+               end = bam_calend(&b->core, bam1_cigar(b));
+               if (end > min_pos) break;
+               k = kh_get(64, hash, key);
+               if (k == kh_end(hash)) which = 3;
+               else {
+                       frag_t *f = &kh_val(hash, k);
+                       if (f->phased && f->flip) which = 2;
+                       else if (f->phased == 0) which = 3;
+                       else { // phased and not flipped
+                               char c = 'Y';
+                               which = f->phase;
+                               bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c);
+                       }
+                       if (which < 2 && is_flip) which = 1 - which; // increase the randomness
+               }
+               if (which == 3) which = (drand48() < 0.5);
+               bam_write1(g->out[which], b);
+               bam_destroy1(b);
+               g->b[i] = 0;
+       }
+       memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
+       g->n -= i;
+}
+
+static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
+{
+       int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos;
+       khint_t k;
+       frag_t **seqs;
+       int8_t *path, *sitemask;
+       uint64_t *pcnt, *regmask;
+
+       if (vpos == 0) return 0;
+       i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos
+       min_pos = i? cns[vpos]>>32 : 0x7fffffff;
+       if (vpos == 1) {
+               printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
+               printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
+                       "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1);
+               for (k = 0; k < kh_end(hash); ++k) {
+                       if (kh_exist(hash, k)) {
+                               frag_t *f = &kh_val(hash, k);
+                               if (f->vpos) continue;
+                               f->flip = 0;
+                               if (f->seq[0] == 0) f->phased = 0;
+                               else f->phased = 1, f->phase = f->seq[0] - 1;
+                       }
+               }
+               dump_aln(g, min_pos, hash);
+               ++g->vpos_shift;
+               return 1;
+       }
+       { // phase
+               int **cnt;
+               uint64_t *mask;
+               printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
+               sitemask = calloc(vpos, 1);
+               cnt = count_all(g->k, vpos, hash);
+               path = dynaprog(g->k, vpos, cnt);
+               for (i = 0; i < vpos; ++i) free(cnt[i]);
+               free(cnt);
+               pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking
+               mask = genmask(vpos, pcnt, &n_masked);
+               regmask = calloc(n_masked, 8);
+               for (i = 0; i < n_masked; ++i) {
+                       regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32;
+                       for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j)
+                               sitemask[j] = 1;
+               }
+               free(mask);
+               if (g->flag & FLAG_FIX_CHIMERA) {
+                       free(pcnt);
+                       pcnt = fragphase(vpos, path, hash, 1);
+               }
+       }
+       for (i = 0; i < n_masked; ++i)
+               printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
+       for (i = 0; i < vpos; ++i) {
+               uint64_t x = pcnt[i];
+               int8_t c[2];
+               c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3);
+               c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3);
+               printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32), (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
+                       i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff));
+       }
+       free(path); free(pcnt); free(regmask); free(sitemask);
+       seqs = calloc(n_seqs, sizeof(void*));
+       for (k = 0, i = 0; k < kh_end(hash); ++k) 
+               if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single)
+                       seqs[i++] = &kh_val(hash, k);
+       n_seqs = i;
+       ks_introsort_rseq(n_seqs, seqs);
+       for (i = 0; i < n_seqs; ++i) {
+               frag_t *f = seqs[i];
+               printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
+               for (j = 0; j < f->vlen; ++j) {
+                       uint32_t c = cns[f->vpos + j];
+                       if (f->seq[j] == 0) putchar('N');
+                       else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]);
+               }
+               printf("\t*\tYP:i:%d\tYF:i:%d\n", f->phase, f->flip);
+       }
+       free(seqs);
+       printf("//\n");
+       fflush(stdout);
+       g->vpos_shift += vpos;
+       dump_aln(g, min_pos, hash);
+       return vpos;
+}
+
+static void update_vpos(int vpos, nseq_t *hash)
+{
+       khint_t k;
+       for (k = 0; k < kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       frag_t *f = &kh_val(hash, k);
+                       if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it
+                       else f->vpos -= vpos;
+               }
+       }
+}
+
+static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement
+{
+       return hash;
+}
+
+static int readaln(void *data, bam1_t *b)
+{
+       phaseg_t *g = (phaseg_t*)data;
+       int ret;
+       ret = bam_read1(g->fp, b);
+       if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
+               if (g->n == g->m) {
+                       g->m = g->m? g->m<<1 : 16;
+                       g->b = realloc(g->b, g->m * sizeof(void*));
+               }
+               g->b[g->n++] = bam_dup1(b);
+       }
+       return ret;
+}
+
+static khash_t(set64) *loadpos(const char *fn, bam_header_t *h)
+{
+       gzFile fp;
+       kstream_t *ks;
+       int ret, dret;
+       kstring_t *str;
+       khash_t(set64) *hash;
+
+       hash = kh_init(set64);
+       str = calloc(1, sizeof(kstring_t));
+       fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+               int tid = bam_get_tid(h, str->s);
+               if (tid >= 0 && dret != '\n') {
+                       if (ks_getuntil(ks, 0, str, &dret) >= 0) {
+                               uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
+                               kh_put(set64, hash, x, &ret);
+                       } else break;
+               }
+               if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+               if (dret < 0) break;
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       return hash;
+}
+
+static int gl2cns(float q[16])
+{
+       int i, j, min_ij;
+       float min, min2;
+       min = min2 = 1e30; min_ij = -1;
+       for (i = 0; i < 4; ++i) {
+               for (j = i; j < 4; ++j) {
+                       if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j];
+                       else if (q[i<<2|j] < min2) min2 = q[i<<2|j];
+               }
+       }
+       return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
+}
+
+int main_phase(int argc, char *argv[])
+{
+       extern void bam_init_header_hash(bam_header_t *header);
+       int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
+       const bam_pileup1_t *plp;
+       bam_plp_t iter;
+       bam_header_t *h;
+       nseq_t *seqs;
+       uint64_t *cns = 0;
+       phaseg_t g;
+       char *fn_list = 0;
+       khash_t(set64) *set = 0;
+       errmod_t *em;
+       uint16_t *bases;
+
+       memset(&g, 0, sizeof(phaseg_t));
+       g.flag = FLAG_FIX_CHIMERA;
+       g.min_varLOD = 40; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
+       while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:")) >= 0) {
+               switch (c) {
+                       case 'D': g.max_depth = atoi(optarg); break;
+                       case 'q': g.min_varLOD = atoi(optarg); break;
+                       case 'Q': g.min_baseQ = atoi(optarg); break;
+                       case 'k': g.k = atoi(optarg); break;
+                       case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break;
+                       case 'e': g.flag |= FLAG_LIST_EXCL; break;
+                       case 'b': g.pre = strdup(optarg); break;
+                       case 'l': fn_list = strdup(optarg); break;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:   samtools phase [options] <in.bam>\n\n");
+               fprintf(stderr, "Options: -k INT    block length [%d]\n", g.k);
+               fprintf(stderr, "         -b STR    prefix of BAMs to output [null]\n");
+               fprintf(stderr, "         -q INT    min het phred-LOD [%d]\n", g.min_varLOD);
+               fprintf(stderr, "         -Q INT    min base quality in het calling [%d]\n", g.min_baseQ);
+               fprintf(stderr, "         -D INT    max read depth [%d]\n", g.max_depth);
+//             fprintf(stderr, "         -l FILE   list of sites to phase [null]\n");
+               fprintf(stderr, "         -F        do not attempt to fix chimeras\n");
+//             fprintf(stderr, "         -e        do not discover SNPs (effective with -l)\n");
+               fprintf(stderr, "\n");
+               return 1;
+       }
+       g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+       h = bam_header_read(g.fp);
+       if (fn_list) { // read the list of sites to phase
+               bam_init_header_hash(h);
+               set = loadpos(fn_list, h);
+               free(fn_list);
+       } else g.flag &= ~FLAG_LIST_EXCL;
+       if (g.pre) { // open BAMs to write
+               char *s = malloc(strlen(g.pre) + 20);
+               strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w");
+               strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w");
+               strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w");
+               for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h);
+               free(s);
+       }
+
+       iter = bam_plp_init(readaln, &g);
+       g.vpos_shift = 0;
+       seqs = kh_init(64);
+       em = errmod_init(1. - 0.83);
+       bases = calloc(g.max_depth, 2);
+       printf("CC\n");
+       printf("CC\tDescriptions:\nCC\n");
+       printf("CC\t  CC      comments\n");
+       printf("CC\t  PS      start of a phase set\n");
+       printf("CC\t  FL      filtered region\n");
+       printf("CC\t  M[012]  markers; 0 for singletons, 1 for phased and 2 for filtered\n");
+       printf("CC\t  EV      supporting reads; SAM format\n");
+       printf("CC\t  //      end of a phase set\nCC\n");
+       printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
+       printf("CC\t  PS  chr  phaseSetStart  phaseSetEnd\n");
+       printf("CC\t  FL  chr  filterStart    filterEnd\n");
+       printf("CC\t  M?  chr  PS  pos  allele0  allele1  hetIndex  #supports0  #errors0  #supp1  #err1\n");
+       printf("CC\nCC\n");
+       fflush(stdout);
+       while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
+               int i, k, c, tmp, dophase = 1, in_set = 0;
+               float q[16];
+               if (tid < 0) break;
+               if (tid != lasttid) { // change of chromosome
+                       g.vpos_shift = 0;
+                       if (lasttid >= 0) {
+                               seqs = shrink_hash(seqs);
+                               phase(&g, h->target_name[lasttid], vpos, cns, seqs);
+                               update_vpos(0x7fffffff, seqs);
+                       }
+                       lasttid = tid;
+                       vpos = 0;
+               }
+               if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1;
+               if (n > g.max_depth) continue; // do not proceed if the depth is too high
+               // fill the bases array and check if there is a variant
+               for (i = k = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = plp + i;
+                       uint8_t *seq;
+                       int q, baseQ, b;
+                       if (p->is_del || p->is_refskip) continue;
+                       baseQ = bam1_qual(p->b)[p->qpos];
+                       if (baseQ < g.min_baseQ) continue;
+                       seq = bam1_seq(p->b);
+                       b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+                       if (b > 3) continue;
+                       q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+                       if (q < 4) q = 4;
+                       if (q > 63) q = 63;
+                       bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
+               }
+               if (k == 0) continue;
+               errmod_cal(em, k, 4, bases, q); // compute genotype likelihood
+               c = gl2cns(q); // get the consensus
+               // tell if to proceed
+               if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list
+               if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant
+               // add the variant
+               if (vpos == max_vpos) {
+                       max_vpos = max_vpos? max_vpos<<1 : 128;
+                       cns = realloc(cns, max_vpos * 8);
+               }
+               cns[vpos] = (uint64_t)pos<<32 | c;
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = plp + i;
+                       uint64_t key;
+                       khint_t k;
+                       uint8_t *seq = bam1_seq(p->b);
+                       frag_t *f;
+                       if (p->is_del || p->is_refskip) continue;
+                       if (p->b->core.qual == 0) continue;
+                       // get the base code
+                       c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)];
+                       if (c == (cns[vpos]&3)) c = 1;
+                       else if (c == (cns[vpos]>>16&3)) c = 2;
+                       else c = 0;
+                       // write to seqs
+                       key = X31_hash_string(bam1_qname(p->b));
+                       k = kh_put(64, seqs, key, &tmp);
+                       f = &kh_val(seqs, k);
+                       if (tmp == 0) { // present in the hash table
+                               if (vpos - f->vpos + 1 < MAX_VARS) {
+                                       f->vlen = vpos - f->vpos + 1;
+                                       f->seq[f->vlen-1] = c;
+                                       f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+                               }
+                               dophase = 0;
+                       } else { // absent
+                               memset(f->seq, 0, MAX_VARS);
+                               f->beg = p->b->core.pos;
+                               f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+                               f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = 0;
+                       }
+               }
+               if (dophase) {
+                       seqs = shrink_hash(seqs);
+                       phase(&g, h->target_name[tid], vpos, cns, seqs);
+                       update_vpos(vpos, seqs);
+                       cns[0] = cns[vpos];
+                       vpos = 0;
+               }
+               ++vpos;
+       }
+       if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs);
+       bam_header_destroy(h);
+       bam_plp_destroy(iter);
+       bam_close(g.fp);
+       kh_destroy(64, seqs);
+       kh_destroy(set64, set);
+       free(cns);
+       errmod_destroy(em);
+       free(bases);
+       if (g.pre) {
+               for (c = 0; c <= 2; ++c) bam_close(g.out[c]);
+               free(g.pre); free(g.b);
+       }
+       return 0;
+}
index 57f1aff50065ef54c470436108a5c69da8c73d84..8d299b78dc079b6961e14137324ecad32e4b66d0 100644 (file)
@@ -137,7 +137,7 @@ viewing the same reference sequence.
 
 .TP
 .B mpileup
-samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]]
+samtools mpileup [-EBug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]]
 
 Generate BCF or pileup for one or multiple BAM files. Alignment records
 are grouped by sample identifiers in @RG header lines. If sample
@@ -164,6 +164,10 @@ Phred-scaled gap extension sequencing error probability. Reducing
 .I INT
 leads to longer indels. [20]
 .TP
+.B -E
+Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt
+specificity a little bit.
+.TP
 .BI -f \ FILE
 The reference file [null]
 .TP
@@ -355,7 +359,7 @@ Treat paired-end reads and single-end reads.
 
 .TP
 .B calmd
-samtools calmd [-eubSr] [-C capQcoef] <aln.bam> <ref.fasta>
+samtools calmd [-EeubSr] [-C capQcoef] <aln.bam> <ref.fasta>
 
 Generate the MD tag. If the MD tag is already present, this command will
 give a warning if the MD tag generated is different from the existing
@@ -388,7 +392,55 @@ Coefficient to cap mapping quality of poorly mapped reads. See the
 command for details. [0]
 .TP
 .B -r
-Compute the BQ tag without changing the base quality.
+Compute the BQ tag (without -A) or cap base quality by BAQ (with -A).
+.TP
+.B -E
+Extended BAQ calculation. This option trades specificity for sensitivity, though the
+effect is minor.
+.RE
+
+.TP
+.B targetcut
+samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>
+
+This command identifies target regions by examining the continuity of read depth, computes
+haploid consensus sequences of targets and outputs a SAM with each sequence corresponding
+to a target. When option
+.B -f
+is in use, BAQ will be applied. This command is
+.B only
+designed for cutting fosmid clones from fosmid pool sequencing [Ref. Kitzman et al. (2010)].
+.RE
+
+.TP
+.B phase
+samtools phase [-F] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] <in.bam>
+
+Call and phase heterozygous SNPs.
+.B OPTIONS:
+.RS
+.TP 8
+.BI -b \ STR
+Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file
+.BR STR .0.bam
+and phase-1 reads in
+.BR STR .1.bam.
+Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads
+with switch errors will be saved in
+.BR STR .chimeric.bam.
+[null]
+.TP
+.B -F
+Do not attempt to fix chimeric reads.
+.TP
+.BI -k \ INT
+Maximum length for local phasing. [13]
+.TP
+.BI -q \ INT
+Minimum Phred-scaled LOD to call a heterozygote. [40]
+.TP
+.BI -Q \ INT
+Minimum base quality to be used in het calling. [13]
 .RE
 
 .TP
@@ -629,6 +681,20 @@ mismatches. Applying this option usually helps
 .B BWA-short
 but may not other mappers.
 
+.IP o 2
+Generate the consensus sequence for one diploid individual:
+
+  samtools mpileup -uf ref.fa aln.bam | bcftools view -cg - | vcfutils.pl vcf2fq > cns.fq
+
+.IP o 2
+Phase one individual:
+
+  samtools calmd -AEur aln.bam ref.fa | samtools phase -b prefix - > phase.out
+
+The
+.B calmd
+command is used to reduce false heterozygotes around INDELs.
+
 .IP o 2
 Call SNPs and short indels for multiple diploid individuals: