--- /dev/null
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bam.h"
+#include "errmod.h"
+#include "faidx.h"
+
+#define ERR_DEP 0.83f
+
+typedef struct {
+ int e[2][3], p[2][2];
+} score_param_t;
+
+/* Note that although the two matrics have 10 parameters in total, only 4
+ * (probably 3) are free. Changing the scoring matrices in a sort of symmetric
+ * way will not change the result. */
+static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} };
+
+typedef struct {
+ int min_baseQ, tid, max_bases;
+ uint16_t *bases;
+ bamFile fp;
+ bam_header_t *h;
+ char *ref;
+ faidx_t *fai;
+ errmod_t *em;
+} ct_t;
+
+static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
+{
+ int i, j, ret, tmp, k, sum[4], qual;
+ float q[16];
+ if (n > g->max_bases) { // enlarge g->bases
+ g->max_bases = n;
+ kroundup32(g->max_bases);
+ g->bases = realloc(g->bases, g->max_bases * 2);
+ }
+ for (i = k = 0; i < n; ++i) {
+ const bam_pileup1_t *p = plp + i;
+ uint8_t *seq;
+ int q, baseQ, b;
+ if (p->is_refskip || p->is_del) continue;
+ baseQ = bam1_qual(p->b)[p->qpos];
+ if (baseQ < g->min_baseQ) continue;
+ seq = bam1_seq(p->b);
+ b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+ if (b > 3) continue;
+ q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+ g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b;
+ }
+ if (k == 0) return 0;
+ errmod_cal(g->em, k, 4, g->bases, q);
+ for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i;
+ for (i = 1; i < 4; ++i) // insertion sort
+ for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
+ tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
+ qual = (sum[1]>>2) - (sum[0]>>2);
+ k = k < 256? k : 255;
+ ret = (qual < 63? qual : 63) << 2 | (sum[0]&3);
+ return ret<<8|k;
+}
+
+static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns)
+{
+ int i, f[2][2], *prev, *curr, *swap_tmp, s;
+ uint8_t *b; // backtrack array
+ b = calloc(l, 1);
+ f[0][0] = f[0][1] = 0;
+ prev = f[0]; curr = f[1];
+ // fill the backtrack matrix
+ for (i = 0; i < l; ++i) {
+ int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2;
+ int tmp0, tmp1;
+ // compute f[0]
+ tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0)
+ tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1)
+ if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0;
+ else curr[0] = tmp1, b[i] = 1;
+ // compute f[1]
+ tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0)
+ tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1)
+ if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1;
+ else curr[1] = tmp1, b[i] |= 1<<1;
+ // swap
+ swap_tmp = prev; prev = curr; curr = swap_tmp;
+ }
+ // backtrack
+ s = prev[0] > prev[1]? 0 : 1;
+ for (i = l - 1; i > 0; --i) {
+ b[i] |= s<<2;
+ s = b[i]>>s&1;
+ }
+ // print
+ for (i = 0, s = -1; i <= l; ++i) {
+ if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) {
+ if (s >= 0) {
+ int j;
+ printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s);
+ for (j = s; j < i; ++j) {
+ int c = cns[j]>>8;
+ if (c == 0) putchar('N');
+ else putchar("ACGT"[c&3]);
+ }
+ putchar('\t');
+ for (j = s; j < i; ++j)
+ putchar(33 + (cns[j]>>8>>2));
+ putchar('\n');
+ }
+ //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s);
+ s = -1;
+ } else if ((b[i]>>2&3) && s < 0) s = i;
+ }
+ free(b);
+}
+
+static int read_aln(void *data, bam1_t *b)
+{
+ extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag);
+ ct_t *g = (ct_t*)data;
+ int ret, len;
+ ret = bam_read1(g->fp, b);
+ if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) {
+ if (b->core.tid != g->tid) { // then load the sequence
+ free(g->ref);
+ g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
+ g->tid = b->core.tid;
+ }
+ bam_prob_realn_core(b, g->ref, 1<<1|1);
+ }
+ return ret;
+}
+
+int main_cut_target(int argc, char *argv[])
+{
+ int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l;
+ const bam_pileup1_t *p;
+ bam_plp_t plp;
+ uint16_t *cns;
+ ct_t g;
+
+ memset(&g, 0, sizeof(ct_t));
+ g.min_baseQ = 13; g.tid = -1;
+ while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) {
+ switch (c) {
+ case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff
+ case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY
+ case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE
+ case '1': g_param.e[1][1] = atoi(optarg); break;
+ case '2': g_param.e[1][2] = atoi(optarg); break;
+ case 'f': g.fai = fai_load(optarg);
+ if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__);
+ break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>\n");
+ return 1;
+ }
+ l = max_l = 0; cns = 0;
+ g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ g.h = bam_header_read(g.fp);
+ g.em = errmod_init(1 - ERR_DEP);
+ plp = bam_plp_init(read_aln, &g);
+ while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) {
+ if (tid < 0) break;
+ if (tid != lasttid) { // change of chromosome
+ if (cns) process_cns(g.h, lasttid, l, cns);
+ if (max_l < g.h->target_len[tid]) {
+ max_l = g.h->target_len[tid];
+ kroundup32(max_l);
+ cns = realloc(cns, max_l * 2);
+ }
+ l = g.h->target_len[tid];
+ memset(cns, 0, max_l * 2);
+ lasttid = tid;
+ }
+ cns[pos] = gencns(&g, n, p);
+ lastpos = pos;
+ }
+ process_cns(g.h, lasttid, l, cns);
+ free(cns);
+ bam_header_destroy(g.h);
+ bam_plp_destroy(plp);
+ bam_close(g.fp);
+ if (g.fai) {
+ fai_destroy(g.fai); free(g.ref);
+ }
+ errmod_destroy(g.em);
+ free(g.bases);
+ return 0;
+}
/* The MIT License
- Copyright (c) 2008 Genome Research Ltd (GRL).
+ Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
SOFTWARE.
*/
-/* Contact: Heng Li <lh3@sanger.ac.uk> */
-
/*
An example:
*/
/*
+ 2011-02-14 (0.2.5):
+
+ * Allow to declare global functions.
+
+ 2009-09-26 (0.2.4):
+
+ * Improve portability
+
2008-09-19 (0.2.3):
* Corrected the example
@copyright Heng Li
*/
-#define AC_VERSION_KHASH_H "0.2.2"
+#define AC_VERSION_KHASH_H "0.2.5"
-#include <stdint.h>
#include <stdlib.h>
#include <string.h>
+#include <limits.h>
+
+/* compipler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifdef _MSC_VER
+#define inline __inline
+#endif
-typedef uint32_t khint_t;
+typedef khint32_t khint_t;
typedef khint_t khiter_t;
#define __ac_HASH_PRIME_SIZE 32
-static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
+static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
{
0ul, 3ul, 11ul, 23ul, 53ul,
97ul, 193ul, 389ul, 769ul, 1543ul,
static const double __ac_HASH_UPPER = 0.77;
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+#define KHASH_DECLARE(name, khkey_t, khval_t) \
+ typedef struct { \
+ khint_t n_buckets, size, n_occupied, upper_bound; \
+ khint32_t *flags; \
+ khkey_t *keys; \
+ khval_t *vals; \
+ } kh_##name##_t; \
+ extern kh_##name##_t *kh_init_##name(); \
+ extern void kh_destroy_##name(kh_##name##_t *h); \
+ extern void kh_clear_##name(kh_##name##_t *h); \
+ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
+ extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+ extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
typedef struct { \
khint_t n_buckets, size, n_occupied, upper_bound; \
- uint32_t *flags; \
+ khint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t; \
- static inline kh_##name##_t *kh_init_##name() { \
+ SCOPE kh_##name##_t *kh_init_##name() { \
return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \
} \
- static inline void kh_destroy_##name(kh_##name##_t *h) \
+ SCOPE void kh_destroy_##name(kh_##name##_t *h) \
{ \
if (h) { \
free(h->keys); free(h->flags); \
free(h); \
} \
} \
- static inline void kh_clear_##name(kh_##name##_t *h) \
+ SCOPE void kh_clear_##name(kh_##name##_t *h) \
{ \
if (h && h->flags) { \
- memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
+ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \
h->size = h->n_occupied = 0; \
} \
} \
- static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
{ \
if (h->n_buckets) { \
khint_t inc, k, i, last; \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
} else return 0; \
} \
- static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+ SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
{ \
- uint32_t *new_flags = 0; \
+ khint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
khint_t t = __ac_HASH_PRIME_SIZE - 1; \
new_n_buckets = __ac_prime_list[t+1]; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
else { \
- new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
- memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
+ new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
+ memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \
if (h->n_buckets < new_n_buckets) { \
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) \
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
} \
} \
- static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
{ \
khint_t x; \
if (h->n_occupied >= h->upper_bound) { \
} else *ret = 0; \
return x; \
} \
- static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
+ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
{ \
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
__ac_set_isdel_true(h->flags, x); \
} \
}
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+ KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
/* --- BEGIN OF HASH FUNCTIONS --- */
/*! @function
@abstract Integer hash function
- @param key The integer [uint32_t]
+ @param key The integer [khint32_t]
@return The hash value [khint_t]
*/
-#define kh_int_hash_func(key) (uint32_t)(key)
+#define kh_int_hash_func(key) (khint32_t)(key)
/*! @function
@abstract Integer comparison function
*/
#define kh_int_hash_equal(a, b) ((a) == (b))
/*! @function
@abstract 64-bit integer hash function
- @param key The integer [uint64_t]
+ @param key The integer [khint64_t]
@return The hash value [khint_t]
*/
-#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
/*! @function
@abstract 64-bit integer comparison function
*/
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_INT(name) \
- KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing integer keys
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT(name, khval_t) \
- KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param name Name of the hash table [symbol]
*/
#define KHASH_SET_INIT_INT64(name) \
- KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
/*! @function
@abstract Instantiate a hash map containing 64-bit integer keys
@param khval_t Type of values [type]
*/
#define KHASH_MAP_INIT_INT64(name, khval_t) \
- KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
typedef const char *kh_cstr_t;
/*! @function
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <math.h>
+#include <zlib.h>
+#include "bam.h"
+#include "errmod.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MAX_VARS 256
+#define FLIP_PENALTY 2
+#define FLIP_THRES 4
+#define MASK_THRES 3
+
+#define FLAG_FIX_CHIMERA 0x1
+#define FLAG_LIST_EXCL 0x4
+
+typedef struct {
+ // configurations, initialized in the main function
+ int flag, k, min_baseQ, min_varLOD, max_depth;
+ // other global variables
+ int vpos_shift;
+ bamFile fp;
+ char *pre;
+ bamFile out[3];
+ // alignment queue
+ int n, m;
+ bam1_t **b;
+} phaseg_t;
+
+typedef struct {
+ int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation!
+ int vpos, beg, end;
+ uint32_t vlen:16, single:1, flip:1, phase:1, phased:1;
+} frag_t, *frag_p;
+
+#define rseq_lt(a,b) ((a)->vpos < (b)->vpos)
+
+#include "khash.h"
+KHASH_SET_INIT_INT64(set64)
+KHASH_MAP_INIT_INT64(64, frag_t)
+
+typedef khash_t(64) nseq_t;
+
+#include "ksort.h"
+KSORT_INIT(rseq, frag_p, rseq_lt)
+
+static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+static inline uint64_t X31_hash_string(const char *s)
+{
+ uint64_t h = *s;
+ if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ return h;
+}
+
+static void count1(int l, const uint8_t *seq, int *cnt)
+{
+ int i, j, n_ambi;
+ uint32_t z, x;
+ if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous
+ for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases
+ if (seq[i] == 0) ++n_ambi;
+ if (l - n_ambi <= 1) return; // only one SNP
+ for (x = 0; x < 1u<<n_ambi; ++x) { // count
+ for (i = j = 0, z = 0; i < l; ++i) {
+ int c;
+ if (seq[i]) c = seq[i] - 1;
+ else {
+ c = x>>j&1;
+ ++j;
+ }
+ z = z<<1 | c;
+ }
+ ++cnt[z];
+ }
+}
+
+static int **count_all(int l, int vpos, nseq_t *hash)
+{
+ khint_t k;
+ int i, j, **cnt;
+ uint8_t *seq;
+ seq = calloc(l, 1);
+ cnt = calloc(vpos, sizeof(void*));
+ for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos >= vpos || f->single) continue; // out of region; or singleton
+ if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right
+ f->single = 1;
+ continue;
+ }
+ for (j = 1; j < f->vlen; ++j) {
+ for (i = 0; i < l; ++i)
+ seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)];
+ count1(l, seq, cnt[f->vpos + j]);
+ }
+ }
+ }
+ free(seq);
+ return cnt;
+}
+
+// phasing
+static int8_t *dynaprog(int l, int vpos, int **w)
+{
+ int *f[2], *curr, *prev, max, i;
+ int8_t **b, *h = 0;
+ uint32_t x, z = 1u<<(l-1), mask = (1u<<l) - 1;
+ f[0] = calloc(z, sizeof(int));
+ f[1] = calloc(z, sizeof(int));
+ b = calloc(vpos, sizeof(void*));
+ prev = f[0]; curr = f[1];
+ // fill the backtrack matrix
+ for (i = 0; i < vpos; ++i) {
+ int *wi = w[i], *tmp;
+ int8_t *bi;
+ bi = b[i] = calloc(z, 1);
+ /* In the following, x is the current state, which is the
+ * lexicographically smaller local haplotype. xc is the complement of
+ * x, or the larger local haplotype; y0 and y1 are the two predecessors
+ * of x. */
+ for (x = 0; x < z; ++x) { // x0 is the smaller
+ uint32_t y0, y1, xc;
+ int c0, c1;
+ xc = ~x&mask; y0 = x>>1; y1 = xc>>1;
+ c0 = prev[y0] + wi[x] + wi[xc];
+ c1 = prev[y1] + wi[x] + wi[xc];
+ if (c0 > c1) bi[x] = 0, curr[x] = c0;
+ else bi[x] = 1, curr[x] = c1;
+ }
+ tmp = prev; prev = curr; curr = tmp; // swap
+ }
+ { // backtrack
+ uint32_t max_x = 0;
+ int which = 0;
+ h = calloc(vpos, 1);
+ for (x = 0, max = 0, max_x = 0; x < z; ++x)
+ if (prev[x] > max) max = prev[x], max_x = x;
+ for (i = vpos - 1, x = max_x; i >= 0; --i) {
+ h[i] = which? (~x&1) : (x&1);
+ which = b[i][x]? !which : which;
+ x = b[i][x]? (~x&mask)>>1 : x>>1;
+ }
+ }
+ // free
+ for (i = 0; i < vpos; ++i) free(b[i]);
+ free(f[0]); free(f[1]); free(b);
+ return h;
+}
+
+// phase each fragment
+static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip)
+{
+ khint_t k;
+ uint64_t *pcnt;
+ uint32_t *left, *rght, max;
+ left = rght = 0; max = 0;
+ pcnt = calloc(vpos, 8);
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ int i, c[2];
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos >= vpos) continue;
+ // get the phase
+ c[0] = c[1] = 0;
+ for (i = 0; i < f->vlen; ++i) {
+ if (f->seq[i] == 0) continue;
+ ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1];
+ }
+ f->phase = c[0] > c[1]? 0 : 1;
+ f->phased = c[0] == c[1]? 0 : 1;
+ // fix chimera
+ f->flip = 0;
+ if (flip && c[0] >= 3 && c[1] >= 3) {
+ int sum[2], m, mi, md;
+ if (f->vlen > max) { // enlarge the array
+ max = f->vlen;
+ kroundup32(max);
+ left = realloc(left, max * 4);
+ rght = realloc(rght, max * 4);
+ }
+ for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts
+ if (f->seq[i]) {
+ int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+ ++sum[c == path[f->vpos + i]? 0 : 1];
+ }
+ left[i] = sum[1]<<16 | sum[0];
+ }
+ for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts
+ if (f->seq[i]) {
+ int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+ ++sum[c == path[f->vpos + i]? 0 : 1];
+ }
+ rght[i] = sum[1]<<16 | sum[0];
+ }
+ // find the best flip point
+ for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) {
+ int a[2];
+ a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY;
+ a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY;
+ if (a[0] > a[1]) {
+ if (a[0] > m) m = a[0], md = 0, mi = i;
+ } else {
+ if (a[1] > m) m = a[1], md = 1, mi = i;
+ }
+ }
+ if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip
+ f->flip = 1;
+ if (md == 0) { // flip the tail
+ for (i = mi + 1; i < f->vlen; ++i)
+ if (f->seq[i] == 1) f->seq[i] = 2;
+ else if (f->seq[i] == 2) f->seq[i] = 1;
+ } else { // flip the head
+ for (i = 0; i <= mi; ++i)
+ if (f->seq[i] == 1) f->seq[i] = 2;
+ else if (f->seq[i] == 2) f->seq[i] = 1;
+ }
+ }
+ }
+ // update pcnt[]
+ if (!f->single) {
+ for (i = 0; i < f->vlen; ++i) {
+ int c;
+ if (f->seq[i] == 0) continue;
+ c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+ if (c == path[f->vpos + i]) {
+ if (f->phase == 0) ++pcnt[f->vpos + i];
+ else pcnt[f->vpos + i] += 1ull<<32;
+ } else {
+ if (f->phase == 0) pcnt[f->vpos + i] += 1<<16;
+ else pcnt[f->vpos + i] += 1ull<<48;
+ }
+ }
+ }
+ }
+ }
+ free(left); free(rght);
+ return pcnt;
+}
+
+static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n)
+{
+ int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0;
+ uint64_t *list = 0;
+ for (i = 0; i < vpos; ++i) {
+ uint64_t x = pcnt[i];
+ int c[4], pre = score, s;
+ c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff;
+ s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1);
+ if (c[3] > c[2]) s += c[3] - c[2];
+ if (c[1] > c[0]) s += c[1] - c[0];
+ score += s;
+ if (score < 0) score = 0;
+ if (pre == 0 && score > 0) beg = i; // change from zero to non-zero
+ if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) {
+ if (n == m) {
+ m = m? m<<1 : 4;
+ list = realloc(list, m * 8);
+ }
+ list[n++] = (uint64_t)beg<<32 | max_i;
+ i = max_i; // reset i to max_i
+ score = 0;
+ } else if (score > max) max = score, max_i = i;
+ if (score == 0) max = 0;
+ }
+ *_n = n;
+ return list;
+}
+
+// trim heading and tailing ambiguous bases; mark deleted and remove sequence
+static int clean_seqs(int vpos, nseq_t *hash)
+{
+ khint_t k;
+ int ret = 0;
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ int beg, end, i;
+ if (f->vpos >= vpos) {
+ ret = 1;
+ continue;
+ }
+ for (i = 0; i < f->vlen; ++i)
+ if (f->seq[i] != 0) break;
+ beg = i;
+ for (i = f->vlen - 1; i >= 0; --i)
+ if (f->seq[i] != 0) break;
+ end = i + 1;
+ if (end - beg <= 0) kh_del(64, hash, k);
+ else {
+ if (beg != 0) memmove(f->seq, f->seq + beg, end - beg);
+ f->vpos += beg; f->vlen = end - beg;
+ f->single = f->vlen == 1? 1 : 0;
+ }
+ }
+ }
+ return ret;
+}
+
+static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
+{
+ int i, is_flip;
+ is_flip = (drand48() < 0.5);
+ for (i = 0; i < g->n; ++i) {
+ int end, which;
+ uint64_t key;
+ khint_t k;
+ bam1_t *b = g->b[i];
+ key = X31_hash_string(bam1_qname(b));
+ end = bam_calend(&b->core, bam1_cigar(b));
+ if (end > min_pos) break;
+ k = kh_get(64, hash, key);
+ if (k == kh_end(hash)) which = 3;
+ else {
+ frag_t *f = &kh_val(hash, k);
+ if (f->phased && f->flip) which = 2;
+ else if (f->phased == 0) which = 3;
+ else { // phased and not flipped
+ char c = 'Y';
+ which = f->phase;
+ bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c);
+ }
+ if (which < 2 && is_flip) which = 1 - which; // increase the randomness
+ }
+ if (which == 3) which = (drand48() < 0.5);
+ bam_write1(g->out[which], b);
+ bam_destroy1(b);
+ g->b[i] = 0;
+ }
+ memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
+ g->n -= i;
+}
+
+static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
+{
+ int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos;
+ khint_t k;
+ frag_t **seqs;
+ int8_t *path, *sitemask;
+ uint64_t *pcnt, *regmask;
+
+ if (vpos == 0) return 0;
+ i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos
+ min_pos = i? cns[vpos]>>32 : 0x7fffffff;
+ if (vpos == 1) {
+ printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
+ printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
+ "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1);
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos) continue;
+ f->flip = 0;
+ if (f->seq[0] == 0) f->phased = 0;
+ else f->phased = 1, f->phase = f->seq[0] - 1;
+ }
+ }
+ dump_aln(g, min_pos, hash);
+ ++g->vpos_shift;
+ return 1;
+ }
+ { // phase
+ int **cnt;
+ uint64_t *mask;
+ printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
+ sitemask = calloc(vpos, 1);
+ cnt = count_all(g->k, vpos, hash);
+ path = dynaprog(g->k, vpos, cnt);
+ for (i = 0; i < vpos; ++i) free(cnt[i]);
+ free(cnt);
+ pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking
+ mask = genmask(vpos, pcnt, &n_masked);
+ regmask = calloc(n_masked, 8);
+ for (i = 0; i < n_masked; ++i) {
+ regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32;
+ for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j)
+ sitemask[j] = 1;
+ }
+ free(mask);
+ if (g->flag & FLAG_FIX_CHIMERA) {
+ free(pcnt);
+ pcnt = fragphase(vpos, path, hash, 1);
+ }
+ }
+ for (i = 0; i < n_masked; ++i)
+ printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
+ for (i = 0; i < vpos; ++i) {
+ uint64_t x = pcnt[i];
+ int8_t c[2];
+ c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3);
+ c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3);
+ printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32), (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
+ i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff));
+ }
+ free(path); free(pcnt); free(regmask); free(sitemask);
+ seqs = calloc(n_seqs, sizeof(void*));
+ for (k = 0, i = 0; k < kh_end(hash); ++k)
+ if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single)
+ seqs[i++] = &kh_val(hash, k);
+ n_seqs = i;
+ ks_introsort_rseq(n_seqs, seqs);
+ for (i = 0; i < n_seqs; ++i) {
+ frag_t *f = seqs[i];
+ printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
+ for (j = 0; j < f->vlen; ++j) {
+ uint32_t c = cns[f->vpos + j];
+ if (f->seq[j] == 0) putchar('N');
+ else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]);
+ }
+ printf("\t*\tYP:i:%d\tYF:i:%d\n", f->phase, f->flip);
+ }
+ free(seqs);
+ printf("//\n");
+ fflush(stdout);
+ g->vpos_shift += vpos;
+ dump_aln(g, min_pos, hash);
+ return vpos;
+}
+
+static void update_vpos(int vpos, nseq_t *hash)
+{
+ khint_t k;
+ for (k = 0; k < kh_end(hash); ++k) {
+ if (kh_exist(hash, k)) {
+ frag_t *f = &kh_val(hash, k);
+ if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it
+ else f->vpos -= vpos;
+ }
+ }
+}
+
+static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement
+{
+ return hash;
+}
+
+static int readaln(void *data, bam1_t *b)
+{
+ phaseg_t *g = (phaseg_t*)data;
+ int ret;
+ ret = bam_read1(g->fp, b);
+ if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
+ if (g->n == g->m) {
+ g->m = g->m? g->m<<1 : 16;
+ g->b = realloc(g->b, g->m * sizeof(void*));
+ }
+ g->b[g->n++] = bam_dup1(b);
+ }
+ return ret;
+}
+
+static khash_t(set64) *loadpos(const char *fn, bam_header_t *h)
+{
+ gzFile fp;
+ kstream_t *ks;
+ int ret, dret;
+ kstring_t *str;
+ khash_t(set64) *hash;
+
+ hash = kh_init(set64);
+ str = calloc(1, sizeof(kstring_t));
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ int tid = bam_get_tid(h, str->s);
+ if (tid >= 0 && dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
+ kh_put(set64, hash, x, &ret);
+ } else break;
+ }
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+ if (dret < 0) break;
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ return hash;
+}
+
+static int gl2cns(float q[16])
+{
+ int i, j, min_ij;
+ float min, min2;
+ min = min2 = 1e30; min_ij = -1;
+ for (i = 0; i < 4; ++i) {
+ for (j = i; j < 4; ++j) {
+ if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j];
+ else if (q[i<<2|j] < min2) min2 = q[i<<2|j];
+ }
+ }
+ return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
+}
+
+int main_phase(int argc, char *argv[])
+{
+ extern void bam_init_header_hash(bam_header_t *header);
+ int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
+ const bam_pileup1_t *plp;
+ bam_plp_t iter;
+ bam_header_t *h;
+ nseq_t *seqs;
+ uint64_t *cns = 0;
+ phaseg_t g;
+ char *fn_list = 0;
+ khash_t(set64) *set = 0;
+ errmod_t *em;
+ uint16_t *bases;
+
+ memset(&g, 0, sizeof(phaseg_t));
+ g.flag = FLAG_FIX_CHIMERA;
+ g.min_varLOD = 40; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
+ while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:")) >= 0) {
+ switch (c) {
+ case 'D': g.max_depth = atoi(optarg); break;
+ case 'q': g.min_varLOD = atoi(optarg); break;
+ case 'Q': g.min_baseQ = atoi(optarg); break;
+ case 'k': g.k = atoi(optarg); break;
+ case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break;
+ case 'e': g.flag |= FLAG_LIST_EXCL; break;
+ case 'b': g.pre = strdup(optarg); break;
+ case 'l': fn_list = strdup(optarg); break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools phase [options] <in.bam>\n\n");
+ fprintf(stderr, "Options: -k INT block length [%d]\n", g.k);
+ fprintf(stderr, " -b STR prefix of BAMs to output [null]\n");
+ fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD);
+ fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ);
+ fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth);
+// fprintf(stderr, " -l FILE list of sites to phase [null]\n");
+ fprintf(stderr, " -F do not attempt to fix chimeras\n");
+// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ h = bam_header_read(g.fp);
+ if (fn_list) { // read the list of sites to phase
+ bam_init_header_hash(h);
+ set = loadpos(fn_list, h);
+ free(fn_list);
+ } else g.flag &= ~FLAG_LIST_EXCL;
+ if (g.pre) { // open BAMs to write
+ char *s = malloc(strlen(g.pre) + 20);
+ strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w");
+ strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w");
+ strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w");
+ for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h);
+ free(s);
+ }
+
+ iter = bam_plp_init(readaln, &g);
+ g.vpos_shift = 0;
+ seqs = kh_init(64);
+ em = errmod_init(1. - 0.83);
+ bases = calloc(g.max_depth, 2);
+ printf("CC\n");
+ printf("CC\tDescriptions:\nCC\n");
+ printf("CC\t CC comments\n");
+ printf("CC\t PS start of a phase set\n");
+ printf("CC\t FL filtered region\n");
+ printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n");
+ printf("CC\t EV supporting reads; SAM format\n");
+ printf("CC\t // end of a phase set\nCC\n");
+ printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
+ printf("CC\t PS chr phaseSetStart phaseSetEnd\n");
+ printf("CC\t FL chr filterStart filterEnd\n");
+ printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n");
+ printf("CC\nCC\n");
+ fflush(stdout);
+ while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
+ int i, k, c, tmp, dophase = 1, in_set = 0;
+ float q[16];
+ if (tid < 0) break;
+ if (tid != lasttid) { // change of chromosome
+ g.vpos_shift = 0;
+ if (lasttid >= 0) {
+ seqs = shrink_hash(seqs);
+ phase(&g, h->target_name[lasttid], vpos, cns, seqs);
+ update_vpos(0x7fffffff, seqs);
+ }
+ lasttid = tid;
+ vpos = 0;
+ }
+ if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1;
+ if (n > g.max_depth) continue; // do not proceed if the depth is too high
+ // fill the bases array and check if there is a variant
+ for (i = k = 0; i < n; ++i) {
+ const bam_pileup1_t *p = plp + i;
+ uint8_t *seq;
+ int q, baseQ, b;
+ if (p->is_del || p->is_refskip) continue;
+ baseQ = bam1_qual(p->b)[p->qpos];
+ if (baseQ < g.min_baseQ) continue;
+ seq = bam1_seq(p->b);
+ b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+ if (b > 3) continue;
+ q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+ if (q < 4) q = 4;
+ if (q > 63) q = 63;
+ bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
+ }
+ if (k == 0) continue;
+ errmod_cal(em, k, 4, bases, q); // compute genotype likelihood
+ c = gl2cns(q); // get the consensus
+ // tell if to proceed
+ if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list
+ if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant
+ // add the variant
+ if (vpos == max_vpos) {
+ max_vpos = max_vpos? max_vpos<<1 : 128;
+ cns = realloc(cns, max_vpos * 8);
+ }
+ cns[vpos] = (uint64_t)pos<<32 | c;
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = plp + i;
+ uint64_t key;
+ khint_t k;
+ uint8_t *seq = bam1_seq(p->b);
+ frag_t *f;
+ if (p->is_del || p->is_refskip) continue;
+ if (p->b->core.qual == 0) continue;
+ // get the base code
+ c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)];
+ if (c == (cns[vpos]&3)) c = 1;
+ else if (c == (cns[vpos]>>16&3)) c = 2;
+ else c = 0;
+ // write to seqs
+ key = X31_hash_string(bam1_qname(p->b));
+ k = kh_put(64, seqs, key, &tmp);
+ f = &kh_val(seqs, k);
+ if (tmp == 0) { // present in the hash table
+ if (vpos - f->vpos + 1 < MAX_VARS) {
+ f->vlen = vpos - f->vpos + 1;
+ f->seq[f->vlen-1] = c;
+ f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+ }
+ dophase = 0;
+ } else { // absent
+ memset(f->seq, 0, MAX_VARS);
+ f->beg = p->b->core.pos;
+ f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+ f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = 0;
+ }
+ }
+ if (dophase) {
+ seqs = shrink_hash(seqs);
+ phase(&g, h->target_name[tid], vpos, cns, seqs);
+ update_vpos(vpos, seqs);
+ cns[0] = cns[vpos];
+ vpos = 0;
+ }
+ ++vpos;
+ }
+ if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs);
+ bam_header_destroy(h);
+ bam_plp_destroy(iter);
+ bam_close(g.fp);
+ kh_destroy(64, seqs);
+ kh_destroy(set64, set);
+ free(cns);
+ errmod_destroy(em);
+ free(bases);
+ if (g.pre) {
+ for (c = 0; c <= 2; ++c) bam_close(g.out[c]);
+ free(g.pre); free(g.b);
+ }
+ return 0;
+}