From: Heng Li Date: Fri, 25 Feb 2011 05:59:27 +0000 (+0000) Subject: added the phase command X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=7251efa37992752a8cf62ff363da0ad5099937bd added the phase command --- diff --git a/Makefile b/Makefile index e28822e..80f7a31 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,8 @@ LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o + bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ + cut_target.o phase.o PROG= samtools INCLUDES= -I. SUBDIRS= . bcftools misc @@ -66,6 +67,7 @@ bcf.o:bcftools/bcf.h bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h bam2bcf_indel.o:bam2bcf.h errmod.o:errmod.h +phase.o:bam.h khash.h ksort.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h diff --git a/bam_md.c b/bam_md.c index 02d6c5b..20b1913 100644 --- a/bam_md.c +++ b/bam_md.c @@ -315,7 +315,8 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, " -b compressed BAM output\n"); fprintf(stderr, " -S the input is SAM with header\n"); fprintf(stderr, " -A modify the quality string\n"); - fprintf(stderr, " -r read-independent local realignment\n\n"); + fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"); + fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n"); return 1; } fp = samopen(argv[optind], mode_r, 0); diff --git a/bam_plcmd.c b/bam_plcmd.c index d6e8213..2c333e8 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -934,6 +934,7 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -A use anomalous read pairs in SNP/INDEL calling\n"); fprintf(stderr, " -g generate BCF output\n"); fprintf(stderr, " -u do not compress BCF output\n"); + fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); fprintf(stderr, " -B disable BAQ computation\n"); fprintf(stderr, " -D output per-sample DP\n"); fprintf(stderr, " -S output per-sample SP (strand bias P-value, slow)\n"); diff --git a/bam_sort.c b/bam_sort.c index 01f7016..38f15d6 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -222,8 +222,11 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; - if ((flag & MERGE_RG) && bam_aux_get(b, "RG") == 0) + if (flag & MERGE_RG) { + uint8_t *rg = bam_aux_get(b, "RG"); + if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); + } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); diff --git a/bamtk.c b/bamtk.c index f2b577c..096cd95 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.12-10 (r896)" +#define PACKAGE_VERSION "0.1.12-11 (r896:110)" #endif int bam_taf2baf(int argc, char *argv[]); @@ -27,6 +27,8 @@ int bam_idxstats(int argc, char *argv[]); int main_samview(int argc, char *argv[]); int main_import(int argc, char *argv[]); int main_reheader(int argc, char *argv[]); +int main_cut_target(int argc, char *argv[]); +int main_phase(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int glf3_view_main(int argc, char *argv[]); @@ -94,6 +96,8 @@ static int usage() fprintf(stderr, " merge merge sorted alignments\n"); fprintf(stderr, " rmdup remove PCR duplicates\n"); fprintf(stderr, " reheader replace BAM header\n"); + fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); + fprintf(stderr, " phase phase heterozygotes\n"); fprintf(stderr, "\n"); #ifdef _WIN32 fprintf(stderr, "\ @@ -131,6 +135,8 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); + else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1); + else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); #if _CURSES_LIB != 0 else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); #endif diff --git a/cut_target.c b/cut_target.c new file mode 100644 index 0000000..ee687fb --- /dev/null +++ b/cut_target.c @@ -0,0 +1,191 @@ +#include +#include +#include +#include "bam.h" +#include "errmod.h" +#include "faidx.h" + +#define ERR_DEP 0.83f + +typedef struct { + int e[2][3], p[2][2]; +} score_param_t; + +/* Note that although the two matrics have 10 parameters in total, only 4 + * (probably 3) are free. Changing the scoring matrices in a sort of symmetric + * way will not change the result. */ +static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} }; + +typedef struct { + int min_baseQ, tid, max_bases; + uint16_t *bases; + bamFile fp; + bam_header_t *h; + char *ref; + faidx_t *fai; + errmod_t *em; +} ct_t; + +static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp) +{ + int i, j, ret, tmp, k, sum[4], qual; + float q[16]; + if (n > g->max_bases) { // enlarge g->bases + g->max_bases = n; + kroundup32(g->max_bases); + g->bases = realloc(g->bases, g->max_bases * 2); + } + for (i = k = 0; i < n; ++i) { + const bam_pileup1_t *p = plp + i; + uint8_t *seq; + int q, baseQ, b; + if (p->is_refskip || p->is_del) continue; + baseQ = bam1_qual(p->b)[p->qpos]; + if (baseQ < g->min_baseQ) continue; + seq = bam1_seq(p->b); + b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; + if (b > 3) continue; + q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; + g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b; + } + if (k == 0) return 0; + errmod_cal(g->em, k, 4, g->bases, q); + for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i; + for (i = 1; i < 4; ++i) // insertion sort + for (j = i; j > 0 && sum[j] < sum[j-1]; --j) + tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp; + qual = (sum[1]>>2) - (sum[0]>>2); + k = k < 256? k : 255; + ret = (qual < 63? qual : 63) << 2 | (sum[0]&3); + return ret<<8|k; +} + +static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns) +{ + int i, f[2][2], *prev, *curr, *swap_tmp, s; + uint8_t *b; // backtrack array + b = calloc(l, 1); + f[0][0] = f[0][1] = 0; + prev = f[0]; curr = f[1]; + // fill the backtrack matrix + for (i = 0; i < l; ++i) { + int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2; + int tmp0, tmp1; + // compute f[0] + tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0) + tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1) + if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0; + else curr[0] = tmp1, b[i] = 1; + // compute f[1] + tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0) + tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1) + if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1; + else curr[1] = tmp1, b[i] |= 1<<1; + // swap + swap_tmp = prev; prev = curr; curr = swap_tmp; + } + // backtrack + s = prev[0] > prev[1]? 0 : 1; + for (i = l - 1; i > 0; --i) { + b[i] |= s<<2; + s = b[i]>>s&1; + } + // print + for (i = 0, s = -1; i <= l; ++i) { + if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { + if (s >= 0) { + int j; + printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); + for (j = s; j < i; ++j) { + int c = cns[j]>>8; + if (c == 0) putchar('N'); + else putchar("ACGT"[c&3]); + } + putchar('\t'); + for (j = s; j < i; ++j) + putchar(33 + (cns[j]>>8>>2)); + putchar('\n'); + } + //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); + s = -1; + } else if ((b[i]>>2&3) && s < 0) s = i; + } + free(b); +} + +static int read_aln(void *data, bam1_t *b) +{ + extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag); + ct_t *g = (ct_t*)data; + int ret, len; + ret = bam_read1(g->fp, b); + if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) { + if (b->core.tid != g->tid) { // then load the sequence + free(g->ref); + g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len); + g->tid = b->core.tid; + } + bam_prob_realn_core(b, g->ref, 1<<1|1); + } + return ret; +} + +int main_cut_target(int argc, char *argv[]) +{ + int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l; + const bam_pileup1_t *p; + bam_plp_t plp; + uint16_t *cns; + ct_t g; + + memset(&g, 0, sizeof(ct_t)); + g.min_baseQ = 13; g.tid = -1; + while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) { + switch (c) { + case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff + case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY + case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE + case '1': g_param.e[1][1] = atoi(optarg); break; + case '2': g_param.e[1][2] = atoi(optarg); break; + case 'f': g.fai = fai_load(optarg); + if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__); + break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] \n"); + return 1; + } + l = max_l = 0; cns = 0; + g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + g.h = bam_header_read(g.fp); + g.em = errmod_init(1 - ERR_DEP); + plp = bam_plp_init(read_aln, &g); + while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) { + if (tid < 0) break; + if (tid != lasttid) { // change of chromosome + if (cns) process_cns(g.h, lasttid, l, cns); + if (max_l < g.h->target_len[tid]) { + max_l = g.h->target_len[tid]; + kroundup32(max_l); + cns = realloc(cns, max_l * 2); + } + l = g.h->target_len[tid]; + memset(cns, 0, max_l * 2); + lasttid = tid; + } + cns[pos] = gencns(&g, n, p); + lastpos = pos; + } + process_cns(g.h, lasttid, l, cns); + free(cns); + bam_header_destroy(g.h); + bam_plp_destroy(plp); + bam_close(g.fp); + if (g.fai) { + fai_destroy(g.fai); free(g.ref); + } + errmod_destroy(g.em); + free(g.bases); + return 0; +} diff --git a/errmod.h b/errmod.h index e3e9a90..32c07b6 100644 --- a/errmod.h +++ b/errmod.h @@ -12,6 +12,13 @@ typedef struct { errmod_t *errmod_init(float depcorr); void errmod_destroy(errmod_t *em); + +/* + n: number of bases + m: maximum base + bases[i]: qual:6, strand:1, base:4 + q[i*m+j]: phred-scaled likelihood of (i,j) + */ int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q); #endif diff --git a/faidx.c b/faidx.c index dbd8b3e..38292a1 100644 --- a/faidx.c +++ b/faidx.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "faidx.h" #include "khash.h" diff --git a/khash.h b/khash.h index 1d583ef..a7e8056 100644 --- a/khash.h +++ b/khash.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,8 +23,6 @@ SOFTWARE. */ -/* Contact: Heng Li */ - /* An example: @@ -49,6 +47,14 @@ int main() { */ /* + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + 2008-09-19 (0.2.3): * Corrected the example @@ -88,17 +94,35 @@ int main() { @copyright Heng Li */ -#define AC_VERSION_KHASH_H "0.2.2" +#define AC_VERSION_KHASH_H "0.2.5" -#include #include #include +#include + +/* compipler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifdef _MSC_VER +#define inline __inline +#endif -typedef uint32_t khint_t; +typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_HASH_PRIME_SIZE 32 -static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = { 0ul, 3ul, 11ul, 23ul, 53ul, 97ul, 193ul, 389ul, 769ul, 1543ul, @@ -119,17 +143,32 @@ static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = static const double __ac_HASH_UPPER = 0.77; -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + extern kh_##name##_t *kh_init_##name(); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ typedef struct { \ khint_t n_buckets, size, n_occupied, upper_bound; \ - uint32_t *flags; \ + khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; \ - static inline kh_##name##_t *kh_init_##name() { \ + SCOPE kh_##name##_t *kh_init_##name() { \ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ } \ - static inline void kh_destroy_##name(kh_##name##_t *h) \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ free(h->keys); free(h->flags); \ @@ -137,14 +176,14 @@ static const double __ac_HASH_UPPER = 0.77; free(h); \ } \ } \ - static inline void kh_clear_##name(kh_##name##_t *h) \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ - memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ - static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t inc, k, i, last; \ @@ -158,9 +197,9 @@ static const double __ac_HASH_UPPER = 0.77; return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { \ - uint32_t *new_flags = 0; \ + khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ khint_t t = __ac_HASH_PRIME_SIZE - 1; \ @@ -168,8 +207,8 @@ static const double __ac_HASH_UPPER = 0.77; new_n_buckets = __ac_prime_list[t+1]; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ else { \ - new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ - memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) \ @@ -218,7 +257,7 @@ static const double __ac_HASH_UPPER = 0.77; h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ } \ - static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { \ @@ -256,7 +295,7 @@ static const double __ac_HASH_UPPER = 0.77; } else *ret = 0; \ return x; \ } \ - static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ @@ -264,24 +303,27 @@ static const double __ac_HASH_UPPER = 0.77; } \ } +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function - @param key The integer [uint32_t] + @param key The integer [khint32_t] @return The hash value [khint_t] */ -#define kh_int_hash_func(key) (uint32_t)(key) +#define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function - @param key The integer [uint64_t] + @param key The integer [khint64_t] @return The hash value [khint_t] */ -#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ @@ -442,7 +484,7 @@ static inline khint_t __ac_X31_hash_string(const char *s) @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ - KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @@ -450,14 +492,14 @@ static inline khint_t __ac_X31_hash_string(const char *s) @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ - KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ - KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @@ -465,7 +507,7 @@ static inline khint_t __ac_X31_hash_string(const char *s) @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ - KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function diff --git a/kstring.c b/kstring.c index 43d524c..b2a0dab 100644 --- a/kstring.c +++ b/kstring.c @@ -29,16 +29,24 @@ char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) const char *p, *start; if (sep) { // set up the table if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished - aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; - for (p = sep; *p; ++p) - aux->tab[*p/64] |= 1ull<<(*p%64); + aux->finished = 0; + if (sep[1]) { + aux->sep = -1; + aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; + for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); + } else aux->sep = sep[0]; + } + if (aux->finished) return 0; + else if (str) aux->p = str - 1, aux->finished = 0; + if (aux->sep < 0) { + for (p = start = aux->p + 1; *p; ++p) + if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; + } else { + for (p = start = aux->p + 1; *p; ++p) + if (*p == aux->sep) break; } - if (str) aux->p = str - 1, aux->tab[0] &= ~1ull; - else if (aux->tab[0]&1) return 0; - for (p = start = aux->p + 1; *p; ++p) - if (aux->tab[*p/64]>>(*p%64)&1) break; aux->p = p; // end of token - if (*p == 0) aux->tab[0] |= 1; // no more tokens + if (*p == 0) aux->finished = 1; // no more tokens return (char*)start; } diff --git a/kstring.h b/kstring.h index c46a62b..ec5775b 100644 --- a/kstring.h +++ b/kstring.h @@ -19,6 +19,7 @@ typedef struct __kstring_t { typedef struct { uint64_t tab[4]; + int sep, finished; const char *p; // end of the current token } ks_tokaux_t; diff --git a/phase.c b/phase.c new file mode 100644 index 0000000..7f06958 --- /dev/null +++ b/phase.c @@ -0,0 +1,678 @@ +#include +#include +#include +#include +#include +#include +#include "bam.h" +#include "errmod.h" + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +#define MAX_VARS 256 +#define FLIP_PENALTY 2 +#define FLIP_THRES 4 +#define MASK_THRES 3 + +#define FLAG_FIX_CHIMERA 0x1 +#define FLAG_LIST_EXCL 0x4 + +typedef struct { + // configurations, initialized in the main function + int flag, k, min_baseQ, min_varLOD, max_depth; + // other global variables + int vpos_shift; + bamFile fp; + char *pre; + bamFile out[3]; + // alignment queue + int n, m; + bam1_t **b; +} phaseg_t; + +typedef struct { + int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation! + int vpos, beg, end; + uint32_t vlen:16, single:1, flip:1, phase:1, phased:1; +} frag_t, *frag_p; + +#define rseq_lt(a,b) ((a)->vpos < (b)->vpos) + +#include "khash.h" +KHASH_SET_INIT_INT64(set64) +KHASH_MAP_INIT_INT64(64, frag_t) + +typedef khash_t(64) nseq_t; + +#include "ksort.h" +KSORT_INIT(rseq, frag_p, rseq_lt) + +static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +static inline uint64_t X31_hash_string(const char *s) +{ + uint64_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} + +static void count1(int l, const uint8_t *seq, int *cnt) +{ + int i, j, n_ambi; + uint32_t z, x; + if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous + for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases + if (seq[i] == 0) ++n_ambi; + if (l - n_ambi <= 1) return; // only one SNP + for (x = 0; x < 1u<>j&1; + ++j; + } + z = z<<1 | c; + } + ++cnt[z]; + } +} + +static int **count_all(int l, int vpos, nseq_t *hash) +{ + khint_t k; + int i, j, **cnt; + uint8_t *seq; + seq = calloc(l, 1); + cnt = calloc(vpos, sizeof(void*)); + for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<vpos >= vpos || f->single) continue; // out of region; or singleton + if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right + f->single = 1; + continue; + } + for (j = 1; j < f->vlen; ++j) { + for (i = 0; i < l; ++i) + seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)]; + count1(l, seq, cnt[f->vpos + j]); + } + } + } + free(seq); + return cnt; +} + +// phasing +static int8_t *dynaprog(int l, int vpos, int **w) +{ + int *f[2], *curr, *prev, max, i; + int8_t **b, *h = 0; + uint32_t x, z = 1u<<(l-1), mask = (1u<>1; y1 = xc>>1; + c0 = prev[y0] + wi[x] + wi[xc]; + c1 = prev[y1] + wi[x] + wi[xc]; + if (c0 > c1) bi[x] = 0, curr[x] = c0; + else bi[x] = 1, curr[x] = c1; + } + tmp = prev; prev = curr; curr = tmp; // swap + } + { // backtrack + uint32_t max_x = 0; + int which = 0; + h = calloc(vpos, 1); + for (x = 0, max = 0, max_x = 0; x < z; ++x) + if (prev[x] > max) max = prev[x], max_x = x; + for (i = vpos - 1, x = max_x; i >= 0; --i) { + h[i] = which? (~x&1) : (x&1); + which = b[i][x]? !which : which; + x = b[i][x]? (~x&mask)>>1 : x>>1; + } + } + // free + for (i = 0; i < vpos; ++i) free(b[i]); + free(f[0]); free(f[1]); free(b); + return h; +} + +// phase each fragment +static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip) +{ + khint_t k; + uint64_t *pcnt; + uint32_t *left, *rght, max; + left = rght = 0; max = 0; + pcnt = calloc(vpos, 8); + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + int i, c[2]; + frag_t *f = &kh_val(hash, k); + if (f->vpos >= vpos) continue; + // get the phase + c[0] = c[1] = 0; + for (i = 0; i < f->vlen; ++i) { + if (f->seq[i] == 0) continue; + ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1]; + } + f->phase = c[0] > c[1]? 0 : 1; + f->phased = c[0] == c[1]? 0 : 1; + // fix chimera + f->flip = 0; + if (flip && c[0] >= 3 && c[1] >= 3) { + int sum[2], m, mi, md; + if (f->vlen > max) { // enlarge the array + max = f->vlen; + kroundup32(max); + left = realloc(left, max * 4); + rght = realloc(rght, max * 4); + } + for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts + if (f->seq[i]) { + int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; + ++sum[c == path[f->vpos + i]? 0 : 1]; + } + left[i] = sum[1]<<16 | sum[0]; + } + for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts + if (f->seq[i]) { + int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; + ++sum[c == path[f->vpos + i]? 0 : 1]; + } + rght[i] = sum[1]<<16 | sum[0]; + } + // find the best flip point + for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) { + int a[2]; + a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY; + a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY; + if (a[0] > a[1]) { + if (a[0] > m) m = a[0], md = 0, mi = i; + } else { + if (a[1] > m) m = a[1], md = 1, mi = i; + } + } + if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip + f->flip = 1; + if (md == 0) { // flip the tail + for (i = mi + 1; i < f->vlen; ++i) + if (f->seq[i] == 1) f->seq[i] = 2; + else if (f->seq[i] == 2) f->seq[i] = 1; + } else { // flip the head + for (i = 0; i <= mi; ++i) + if (f->seq[i] == 1) f->seq[i] = 2; + else if (f->seq[i] == 2) f->seq[i] = 1; + } + } + } + // update pcnt[] + if (!f->single) { + for (i = 0; i < f->vlen; ++i) { + int c; + if (f->seq[i] == 0) continue; + c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; + if (c == path[f->vpos + i]) { + if (f->phase == 0) ++pcnt[f->vpos + i]; + else pcnt[f->vpos + i] += 1ull<<32; + } else { + if (f->phase == 0) pcnt[f->vpos + i] += 1<<16; + else pcnt[f->vpos + i] += 1ull<<48; + } + } + } + } + } + free(left); free(rght); + return pcnt; +} + +static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n) +{ + int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0; + uint64_t *list = 0; + for (i = 0; i < vpos; ++i) { + uint64_t x = pcnt[i]; + int c[4], pre = score, s; + c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff; + s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1); + if (c[3] > c[2]) s += c[3] - c[2]; + if (c[1] > c[0]) s += c[1] - c[0]; + score += s; + if (score < 0) score = 0; + if (pre == 0 && score > 0) beg = i; // change from zero to non-zero + if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) { + if (n == m) { + m = m? m<<1 : 4; + list = realloc(list, m * 8); + } + list[n++] = (uint64_t)beg<<32 | max_i; + i = max_i; // reset i to max_i + score = 0; + } else if (score > max) max = score, max_i = i; + if (score == 0) max = 0; + } + *_n = n; + return list; +} + +// trim heading and tailing ambiguous bases; mark deleted and remove sequence +static int clean_seqs(int vpos, nseq_t *hash) +{ + khint_t k; + int ret = 0; + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + int beg, end, i; + if (f->vpos >= vpos) { + ret = 1; + continue; + } + for (i = 0; i < f->vlen; ++i) + if (f->seq[i] != 0) break; + beg = i; + for (i = f->vlen - 1; i >= 0; --i) + if (f->seq[i] != 0) break; + end = i + 1; + if (end - beg <= 0) kh_del(64, hash, k); + else { + if (beg != 0) memmove(f->seq, f->seq + beg, end - beg); + f->vpos += beg; f->vlen = end - beg; + f->single = f->vlen == 1? 1 : 0; + } + } + } + return ret; +} + +static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) +{ + int i, is_flip; + is_flip = (drand48() < 0.5); + for (i = 0; i < g->n; ++i) { + int end, which; + uint64_t key; + khint_t k; + bam1_t *b = g->b[i]; + key = X31_hash_string(bam1_qname(b)); + end = bam_calend(&b->core, bam1_cigar(b)); + if (end > min_pos) break; + k = kh_get(64, hash, key); + if (k == kh_end(hash)) which = 3; + else { + frag_t *f = &kh_val(hash, k); + if (f->phased && f->flip) which = 2; + else if (f->phased == 0) which = 3; + else { // phased and not flipped + char c = 'Y'; + which = f->phase; + bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c); + } + if (which < 2 && is_flip) which = 1 - which; // increase the randomness + } + if (which == 3) which = (drand48() < 0.5); + bam_write1(g->out[which], b); + bam_destroy1(b); + g->b[i] = 0; + } + memmove(g->b, g->b + i, (g->n - i) * sizeof(void*)); + g->n -= i; +} + +static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash) +{ + int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos; + khint_t k; + frag_t **seqs; + int8_t *path, *sitemask; + uint64_t *pcnt, *regmask; + + if (vpos == 0) return 0; + i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos + min_pos = i? cns[vpos]>>32 : 0x7fffffff; + if (vpos == 1) { + printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1); + printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1, + "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1); + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + if (f->vpos) continue; + f->flip = 0; + if (f->seq[0] == 0) f->phased = 0; + else f->phased = 1, f->phase = f->seq[0] - 1; + } + } + dump_aln(g, min_pos, hash); + ++g->vpos_shift; + return 1; + } + { // phase + int **cnt; + uint64_t *mask; + printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1); + sitemask = calloc(vpos, 1); + cnt = count_all(g->k, vpos, hash); + path = dynaprog(g->k, vpos, cnt); + for (i = 0; i < vpos; ++i) free(cnt[i]); + free(cnt); + pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking + mask = genmask(vpos, pcnt, &n_masked); + regmask = calloc(n_masked, 8); + for (i = 0; i < n_masked; ++i) { + regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32; + for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j) + sitemask[j] = 1; + } + free(mask); + if (g->flag & FLAG_FIX_CHIMERA) { + free(pcnt); + pcnt = fragphase(vpos, path, hash, 1); + } + } + for (i = 0; i < n_masked; ++i) + printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1); + for (i = 0; i < vpos; ++i) { + uint64_t x = pcnt[i]; + int8_t c[2]; + c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3); + c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3); + printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32), (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], + i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff)); + } + free(path); free(pcnt); free(regmask); free(sitemask); + seqs = calloc(n_seqs, sizeof(void*)); + for (k = 0, i = 0; k < kh_end(hash); ++k) + if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single) + seqs[i++] = &kh_val(hash, k); + n_seqs = i; + ks_introsort_rseq(n_seqs, seqs); + for (i = 0; i < n_seqs; ++i) { + frag_t *f = seqs[i]; + printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen); + for (j = 0; j < f->vlen; ++j) { + uint32_t c = cns[f->vpos + j]; + if (f->seq[j] == 0) putchar('N'); + else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]); + } + printf("\t*\tYP:i:%d\tYF:i:%d\n", f->phase, f->flip); + } + free(seqs); + printf("//\n"); + fflush(stdout); + g->vpos_shift += vpos; + dump_aln(g, min_pos, hash); + return vpos; +} + +static void update_vpos(int vpos, nseq_t *hash) +{ + khint_t k; + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it + else f->vpos -= vpos; + } + } +} + +static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement +{ + return hash; +} + +static int readaln(void *data, bam1_t *b) +{ + phaseg_t *g = (phaseg_t*)data; + int ret; + ret = bam_read1(g->fp, b); + if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) { + if (g->n == g->m) { + g->m = g->m? g->m<<1 : 16; + g->b = realloc(g->b, g->m * sizeof(void*)); + } + g->b[g->n++] = bam_dup1(b); + } + return ret; +} + +static khash_t(set64) *loadpos(const char *fn, bam_header_t *h) +{ + gzFile fp; + kstream_t *ks; + int ret, dret; + kstring_t *str; + khash_t(set64) *hash; + + hash = kh_init(set64); + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int tid = bam_get_tid(h, str->s); + if (tid >= 0 && dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) >= 0) { + uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); + kh_put(set64, hash, x, &ret); + } else break; + } + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (dret < 0) break; + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return hash; +} + +static int gl2cns(float q[16]) +{ + int i, j, min_ij; + float min, min2; + min = min2 = 1e30; min_ij = -1; + for (i = 0; i < 4; ++i) { + for (j = i; j < 4; ++j) { + if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j]; + else if (q[i<<2|j] < min2) min2 = q[i<<2|j]; + } + } + return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2; +} + +int main_phase(int argc, char *argv[]) +{ + extern void bam_init_header_hash(bam_header_t *header); + int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0; + const bam_pileup1_t *plp; + bam_plp_t iter; + bam_header_t *h; + nseq_t *seqs; + uint64_t *cns = 0; + phaseg_t g; + char *fn_list = 0; + khash_t(set64) *set = 0; + errmod_t *em; + uint16_t *bases; + + memset(&g, 0, sizeof(phaseg_t)); + g.flag = FLAG_FIX_CHIMERA; + g.min_varLOD = 40; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; + while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:")) >= 0) { + switch (c) { + case 'D': g.max_depth = atoi(optarg); break; + case 'q': g.min_varLOD = atoi(optarg); break; + case 'Q': g.min_baseQ = atoi(optarg); break; + case 'k': g.k = atoi(optarg); break; + case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break; + case 'e': g.flag |= FLAG_LIST_EXCL; break; + case 'b': g.pre = strdup(optarg); break; + case 'l': fn_list = strdup(optarg); break; + } + } + if (argc == optind) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools phase [options] \n\n"); + fprintf(stderr, "Options: -k INT block length [%d]\n", g.k); + fprintf(stderr, " -b STR prefix of BAMs to output [null]\n"); + fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); + fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); + fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth); +// fprintf(stderr, " -l FILE list of sites to phase [null]\n"); + fprintf(stderr, " -F do not attempt to fix chimeras\n"); +// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); + fprintf(stderr, "\n"); + return 1; + } + g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + h = bam_header_read(g.fp); + if (fn_list) { // read the list of sites to phase + bam_init_header_hash(h); + set = loadpos(fn_list, h); + free(fn_list); + } else g.flag &= ~FLAG_LIST_EXCL; + if (g.pre) { // open BAMs to write + char *s = malloc(strlen(g.pre) + 20); + strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w"); + strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w"); + strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w"); + for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h); + free(s); + } + + iter = bam_plp_init(readaln, &g); + g.vpos_shift = 0; + seqs = kh_init(64); + em = errmod_init(1. - 0.83); + bases = calloc(g.max_depth, 2); + printf("CC\n"); + printf("CC\tDescriptions:\nCC\n"); + printf("CC\t CC comments\n"); + printf("CC\t PS start of a phase set\n"); + printf("CC\t FL filtered region\n"); + printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n"); + printf("CC\t EV supporting reads; SAM format\n"); + printf("CC\t // end of a phase set\nCC\n"); + printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n"); + printf("CC\t PS chr phaseSetStart phaseSetEnd\n"); + printf("CC\t FL chr filterStart filterEnd\n"); + printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n"); + printf("CC\nCC\n"); + fflush(stdout); + while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { + int i, k, c, tmp, dophase = 1, in_set = 0; + float q[16]; + if (tid < 0) break; + if (tid != lasttid) { // change of chromosome + g.vpos_shift = 0; + if (lasttid >= 0) { + seqs = shrink_hash(seqs); + phase(&g, h->target_name[lasttid], vpos, cns, seqs); + update_vpos(0x7fffffff, seqs); + } + lasttid = tid; + vpos = 0; + } + if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1; + if (n > g.max_depth) continue; // do not proceed if the depth is too high + // fill the bases array and check if there is a variant + for (i = k = 0; i < n; ++i) { + const bam_pileup1_t *p = plp + i; + uint8_t *seq; + int q, baseQ, b; + if (p->is_del || p->is_refskip) continue; + baseQ = bam1_qual(p->b)[p->qpos]; + if (baseQ < g.min_baseQ) continue; + seq = bam1_seq(p->b); + b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; + if (b > 3) continue; + q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; + if (q < 4) q = 4; + if (q > 63) q = 63; + bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; + } + if (k == 0) continue; + errmod_cal(em, k, 4, bases, q); // compute genotype likelihood + c = gl2cns(q); // get the consensus + // tell if to proceed + if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list + if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant + // add the variant + if (vpos == max_vpos) { + max_vpos = max_vpos? max_vpos<<1 : 128; + cns = realloc(cns, max_vpos * 8); + } + cns[vpos] = (uint64_t)pos<<32 | c; + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = plp + i; + uint64_t key; + khint_t k; + uint8_t *seq = bam1_seq(p->b); + frag_t *f; + if (p->is_del || p->is_refskip) continue; + if (p->b->core.qual == 0) continue; + // get the base code + c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)]; + if (c == (cns[vpos]&3)) c = 1; + else if (c == (cns[vpos]>>16&3)) c = 2; + else c = 0; + // write to seqs + key = X31_hash_string(bam1_qname(p->b)); + k = kh_put(64, seqs, key, &tmp); + f = &kh_val(seqs, k); + if (tmp == 0) { // present in the hash table + if (vpos - f->vpos + 1 < MAX_VARS) { + f->vlen = vpos - f->vpos + 1; + f->seq[f->vlen-1] = c; + f->end = bam_calend(&p->b->core, bam1_cigar(p->b)); + } + dophase = 0; + } else { // absent + memset(f->seq, 0, MAX_VARS); + f->beg = p->b->core.pos; + f->end = bam_calend(&p->b->core, bam1_cigar(p->b)); + f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = 0; + } + } + if (dophase) { + seqs = shrink_hash(seqs); + phase(&g, h->target_name[tid], vpos, cns, seqs); + update_vpos(vpos, seqs); + cns[0] = cns[vpos]; + vpos = 0; + } + ++vpos; + } + if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs); + bam_header_destroy(h); + bam_plp_destroy(iter); + bam_close(g.fp); + kh_destroy(64, seqs); + kh_destroy(set64, set); + free(cns); + errmod_destroy(em); + free(bases); + if (g.pre) { + for (c = 0; c <= 2; ++c) bam_close(g.out[c]); + free(g.pre); free(g.b); + } + return 0; +} diff --git a/samtools.1 b/samtools.1 index 57f1aff..8d299b7 100644 --- a/samtools.1 +++ b/samtools.1 @@ -137,7 +137,7 @@ viewing the same reference sequence. .TP .B mpileup -samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]] +samtools mpileup [-EBug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]] Generate BCF or pileup for one or multiple BAM files. Alignment records are grouped by sample identifiers in @RG header lines. If sample @@ -164,6 +164,10 @@ Phred-scaled gap extension sequencing error probability. Reducing .I INT leads to longer indels. [20] .TP +.B -E +Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt +specificity a little bit. +.TP .BI -f \ FILE The reference file [null] .TP @@ -355,7 +359,7 @@ Treat paired-end reads and single-end reads. .TP .B calmd -samtools calmd [-eubSr] [-C capQcoef] +samtools calmd [-EeubSr] [-C capQcoef] Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is different from the existing @@ -388,7 +392,55 @@ Coefficient to cap mapping quality of poorly mapped reads. See the command for details. [0] .TP .B -r -Compute the BQ tag without changing the base quality. +Compute the BQ tag (without -A) or cap base quality by BAQ (with -A). +.TP +.B -E +Extended BAQ calculation. This option trades specificity for sensitivity, though the +effect is minor. +.RE + +.TP +.B targetcut +samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0] [-1 em1] [-2 em2] [-f ref] + +This command identifies target regions by examining the continuity of read depth, computes +haploid consensus sequences of targets and outputs a SAM with each sequence corresponding +to a target. When option +.B -f +is in use, BAQ will be applied. This command is +.B only +designed for cutting fosmid clones from fosmid pool sequencing [Ref. Kitzman et al. (2010)]. +.RE + +.TP +.B phase +samtools phase [-F] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] + +Call and phase heterozygous SNPs. +.B OPTIONS: +.RS +.TP 8 +.BI -b \ STR +Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file +.BR STR .0.bam +and phase-1 reads in +.BR STR .1.bam. +Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads +with switch errors will be saved in +.BR STR .chimeric.bam. +[null] +.TP +.B -F +Do not attempt to fix chimeric reads. +.TP +.BI -k \ INT +Maximum length for local phasing. [13] +.TP +.BI -q \ INT +Minimum Phred-scaled LOD to call a heterozygote. [40] +.TP +.BI -Q \ INT +Minimum base quality to be used in het calling. [13] .RE .TP @@ -629,6 +681,20 @@ mismatches. Applying this option usually helps .B BWA-short but may not other mappers. +.IP o 2 +Generate the consensus sequence for one diploid individual: + + samtools mpileup -uf ref.fa aln.bam | bcftools view -cg - | vcfutils.pl vcf2fq > cns.fq + +.IP o 2 +Phase one individual: + + samtools calmd -AEur aln.bam ref.fa | samtools phase -b prefix - > phase.out + +The +.B calmd +command is used to reduce false heterozygotes around INDELs. + .IP o 2 Call SNPs and short indels for multiple diploid individuals: