From 543a9fc2055575dcc17a7bb9c83ecca17a32cec6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 8 Feb 2012 16:37:40 +0000 Subject: [PATCH] changes from git --- bam.c | 136 ++++++++++++++++++++++++++++++++++++++++++++----- bam.h | 24 ++++++--- bam2bcf.c | 28 +++++----- bam2bcf.h | 9 +++- bam_import.c | 16 +++--- bam_mate.c | 2 +- bam_plcmd.c | 46 +++++++++-------- bamtk.c | 3 +- bcftools/bcf.c | 4 +- bcftools/vcf.c | 4 +- padding.c | 2 +- sam_view.c | 23 ++++++--- 12 files changed, 224 insertions(+), 73 deletions(-) diff --git a/bam.c b/bam.c index 0055e84..b00d6a6 100644 --- a/bam.c +++ b/bam.c @@ -7,7 +7,7 @@ #include "kstring.h" #include "sam_header.h" -int bam_is_be = 0, bam_verbose = 2; +int bam_is_be = 0, bam_verbose = 2, bam_no_B = 0; char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; /************************** @@ -16,12 +16,26 @@ char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) { - uint32_t k, end; - end = c->pos; + int k, end = c->pos; for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) - end += cigar[k] >> BAM_CIGAR_SHIFT; + int op = bam_cigar_op(cigar[k]); + int len = bam_cigar_oplen(cigar[k]); + if (op == BAM_CBACK) { // move backward + int l, u, v; + if (k == c->n_cigar - 1) break; // skip trailing 'B' + for (l = k - 1, u = v = 0; l >= 0; --l) { + int op1 = bam_cigar_op(cigar[l]); + int len1 = bam_cigar_oplen(cigar[l]); + if (bam_cigar_type(op1)&1) { // consume query + if (u + len1 >= len) { // stop + if (bam_cigar_type(op1)&2) v += len - u; + break; + } else u += len1; + } + if (bam_cigar_type(op1)&2) v += len1; + } + end = l < 0? c->pos : end - v; + } else if (bam_cigar_type(op)&2) end += bam_cigar_oplen(cigar[k]); } return end; } @@ -30,11 +44,9 @@ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) { uint32_t k; int32_t l = 0; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) - l += cigar[k] >> BAM_CIGAR_SHIFT; - } + for (k = 0; k < c->n_cigar; ++k) + if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) + l += bam_cigar_oplen(cigar[k]); return l; } @@ -206,6 +218,7 @@ int bam_read1(bamFile fp, bam1_t *b) if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; if (bam_is_be) swap_endian_data(c, b->data_len, b->data); + if (bam_no_B) bam_remove_B(b); return 4 + block_len; } @@ -266,9 +279,10 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); if (c->n_cigar == 0) kputc('*', &str); else { + uint32_t *cigar = bam1_cigar(b); for (i = 0; i < c->n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); - kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); + kputc(bam_cigar_opchr(cigar[i]), &str); } } kputc('\t', &str); @@ -360,3 +374,101 @@ const char *bam_get_library(bam_header_t *h, const bam1_t *b) rg = bam_aux_get(b, "RG"); return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1)); } + +/************ + * Remove B * + ************/ + +int bam_remove_B(bam1_t *b) +{ + int i, j, end_j, k, l, no_qual; + uint32_t *cigar, *new_cigar; + uint8_t *seq, *qual, *p; + // test if removal is necessary + if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing + cigar = bam1_cigar(b); + for (k = 0; k < b->core.n_cigar; ++k) + if (bam_cigar_op(cigar[k]) == BAM_CBACK) break; + if (k == b->core.n_cigar) return 0; // no 'B' + if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed + // allocate memory for the new CIGAR + if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory + b->m_data = b->data_len + b->core.n_cigar * 4; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + cigar = bam1_cigar(b); // after realloc, cigar may be changed + } + new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data + // the core loop + seq = bam1_seq(b); qual = bam1_qual(b); + no_qual = (qual[0] == 0xff); // test whether base quality is available + i = j = 0; end_j = -1; + for (k = l = 0; k < b->core.n_cigar; ++k) { + int op = bam_cigar_op(cigar[k]); + int len = bam_cigar_oplen(cigar[k]); + if (op == BAM_CBACK) { // the backward operation + int t, u; + if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR + if (len > j) goto rmB_err; // an excessively long backward + for (t = l - 1, u = 0; t >= 0; --t) { // look back + int op1 = bam_cigar_op(new_cigar[t]); + int len1 = bam_cigar_oplen(new_cigar[t]); + if (bam_cigar_type(op1)&1) { // consume the query + if (u + len1 >= len) { // stop + new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT; + break; + } else u += len1; + } + } + if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation + l = t + 1; + end_j = j; j -= len; + } else { // other CIGAR operations + new_cigar[l++] = cigar[k]; + if (bam_cigar_type(op)&1) { // consume the query + if (i != j) { // no need to copy if i == j + int u, c, c0; + for (u = 0; u < len; ++u) { // construct the consensus + c = bam1_seqi(seq, i+u); + if (j + u < end_j) { // in an overlap + c0 = bam1_seqi(seq, j+u); + if (c != c0) { // a mismatch; choose the better base + if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better + bam1_seq_seti(seq, j+u, c); + qual[j+u] = qual[i+u] - qual[j+u]; + } else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality + } else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u]; + } else { // not in an overlap; copy over + bam1_seq_seti(seq, j+u, c); + qual[j+u] = qual[i+u]; + } + } + } + i += len, j += len; + } + } + } + if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified + // merge adjacent operations if possible + for (k = 1; k < l; ++k) + if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1])) + new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf; + // kill zero length operations + for (k = i = 0; k < l; ++k) + if (new_cigar[k] >> BAM_CIGAR_SHIFT) + new_cigar[i++] = new_cigar[k]; + l = i; + // update b + memcpy(cigar, new_cigar, l * 4); // set CIGAR + p = b->data + b->core.l_qname + l * 4; + memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ + memmove(p, qual, j); p += j; // set QUAL + memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields + b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length + b->data_len = p - b->data; // update record length + return 0; + +rmB_err: + b->core.flag |= BAM_FUNMAP; + return -1; +} diff --git a/bam.h b/bam.h index 376e324..df345b6 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.18 (r982:295)" +#define BAM_VERSION "0.1.18-dev (r982:310)" #include #include @@ -89,7 +89,7 @@ typedef struct { char **target_name; uint32_t *target_len; void *dict, *hash, *rg2lib; - size_t l_text, n_text; + uint32_t l_text, n_text; char *text; } bam_header_t; @@ -150,16 +150,19 @@ typedef struct { /*! @abstract CIGAR: P = padding */ #define BAM_CPAD 6 /*! @abstract CIGAR: equals = match */ -#define BAM_CEQUAL 7 +#define BAM_CEQUAL 7 /*! @abstract CIGAR: X = mismatch */ -#define BAM_CDIFF 8 +#define BAM_CDIFF 8 +#define BAM_CBACK 9 -#define BAM_CIGAR_STR "MIDNSHP=X" +#define BAM_CIGAR_STR "MIDNSHP=XB" +#define BAM_CIGAR_TYPE 0x3C1A7 #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) #define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) #define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) -#define bam_cigar_gen(o, l) ((o)<>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference /*! @typedef @abstract Structure for core alignment information. @@ -252,7 +255,10 @@ typedef struct __bam_iter_t *bam_iter_t; @param i The i-th position, 0-based @return 4-bit integer representing the base. */ -#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +//#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +#define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) + +#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) ) /*! @function @abstract Get query sequence and quality @@ -282,6 +288,8 @@ extern int bam_is_be; */ extern int bam_verbose; +extern int bam_no_B; + /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ extern unsigned char bam_nt16_table[256]; @@ -427,6 +435,8 @@ extern "C" { */ int bam_read1(bamFile fp, bam1_t *b); + int bam_remove_B(bam1_t *b); + /*! @abstract Write an alignment to BAM. @param fp BAM file handler diff --git a/bam2bcf.c b/bam2bcf.c index dec3305..83f2e56 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -55,7 +55,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t } // fill the bases array memset(r, 0, sizeof(bcf_callret1_t)); - for (i = n = 0; i < _n; ++i) { + for (i = n = r->n_supp = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; // set base @@ -78,6 +78,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t b = p->aux>>16&0x3f; is_diff = (b != 0); } + if (is_diff) ++r->n_supp; bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; // collect annotations if (b < 4) r->qsum[b] += q; @@ -260,7 +261,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, return 0; } -int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP, +int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref) { extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); @@ -310,28 +311,29 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc if (i) kputc(',', &s); kputw(bc->anno[i], &s); } - if ( bc->vdb!=1 ) - { + if (bc->vdb != 1) ksprintf(&s, ";VDB=%.4f", bc->vdb); - } kputc('\0', &s); // FMT kputs("PL", &s); - if (bcr) { - kputs(":DP", &s); - if (is_SP) kputs(":SP", &s); + if (bcr && fmt_flag) { + if (fmt_flag & B2B_FMT_DP) kputs(":DP", &s); + if (fmt_flag & B2B_FMT_DV) kputs(":DV", &s); + if (fmt_flag & B2B_FMT_SP) kputs(":SP", &s); } kputc('\0', &s); b->m_str = s.m; b->str = s.s; b->l_str = s.l; bcf_sync(b); memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n); - if (bcr) { - uint16_t *dp = (uint16_t*)b->gi[1].data; - int32_t *sp = is_SP? b->gi[2].data : 0; + if (bcr && fmt_flag) { + uint16_t *dp = (fmt_flag & B2B_FMT_DP)? b->gi[1].data : 0; + uint16_t *dv = (fmt_flag & B2B_FMT_DV)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0)].data : 0; + int32_t *sp = (fmt_flag & B2B_FMT_DV)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0) + ((fmt_flag & B2B_FMT_DV) != 0)].data : 0; for (i = 0; i < bc->n; ++i) { bcf_callret1_t *p = bcr + i; - dp[i] = p->depth < 0xffff? p->depth : 0xffff; - if (is_SP) { + if (dp) dp[i] = p->depth < 0xffff? p->depth : 0xffff; + if (dv) dv[i] = p->n_supp < 0xffff? p->n_supp : 0xffff; + if (sp) { if (p->anno[0] + p->anno[1] < 2 || p->anno[2] + p->anno[3] < 2 || p->anno[0] + p->anno[2] < 2 || p->anno[1] + p->anno[3] < 2) { diff --git a/bam2bcf.h b/bam2bcf.h index 4af080c..a4d8ca5 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -7,6 +7,10 @@ #define B2B_INDEL_NULL 10000 +#define B2B_FMT_DP 0x1 +#define B2B_FMT_SP 0x2 +#define B2B_FMT_DV 0x4 + typedef struct __bcf_callaux_t { int capQ, min_baseQ; int openQ, extQ, tandemQ; // for indels @@ -23,7 +27,7 @@ typedef struct __bcf_callaux_t { } bcf_callaux_t; typedef struct { - int depth, ori_depth, qsum[4]; + int depth, n_supp, ori_depth, qsum[4]; int anno[16]; float p[25]; int mvd[3]; // mean variant distance, number of variant reads, average read length @@ -32,6 +36,7 @@ typedef struct { typedef struct { int a[5]; // alleles: ref, alt, alt2, alt3 int n, n_alleles, shift, ori_ref, unseen; + int n_supp; // number of supporting non-reference reads int anno[16], depth, ori_depth; uint8_t *PL; float vdb; // variant distance bias @@ -45,7 +50,7 @@ extern "C" { void bcf_call_destroy(bcf_callaux_t *bca); int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call); - int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP, + int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref); int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, const void *rghash); diff --git a/bam_import.c b/bam_import.c index 5518a9c..da2bf94 100644 --- a/bam_import.c +++ b/bam_import.c @@ -183,7 +183,7 @@ static inline void append_text(bam_header_t *header, kstring_t *str) // Sanity check if ( header->l_text+str->l+1 >= header->n_text ) { - fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,header->n_text,x,y); + fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,(long)header->n_text,x,y); abort(); } strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. @@ -291,11 +291,13 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; z += str->l + 1; if (str->s[0] != '*') { + uint32_t *cigar; for (s = str->s; *s; ++s) { if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar; else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); } b->data = alloc_data(b, doff + c->n_cigar * 4); + cigar = bam1_cigar(b); for (i = 0, s = str->s; i != c->n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); @@ -308,12 +310,13 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) else if (op == 'P') op = BAM_CPAD; else if (op == '=') op = BAM_CEQUAL; else if (op == 'X') op = BAM_CDIFF; + else if (op == 'B') op = BAM_CBACK; else parse_error(fp->n_lines, "invalid CIGAR operation"); s = t + 1; - bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; + cigar[i] = bam_cigar_gen(x, op); } if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation"); - c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b))); + c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar)); doff += c->n_cigar * 4; } else { if (!(c->flag&BAM_FUNMAP)) { @@ -340,9 +343,9 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) if (strcmp(str->s, "*")) { c->l_qseq = strlen(str->s); if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { - fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", - (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); - parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", + (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); + parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); } p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; memset(p, 0, (c->l_qseq+1)/2); @@ -459,6 +462,7 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) } b->l_aux = doff - doff0; b->data_len = doff; + if (bam_no_B) bam_remove_B(b); return z; } diff --git a/bam_mate.c b/bam_mate.c index 642ded6..056c723 100644 --- a/bam_mate.c +++ b/bam_mate.c @@ -34,7 +34,7 @@ void bam_mating_core(bamFile in, bamFile out) { bam_header_t *header; bam1_t *b[2]; - int curr, has_prev, pre_end, cur_end; + int curr, has_prev, pre_end = 0, cur_end; kstring_t str; str.l = str.m = 0; str.s = 0; diff --git a/bam_plcmd.c b/bam_plcmd.c index cbf6ae8..be2c87d 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -66,8 +66,6 @@ static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, cons #define MPLP_NO_COMP 0x20 #define MPLP_NO_ORPHAN 0x40 #define MPLP_REALN 0x80 -#define MPLP_FMT_DP 0x100 -#define MPLP_FMT_SP 0x200 #define MPLP_NO_INDEL 0x400 #define MPLP_EXT_BAQ 0x800 #define MPLP_ILLUMINA13 0x1000 @@ -80,7 +78,7 @@ void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { - int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth; + int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels char *reg, *pl_list; @@ -308,8 +306,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); - bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, - (conf->flag&MPLP_FMT_SP), 0, 0); + bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels @@ -318,8 +315,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); - bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, - (conf->flag&MPLP_FMT_SP), bca, ref); + bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } @@ -327,20 +323,29 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { - int j; - printf("\t%d\t", n_plp[i]); + int j, cnt; + for (j = cnt = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; + } + printf("\t%d\t", cnt); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { - for (j = 0; j < n_plp[i]; ++j) - pileup_seq(plp[i] + j, pos, ref_len, ref); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ) + pileup_seq(plp[i] + j, pos, ref_len, ref); + } putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; - int c = bam1_qual(p->b)[p->qpos] + 33; - if (c > 126) c = 126; - putchar(c); + int c = bam1_qual(p->b)[p->qpos]; + if (c >= conf->min_baseQ) { + c = c + 33 < 126? c + 33 : 126; + putchar(c); + } } if (conf->flag & MPLP_PRINT_MAPQ) { putchar('\t'); @@ -437,15 +442,14 @@ int bam_mpileup(int argc, char *argv[]) int nfiles = 0, use_orphan = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); - #define MPLP_PRINT_POS 0x4000 mplp.max_mq = 60; mplp.min_baseQ = 13; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; - mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; - while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6Os")) >= 0) { + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_EXT_BAQ; + while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6OsV")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); @@ -459,8 +463,9 @@ int bam_mpileup(int argc, char *argv[]) case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; case 'B': mplp.flag &= ~MPLP_REALN; break; - case 'D': mplp.flag |= MPLP_FMT_DP; break; - case 'S': mplp.flag |= MPLP_FMT_SP; break; + case 'D': mplp.fmt_flag |= B2B_FMT_DP; break; + case 'S': mplp.fmt_flag |= B2B_FMT_SP; break; + case 'V': mplp.fmt_flag |= B2B_FMT_DV; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_EXT_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; @@ -503,7 +508,7 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -b FILE list of input BAM files [null]\n"); fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); - fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); +// fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); @@ -532,6 +537,7 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); return 1; } + bam_no_B = 1; if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; mpileup(&mplp,nfiles,fn); diff --git a/bamtk.c b/bamtk.c index 6700b55..2e7fcb0 100644 --- a/bamtk.c +++ b/bamtk.c @@ -56,7 +56,7 @@ static int usage() fprintf(stderr, " cat concatenate BAMs\n"); fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); fprintf(stderr, " phase phase heterozygotes\n"); - fprintf(stderr, " pad2unpad convert padded BAM to unpadded BAM\n"); +// fprintf(stderr, " depad convert padded BAM to unpadded BAM\n"); // not stable fprintf(stderr, "\n"); #ifdef _WIN32 fprintf(stderr, "\ @@ -97,6 +97,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1); else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1); else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1); + else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) { fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 84a8e76..de7bc11 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -140,7 +140,7 @@ int bcf_sync(bcf1_t *b) for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) { b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; - } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { b->gi[i].len = 2; } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) { b->gi[i].len = 1; @@ -244,7 +244,7 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) if (k > 0) kputc(',', s); kputw(d[k], s); } - } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { kputw(((uint16_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { kputw(((uint8_t*)b->gi[i].data)[j], s); diff --git a/bcftools/vcf.c b/bcftools/vcf.c index 9daa845..bc11084 100644 --- a/bcftools/vcf.c +++ b/bcftools/vcf.c @@ -186,7 +186,7 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) ((uint8_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { ((int32_t*)b->gi[i].data)[k-9] = 0; - } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { ((uint16_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { int y = b->n_alleles * (b->n_alleles + 1) / 2; @@ -210,7 +210,7 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) int x = strtol(q, &q, 10); if (x > 0xffff) x = 0xffff; ((uint32_t*)b->gi[i].data)[k-9] = x; - } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { int x = strtol(q, &q, 10); if (x > 0xffff) x = 0xffff; ((uint16_t*)b->gi[i].data)[k-9] = x; diff --git a/padding.c b/padding.c index d76cb31..fb3bc13 100644 --- a/padding.c +++ b/padding.c @@ -109,7 +109,7 @@ int main_pad2unpad(int argc, char *argv[]) { bamFile in, out; if (argc == 1) { - fprintf(stderr, "Usage: samtools pad2unpad \n"); + fprintf(stderr, "Usage: samtools depad \n"); return 1; } in = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); diff --git a/sam_view.c b/sam_view.c index efda4e8..18b1282 100644 --- a/sam_view.c +++ b/sam_view.c @@ -21,7 +21,7 @@ typedef khash_t(rg) *rghash_t; // FIXME: we'd better use no global variables... static rghash_t g_rghash = 0; -static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; +static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0, g_qual_scale = 0; static float g_subsam = -1; static char *g_library, *g_rg; static void *g_bed; @@ -30,8 +30,16 @@ void *bed_read(const char *fn); void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); -static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) +static int process_aln(const bam_header_t *h, bam1_t *b) { + if (g_qual_scale > 1) { + int i; + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) { + int c = qual[i] * g_qual_scale; + qual[i] = c < 93? c : 93; + } + } if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) return 1; if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) @@ -92,7 +100,7 @@ static char *drop_rg(char *hdtxt, rghash_t h, int *len) // callback function for bam_fetch() that prints nonskipped records static int view_func(const bam1_t *b, void *data) { - if (!__g_skip_aln(((samfile_t*)data)->header, b)) + if (!process_aln(((samfile_t*)data)->header, (bam1_t*)b)) samwrite((samfile_t*)data, b); return 0; } @@ -100,7 +108,7 @@ static int view_func(const bam1_t *b, void *data) // callback function for bam_fetch() that counts nonskipped records static int count_func(const bam1_t *b, void *data) { - if (!__g_skip_aln(((count_func_data_t*)data)->header, b)) { + if (!process_aln(((count_func_data_t*)data)->header, (bam1_t*)b)) { (*((count_func_data_t*)data)->count)++; } return 0; @@ -118,7 +126,7 @@ int main_samview(int argc, char *argv[]) /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:")) >= 0) { + while ((c = getopt(argc, argv, "SbBct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:Q:")) >= 0) { switch (c) { case 's': g_subsam = atof(optarg); break; case 'c': is_count = 1; break; @@ -141,6 +149,8 @@ int main_samview(int argc, char *argv[]) case 'X': of_type = BAM_OFSTR; break; case '?': is_long_help = 1; break; case 'T': fn_ref = strdup(optarg); is_bamin = 0; break; + case 'B': bam_no_B = 1; break; + case 'Q': g_qual_scale = atoi(optarg); break; default: return usage(is_long_help); } } @@ -204,7 +214,7 @@ int main_samview(int argc, char *argv[]) bam1_t *b = bam_init1(); int r; while ((r = samread(in, b)) >= 0) { // read one alignment from `in' - if (!__g_skip_aln(in->header, b)) { + if (!process_aln(in->header, b)) { if (!is_count) samwrite(out, b); // write the alignment to `out' count++; } @@ -277,6 +287,7 @@ static int usage(int is_long_help) fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); fprintf(stderr, " -c print only the count of matching records\n"); + fprintf(stderr, " -B collapse the backward CIGAR operation\n"); fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n"); fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); -- 2.39.2