From: Petr Danecek Date: Fri, 15 Mar 2013 08:40:35 +0000 (+0000) Subject: mpileup: New HWE, QBD, RPS annotations. Modified VDB (beware: this version of VDB... X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=17e151305384832457424bde6fac93e2f022cc8a mpileup: New HWE, QBD, RPS annotations. Modified VDB (beware: this version of VDB may be broken). Fix in indel calling: Ignore insertions with N's. Output contigs in the VCF header. New sam_header2tbl_n() API call. --- diff --git a/bam2bcf.c b/bam2bcf.c index 0b41c6f..340b10b 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -130,7 +130,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // collect read positions for ReadPosBias int len, pos = get_position(p, &len); - int epos = (double)pos/len * bca->npos; + int epos = (double)pos/(len+1) * bca->npos; if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) bca->ref_pos[epos]++; else @@ -158,7 +158,11 @@ void calc_ReadPosBias(bcf_callaux_t *bca, bcf_call_t *call) nref += bca->ref_pos[i]; nalt += bca->alt_pos[i]; U += nref*bca->alt_pos[i]; + bca->ref_pos[i] = 0; + bca->alt_pos[i] = 0; } +#if 0 +//todo double var = 0, avg = (double)(nref+nalt)/bca->npos; for (i=0; inpos; i++) { @@ -170,6 +174,7 @@ void calc_ReadPosBias(bcf_callaux_t *bca, bcf_call_t *call) call->read_pos.avg = avg; call->read_pos.var = sqrt(var/bca->npos); call->read_pos.dp = nref+nalt; +#endif if ( !nref || !nalt ) { call->read_pos_bias = -1; @@ -416,7 +421,7 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc if (i) kputc(',', &s); kputw(bc->anno[i], &s); } - ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var); + //ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var); ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]); if (bc->vdb != -1) ksprintf(&s, ";VDB=%e", bc->vdb); diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 7ccd1cd..30b3f46 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -218,6 +218,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla * reduces the power because sometimes, substitutions caused by * indels are not distinguishable from true mutations. Multiple * sequence realignment helps to increase the power. + * + * Masks mismatches present in at least 70% of the reads with 'N'. */ { // construct per-sample consensus int L = right - left + 1, max_i, max2_i; @@ -261,7 +263,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; if (max_i >= 0) r[max_i] = 15; if (max2_i >= 0) r[max2_i] = 15; -// for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); + //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); } free(ref0); free(cns); } @@ -278,9 +280,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } } // construct the consensus sequence - max_ins = types[n_types - 1]; // max_ins is at least 0 + max_ins = types[n_types - 1]; // max_ins is at least 0 if (max_ins > 0) { - int *inscns_aux = calloc(4 * n_types * max_ins, sizeof(int)); + int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int)); // count the number of occurrences of each base at each position for each type of insertion for (t = 0; t < n_types; ++t) { if (types[t] > 0) { @@ -291,7 +293,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla uint8_t *seq = bam1_seq(p->b); for (k = 1; k <= p->indel; ++k) { int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)]; - if (c < 4) ++inscns_aux[(t*max_ins+(k-1))*4 + c]; + assert(c<5); + ++inscns_aux[(t*max_ins+(k-1))*5 + c]; } } } @@ -302,11 +305,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla inscns = calloc(n_types * max_ins, 1); for (t = 0; t < n_types; ++t) { for (j = 0; j < types[t]; ++j) { - int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*4]; - for (k = 0; k < 4; ++k) + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; + for (k = 0; k < 5; ++k) if (ia[k] > max) max = ia[k], max_k = k; inscns[t*max_ins + j] = max? max_k : 4; + if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's } } free(inscns_aux); diff --git a/bam_plcmd.c b/bam_plcmd.c index 9c4f273..54a4597 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -9,6 +9,7 @@ #include "sam.h" #include "faidx.h" #include "kstring.h" +#include "sam_header.h" static inline int printw(int c, FILE *fp) { @@ -85,7 +86,7 @@ typedef struct { int rflag_require, rflag_filter; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels - char *reg, *pl_list; + char *reg, *pl_list, *fai_fname; faidx_t *fai; void *bed, *rghash; } mplp_conf_t; @@ -275,9 +276,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); - bh->txt = malloc(strlen(BAM_VERSION) + 64); - bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); - free(s.s); + s.l = 0; + ksprintf(&s, "##samtoolsVersion=%s\n", BAM_VERSION); + if (conf->fai_fname) ksprintf(&s, "##reference=file://%s\n", conf->fai_fname); + h->dict = sam_header_parse2(h->text); + int nseq; + const char *tags[] = {"SN","LN","UR","M5",NULL}; + char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &nseq); + for (i=0; i\n", &s); + } + if (tbl) free(tbl); + bh->txt = s.s; + bh->l_txt = 1 + s.l; bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); @@ -487,6 +503,7 @@ int bam_mpileup(int argc, char *argv[]) case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; + mplp.fai_fname = optarg; break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; diff --git a/bcftools/call1.c b/bcftools/call1.c index 18805a0..e6373d3 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -288,10 +288,8 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); - if (!strstr(str.s, "##INFO=\n", &str); + //if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO== 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; - case 'l': vc.bed = bed_read(optarg); if (!vc.bed) fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; break; + case 'l': vc.bed = bed_read(optarg); if (!vc.bed) { fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; } break; case 'D': vc.fn_dict = strdup(optarg); break; case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; diff --git a/bcftools/prob1.c b/bcftools/prob1.c index f04cf08..3539ee3 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -502,7 +502,7 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_o // Call GTs int isample, gts=0, ac[4] = {0,0,0,0}; - int nRR = 0, nAA = 0, nRA = 0, max_dv = 0, dp_nref = 0; + int nRR = 0, nAA = 0, nRA = 0, max_dv = 0; for (isample = 0; isample < b->n_smpl; isample++) { int ploidy = b->ploidy ? b->ploidy[isample] : 2; @@ -551,7 +551,6 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_o if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) ) { if ( max_dv < dv ) max_dv = dv; - dp_nref += dp; } // For HWE annotation; multiple ALT alleles treated as one @@ -604,7 +603,6 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_o if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3])); - if ( dp_nref ) ksprintf(&s, ";QBDNR=%e", b->qual/dp_nref); if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv); } if ( nAA+nRA ) diff --git a/sam_header.c b/sam_header.c index ddc2c38..88b6a1c 100644 --- a/sam_header.c +++ b/sam_header.c @@ -770,4 +770,41 @@ void *sam_header_merge(int n, const void **_dicts) return out_dict; } +char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n) +{ + int nout = 0; + char **out = NULL; + + *n = 0; + list_t *l = (list_t *)dict; + if ( !l ) return NULL; + + int i, ntags = 0; + while ( tags[ntags] ) ntags++; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags); + for (i=0; ivalue; + } + nout++; + l = l->next; + } + *n = nout; + return out; +} diff --git a/sam_header.h b/sam_header.h index ebea12f..4b0cb03 100644 --- a/sam_header.h +++ b/sam_header.h @@ -19,6 +19,23 @@ extern "C" { void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value); char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); + /* + // Usage example + int i, j, n; + const char *tags[] = {"SN","LN","UR","M5",NULL}; + void *dict = sam_header_parse2(bam->header->text); + char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &n); + for (i=0; i