From 50706bea83d8a1518485283876333279f4ae7137 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Apr 2011 22:20:06 +0000 Subject: [PATCH] * added bam_verbose global variable * fixed a bug when grouping reads in mpileup (due to recent changes) * do not validate SAM header * estimate genotype frequency * added HWE test back, but with a different method --- bam.c | 2 +- bam.h | 8 +++++++- bam2depth.c | 2 +- bam_plcmd.c | 4 ++-- bcftools/call1.c | 29 +++++++++++++++++++++++++++-- bcftools/prob1.c | 28 ++++++++++++++++++++++++++-- sam.c | 12 ++++++------ sam_header.c | 3 ++- sam_view.c | 40 ++++++++++++++++++++++++++++++++++++++++ sample.c | 4 ++-- 10 files changed, 114 insertions(+), 18 deletions(-) diff --git a/bam.c b/bam.c index 96aace2..a65aed5 100644 --- a/bam.c +++ b/bam.c @@ -7,7 +7,7 @@ #include "kstring.h" #include "sam_header.h" -int bam_is_be = 0; +int bam_is_be = 0, bam_verbose = 2; char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; /************************** diff --git a/bam.h b/bam.h index f02f9fa..98612dd 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.14 (r933:176)" +#define BAM_VERSION "0.1.14 (r947:199)" #include #include @@ -264,6 +264,12 @@ typedef struct __bam_iter_t *bam_iter_t; */ extern int bam_is_be; +/*! + @abstract Verbose level between 0 and 3; 0 is supposed to disable all + debugging information, though this may not have been implemented. + */ +extern int bam_verbose; + /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ extern unsigned char bam_nt16_table[256]; diff --git a/bam2depth.c b/bam2depth.c index 96fad65..95588f9 100644 --- a/bam2depth.c +++ b/bam2depth.c @@ -80,7 +80,7 @@ int main_depth(int argc, char *argv[]) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip - printf("%s\t%d", h->target_name[tid], pos+1); + fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); for (i = 0; i < n; ++i) { int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { diff --git a/bam_plcmd.c b/bam_plcmd.c index 547b1e2..a879f47 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -623,7 +623,7 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); if (id < 0 || id >= m->n) { assert(q); // otherwise a bug - fprintf(stderr, "[%s] Read group %s used in file %s but not defined in the header.\n", __func__, (char*)q+1, fn[i]); + fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); exit(1); } if (m->n_plp[id] == m->m_plp[id]) { @@ -742,7 +742,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; - fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth); + fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); diff --git a/bcftools/call1.c b/bcftools/call1.c index 7aca159..cb33e06 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -38,6 +38,23 @@ void *bed_read(const char *fn); void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); +static double test_hwe(const double g[3]) +{ + extern double kf_gammaq(double p, double x); + double fexp, chi2, f[3], n; + int i; + n = g[0] + g[1] + g[2]; + fexp = (2. * g[2] + g[1]) / (2. * n); + if (fexp > 1. - 1e-10) fexp = 1. - 1e-10; + if (fexp < 1e-10) fexp = 1e-10; + f[0] = n * (1. - fexp) * (1. - fexp); + f[1] = n * 2. * fexp * (1. - fexp); + f[2] = n * fexp * fexp; + for (i = 0, chi2 = 0.; i < 3; ++i) + chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i]; + return kf_gammaq(.5, chi2 / 2.); +} + typedef struct { double p[4]; int mq, depth, is_tested, d[4]; @@ -117,7 +134,11 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p kputs(b->info, &s); if (b->info[0]) kputc(';', &s); // ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih); - ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih); + ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g;G3=%.4g,%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih, pr->g[2], pr->g[1], pr->g[0]); + if (n_smpl > 5) { + double hwe = test_hwe(pr->g); + if (hwe < 0.1) ksprintf(&s, ";HWE=%.4g", hwe); + } if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); if (fq < -999) fq = -999; @@ -216,6 +237,10 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=info) kputc(';', &s); - ksprintf(&s, "NEIR=%.3f;NEIF=%.3f,%.3f", r2, f[0]+f[2], f[0]+f[1]); + ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]); bcf_append_info(b, s.s, s.l); free(s.s); } diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 57f385b..a024d04 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -10,7 +10,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define MC_MAX_EM_ITER 16 -#define MC_EM_EPS 1e-4 +#define MC_EM_EPS 1e-5 #define MC_DEF_INDEL 0.15 unsigned char seq_nt4_table[256] = { @@ -224,6 +224,24 @@ static double mc_freq_iter(double f0, const bcf_p1aux_t *ma, int beg, int end) return f; } +static double mc_gtfreq_iter(double g[3], const bcf_p1aux_t *ma, int beg, int end) +{ + double err, gg[3]; + int i; + gg[0] = gg[1] = gg[2] = 0.; + for (i = beg; i < end; ++i) { + double *pdg, sum, tmp[3]; + pdg = ma->pdg + i * 3; + tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; + sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); + gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; + } + err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); + err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); + g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; + return err; +} + int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { double sum, g[3]; @@ -533,6 +551,13 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) } } } + { // compute g[3] + rst->g[0] = (1. - rst->f_em) * (1. - rst->f_em); + rst->g[1] = 2. * rst->f_em * (1. - rst->f_em); + rst->g[2] = rst->f_em * rst->f_em; + for (i = 0; i < MC_MAX_EM_ITER; ++i) + if (mc_gtfreq_iter(rst->g, ma, 0, ma->n) < MC_EM_EPS) break; + } { // estimate equal-tail credible interval (95% level) int l, h; double p; @@ -546,7 +571,6 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) h = i; rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; } - rst->g[0] = rst->g[1] = rst->g[2] = -1.; rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant rst->p_chi2 = contrast2(ma, rst->cmp); diff --git a/sam.c b/sam.c index e195b0f..f026bc8 100644 --- a/sam.c +++ b/sam.c @@ -59,9 +59,9 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } - if (fp->header->n_targets == 0) + if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); - } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); + } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } } else if (strchr(mode, 'w')) { // write fp->header = bam_header_dup((const bam_header_t*)aux); @@ -93,10 +93,10 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) sam_header_parse(alt); alt->l_text = 0; alt->text = 0; // check if there are @SQ lines in the header - fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} - if (alt->n_targets != fp->header->n_targets) - fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1) + fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n"); } else { // then dump ->target_{name,len} for (i = 0; i < fp->header->n_targets; ++i) fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); @@ -168,7 +168,7 @@ char *samfaipath(const char *fn_ref) if (access(fn_ref, R_OK) == -1) { fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); } else { - fprintf(stderr, "[samfaipath] build FASTA index...\n"); + if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n"); if (fai_build(fn_ref) == -1) { fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); free(fn_list); fn_list = 0; diff --git a/sam_header.c b/sam_header.c index 05d75de..f49b2d4 100644 --- a/sam_header.c +++ b/sam_header.c @@ -563,6 +563,7 @@ void *sam_header_parse2(const char *headerText) const char *text; char *buf=NULL; size_t nbuf = 0; + int tovalidate = 0; if ( !headerText ) return 0; @@ -571,7 +572,7 @@ void *sam_header_parse2(const char *headerText) while ( (text=nextline(&buf, &nbuf, text)) ) { hline = sam_header_line_parse(buf); - if ( hline && sam_header_line_validate(hline) ) + if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) // With too many (~250,000) reference sequences the header parsing was too slow with list_append. hlines = list_append_to_end(hlines, hline); else diff --git a/sam_view.c b/sam_view.c index fbffdb4..a579614 100644 --- a/sam_view.c +++ b/sam_view.c @@ -6,6 +6,7 @@ #include "sam_header.h" #include "sam.h" #include "faidx.h" +#include "kstring.h" #include "khash.h" KHASH_SET_INIT_STR(rg) @@ -68,6 +69,37 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) return 0; } +static char *drop_rg(char *hdtxt, rghash_t h, int *len) +{ + char *p = hdtxt, *q, *r, *s; + kstring_t str; + memset(&str, 0, sizeof(kstring_t)); + while (1) { + int toprint = 0; + q = strchr(p, '\n'); + if (q == 0) q = p + strlen(p); + if (q - p < 3) break; // the line is too short; then stop + if (strncmp(p, "@RG\t", 4) == 0) { + int c; + khint_t k; + if ((r = strstr(p, "\tID:")) != 0) { + r += 4; + for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); + c = *s; *s = '\0'; + k = kh_get(rg, h, r); + *s = c; + if (k != kh_end(h)) toprint = 1; + } + } else toprint = 1; + if (toprint) { + kputsn(p, q - p, &str); kputc('\n', &str); + } + p = q + 1; + } + *len = str.l; + return str.s; +} + // callback function for bam_fetch() that prints nonskipped records static int view_func(const bam1_t *b, void *data) { @@ -164,6 +196,14 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... + char *tmp; + int l; + tmp = drop_rg(in->header->text, g_rghash, &l); + free(in->header->text); + in->header->text = tmp; + in->header->l_text = l; + } if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); ret = 1; diff --git a/sample.c b/sample.c index 6ce6e6c..3efc559 100644 --- a/sample.c +++ b/sample.c @@ -74,8 +74,8 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) p = q > r? q : r; ++n; } -// if (n == 0) add_pair(sm, sm2id, fn, fn); - add_pair(sm, sm2id, fn, fn); + if (n == 0) add_pair(sm, sm2id, fn, fn); +// add_pair(sm, sm2id, fn, fn); free(buf.s); return 0; } -- 2.39.2