X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=sam%2Fbcftools%2Fcall1.c;h=e6373d367938d52ee38f0395bf2743506cd84bc0;hp=3cc464949eb69e630ee7905c96d4f8050b1b569c;hb=dbcf1cfb8ad1086c21d64e249f012809403e7ddc;hpb=58d504aaf36ae486b1dba6d03e0e9f1c25855037 diff --git a/sam/bcftools/call1.c b/sam/bcftools/call1.c index 3cc4649..e6373d3 100644 --- a/sam/bcftools/call1.c +++ b/sam/bcftools/call1.c @@ -33,13 +33,14 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_EM 0x10000 #define VC_PAIRCALL 0x20000 #define VC_QCNT 0x40000 +#define VC_INDEL_ONLY 0x80000 typedef struct { int flag, prior_type, n1, n_sub, *sublist, n_perm; uint32_t *trio_aux; char *prior_file, **subsam, *fn_dict; uint8_t *ploidy; - double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt; + double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt, min_ma_lrt; void *bed; } viewconf_t; @@ -47,11 +48,6 @@ void *bed_read(const char *fn); void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); -typedef struct { - double p[4]; - int mq, depth, is_tested, d[4]; -} anno16_t; - static double ttest(int n1, int n2, int a[4]) { extern double kf_betai(double a, double b, double x); @@ -82,7 +78,7 @@ static int test16_core(int anno[16], anno16_t *a) return 0; } -static int test16(bcf1_t *b, anno16_t *a) +int test16(bcf1_t *b, anno16_t *a) { char *p; int i, anno[16]; @@ -99,17 +95,6 @@ static int test16(bcf1_t *b, anno16_t *a) return test16_core(anno, a); } -static void rm_info(bcf1_t *b, const char *key) -{ - char *p, *q; - if ((p = strstr(b->info, key)) == 0) return; - for (q = p; *q && *q != ';'; ++q); - if (p > b->info && *(p-1) == ';') --p; - memmove(p, q, b->l_str - (q - b->str)); - b->l_str -= q - p; - bcf_sync(b); -} - static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt) { kstring_t s; @@ -118,7 +103,7 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, anno16_t a; has_I16 = test16(b, &a) >= 0? 1 : 0; - rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! + //rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); @@ -169,6 +154,8 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, } if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); kputc('\0', &s); + rm_info(&s, "QS="); + rm_info(&s, "I16="); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; @@ -203,7 +190,27 @@ static char **read_samples(const char *fn, int *_n) *_n = 0; s.l = s.m = 0; s.s = 0; fp = gzopen(fn, "r"); - if (fp == 0) return 0; // fail to open file + if (fp == 0) + { + // interpret as sample names, not as a file name + const char *t = fn, *p = t; + while (*t) + { + t++; + if ( *t==',' || !*t ) + { + sam = realloc(sam, sizeof(void*)*(n+1)); + sam[n] = (char*) malloc(sizeof(char)*(t-p+2)); + memcpy(sam[n], p, t-p); + sam[n][t-p] = 0; + sam[n][t-p+1] = 2; // assume diploid + p = t+1; + n++; + } + } + *_n = n; + return sam; // fail to open file + } ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { int l; @@ -249,6 +256,12 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + //if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + kputs("##INFO=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT== 0) { + while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:K:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; - case 'l': vc.bed = bed_read(optarg); break; + case 'l': vc.bed = bed_read(optarg); if (!vc.bed) { fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; } break; case 'D': vc.fn_dict = strdup(optarg); break; case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; @@ -335,8 +361,10 @@ int bcfview(int argc, char *argv[]) case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; case 'I': vc.flag |= VC_NO_INDEL; break; + case 'w': vc.flag |= VC_INDEL_ONLY; break; case 'M': vc.flag |= VC_ANNO_MAX; break; case 'Y': vc.flag |= VC_QCNT; break; + case 'm': vc.min_ma_lrt = atof(optarg); break; case 't': vc.theta = atof(optarg); break; case 'p': vc.pref = atof(optarg); break; case 'i': vc.indel_frac = atof(optarg); break; @@ -346,6 +374,7 @@ int bcfview(int argc, char *argv[]) case 'C': vc.min_lrt = atof(optarg); break; case 'X': vc.min_perm_p = atof(optarg); break; case 'd': vc.min_smpl_frac = atof(optarg); break; + case 'K': bcf_p1_fp_lk = gzopen(optarg, "w"); break; case 's': vc.subsam = read_samples(optarg, &vc.n_sub); vc.ploidy = calloc(vc.n_sub + 1, 1); for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; @@ -392,6 +421,7 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -m FLOAT alternative model for multiallelic and rare-variant calling, include if P(chi^2)>=FLOAT\n"); fprintf(stderr, " -p FLOAT variant if P(ref|D) 0) { int is_indel, cons_llr = -1; int64_t cons_gt = -1; @@ -482,6 +516,7 @@ int bcfview(int argc, char *argv[]) if (vc.flag & VC_FIX_PL) bcf_fix_pl(b); is_indel = bcf_is_indel(b); if ((vc.flag & VC_NO_INDEL) && is_indel) continue; + if ((vc.flag & VC_INDEL_ONLY) && !is_indel) continue; if ((vc.flag & VC_ACGT_ONLY) && !is_indel) { int x; if (b->ref[0] == 0 || b->ref[1] != 0) continue; @@ -515,9 +550,19 @@ int bcfview(int argc, char *argv[]) int i; for (i = 0; i < 9; ++i) em[i] = -1.; } - if (vc.flag & VC_CALL) { // call variants + if ( !(vc.flag&VC_KEEPALT) && (vc.flag&VC_CALL) && vc.min_ma_lrt>=0 ) + { + bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions + int gts = call_multiallelic_gt(b, p1, vc.min_ma_lrt, vc.flag&VC_VARONLY); + if ( gts<=1 && vc.flag & VC_VARONLY ) continue; + } + else if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; - int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); + int calret; + gzwrite(bcf_p1_fp_lk, &b->tid, 4); + gzwrite(bcf_p1_fp_lk, &b->pos, 4); + gzwrite(bcf_p1_fp_lk, &em[0], sizeof(double)); + calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); if (n_processed % 100000 == 0) { fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); bcf_p1_dump_afs(p1); @@ -562,6 +607,8 @@ int bcfview(int argc, char *argv[]) } else bcf_fix_gt(b); vcf_write(bout, hout, b); } + + if (bcf_p1_fp_lk) gzclose(bcf_p1_fp_lk); if (vc.prior_file) free(vc.prior_file); if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); if (hin != hout) bcf_hdr_destroy(hout);