From 674ffee7adcfc928f5039777180206fdebc5539b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 30 Mar 2011 18:35:53 +0000 Subject: [PATCH] * added BED support to samtools view and mpileup and bcftools view * ML estimate of allele frequency in two groups * better error message given inconsisten RG labels * compute all-pair r^2 --- Makefile | 2 +- bam.h | 2 +- bam_plcmd.c | 44 ++++++++------ bam_sort.c | 13 ++-- bcftools/Makefile | 2 +- bcftools/call1.c | 61 ++++--------------- bcftools/ld.c | 43 +++++++++++++ bcftools/main.c | 4 +- bcftools/prob1.c | 20 ++++-- bcftools/prob1.h | 2 +- bedidx.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++ sam_view.c | 14 ++++- sample.c | 3 +- 13 files changed, 274 insertions(+), 87 deletions(-) create mode 100644 bedidx.c diff --git a/Makefile b/Makefile index f025942..2cfa085 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1 KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ - bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \ + bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o bedidx.o \ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ diff --git a/bam.h b/bam.h index bc8e3f1..f02f9fa 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.14 (r933:170)" +#define BAM_VERSION "0.1.14 (r933:176)" #include #include diff --git a/bam_plcmd.c b/bam_plcmd.c index fd75cec..547b1e2 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -536,19 +536,23 @@ int bam_pileup(int argc, char *argv[]) #define MPLP_EXT_BAQ 0x800 #define MPLP_ILLUMINA13 0x1000 +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); + typedef struct { int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels - char *reg, *fn_pos, *pl_list; + char *reg, *pl_list; faidx_t *fai; - kh_64_t *hash; - void *rghash; + void *bed, *rghash; } mplp_conf_t; typedef struct { bamFile fp; bam_iter_t iter; + bam_header_t *h; int ref_id; char *ref; const mplp_conf_t *conf; @@ -571,6 +575,14 @@ static int mplp_func(void *data, bam1_t *b) int has_ref; ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); if (ret < 0) break; + if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads + skip = 1; + continue; + } + if (ma->conf->bed) { // test overlap + skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); + if (skip) continue; + } if (ma->conf->rghash) { // exclude read groups uint8_t *rg = bam_aux_get(b, "RG"); skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0); @@ -589,7 +601,7 @@ static int mplp_func(void *data, bam1_t *b) int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres); if (q < 0) skip = 1; else if (b->core.qual > q) b->core.qual = q; - } else if (b->core.flag&BAM_FUNMAP) skip = 1; + } else if (b->core.qual < ma->conf->min_mq) skip = 1; else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; } while (skip); @@ -609,7 +621,11 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, q = bam_aux_get(p->b, "RG"); if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf); if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); - assert(id >= 0 && id < m->n); + if (id < 0 || id >= m->n) { + assert(q); // otherwise a bug + fprintf(stderr, "[%s] Read group %s used in file %s but not defined in the header.\n", __func__, (char*)q+1, fn[i]); + exit(1); + } if (m->n_plp[id] == m->m_plp[id]) { m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); @@ -629,7 +645,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_mplp_t iter; bam_header_t *h = 0; char *ref; - khash_t(64) *hash = 0; void *rghash = 0; bcf_callaux_t *bca = 0; @@ -657,6 +672,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); + data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { @@ -687,7 +703,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); - if (conf->fn_pos) hash = load_pos(conf->fn_pos, h); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; @@ -733,11 +748,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested - if (hash) { - khint_t k; - k = kh_get(64, hash, (uint64_t)tid<<32 | pos); - if (k == kh_end(hash)) continue; - } + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len); @@ -797,12 +808,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); - if (hash) { // free the hash table - khint_t k; - for (k = kh_begin(hash); k < kh_end(hash); ++k) - if (kh_exist(hash, k)) free(kh_val(hash, k)); - kh_destroy(64, hash); - } bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); @@ -887,7 +892,7 @@ int bam_mpileup(int argc, char *argv[]) break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; - case 'l': mplp.fn_pos = strdup(optarg); break; + case 'l': mplp.bed = bed_read(optarg); break; case 'P': mplp.pl_list = strdup(optarg); break; case 'g': mplp.flag |= MPLP_GLF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; @@ -966,5 +971,6 @@ int bam_mpileup(int argc, char *argv[]) if (mplp.rghash) bcf_str2id_thorough_destroy(mplp.rghash); free(mplp.reg); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); + if (mplp.bed) bed_destroy(mplp.bed); return 0; } diff --git a/bam_sort.c b/bam_sort.c index 7aeccff..94970b5 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -70,6 +70,7 @@ static void swap_header_text(bam_header_t *h1, bam_header_t *h2) #define MERGE_RG 1 #define MERGE_UNCOMP 2 +#define MERGE_LEVEL1 4 /*! @abstract Merge multiple sorted BAM. @@ -207,11 +208,9 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch } else h->pos = HEAP_EMPTY; } - if (flag & MERGE_UNCOMP) { - fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); - } else { - fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); - } + if (flag & MERGE_UNCOMP) fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); + else if (flag & MERGE_LEVEL1) fpout = strcmp(out, "-")? bam_open(out, "w1") : bam_dopen(fileno(stdout), "w1"); + else fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); if (fpout == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; @@ -257,11 +256,12 @@ int bam_merge(int argc, char *argv[]) int c, is_by_qname = 0, flag = 0, ret = 0; char *fn_headers = NULL, *reg = 0; - while ((c = getopt(argc, argv, "h:nruR:")) >= 0) { + while ((c = getopt(argc, argv, "h:nru1R:")) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; + case '1': flag |= MERGE_LEVEL1; break; case 'u': flag |= MERGE_UNCOMP; break; case 'R': reg = strdup(optarg); break; } @@ -272,6 +272,7 @@ int bam_merge(int argc, char *argv[]) fprintf(stderr, "Options: -n sort by read names\n"); fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); fprintf(stderr, " -u uncompressed BAM output\n"); + fprintf(stderr, " -1 compress level 1\n"); fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); diff --git a/bcftools/Makefile b/bcftools/Makefile index e579a9e..b5a1b1b 100644 --- a/bcftools/Makefile +++ b/bcftools/Makefile @@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE LOBJS= bcf.o vcf.o bcfutils.o prob1.o ld.o kfunc.o index.o fet.o bcf2qcall.o OMISC= .. -AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o +AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o PROG= bcftools INCLUDES= SUBDIRS= . diff --git a/bcftools/call1.c b/bcftools/call1.c index e074fb5..7aca159 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -8,9 +8,6 @@ #include "kstring.h" #include "time.h" -#include "khash.h" -KHASH_SET_INIT_INT64(set64) - #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@ -31,43 +28,15 @@ KSTREAM_INIT(gzFile, gzread, 16384) typedef struct { int flag, prior_type, n1, n_sub, *sublist, n_perm; - char *fn_list, *prior_file, **subsam, *fn_dict; + char *prior_file, **subsam, *fn_dict; uint8_t *ploidy; double theta, pref, indel_frac, min_perm_p, min_smpl_frac; + void *bed; } viewconf_t; -khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) -{ - void *str2id; - gzFile fp; - kstream_t *ks; - int ret, dret, lineno = 1; - kstring_t *str; - khash_t(set64) *hash = 0; - - hash = kh_init(set64); - str2id = bcf_build_refhash(_h); - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int tid = bcf_str2id(str2id, str->s); - if (tid >= 0 && dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) >= 0) { - uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); - kh_put(set64, hash, x, &ret); - } else break; - } else fprintf(stderr, "[%s] %s is not a reference name (line %d).\n", __func__, str->s, lineno); - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (dret < 0) break; - ++lineno; - } - bcf_str2id_destroy(str2id); - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - return hash; -} +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { double p[4]; @@ -154,7 +123,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p if (fq < -999) fq = -999; if (fq > 999) fq = 999; ksprintf(&s, ";FQ=%.3g", fq); - if (pr->cmp[0] >= 0.) { + if (pr->cmp[0] >= 0.) { // two sample groups int i, q[3], pq; for (i = 1; i < 3; ++i) { double x = pr->cmp[i] + pr->cmp[0]/2.; @@ -164,6 +133,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p pq = (int)(-4.343 * log(pr->p_chi2) + .499); if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank); ksprintf(&s, ";QCHI2=%d;PCHI2=%.3g;PC2=%d,%d", pq, q[1], q[2], pr->p_chi2); + ksprintf(&s, ";AF2=%.4g,%.4g", 1.-pr->f_em2[0], 1.-pr->f_em2[1]); // ksprintf(&s, ",%g,%g,%g", pr->cmp[0], pr->cmp[1], pr->cmp[2]); } if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); @@ -259,7 +229,7 @@ static void write_header(bcf_hdr_t *h) if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + kputs("##INFO=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT== 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; - case 'l': vc.fn_list = strdup(optarg); break; + case 'l': vc.bed = bed_read(optarg); break; case 'D': vc.fn_dict = strdup(optarg); break; case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; @@ -411,7 +380,6 @@ int bcfview(int argc, char *argv[]) } if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac } - if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, hin); if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { void *str2id = bcf_build_refhash(hout); if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { @@ -447,13 +415,7 @@ int bcfview(int argc, char *argv[]) x = toupper(b->ref[0]); if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue; } - if (hash) { - uint64_t x = (uint64_t)b->tid<<32 | b->pos; - khint_t k = kh_get(set64, hash, x); - if (kh_size(hash) == 0) break; - if (k == kh_end(hash)) continue; - kh_del(set64, hash, k); - } + if (vc.bed && !bed_overlap(vc.bed, hin->ns[b->tid], b->pos, b->pos + strlen(b->ref))) continue; if (tid >= 0) { int l = strlen(b->ref); l = b->pos + (l > 0? l : 1); @@ -512,8 +474,6 @@ int bcfview(int argc, char *argv[]) bcf_hdr_destroy(hin); bcf_destroy(b); bcf_destroy(blast); vcf_close(bp); vcf_close(bout); - if (hash) kh_destroy(set64, hash); - if (vc.fn_list) free(vc.fn_list); if (vc.fn_dict) free(vc.fn_dict); if (vc.ploidy) free(vc.ploidy); if (vc.n_sub) { @@ -521,6 +481,7 @@ int bcfview(int argc, char *argv[]) for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); free(vc.subsam); free(vc.sublist); } + if (vc.bed) bed_destroy(vc.bed); if (seeds) free(seeds); if (p1) bcf_p1_destroy(p1); return 0; diff --git a/bcftools/ld.c b/bcftools/ld.c index 11474ed..0207819 100644 --- a/bcftools/ld.c +++ b/bcftools/ld.c @@ -98,3 +98,46 @@ double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) } return r; } + +int bcf_main_pwld(int argc, char *argv[]) +{ + bcf_t *fp; + bcf_hdr_t *h; + bcf1_t **b, *b0; + int i, j, m, n; + double f[4]; + if (argc == 1) { + fprintf(stderr, "Usage: bcftools pwld \n"); + return 1; + } + fp = bcf_open(argv[1], "rb"); + h = bcf_hdr_read(fp); + // read the entire BCF + m = n = 0; b = 0; + b0 = calloc(1, sizeof(bcf1_t)); + while (bcf_read(fp, h, b0) >= 0) { + if (m == n) { + m = m? m<<1 : 16; + b = realloc(b, sizeof(void*) * m); + } + b[n] = calloc(1, sizeof(bcf1_t)); + bcf_cpy(b[n++], b0); + } + bcf_destroy(b0); + // compute pair-wise r^2 + printf("%d\n", n); // the number of loci + for (i = 0; i < n; ++i) { + printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1); + for (j = 0; j < i; ++j) { + double r = bcf_ld_freq(b[i], b[j], f); + printf("\t%.3f", r*r); + } + printf("\t1.000\n"); + } + // free + for (i = 0; i < n; ++i) bcf_destroy(b[i]); + free(b); + bcf_hdr_destroy(h); + bcf_close(fp); + return 0; +} diff --git a/bcftools/main.c b/bcftools/main.c index 7ffc2a0..26377e1 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -5,6 +5,7 @@ int bcfview(int argc, char *argv[]); int bcf_main_index(int argc, char *argv[]); +int bcf_main_pwld(int argc, char *argv[]); #define BUF_SIZE 0x10000 @@ -55,7 +56,8 @@ int main(int argc, char *argv[]) } if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); - else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); + else if (strcmp(argv[1], "ld") == 0) return bcf_main_pwld(argc-1, argv+1); + else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... else { fprintf(stderr, "[main] Unrecognized command.\n"); return 1; diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 503a998..57f385b 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -209,18 +209,18 @@ static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) return i; } // f0 is the reference allele frequency -static double mc_freq_iter(double f0, const bcf_p1aux_t *ma) +static double mc_freq_iter(double f0, const bcf_p1aux_t *ma, int beg, int end) { double f, f3[3]; int i; f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; - for (i = 0, f = 0.; i < ma->n; ++i) { + for (i = beg, f = 0.; i < end; ++i) { double *pdg; pdg = ma->pdg + i * 3; f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); } - f /= ma->n * 2.; + f /= (end - beg) * 2.; return f; } @@ -448,6 +448,8 @@ static double contrast2(bcf_p1aux_t *p1, double ret[3]) for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1) for (k2 = 0; k2 <= 2*n2; ++k2) if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y; + if (ret[0] + ret[1] + ret[2] < 0.99) // It seems that this may be caused by floating point errors. I do not really understand why... + z = 1.0, ret[0] = ret[1] = ret[2] = 1./3; } return (double)z; } @@ -516,10 +518,20 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { // calculate f_em double flast = rst->f_flat; for (i = 0; i < MC_MAX_EM_ITER; ++i) { - rst->f_em = mc_freq_iter(flast, ma); + rst->f_em = mc_freq_iter(flast, ma, 0, ma->n); if (fabs(rst->f_em - flast) < MC_EM_EPS) break; flast = rst->f_em; } + if (ma->n1 > 0 && ma->n1 < ma->n) { + for (k = 0; k < 2; ++k) { + flast = rst->f_em; + for (i = 0; i < MC_MAX_EM_ITER; ++i) { + rst->f_em2[k] = k? mc_freq_iter(flast, ma, ma->n1, ma->n) : mc_freq_iter(flast, ma, 0, ma->n1); + if (fabs(rst->f_em2[k] - flast) < MC_EM_EPS) break; + flast = rst->f_em2[k]; + } + } + } } { // estimate equal-tail credible interval (95% level) int l, h; diff --git a/bcftools/prob1.h b/bcftools/prob1.h index 3f89dda..b824c3c 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -10,7 +10,7 @@ typedef struct { int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; double cil, cih; - double cmp[3], p_chi2; // used by contrast2() + double cmp[3], p_chi2, f_em2[2]; // used by contrast2() double g[3]; } bcf_p1rst_t; diff --git a/bedidx.c b/bedidx.c new file mode 100644 index 0000000..9b76af7 --- /dev/null +++ b/bedidx.c @@ -0,0 +1,151 @@ +#include +#include +#include +#include +#include + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 8192) + +typedef struct { + int n, m; + uint64_t *a; + int *idx; +} bed_reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, bed_reglist_t) + +#define LIDX_SHIFT 13 + +typedef kh_reg_t reghash_t; + +int *bed_index_core(int n, uint64_t *a, int *n_idx) +{ + int i, j, m, *idx; + m = *n_idx = 0; idx = 0; + for (i = 0; i < n; ++i) { + int beg, end; + beg = a[i]>>32; end = (uint32_t)a[i]; + if (m < end + 1) { + int oldm = m; + m = end + 1; + kroundup32(m); + idx = realloc(idx, m * sizeof(int)); + for (j = oldm; j < m; ++j) idx[j] = -1; + } + if (beg == end) { + if (idx[beg] < 0) idx[beg] = i; + } else { + for (j = beg; j <= end; ++j) + if (idx[j] < 0) idx[j] = i; + } + *n_idx = end + 1; + } + return idx; +} + +void bed_index(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + bed_reglist_t *p = &kh_val(h, k); + if (p->idx) free(p->idx); + p->idx = bed_index_core(p->n, p->a, &p->m); + } + } +} + +int bed_overlap_core(const bed_reglist_t *p, int beg, int end) +{ + int i, min_off; + if (p->n == 0) return 0; + min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; + if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here + int n = beg>>LIDX_SHIFT; + if (n > p->n) n = p->n; + for (i = n - 1; i >= 0; --i) + if (p->idx[i] >= 0) break; + min_off = i >= 0? p->idx[i] : 0; + } + for (i = min_off; i < p->n; ++i) { + if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed + if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) + return 1; // find the overlap; return + } + return 0; +} + +int bed_overlap(const void *_h, const char *chr, int beg, int end) +{ + const reghash_t *h = (const reghash_t*)_h; + khint_t k; + if (!h) return 0; + k = kh_get(reg, h, chr); + if (k == kh_end(h)) return 0; + return bed_overlap_core(&kh_val(h, k), beg, end); +} + +void *bed_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name + int beg = -1, end = -1; + bed_reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { // absent from the hash table + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { // if the lines has other characters + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); // begin + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) + end = atoi(str->s); // end + } + } + } + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg >= 0 && end > beg) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + bed_index(h); + return h; +} + +void bed_destroy(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free(kh_val(h, k).idx); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +} diff --git a/sam_view.c b/sam_view.c index 170bd84..fbffdb4 100644 --- a/sam_view.c +++ b/sam_view.c @@ -18,10 +18,15 @@ typedef struct { typedef khash_t(rg) *rghash_t; -rghash_t g_rghash = 0; +static rghash_t g_rghash = 0; static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; static char *g_library, *g_rg; static int g_sol2sanger_tbl[128]; +static void *g_bed; + +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); static void sol2sanger(bam1_t *b) { @@ -44,6 +49,8 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) { if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) return 1; + if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) + return 1; if (g_rg || g_rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { @@ -90,7 +97,7 @@ int main_samview(int argc, char *argv[]) /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:")) >= 0) { + while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:CR:L:")) >= 0) { switch (c) { case 'c': is_count = 1; break; case 'C': slx2sngr = 1; break; @@ -106,6 +113,7 @@ int main_samview(int argc, char *argv[]) case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': g_library = strdup(optarg); break; + case 'L': g_bed = bed_read(optarg); break; case 'r': g_rg = strdup(optarg); break; case 'R': fn_rg = strdup(optarg); break; case 'x': of_type = BAM_OFHEX; break; @@ -215,6 +223,7 @@ view_end: } // close files, free and return free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); + if (g_bed) bed_destroy(g_bed); if (g_rghash) { khint_t k; for (k = 0; k < kh_end(g_rghash); ++k) @@ -240,6 +249,7 @@ static int usage(int is_long_help) fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); fprintf(stderr, " -c print only the count of matching records\n"); + fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n"); fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); diff --git a/sample.c b/sample.c index b3d2642..6ce6e6c 100644 --- a/sample.c +++ b/sample.c @@ -74,7 +74,8 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) p = q > r? q : r; ++n; } - if (n == 0) add_pair(sm, sm2id, fn, fn); +// if (n == 0) add_pair(sm, sm2id, fn, fn); + add_pair(sm, sm2id, fn, fn); free(buf.s); return 0; } -- 2.39.2