From fa6c5588e0d3845a98ea54435d49ccbe305515d7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 17 Apr 2012 16:03:11 -0400 Subject: [PATCH] r572: added "bedcov" command --- Makefile | 2 +- bam.h | 2 +- bamtk.c | 3 ++ bedcov.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ kstring.h | 17 ++++++++ 5 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 bedcov.c diff --git a/Makefile b/Makefile index 6ddd4dd..4a8a75b 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ - cut_target.o phase.o bam2depth.o padding.o + cut_target.o phase.o bam2depth.o padding.o bedcov.o PROG= samtools INCLUDES= -I. SUBDIRS= . bcftools misc diff --git a/bam.h b/bam.h index efc0459..82f1ffa 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.18-master-r567" +#define BAM_VERSION "0.1.18-r572" #include #include diff --git a/bamtk.c b/bamtk.c index 2e7fcb0..acdf65f 100644 --- a/bamtk.c +++ b/bamtk.c @@ -28,6 +28,7 @@ int main_cat(int argc, char *argv[]); int main_depth(int argc, char *argv[]); int main_bam2fq(int argc, char *argv[]); int main_pad2unpad(int argc, char *argv[]); +int main_bedcov(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); @@ -54,6 +55,7 @@ static int usage() fprintf(stderr, " rmdup remove PCR duplicates\n"); fprintf(stderr, " reheader replace BAM header\n"); fprintf(stderr, " cat concatenate BAMs\n"); + fprintf(stderr, " bedcov read depth per BED region\n"); fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); fprintf(stderr, " phase phase heterozygotes\n"); // fprintf(stderr, " depad convert padded BAM to unpadded BAM\n"); // not stable @@ -98,6 +100,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1); else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1); else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1); + else if (strcmp(argv[1], "bedcov") == 0) return main_bedcov(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) { fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; diff --git a/bedcov.c b/bedcov.c new file mode 100644 index 0000000..2b61a22 --- /dev/null +++ b/bedcov.c @@ -0,0 +1,126 @@ +#include +#include +#include +#include +#include +#include "kstring.h" +#include "bgzf.h" +#include "bam.h" + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +typedef struct { + bamFile fp; + bam_iter_t iter; + int min_mapQ; +} aux_t; + +static int read_bam(void *data, bam1_t *b) +{ + aux_t *aux = (aux_t*)data; + int ret = bam_iter_read(aux->fp, aux->iter, b); + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + return ret; +} + +int main_bedcov(int argc, char *argv[]) +{ + extern void bam_init_header_hash(bam_header_t*); + gzFile fp; + kstring_t str; + kstream_t *ks; + bam_index_t **idx; + bam_header_t *h = 0; + aux_t **aux; + int *n_plp, dret, i, n, c, min_mapQ = 0; + int64_t *cnt; + const bam_pileup1_t **plp; + + while ((c = getopt(argc, argv, "Q:")) >= 0) { + switch (c) { + case 'Q': min_mapQ = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: samtools bedcov [...]\n"); + return 1; + } + memset(&str, 0, sizeof(kstring_t)); + n = argc - optind - 1; + aux = calloc(n, sizeof(void*)); + idx = calloc(n, sizeof(void*)); + for (i = 0; i < n; ++i) { + aux[i] = calloc(1, sizeof(aux_t)); + aux[i]->min_mapQ = min_mapQ; + aux[i]->fp = bam_open(argv[i+optind+1], "r"); + idx[i] = bam_index_load(argv[i+optind+1]); + if (aux[i]->fp == 0 || idx[i] == 0) { + fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); + return 2; + } + bgzf_set_cache_size(aux[i]->fp, 20); + if (i == 0) h = bam_header_read(aux[0]->fp); + } + bam_init_header_hash(h); + cnt = calloc(n, 8); + + fp = gzopen(argv[optind], "rb"); + ks = ks_init(fp); + n_plp = calloc(n, sizeof(int)); + plp = calloc(n, sizeof(void*)); + while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { + char *p, *q; + int tid, beg, end, pos; + bam_mplp_t mplp; + + for (p = q = str.s; *p && *p != '\t'; ++p); + if (*p != '\t') goto bed_error; + *p = 0; tid = bam_get_tid(h, q); *p = '\t'; + if (tid < 0) goto bed_error; + for (q = p = p + 1; isdigit(*p); ++p); + if (*p != '\t') goto bed_error; + *p = 0; beg = atoi(q); *p = '\t'; + for (q = p = p + 1; isdigit(*p); ++p); + if (*p == '\t' || *p == 0) { + int c = *p; + *p = 0; end = atoi(q); *p = c; + } else goto bed_error; + + for (i = 0; i < n; ++i) { + if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); + aux[i]->iter = bam_iter_query(idx[i], tid, beg, end); + } + mplp = bam_mplp_init(n, read_bam, (void**)aux); + bam_mplp_set_maxcnt(mplp, 64000); + memset(cnt, 0, 8 * n); + while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) + if (pos >= beg && pos < end) + for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; + for (i = 0; i < n; ++i) { + kputc('\t', &str); + kputl(cnt[i], &str); + } + puts(str.s); + bam_mplp_destroy(mplp); + continue; + +bed_error: + fprintf(stderr, "Errors in BED line '%s'\n", str.s); + } + free(n_plp); free(plp); + ks_destroy(ks); + gzclose(fp); + + free(cnt); + for (i = 0; i < n; ++i) { + if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); + bam_index_destroy(idx[i]); + bam_close(aux[i]->fp); + free(aux[i]); + } + bam_header_destroy(h); + free(aux); free(idx); + free(str.s); + return 0; +} diff --git a/kstring.h b/kstring.h index d490f58..abd8236 100644 --- a/kstring.h +++ b/kstring.h @@ -142,6 +142,23 @@ static inline int kputuw(unsigned c, kstring_t *s) return 0; } +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + long l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + static inline int *ksplit(kstring_t *s, int delimiter, int *n) { int max = 0, *offsets = 0; -- 2.39.2