From: Heng Li <lh3@me.com>
Date: Tue, 17 Apr 2012 20:03:11 +0000 (-0400)
Subject: r572: added "bedcov" command
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=fa6c5588e0d3845a98ea54435d49ccbe305515d7;p=samtools.git

r572: added "bedcov" command
---

diff --git a/Makefile b/Makefile
index 6ddd4dd..4a8a75b 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ LOBJS=		bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o	\
 AOBJS=		bam_tview.o bam_plcmd.o sam_view.o \
 			bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
 			bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
-			cut_target.o phase.o bam2depth.o padding.o
+			cut_target.o phase.o bam2depth.o padding.o bedcov.o
 PROG=		samtools
 INCLUDES=	-I.
 SUBDIRS=	. bcftools misc
diff --git a/bam.h b/bam.h
index efc0459..82f1ffa 100644
--- a/bam.h
+++ b/bam.h
@@ -40,7 +40,7 @@
   @copyright Genome Research Ltd.
  */
 
-#define BAM_VERSION "0.1.18-master-r567"
+#define BAM_VERSION "0.1.18-r572"
 
 #include <stdint.h>
 #include <stdlib.h>
diff --git a/bamtk.c b/bamtk.c
index 2e7fcb0..acdf65f 100644
--- a/bamtk.c
+++ b/bamtk.c
@@ -28,6 +28,7 @@ int main_cat(int argc, char *argv[]);
 int main_depth(int argc, char *argv[]);
 int main_bam2fq(int argc, char *argv[]);
 int main_pad2unpad(int argc, char *argv[]);
+int main_bedcov(int argc, char *argv[]);
 
 int faidx_main(int argc, char *argv[]);
 
@@ -54,6 +55,7 @@ static int usage()
 	fprintf(stderr, "         rmdup       remove PCR duplicates\n");
 	fprintf(stderr, "         reheader    replace BAM header\n");
 	fprintf(stderr, "         cat         concatenate BAMs\n");
+	fprintf(stderr, "         bedcov      read depth per BED region\n");
 	fprintf(stderr, "         targetcut   cut fosmid regions (for fosmid pool only)\n");
 	fprintf(stderr, "         phase       phase heterozygotes\n");
 //	fprintf(stderr, "         depad       convert padded BAM to unpadded BAM\n"); // not stable
@@ -98,6 +100,7 @@ int main(int argc, char *argv[])
 	else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1);
 	else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1);
 	else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1);
+	else if (strcmp(argv[1], "bedcov") == 0) return main_bedcov(argc-1, argv+1);
 	else if (strcmp(argv[1], "pileup") == 0) {
 		fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
 		return 1;
diff --git a/bedcov.c b/bedcov.c
new file mode 100644
index 0000000..2b61a22
--- /dev/null
+++ b/bedcov.c
@@ -0,0 +1,126 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include "kstring.h"
+#include "bgzf.h"
+#include "bam.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+typedef struct {
+	bamFile fp;
+	bam_iter_t iter;
+	int min_mapQ;
+} aux_t;
+
+static int read_bam(void *data, bam1_t *b)
+{
+	aux_t *aux = (aux_t*)data;
+	int ret = bam_iter_read(aux->fp, aux->iter, b);
+	if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+	return ret;
+}
+
+int main_bedcov(int argc, char *argv[])
+{
+	extern void bam_init_header_hash(bam_header_t*);
+	gzFile fp;
+	kstring_t str;
+	kstream_t *ks;
+	bam_index_t **idx;
+	bam_header_t *h = 0;
+	aux_t **aux;
+	int *n_plp, dret, i, n, c, min_mapQ = 0;
+	int64_t *cnt;
+	const bam_pileup1_t **plp;
+
+	while ((c = getopt(argc, argv, "Q:")) >= 0) {
+		switch (c) {
+		case 'Q': min_mapQ = atoi(optarg); break;
+		}
+	}
+	if (optind + 2 > argc) {
+		fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n");
+		return 1;
+	}
+	memset(&str, 0, sizeof(kstring_t));
+	n = argc - optind - 1;
+	aux = calloc(n, sizeof(void*));
+	idx = calloc(n, sizeof(void*));
+	for (i = 0; i < n; ++i) {
+		aux[i] = calloc(1, sizeof(aux_t));
+		aux[i]->min_mapQ = min_mapQ;
+		aux[i]->fp = bam_open(argv[i+optind+1], "r");
+		idx[i] = bam_index_load(argv[i+optind+1]);
+		if (aux[i]->fp == 0 || idx[i] == 0) {
+			fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
+			return 2;
+		}
+		bgzf_set_cache_size(aux[i]->fp, 20);
+		if (i == 0) h = bam_header_read(aux[0]->fp);
+	}
+	bam_init_header_hash(h);
+	cnt = calloc(n, 8);
+
+	fp = gzopen(argv[optind], "rb");
+	ks = ks_init(fp);
+	n_plp = calloc(n, sizeof(int));
+	plp = calloc(n, sizeof(void*));
+	while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
+		char *p, *q;
+		int tid, beg, end, pos;
+		bam_mplp_t mplp;
+
+		for (p = q = str.s; *p && *p != '\t'; ++p);
+		if (*p != '\t') goto bed_error;
+		*p = 0; tid = bam_get_tid(h, q); *p = '\t';
+		if (tid < 0) goto bed_error;
+		for (q = p = p + 1; isdigit(*p); ++p);
+		if (*p != '\t') goto bed_error;
+		*p = 0; beg = atoi(q); *p = '\t';
+		for (q = p = p + 1; isdigit(*p); ++p);
+		if (*p == '\t' || *p == 0) {
+			int c = *p;
+			*p = 0; end = atoi(q); *p = c;
+		} else goto bed_error;
+
+		for (i = 0; i < n; ++i) {
+			if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
+			aux[i]->iter = bam_iter_query(idx[i], tid, beg, end);
+		}
+		mplp = bam_mplp_init(n, read_bam, (void**)aux);
+		bam_mplp_set_maxcnt(mplp, 64000);
+		memset(cnt, 0, 8 * n);
+		while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
+			if (pos >= beg && pos < end)
+				for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
+		for (i = 0; i < n; ++i) {
+			kputc('\t', &str);
+			kputl(cnt[i], &str);
+		}
+		puts(str.s);
+		bam_mplp_destroy(mplp);
+		continue;
+
+bed_error:
+		fprintf(stderr, "Errors in BED line '%s'\n", str.s);
+	}
+	free(n_plp); free(plp);
+	ks_destroy(ks);
+	gzclose(fp);
+
+	free(cnt);
+	for (i = 0; i < n; ++i) {
+		if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
+		bam_index_destroy(idx[i]);
+		bam_close(aux[i]->fp);
+		free(aux[i]);
+	}
+	bam_header_destroy(h);
+	free(aux); free(idx);
+	free(str.s);
+	return 0;
+}
diff --git a/kstring.h b/kstring.h
index d490f58..abd8236 100644
--- a/kstring.h
+++ b/kstring.h
@@ -142,6 +142,23 @@ static inline int kputuw(unsigned c, kstring_t *s)
 	return 0;
 }
 
+static inline int kputl(long c, kstring_t *s)
+{
+	char buf[32];
+	long l, x;
+	if (c == 0) return kputc('0', s);
+	for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+	if (c < 0) buf[l++] = '-';
+	if (s->l + l + 1 >= s->m) {
+		s->m = s->l + l + 2;
+		kroundup32(s->m);
+		s->s = (char*)realloc(s->s, s->m);
+	}
+	for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+	s->s[s->l] = 0;
+	return 0;
+}
+
 static inline int *ksplit(kstring_t *s, int delimiter, int *n)
 {
 	int max = 0, *offsets = 0;