From 63e4e54189bc9e7f43852edf865f95e713b93dc4 Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@live.co.uk>
Date: Fri, 24 Apr 2009 15:40:18 +0000
Subject: [PATCH]  * samtools-0.1.3-3 (r240)  * generate MD tag  * generate "="
 bases  * the plain pileup now support "=" bases, but consensus calling and
 glfgen may fail

---
 Makefile      |  3 +-
 Makefile.lite |  3 +-
 bam.h         |  1 +
 bam_aux.c     | 15 ++++++++
 bam_md.c      | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++
 bam_plcmd.c   |  2 +-
 bamtk.c       |  5 ++-
 7 files changed, 120 insertions(+), 4 deletions(-)
 create mode 100644 bam_md.c

diff --git a/Makefile b/Makefile
index d4a4101..25897db 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ CXXFLAGS=	$(CFLAGS)
 DFLAGS=		-D_IOLIB=2 -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES
 OBJS=		bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
 			razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
-			bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o
+			bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o
 PROG=		razip bgzip samtools
 INCLUDES=	
 LIBS=		-lm -lz
@@ -52,6 +52,7 @@ bam_lpileup.o:bam.h ksort.h
 bam_tview.o:bam.h faidx.h bam_maqcns.h
 bam_maqcns.o:bam.h ksort.h bam_maqcns.h
 bam_sort.o:bam.h ksort.h razf.h
+bam_md.o:bam.h faidx.h
 razf.o:razf.h
 glf.o:glf.h
 
diff --git a/Makefile.lite b/Makefile.lite
index ac2e8de..5ddb284 100644
--- a/Makefile.lite
+++ b/Makefile.lite
@@ -5,7 +5,7 @@ CXXFLAGS=	$(CFLAGS)
 DFLAGS=		-D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -D_NO_CURSES -D_NO_RAZF
 OBJS=		bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
 			bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
-			bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o
+			bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o
 PROG=		samtools
 INCLUDES=	
 LIBS=		-lm -lz
@@ -52,6 +52,7 @@ bam_lpileup.o:bam.h ksort.h
 bam_tview.o:bam.h faidx.h bam_maqcns.h
 bam_maqcns.o:bam.h ksort.h bam_maqcns.h
 bam_sort.o:bam.h ksort.h razf.h
+bam_md.o:bam.h faidx.h
 razf.o:razf.h
 
 faidx.o:faidx.h razf.h khash.h
diff --git a/bam.h b/bam.h
index c0bde4b..92e7a36 100644
--- a/bam.h
+++ b/bam.h
@@ -600,6 +600,7 @@ extern "C" {
 	 */
 	void bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
 
+	void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
 	uint8_t *bam_aux_get(bam1_t *b, const char tag[2]);
 	int32_t bam_aux2i(const uint8_t *s);
 	float bam_aux2f(const uint8_t *s);
diff --git a/bam_aux.c b/bam_aux.c
index f9be398..d6f18ea 100644
--- a/bam_aux.c
+++ b/bam_aux.c
@@ -3,6 +3,21 @@
 #include "khash.h"
 KHASH_MAP_INIT_STR(s, int)
 
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
+{
+	int ori_len = b->data_len;
+	b->data_len += 3 + len;
+	b->l_aux += 3 + len;
+	if (b->m_data < b->data_len) {
+		b->m_data = b->data_len;
+		kroundup32(b->m_data);
+		b->data = (uint8_t*)realloc(b->data, b->m_data);
+	}
+	b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
+	b->data[ori_len + 2] = type;
+	memcpy(b->data + ori_len + 3, data, len);
+}
+
 uint8_t *bam_aux_get(bam1_t *b, const char tag[2])
 {
 	uint8_t *s;
diff --git a/bam_md.c b/bam_md.c
new file mode 100644
index 0000000..a998f1b
--- /dev/null
+++ b/bam_md.c
@@ -0,0 +1,95 @@
+#include <unistd.h>
+#include "faidx.h"
+#include "bam.h"
+#include "kstring.h"
+
+void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
+{
+	uint8_t *seq = bam1_seq(b);
+	uint32_t *cigar = bam1_cigar(b);
+	bam1_core_t *c = &b->core;
+	int i, x, y, u = 0, has_md = 0;
+	kstring_t *str;
+
+	if (bam_aux_get(b, "MD")) has_md = 1;
+	if (has_md == 1 && !is_equal) return; // no need to add MD
+	str = (kstring_t*)calloc(1, sizeof(kstring_t));
+	if (c->flag & BAM_FUNMAP) return;
+	for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+		int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+		if (op == BAM_CMATCH) {
+			for (j = 0; j < l; ++j) {
+				int z = y + j;
+				int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+				if (ref[x+j] == 0) break; // out of boundary
+				if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) {
+					if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+					++u;
+				} else {
+					ksprintf(str, "%d", u);
+					kputc(ref[x+j], str);
+					u = 0;
+				}
+			}
+			if (j < l) break;
+			x += l; y += l;
+		} else if (op == BAM_CDEL) {
+			ksprintf(str, "%d", u);
+			kputc('^', str);
+			for (j = 0; j < l; ++j) {
+				if (ref[x+j] == 0) break;
+				kputc(ref[x+j], str);
+			}
+			if (j < l) break;
+			x += l;
+		} else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+			y += l;
+		} else if (op == BAM_CREF_SKIP) {
+			x += l;
+		}
+	}
+	ksprintf(str, "%d", u);
+	if (!has_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+	free(str->s); free(str);
+}
+
+int bam_fillmd(int argc, char *argv[])
+{
+	int c, is_equal = 0, tid = -1, ret, len;
+	bamFile fp, fpout = 0;
+	bam_header_t *header;
+	faidx_t *fai;
+	char *ref = 0;
+	bam1_t *b;
+
+	while ((c = getopt(argc, argv, "e")) >= 0) {
+		switch (c) {
+		case 'e': is_equal = 1; break;
+		default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+		}
+	}
+	if (optind + 1 >= argc) {
+		fprintf(stderr, "Usage: bam fillmd [-e] <aln.bam> <ref.fasta>\n");
+		return 1;
+	}
+	fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+	assert(fp);
+	header = bam_header_read(fp);
+	fpout = bam_dopen(fileno(stdout), "w");
+	bam_header_write(fpout, header);
+	fai = fai_load(argv[optind+1]);
+
+	b = bam_init1();
+	while ((ret = bam_read1(fp, b)) >= 0) {
+		if (tid != b->core.tid)
+			ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
+		bam_fillmd1(b, ref, is_equal);
+		bam_write1(fpout, b);
+	}
+	bam_destroy1(b);
+
+	fai_destroy(fai);
+	bam_header_destroy(header);
+	bam_close(fp); bam_close(fpout);
+	return 0;
+}
diff --git a/bam_plcmd.c b/bam_plcmd.c
index 38680df..b1ffdf9 100644
--- a/bam_plcmd.c
+++ b/bam_plcmd.c
@@ -195,7 +195,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
 		if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33);
 		if (!p->is_del) {
 			int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
-			if (toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+			if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
 			else c = bam1_strand(p->b)? tolower(c) : toupper(c);
 			putchar(c);
 			if (p->indel > 0) {
diff --git a/bamtk.c b/bamtk.c
index f12f900..a9daa52 100644
--- a/bamtk.c
+++ b/bamtk.c
@@ -3,7 +3,7 @@
 #include "bam.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.3-2"
+#define PACKAGE_VERSION "0.1.3-3"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
@@ -15,6 +15,7 @@ int bam_tview_main(int argc, char *argv[]);
 int bam_mating(int argc, char *argv[]);
 int bam_rmdup(int argc, char *argv[]);
 int bam_flagstat(int argc, char *argv[]);
+int bam_fillmd(int argc, char *argv[]);
 
 int faidx_main(int argc, char *argv[]);
 int glf3_view_main(int argc, char *argv[]);
@@ -161,6 +162,7 @@ static int usage()
 	fprintf(stderr, "         rmdup       remove PCR duplicates\n");
 	fprintf(stderr, "         glfview     print GLFv2 file\n");
 	fprintf(stderr, "         flagstat    simple stats\n");
+	fprintf(stderr, "         fillmd      fill the MD tag and change identical base to =\n");
 	fprintf(stderr, "\n");
 	return 1;
 }
@@ -180,6 +182,7 @@ int main(int argc, char *argv[])
 	else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
 	else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
 	else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);
+	else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
 #ifndef _NO_CURSES
 	else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
 #endif
-- 
2.39.5