DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES
OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
- bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o
+ bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o
PROG= razip bgzip samtools
INCLUDES=
LIBS= -lm -lz
bam_tview.o:bam.h faidx.h bam_maqcns.h
bam_maqcns.o:bam.h ksort.h bam_maqcns.h
bam_sort.o:bam.h ksort.h razf.h
+bam_md.o:bam.h faidx.h
razf.o:razf.h
glf.o:glf.h
DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -D_NO_CURSES -D_NO_RAZF
OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
- bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o
+ bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o
PROG= samtools
INCLUDES=
LIBS= -lm -lz
bam_tview.o:bam.h faidx.h bam_maqcns.h
bam_maqcns.o:bam.h ksort.h bam_maqcns.h
bam_sort.o:bam.h ksort.h razf.h
+bam_md.o:bam.h faidx.h
razf.o:razf.h
faidx.o:faidx.h razf.h khash.h
*/
void bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
+ void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
uint8_t *bam_aux_get(bam1_t *b, const char tag[2]);
int32_t bam_aux2i(const uint8_t *s);
float bam_aux2f(const uint8_t *s);
#include "khash.h"
KHASH_MAP_INIT_STR(s, int)
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
+{
+ int ori_len = b->data_len;
+ b->data_len += 3 + len;
+ b->l_aux += 3 + len;
+ if (b->m_data < b->data_len) {
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+ b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
+ b->data[ori_len + 2] = type;
+ memcpy(b->data + ori_len + 3, data, len);
+}
+
uint8_t *bam_aux_get(bam1_t *b, const char tag[2])
{
uint8_t *s;
--- /dev/null
+#include <unistd.h>
+#include "faidx.h"
+#include "bam.h"
+#include "kstring.h"
+
+void bam_fillmd1(bam1_t *b, char *ref, int is_equal)
+{
+ uint8_t *seq = bam1_seq(b);
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ int i, x, y, u = 0, has_md = 0;
+ kstring_t *str;
+
+ if (bam_aux_get(b, "MD")) has_md = 1;
+ if (has_md == 1 && !is_equal) return; // no need to add MD
+ str = (kstring_t*)calloc(1, sizeof(kstring_t));
+ if (c->flag & BAM_FUNMAP) return;
+ for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+ int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (op == BAM_CMATCH) {
+ for (j = 0; j < l; ++j) {
+ int z = y + j;
+ int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+ if (ref[x+j] == 0) break; // out of boundary
+ if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) {
+ if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
+ ++u;
+ } else {
+ ksprintf(str, "%d", u);
+ kputc(ref[x+j], str);
+ u = 0;
+ }
+ }
+ if (j < l) break;
+ x += l; y += l;
+ } else if (op == BAM_CDEL) {
+ ksprintf(str, "%d", u);
+ kputc('^', str);
+ for (j = 0; j < l; ++j) {
+ if (ref[x+j] == 0) break;
+ kputc(ref[x+j], str);
+ }
+ if (j < l) break;
+ x += l;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
+ y += l;
+ } else if (op == BAM_CREF_SKIP) {
+ x += l;
+ }
+ }
+ ksprintf(str, "%d", u);
+ if (!has_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
+ free(str->s); free(str);
+}
+
+int bam_fillmd(int argc, char *argv[])
+{
+ int c, is_equal = 0, tid = -1, ret, len;
+ bamFile fp, fpout = 0;
+ bam_header_t *header;
+ faidx_t *fai;
+ char *ref = 0;
+ bam1_t *b;
+
+ while ((c = getopt(argc, argv, "e")) >= 0) {
+ switch (c) {
+ case 'e': is_equal = 1; break;
+ default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
+ }
+ }
+ if (optind + 1 >= argc) {
+ fprintf(stderr, "Usage: bam fillmd [-e] <aln.bam> <ref.fasta>\n");
+ return 1;
+ }
+ fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+ assert(fp);
+ header = bam_header_read(fp);
+ fpout = bam_dopen(fileno(stdout), "w");
+ bam_header_write(fpout, header);
+ fai = fai_load(argv[optind+1]);
+
+ b = bam_init1();
+ while ((ret = bam_read1(fp, b)) >= 0) {
+ if (tid != b->core.tid)
+ ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
+ bam_fillmd1(b, ref, is_equal);
+ bam_write1(fpout, b);
+ }
+ bam_destroy1(b);
+
+ fai_destroy(fai);
+ bam_header_destroy(header);
+ bam_close(fp); bam_close(fpout);
+ return 0;
+}
if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33);
if (!p->is_del) {
int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
- if (toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+ if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
else c = bam1_strand(p->b)? tolower(c) : toupper(c);
putchar(c);
if (p->indel > 0) {
#include "bam.h"
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.3-2"
+#define PACKAGE_VERSION "0.1.3-3"
#endif
int bam_taf2baf(int argc, char *argv[]);
int bam_mating(int argc, char *argv[]);
int bam_rmdup(int argc, char *argv[]);
int bam_flagstat(int argc, char *argv[]);
+int bam_fillmd(int argc, char *argv[]);
int faidx_main(int argc, char *argv[]);
int glf3_view_main(int argc, char *argv[]);
fprintf(stderr, " rmdup remove PCR duplicates\n");
fprintf(stderr, " glfview print GLFv2 file\n");
fprintf(stderr, " flagstat simple stats\n");
+ fprintf(stderr, " fillmd fill the MD tag and change identical base to =\n");
fprintf(stderr, "\n");
return 1;
}
else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);
+ else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1);
#ifndef _NO_CURSES
else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
#endif