From f1222861f3824c28d35cd143f5565ca09d80f056 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 May 2009 13:33:22 +0000 Subject: [PATCH] * samtools-0.1.3-11 (r258) * unify the interface to BAM and SAM I/O --- Makefile.generic | 5 +- Makefile.lite | 8 ++- bam.c | 66 ++++++++++++---------- bam.h | 5 +- bam_import.c | 70 ++++++++---------------- bamtk.c | 93 +++---------------------------- sam.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++ sam.h | 30 ++++++++++ sam_view.c | 136 +++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 384 insertions(+), 169 deletions(-) create mode 100644 sam.c create mode 100644 sam.h create mode 100644 sam_view.c diff --git a/Makefile.generic b/Makefile.generic index 80bd9e7..1d8e7a5 100644 --- a/Makefile.generic +++ b/Makefile.generic @@ -5,7 +5,7 @@ CXXFLAGS= $(CFLAGS) DFLAGS= -D_IOLIB=2 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \ razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \ - bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o + bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o PROG= razip bgzip samtools INCLUDES= LIBS= -lm -lz @@ -43,7 +43,8 @@ bgzip:bgzip.o bgzf.o $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(LIBS) razip.o:razf.h -bam.o:bam.h razf.h bam_endian.h +bam.o:bam.h razf.h bam_endian.h kstring.h +sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h diff --git a/Makefile.lite b/Makefile.lite index 90b52e5..229565f 100644 --- a/Makefile.lite +++ b/Makefile.lite @@ -5,7 +5,7 @@ CXXFLAGS= $(CFLAGS) DFLAGS= -D_IOLIB=2 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_NO_CURSES -D_NO_RAZF OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \ bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \ - bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o + bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o PROG= samtools INCLUDES= LIBS= -lm -lz @@ -43,10 +43,11 @@ bgzip:bgzip.o bgzf.o $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(LIBS) razip.o:razf.h -bam.o:bam.h razf.h bam_endian.h +bam.o:bam.h razf.h bam_endian.h kstring.h +sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h -bam_plcmd.o:bam.h faidx.h bam_maqcns.h +bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h bam_lpileup.o:bam.h ksort.h bam_tview.o:bam.h faidx.h bam_maqcns.h @@ -54,6 +55,7 @@ bam_maqcns.o:bam.h ksort.h bam_maqcns.h bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h razf.o:razf.h +glf.o:glf.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h diff --git a/bam.c b/bam.c index 1b74c6d..22e09cf 100644 --- a/bam.c +++ b/bam.c @@ -2,6 +2,7 @@ #include #include "bam.h" #include "bam_endian.h" +#include "kstring.h" int bam_is_be = 0; @@ -233,45 +234,54 @@ int bam_write1(bamFile fp, const bam1_t *b) return bam_write1_core(fp, &b->core, b->data_len, b->data); } -void bam_view1(const bam_header_t *header, const bam1_t *b) +char *bam_format1(const bam_header_t *header, const bam1_t *b) { uint8_t *s = bam1_seq(b), *t = bam1_qual(b); int i; const bam1_core_t *c = &b->core; - printf("%s\t%d\t", bam1_qname(b), c->flag); - if (c->tid < 0) printf("*\t"); - else printf("%s\t", header->target_name[c->tid]); - printf("%d\t%d\t", c->pos + 1, c->qual); - if (c->n_cigar == 0) putchar('*'); + kstring_t str; + str.l = str.m = 0; str.s = 0; + + ksprintf(&str, "%s\t%d\t", bam1_qname(b), c->flag); + if (c->tid < 0) kputs("*\t", &str); + else ksprintf(&str, "%s\t", header->target_name[c->tid]); + ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual); + if (c->n_cigar == 0) kputc('*', &str); else { for (i = 0; i < c->n_cigar; ++i) - printf("%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); + ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); } - putchar('\t'); - if (c->mtid < 0) printf("*\t"); - else if (c->mtid == c->tid) printf("=\t"); - else printf("%s\t", header->target_name[c->mtid]); - printf("%d\t%d\t", c->mpos + 1, c->isize); - for (i = 0; i < c->l_qseq; ++i) putchar(bam_nt16_rev_table[bam1_seqi(s, i)]); - putchar('\t'); - if (t[0] == 0xff) putchar('*'); - else for (i = 0; i < c->l_qseq; ++i) putchar(t[i] + 33); + kputc('\t', &str); + if (c->mtid < 0) kputs("*\t", &str); + else if (c->mtid == c->tid) kputs("=\t", &str); + else ksprintf(&str, "%s\t", header->target_name[c->mtid]); + ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize); + for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); + kputc('\t', &str); + if (t[0] == 0xff) kputc('*', &str); + else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); s = bam1_aux(b); while (s < b->data + b->data_len) { uint8_t type, key[2]; key[0] = s[0]; key[1] = s[1]; s += 2; type = *s; ++s; - printf("\t%c%c:", key[0], key[1]); - if (type == 'A') { printf("A:%c", *s); ++s; } - else if (type == 'C') { printf("i:%u", *s); ++s; } - else if (type == 'c') { printf("i:%d", *s); ++s; } - else if (type == 'S') { printf("i:%u", *(uint16_t*)s); s += 2; } - else if (type == 's') { printf("i:%d", *(int16_t*)s); s += 2; } - else if (type == 'I') { printf("i:%u", *(uint32_t*)s); s += 4; } - else if (type == 'i') { printf("i:%d", *(int32_t*)s); s += 4; } - else if (type == 'f') { printf("f:%g", *(float*)s); s += 4; } - else if (type == 'd') { printf("d:%lg", *(double*)s); s += 8; } - else if (type == 'Z' || type == 'H') { printf("%c:", type); while (*s) putchar(*s++); ++s; } + ksprintf(&str, "\t%c%c:", key[0], key[1]); + if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; } + else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; } + else if (type == 'c') { ksprintf(&str, "i:%d", *s); ++s; } + else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; } + else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; } + else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; } + else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; } + else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } + else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } + else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; } } - putchar('\n'); + return str.s; +} + +void bam_view1(const bam_header_t *header, const bam1_t *b) +{ + char *s = bam_format1(header, b); + printf("%s\n", s); } diff --git a/bam.h b/bam.h index 4045684..1195e75 100644 --- a/bam.h +++ b/bam.h @@ -404,11 +404,12 @@ extern "C" { } while (0) /*! - @abstract Print an alignment to the standard output in TAM format. + @abstract Format a BAM record in the SAM format @param header pointer to the header structure @param b alignment to print + @return a pointer to the SAM string */ - void bam_view1(const bam_header_t *header, const bam1_t *b); + char *bam_format1(const bam_header_t *header, const bam1_t *b); /*! @abstract Merge multiple sorted BAM. diff --git a/bam_import.c b/bam_import.c index 746dc5b..9daa29f 100644 --- a/bam_import.c +++ b/bam_import.c @@ -146,22 +146,27 @@ static inline void append_text(bam_header_t *header, kstring_t *str) } int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) { - int ret, doff, doff0, dret; + int ret, doff, doff0, dret, z = 0; bam1_core_t *c = &b->core; kstring_t *str = fp->str; kstream_t *ks = fp->ks; while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header + z += str->l + 1; str->s[str->l] = dret; // note that str->s is NOT null terminated!! append_text(header, str); if (dret != '\n') { ret = ks_getuntil(fp->ks, '\n', str, &dret); + z += str->l + 1; str->s[str->l] = '\n'; // NOT null terminated!! append_text(header, str); } ++fp->n_lines; } - while (ret == 0) ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); // special consideration for "\r\n" + while (ret == 0) { // special consideration for "\r\n" and empty lines + ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); + if (ret >= 0) z += str->l + 1; + } if (ret < 0) return -1; ++fp->n_lines; doff = 0; @@ -172,10 +177,10 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) doff += c->l_qname; } { // flag, tid, pos, qual - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->flag = atoi(str->s); - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->tid = bam_get_tid(header, str->s); - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->qual = isdigit(str->s[0])? atoi(str->s) : 0; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->flag = atoi(str->s); + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s); + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0; if (ret < 0) return -2; } { // cigar @@ -184,6 +189,7 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) long x; c->n_cigar = 0; if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; + z += str->l + 1; if (str->s[0] != '*') { for (s = str->s; *s; ++s) { if (isalpha(*s)) ++c->n_cigar; @@ -210,15 +216,19 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) } else c->bin = bam_reg2bin(c->pos, c->pos + 1); } { // mtid, mpos, isize - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; - ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; if (ret < 0) return -4; } { // seq and qual int i; uint8_t *p; if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq + z += str->l + 1; c->l_qseq = strlen(str->s); if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); @@ -227,6 +237,7 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) for (i = 0; i < c->l_qseq; ++i) p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2); if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual + z += str->l + 1; if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s)) parse_error(fp->n_lines, "sequence and quality are inconsistent"); p += (c->l_qseq+1)/2; @@ -238,6 +249,7 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) if (dret != '\n' && dret != '\r') { // aux while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) { uint8_t *s, type, key[2]; + z += str->l + 1; if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':') parse_error(fp->n_lines, "missing colon in auxiliary data"); key[0] = str->s[0]; key[1] = str->s[1]; @@ -313,7 +325,7 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) } b->l_aux = doff - doff0; b->data_len = doff; - return 0; + return z; } tamFile sam_open(const char *fn) @@ -336,41 +348,3 @@ void sam_close(tamFile fp) free(fp); } } - -static void taf2baf_core(const char *fntaf, const char *fnbaf, bam_header_t *header) -{ - bamFile fpbaf; - bam1_t *b; - tamFile fp; - int ret; - - b = (bam1_t*)calloc(1, sizeof(bam1_t)); - fpbaf = (strcmp(fnbaf, "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(fnbaf, "w"); - fp = sam_open(fntaf); - ret = sam_read1(fp, header, b); - bam_header_write(fpbaf, header); - if (ret >= 0) { - bam_write1(fpbaf, b); - while (sam_read1(fp, header, b) >= 0) bam_write1(fpbaf, b); - } - bam_close(fpbaf); - free(b->data); free(b); - sam_close(fp); -} - -int bam_taf2baf(int argc, char *argv[]) -{ - int c; - bam_header_t *header; - - while ((c = getopt(argc, argv, "")) >= 0) { - } - if (optind + 3 > argc) { - fprintf(stderr, "Usage: bamtk import \n"); - return 1; - } - header = sam_header_read2(argv[optind]); - taf2baf_core(argv[optind+1], argv[optind+2], header); - bam_header_destroy(header); - return 0; -} diff --git a/bamtk.c b/bamtk.c index 5987635..a7e75cb 100644 --- a/bamtk.c +++ b/bamtk.c @@ -3,7 +3,7 @@ #include "bam.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.3-10 (r257)" +#define PACKAGE_VERSION "0.1.3-11 (r258)" #endif int bam_taf2baf(int argc, char *argv[]); @@ -17,91 +17,12 @@ int bam_rmdup(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); int bam_fillmd(int argc, char *argv[]); +int main_samview(int argc, char *argv[]); +int main_import(int argc, char *argv[]); + int faidx_main(int argc, char *argv[]); int glf3_view_main(int argc, char *argv[]); -static int view_aux(const bam1_t *b, void *data) -{ - bam_view1((bam_header_t*)data, b); - return 0; -} -static int view_auxb(const bam1_t *b, void *data) -{ - bam_write1((bamFile)data, b); - return 0; -} - -int bam_view(int argc, char *argv[]) -{ - bamFile fp, fpout = 0; - bam_header_t *header; - bam1_t *b; - int ret, c, is_bam = 0, is_header = 0, is_headeronly = 0; - while ((c = getopt(argc, argv, "bhH")) >= 0) { - switch (c) { - case 'b': is_bam = 1; break; - case 'h': is_header = 1; break; - case 'H': is_headeronly = 1; break; - default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 1; - } - } - if (argc == optind) { - fprintf(stderr, "Usage: samtools view [-bhH] [ [...]]\n"); - return 1; - } - fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); - assert(fp); - header = bam_header_read(fp); - if (header == 0) { - fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n"); - return 1; - } - if (is_bam) { - assert(fpout = bam_dopen(fileno(stdout), "w")); - bam_header_write(fpout, header); - } - if (is_header || is_headeronly) { - int i, c; - c = header->text[header->l_text-1]; - header->text[header->l_text-1] = 0; - printf("%s", header->text); - if (c) putchar(c); - header->text[header->l_text-1] = c; - for (i = 0; i < header->n_targets; ++i) - printf("@SQ\tSN:%s\tLN:%d\n", header->target_name[i], header->target_len[i]); - if (is_headeronly) { - bam_header_destroy(header); - bam_close(fp); - return 0; - } - } - if (optind + 1 == argc) { - b = (bam1_t*)calloc(1, sizeof(bam1_t)); - while ((ret = bam_read1(fp, b)) >= 0) bam_view1(header, b); - if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret); - free(b->data); free(b); - } else { - int i; - bam_index_t *idx; - idx = bam_index_load(argv[optind]); - for (i = optind + 1; i < argc; ++i) { - int tid, beg, end; - bam_parse_region(header, argv[i], &tid, &beg, &end); - if (tid < 0) { - fprintf(stderr, "[bam_view] fail to get the reference name. Abort!\n"); - return 1; - } - if (is_bam) bam_fetch(fp, idx, tid, beg, end, fpout, view_auxb); - else bam_fetch(fp, idx, tid, beg, end, header, view_aux); - } - bam_index_destroy(idx); - } - bam_header_destroy(header); - bam_close(fp); - if (is_bam) bam_close(fpout); - return 0; -} - int bam_tagview(int argc, char *argv[]) { bamFile fp; @@ -148,7 +69,7 @@ static int usage() fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); fprintf(stderr, "Usage: samtools [options]\n\n"); - fprintf(stderr, "Command: import import from the text format\n"); + fprintf(stderr, "Command: import import from SAM (obsolete; use `view')\n"); fprintf(stderr, " view export to the text format\n"); fprintf(stderr, " sort sort alignment file\n"); fprintf(stderr, " merge merge multiple sorted alignment files\n"); @@ -170,8 +91,8 @@ static int usage() int main(int argc, char *argv[]) { if (argc < 2) return usage(); - if (strcmp(argv[1], "view") == 0) return bam_view(argc-1, argv+1); - else if (strcmp(argv[1], "import") == 0) return bam_taf2baf(argc-1, argv+1); + if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); + else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1); else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); diff --git a/sam.c b/sam.c new file mode 100644 index 0000000..a133274 --- /dev/null +++ b/sam.c @@ -0,0 +1,140 @@ +#include +#include "sam.h" + +#define TYPE_BAM 1 +#define TYPE_READ 2 + +bam_header_t *bam_header_dup(const bam_header_t *h0) +{ + bam_header_t *h; + int i; + h = bam_header_init(); + *h = *h0; + h->hash = 0; + h->text = (char*)calloc(h->l_text + 1, 1); + memcpy(h->text, h0->text, h->l_text); + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + for (i = 0; i < h->n_targets; ++i) { + h->target_len[i] = h0->target_len[i]; + h->target_name[i] = strdup(h0->target_name[i]); + } + return h; +} + +bam_header_t *bam_header_parse(const char *text) +{ + bam_header_t *h; + int i; + char *s, *p, *q, *r; + + i = strlen(text); + if (i < 3) return 0; // headerless + h = bam_header_init(); + h->l_text = i; + h->text = strdup(text); + s = h->text; + while ((s = strstr(s, "@SQ")) != 0) { + ++h->n_targets; + s += 3; + } + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + i = 0; + s = h->text; + while ((s = strstr(s, "@SQ")) != 0) { + s += 3; + r = s; + if ((p = strstr(s, "SN:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t'; ++p); + h->target_name[i] = (char*)calloc(p - q + 1, 1); + strncpy(h->target_name[i], q, p - q); + } else goto header_err_ret; + if (r < p) r = p; + if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10); + else goto header_err_ret; + if (r < p) r = p; + s = r + 3; + ++i; + } + if (h->n_targets == 0) { + bam_header_destroy(h); + return 0; + } else return h; + +header_err_ret: + fprintf(stderr, "[bam_header_parse] missing SN tag in a @SQ line.\n"); + bam_header_destroy(h); + return 0; +} + +samfile_t *samopen(const char *fn, const char *mode, const void *aux) +{ + samfile_t *fp; + fp = (samfile_t*)calloc(1, sizeof(samfile_t)); + if (mode[0] == 'r') { + const char *fn_list = (const char*)aux; + fp->type |= TYPE_READ; + if (mode[1] == 'b') { // binary + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + fp->header = bam_header_read(fp->x.bam); + } else { + fp->x.tamr = sam_open(fn); + fp->header = sam_header_read2(fn_list); + } + } else if (mode[0] == 'w') { + fp->header = bam_header_dup((const bam_header_t*)aux); + if (mode[1] == 'b') { // binary + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, "w") : bam_dopen(fileno(stdout), "w"); + bam_header_write(fp->x.bam, fp->header); + } else { + int i; + bam_header_t *alt = 0; + alt = bam_header_parse(fp->header->text); + fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; + if (alt) { + if (alt->n_targets != fp->header->n_targets) + fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + bam_header_destroy(alt); + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); + } + if (strstr(mode, "h")) // print header + for (i = 0; i < fp->header->n_targets; ++i) + fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); + } + } + return fp; +} + +void samclose(samfile_t *fp) +{ + if (fp == 0) return; + if (fp->header) bam_header_destroy(fp->header); + if (fp->type & TYPE_BAM) bam_close(fp->x.bam); + else if (fp->type & TYPE_READ) sam_close(fp->x.tamr); + else fclose(fp->x.tamw); + free(fp); +} + +int samread(samfile_t *fp, bam1_t *b) +{ + if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading + if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b); + else return sam_read1(fp->x.tamr, fp->header, b); +} + +int samwrite(samfile_t *fp, const bam1_t *b) +{ + if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing + if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); + else { + char *s = bam_format1(fp->header, b); + int l = strlen(s); + fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw); + free(s); + return l + 1; + } +} diff --git a/sam.h b/sam.h new file mode 100644 index 0000000..1e50550 --- /dev/null +++ b/sam.h @@ -0,0 +1,30 @@ +#ifndef BAM_SAM_H +#define BAM_SAM_H + +#include "bam.h" + +typedef struct { + int type; + union { + tamFile tamr; + bamFile bam; + FILE *tamw; + } x; + bam_header_t *header; +} samfile_t; + +#ifdef __cplusplus +extern "C" { +#endif + + // mode can be: r/w/rb/wb. On writing, aux points to bam_header_t; on reading, aux points to the name of fn_list for SAM + samfile_t *samopen(const char *fn, const char *mode, const void *aux); + void samclose(samfile_t *fp); + int samread(samfile_t *fp, bam1_t *b); + int samwrite(samfile_t *fp, const bam1_t *b); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sam_view.c b/sam_view.c new file mode 100644 index 0000000..2df0530 --- /dev/null +++ b/sam_view.c @@ -0,0 +1,136 @@ +#include +#include +#include +#include +#include "sam.h" + +static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; + +// callback function for bam_fetch() +static int view_func(const bam1_t *b, void *data) +{ + if (b->core.qual < g_min_mapQ) return 0; + if (g_flag_on && (b->core.flag & g_flag_on) == 0) return 0; + if (b->core.flag & g_flag_off) return 0; + samwrite((samfile_t*)data, b); + return 0; +} + +static int usage(void); + +int main_samview(int argc, char *argv[]) +{ + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0; + samfile_t *in, *out; + char in_mode[4], out_mode[4], *fn_out = 0, *fn_list = 0; + + /* parse command-line options */ + strcpy(in_mode, "r"); strcpy(out_mode, "w"); + while ((c = getopt(argc, argv, "bt:hHo:q:f:F:")) >= 0) { + switch (c) { + case 'b': strcat(out_mode, "b"); break; + case 't': fn_list = strdup(optarg); is_bamin = 0; break; + case 'h': is_header = 1; break; + case 'H': is_header_only = 1; break; + case 'o': fn_out = strdup(optarg); break; + case 'f': g_flag_on = strtol(optarg, 0, 0); break; + case 'F': g_flag_on = strtol(optarg, 0, 0); break; + case 'q': g_min_mapQ = atoi(optarg); break; + default: return usage(); + } + } + if (is_header_only) is_header = 1; + if (is_bamin) strcat(in_mode, "b"); + if (is_header) strcat(out_mode, "h"); + if (argc == optind) return usage(); + + // open file handlers + in = samopen(argv[optind], in_mode, fn_list); + out = samopen(fn_out? fn_out : "-", out_mode, in->header); + if (is_header_only) goto view_end; // no need to print alignments + + if (argc == optind + 1) { // convert/print the entire file + bam1_t *b = bam_init1(); + int r; + while ((r = samread(in, b)) >= 0) // read one alignment from `in' + samwrite(out, b); // write the alignment to `out' + if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n"); + bam_destroy1(b); + } else { // retrieve alignments in specified regions + int i; + bam_index_t *idx = 0; + if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index + if (idx == 0) { // index is unavailable + fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n"); + ret = 1; + goto view_end; + } + for (i = optind + 1; i < argc; ++i) { + int tid, beg, end; + bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200' + if (tid < 0) { // reference name is not found + fprintf(stderr, "[main_samview] fail to get the reference name. Continue anyway.\n"); + continue; + } + bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); // fetch alignments + } + bam_index_destroy(idx); // destroy the BAM index + } + +view_end: + // close files, free and return + free(fn_list); free(fn_out); + samclose(in); + samclose(out); + return ret; +} + +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools view [options] | [region1 [...]]\n\n"); + fprintf(stderr, "Options: -b output BAM\n"); + fprintf(stderr, " -h print header for the SAM output\n"); + fprintf(stderr, " -H print header only (no alignments)\n"); + fprintf(stderr, " -t FILE list of reference names and lengths, assuming SAM input [null]\n"); + fprintf(stderr, " -o FILE output file name [stdout]\n"); + fprintf(stderr, " -f INT required flag, 0 for unset [0]\n"); + fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n"); + fprintf(stderr, " -q INT minimum mapping quality [0]\n"); + fprintf(stderr, "\n\ +Notes:\n\ +\n\ + 1. By default, this command assumes the file on the command line is in\n\ + the BAM format and it prints the alignments in SAM. If `-t' is\n\ + applied, the input file is assumed to be in the SAM format. The\n\ + file supplied with `-t' is SPACE/TAB delimited with the first two\n\ + fields of each line consisting of the reference name and the\n\ + corresponding sequence length. The `.fai' file generated by `faidx'\n\ + can be used here. This file may be empty if reads are unaligned.\n\ +\n\ + 2. SAM->BAM conversion: `samtools view -bt ref.fa.fai in.sam.gz'.\n\ +\n\ + 3. BAM->SAM conversion: `samtools view in.bam'.\n\ +\n\ + 4. A region should be presented in one of the following formats:\n\ + `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\ + specified, the input alignment file must be an indexed BAM file.\n\ +\n"); + return 1; +} + +int main_import(int argc, char *argv[]) +{ + int argc2, ret; + char **argv2; + if (argc != 4) { + fprintf(stderr, "Usage: bamtk import \n"); + return 1; + } + argc2 = 6; + argv2 = calloc(6, sizeof(char*)); + argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2]; + ret = main_samview(argc2, argv2); + free(argv2); + return ret; +} -- 2.39.2