From 42e756837f7acb91bfd1134ef16d423e71c913d7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 24 Mar 2009 10:30:23 +0000 Subject: [PATCH] * samtools-0.1.2-16 * made pileup take a list of proposed indels. An insertion is N at the moment. * added my kstring library for a bit complex parsing of the position list. --- Makefile | 2 +- bam_import.c | 16 ++++----- bam_maqcns.c | 23 +++++++----- bam_maqcns.h | 3 +- bam_plcmd.c | 100 ++++++++++++++++++++++++++++++++++++++------------- bamtk.c | 2 +- kseq.h | 3 ++ kstring.c | 81 +++++++++++++++++++++++++++++++++++++++++ kstring.h | 54 ++++++++++++++++++++++++++++ 9 files changed, 239 insertions(+), 45 deletions(-) create mode 100644 kstring.c create mode 100644 kstring.h diff --git a/Makefile b/Makefile index 6b60a7b..d4a4101 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ CXXFLAGS= $(CFLAGS) DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \ razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \ - bam_mate.o bam_rmdup.o glf.o bam_stat.o + bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o PROG= razip bgzip samtools INCLUDES= LIBS= -lm -lz diff --git a/bam_import.c b/bam_import.c index 7742d2f..6697a9f 100644 --- a/bam_import.c +++ b/bam_import.c @@ -44,29 +44,27 @@ struct __tamFile_t { uint64_t n_lines; }; -char **bam_load_pos(const char *fn, int *_n) +char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only { char **list = 0, *s; - int n = 0, dret, m = 0, c; + int n = 0, dret, m = 0; gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); kstream_t *ks; kstring_t *str; str = (kstring_t*)calloc(1, sizeof(kstring_t)); ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) > 0) { + while (ks_getuntil(ks, '\n', str, &dret) > 0) { if (n == m) { m = m? m << 1 : 16; list = (char**)realloc(list, m * sizeof(char*)); } - s = list[n++] = (char*)calloc(str->l + 5, 1); + if (str->s[str->l-1] == '\r') + str->s[--str->l] = '\0'; + s = list[n++] = (char*)calloc(str->l + 1, 1); strcpy(s, str->s); - s += str->l + 1; - ks_getuntil(ks, 0, str, &dret); - *((uint32_t*)s) = atoi(str->s); - if (dret != '\n') - while ((c = ks_getc(fp)) >= 0 && c != '\n'); } ks_destroy(ks); + gzclose(fp); free(str->s); free(str); *_n = n; return list; diff --git a/bam_maqcns.c b/bam_maqcns.c index ec11f3e..6b2b5e0 100644 --- a/bam_maqcns.c +++ b/bam_maqcns.c @@ -283,19 +283,23 @@ void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir) #define MINUS_CONST 0x10000000 -bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref) +bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types) { int i, j, n_types, *types, left, right; bam_maqindel_ret_t *ret = 0; - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break; + // if there is no proposed indel, check if there is an indel from the alignment + if (_n_types == 0) { + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break; + } + if (i == n) return 0; // no indel } - if (i == n) return 0; // no indel { // calculate how many types of indels are available (set n_types and types) int m; uint32_t *aux; - aux = (uint32_t*)calloc(n+1, 4); + aux = (uint32_t*)calloc(n + _n_types + 1, 4); m = 0; aux[m++] = MINUS_CONST; // zero indel is always a type for (i = 0; i < n; ++i) { @@ -303,9 +307,12 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) aux[m++] = MINUS_CONST + p->indel; } + if (_n_types) // then also add this to aux[] + for (i = 0; i < _n_types; ++i) + if (_types[i]) aux[m++] = MINUS_CONST + _types[i]; ks_introsort(uint32_t, m, aux); - n_types = 1; - for (i = 1; i < m; ++i) + // squeeze out identical types + for (i = 1, n_types = 1; i < m; ++i) if (aux[i] != aux[i-1]) ++n_types; types = (int*)calloc(n_types, sizeof(int)); j = 0; diff --git a/bam_maqcns.h b/bam_maqcns.h index 2c94fec..56fa9db 100644 --- a/bam_maqcns.h +++ b/bam_maqcns.h @@ -44,7 +44,8 @@ extern "C" { uint32_t glf2cns(const glf1_t *g, int q_r); bam_maqindel_opt_t *bam_maqindel_opt_init(); - bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref); + bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, + int _n_types, int *_types); void bam_maqindel_ret_destroy(bam_maqindel_ret_t*); #ifdef __cplusplus diff --git a/bam_plcmd.c b/bam_plcmd.c index 98a1e49..79d0e63 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -6,7 +6,10 @@ #include "bam_maqcns.h" #include "khash.h" #include "glf.h" -KHASH_SET_INIT_INT64(64) +#include "kstring.h" + +typedef int *indel_list_t; +KHASH_MAP_INIT_INT64(64, indel_list_t) #define BAM_PLF_SIMPLE 0x01 #define BAM_PLF_CNS 0x02 @@ -26,28 +29,55 @@ typedef struct { glfFile fp; // for glf output only } pu_data_t; -char **bam_load_pos(const char *fn, int *_n); +char **__bam_get_lines(const char *fn, int *_n); void bam_init_header_hash(bam_header_t *header); int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); static khash_t(64) *load_pos(const char *fn, bam_header_t *h) { - int n, tmp, i; - char **list, *s; - uint64_t x; + char **list; + int i, j, n, *fields, max_fields; khash_t(64) *hash; bam_init_header_hash(h); - list = bam_load_pos(fn, &n); + list = __bam_get_lines(fn, &n); hash = kh_init(64); + max_fields = 0; fields = 0; for (i = 0; i < n; ++i) { - x = (uint64_t)bam_get_tid(h, list[i]) << 32; - s = list[i]; - while (*s++); - x |= *((uint32_t*)s) - 1; - kh_put(64, hash, x, &tmp); - free(list[i]); + char *str = list[i]; + int chr, n_fields, ret; + khint_t k; + uint64_t x; + n_fields = ksplit_core(str, 0, &max_fields, &fields); + if (n_fields < 2) continue; + chr = bam_get_tid(h, str + fields[0]); + if (chr < 0) { + fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]); + continue; + } + x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1); + k = kh_put(64, hash, x, &ret); + if (ret == 0) { + fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]); + continue; + } + kh_val(hash, k) = 0; + if (n_fields > 2) { + // count + for (j = 2; j < n_fields; ++j) { + char *s = str + fields[j]; + if ((*s != '+' && *s != '-') || !isdigit(s[1])) break; + } + if (j > 2) { // update kh_val() + int *q, y, z; + q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int)); + q[0] = j - 2; z = j; y = 1; + for (j = 2; j < z; ++j) + q[y++] = atoi(str + fields[j]); + } + } + free(str); } - free(list); + free(list); free(fields); return hash; } @@ -56,14 +86,19 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, { pu_data_t *d = (pu_data_t*)data; bam_maqindel_ret_t *r = 0; - int rb; + int rb, *proposed_indels = 0; glf1_t *g; glf3_t *g3; + if (d->fai == 0) { fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n"); exit(1); } - if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0; + if (d->hash) { // only output a list of sites + khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); + if (k == kh_end(d->hash)) return 0; + proposed_indels = kh_val(d->hash, k); + } g3 = glf3_init1(); if (d->fai && (int)tid != d->tid) { if (d->ref) { // then write the end mark @@ -83,7 +118,9 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, g3->offset = pos - d->last_pos; d->last_pos = pos; glf3_write1(d->fp, g3); - r = bam_maqindel(n, pos, d->ido, pu, d->ref); + if (proposed_indels) + r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); if (r) { // then write indel line int het = 3 * n, min; min = het; @@ -114,11 +151,16 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p { pu_data_t *d = (pu_data_t*)data; bam_maqindel_ret_t *r = 0; - int i, j, rb, max_mapq = 0; + int i, j, rb, max_mapq = 0, *proposed_indels = 0; uint32_t x; - if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0; + if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data); - if (d->fai && (int)tid != d->tid) { + if (d->hash) { // only output a list of sites + khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); + if (k == kh_end(d->hash)) return 0; + proposed_indels = kh_val(d->hash, k); + } + if (d->fai && (int)tid != d->tid) { // then update d->ref free(d->ref); d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); d->tid = tid; @@ -140,9 +182,10 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p } printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[x>>28], x>>8&0xff, ref_q, x>>16&0xff); } - if (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) { - if (d->ref) // indel calling - r = bam_maqindel(n, pos, d->ido, pu, d->ref); + if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref) { + if (proposed_indels) + r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); + else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0); } // pileup strings printf("%d\t", n); @@ -257,7 +300,9 @@ int bam_pileup(int argc, char *argv[]) glf3_header_write(d->fp, h); glf3_header_destroy(h); } - if (fn_list) { + if (d->fai == 0) + fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n"); + if (fn_list) { // the input is SAM tamFile fp; bam1_t *b; int ret; @@ -273,7 +318,7 @@ int bam_pileup(int argc, char *argv[]) bam_plbuf_destroy(buf); bam_destroy1(b); sam_close(fp); - } else { + } else { // the input is BAM bamFile fp; fp = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r"); d->h = bam_header_read(fp); @@ -282,8 +327,13 @@ int bam_pileup(int argc, char *argv[]) bam_close(fp); } if (d->format & BAM_PLF_GLF) bgzf_close(d->fp); + if (fn_pos) { // free the hash table + khint_t k; + for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k) + if (kh_exist(d->hash, k)) free(kh_val(d->hash, k)); + kh_destroy(64, d->hash); + } free(fn_pos); free(fn_list); - kh_destroy(64, d->hash); bam_header_destroy(d->h); if (d->fai) fai_destroy(d->fai); bam_maqcns_destroy(d->c); diff --git a/bamtk.c b/bamtk.c index a486353..be37b63 100644 --- a/bamtk.c +++ b/bamtk.c @@ -3,7 +3,7 @@ #include "bam.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.2-15" +#define PACKAGE_VERSION "0.1.2-16" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/kseq.h b/kseq.h index 25f31a3..2875c77 100644 --- a/kseq.h +++ b/kseq.h @@ -71,10 +71,13 @@ return (int)ks->buf[ks->begin++]; \ } +#ifndef KSTRING_T +#define KSTRING_T kstring_t typedef struct __kstring_t { size_t l, m; char *s; } kstring_t; +#endif #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) diff --git a/kstring.c b/kstring.c new file mode 100644 index 0000000..dc20fae --- /dev/null +++ b/kstring.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + int *fields, n, i; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + free(s); + return 0; +} +#endif diff --git a/kstring.h b/kstring.h new file mode 100644 index 0000000..47d63f7 --- /dev/null +++ b/kstring.h @@ -0,0 +1,54 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +int ksprintf(kstring_t *s, const char *fmt, ...); +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + +static inline int kputs(const char *p, kstring_t *s) +{ + int l = strlen(p); + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strcpy(s->s + s->l, p); + s->l += l; + return l; +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif -- 2.39.2