* made pileup take a list of proposed indels. An insertion is N at the moment.
* added my kstring library for a bit complex parsing of the position list.
DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES
OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
- bam_mate.o bam_rmdup.o glf.o bam_stat.o
+ bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o
PROG= razip bgzip samtools
INCLUDES=
LIBS= -lm -lz
uint64_t n_lines;
};
-char **bam_load_pos(const char *fn, int *_n)
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
{
char **list = 0, *s;
- int n = 0, dret, m = 0, c;
+ int n = 0, dret, m = 0;
gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
kstream_t *ks;
kstring_t *str;
str = (kstring_t*)calloc(1, sizeof(kstring_t));
ks = ks_init(fp);
- while (ks_getuntil(ks, 0, str, &dret) > 0) {
+ while (ks_getuntil(ks, '\n', str, &dret) > 0) {
if (n == m) {
m = m? m << 1 : 16;
list = (char**)realloc(list, m * sizeof(char*));
}
- s = list[n++] = (char*)calloc(str->l + 5, 1);
+ if (str->s[str->l-1] == '\r')
+ str->s[--str->l] = '\0';
+ s = list[n++] = (char*)calloc(str->l + 1, 1);
strcpy(s, str->s);
- s += str->l + 1;
- ks_getuntil(ks, 0, str, &dret);
- *((uint32_t*)s) = atoi(str->s);
- if (dret != '\n')
- while ((c = ks_getc(fp)) >= 0 && c != '\n');
}
ks_destroy(ks);
+ gzclose(fp);
free(str->s); free(str);
*_n = n;
return list;
#define MINUS_CONST 0x10000000
-bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref)
+bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+ int _n_types, int *_types)
{
int i, j, n_types, *types, left, right;
bam_maqindel_ret_t *ret = 0;
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+ // if there is no proposed indel, check if there is an indel from the alignment
+ if (_n_types == 0) {
+ for (i = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+ }
+ if (i == n) return 0; // no indel
}
- if (i == n) return 0; // no indel
{ // calculate how many types of indels are available (set n_types and types)
int m;
uint32_t *aux;
- aux = (uint32_t*)calloc(n+1, 4);
+ aux = (uint32_t*)calloc(n + _n_types + 1, 4);
m = 0;
aux[m++] = MINUS_CONST; // zero indel is always a type
for (i = 0; i < n; ++i) {
if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0)
aux[m++] = MINUS_CONST + p->indel;
}
+ if (_n_types) // then also add this to aux[]
+ for (i = 0; i < _n_types; ++i)
+ if (_types[i]) aux[m++] = MINUS_CONST + _types[i];
ks_introsort(uint32_t, m, aux);
- n_types = 1;
- for (i = 1; i < m; ++i)
+ // squeeze out identical types
+ for (i = 1, n_types = 1; i < m; ++i)
if (aux[i] != aux[i-1]) ++n_types;
types = (int*)calloc(n_types, sizeof(int));
j = 0;
uint32_t glf2cns(const glf1_t *g, int q_r);
bam_maqindel_opt_t *bam_maqindel_opt_init();
- bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref);
+ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+ int _n_types, int *_types);
void bam_maqindel_ret_destroy(bam_maqindel_ret_t*);
#ifdef __cplusplus
#include "bam_maqcns.h"
#include "khash.h"
#include "glf.h"
-KHASH_SET_INIT_INT64(64)
+#include "kstring.h"
+
+typedef int *indel_list_t;
+KHASH_MAP_INIT_INT64(64, indel_list_t)
#define BAM_PLF_SIMPLE 0x01
#define BAM_PLF_CNS 0x02
glfFile fp; // for glf output only
} pu_data_t;
-char **bam_load_pos(const char *fn, int *_n);
+char **__bam_get_lines(const char *fn, int *_n);
void bam_init_header_hash(bam_header_t *header);
int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
static khash_t(64) *load_pos(const char *fn, bam_header_t *h)
{
- int n, tmp, i;
- char **list, *s;
- uint64_t x;
+ char **list;
+ int i, j, n, *fields, max_fields;
khash_t(64) *hash;
bam_init_header_hash(h);
- list = bam_load_pos(fn, &n);
+ list = __bam_get_lines(fn, &n);
hash = kh_init(64);
+ max_fields = 0; fields = 0;
for (i = 0; i < n; ++i) {
- x = (uint64_t)bam_get_tid(h, list[i]) << 32;
- s = list[i];
- while (*s++);
- x |= *((uint32_t*)s) - 1;
- kh_put(64, hash, x, &tmp);
- free(list[i]);
+ char *str = list[i];
+ int chr, n_fields, ret;
+ khint_t k;
+ uint64_t x;
+ n_fields = ksplit_core(str, 0, &max_fields, &fields);
+ if (n_fields < 2) continue;
+ chr = bam_get_tid(h, str + fields[0]);
+ if (chr < 0) {
+ fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]);
+ continue;
+ }
+ x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1);
+ k = kh_put(64, hash, x, &ret);
+ if (ret == 0) {
+ fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]);
+ continue;
+ }
+ kh_val(hash, k) = 0;
+ if (n_fields > 2) {
+ // count
+ for (j = 2; j < n_fields; ++j) {
+ char *s = str + fields[j];
+ if ((*s != '+' && *s != '-') || !isdigit(s[1])) break;
+ }
+ if (j > 2) { // update kh_val()
+ int *q, y, z;
+ q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int));
+ q[0] = j - 2; z = j; y = 1;
+ for (j = 2; j < z; ++j)
+ q[y++] = atoi(str + fields[j]);
+ }
+ }
+ free(str);
}
- free(list);
+ free(list); free(fields);
return hash;
}
{
pu_data_t *d = (pu_data_t*)data;
bam_maqindel_ret_t *r = 0;
- int rb;
+ int rb, *proposed_indels = 0;
glf1_t *g;
glf3_t *g3;
+
if (d->fai == 0) {
fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n");
exit(1);
}
- if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0;
+ if (d->hash) { // only output a list of sites
+ khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+ if (k == kh_end(d->hash)) return 0;
+ proposed_indels = kh_val(d->hash, k);
+ }
g3 = glf3_init1();
if (d->fai && (int)tid != d->tid) {
if (d->ref) { // then write the end mark
g3->offset = pos - d->last_pos;
d->last_pos = pos;
glf3_write1(d->fp, g3);
- r = bam_maqindel(n, pos, d->ido, pu, d->ref);
+ if (proposed_indels)
+ r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+ else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
if (r) { // then write indel line
int het = 3 * n, min;
min = het;
{
pu_data_t *d = (pu_data_t*)data;
bam_maqindel_ret_t *r = 0;
- int i, j, rb, max_mapq = 0;
+ int i, j, rb, max_mapq = 0, *proposed_indels = 0;
uint32_t x;
- if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0;
+
if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data);
- if (d->fai && (int)tid != d->tid) {
+ if (d->hash) { // only output a list of sites
+ khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+ if (k == kh_end(d->hash)) return 0;
+ proposed_indels = kh_val(d->hash, k);
+ }
+ if (d->fai && (int)tid != d->tid) { // then update d->ref
free(d->ref);
d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
d->tid = tid;
}
printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[x>>28], x>>8&0xff, ref_q, x>>16&0xff);
}
- if (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) {
- if (d->ref) // indel calling
- r = bam_maqindel(n, pos, d->ido, pu, d->ref);
+ if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref) {
+ if (proposed_indels)
+ r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+ else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
}
// pileup strings
printf("%d\t", n);
glf3_header_write(d->fp, h);
glf3_header_destroy(h);
}
- if (fn_list) {
+ if (d->fai == 0)
+ fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n");
+ if (fn_list) { // the input is SAM
tamFile fp;
bam1_t *b;
int ret;
bam_plbuf_destroy(buf);
bam_destroy1(b);
sam_close(fp);
- } else {
+ } else { // the input is BAM
bamFile fp;
fp = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r");
d->h = bam_header_read(fp);
bam_close(fp);
}
if (d->format & BAM_PLF_GLF) bgzf_close(d->fp);
+ if (fn_pos) { // free the hash table
+ khint_t k;
+ for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k)
+ if (kh_exist(d->hash, k)) free(kh_val(d->hash, k));
+ kh_destroy(64, d->hash);
+ }
free(fn_pos); free(fn_list);
- kh_destroy(64, d->hash);
bam_header_destroy(d->h);
if (d->fai) fai_destroy(d->fai);
bam_maqcns_destroy(d->c);
#include "bam.h"
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.2-15"
+#define PACKAGE_VERSION "0.1.2-16"
#endif
int bam_taf2baf(int argc, char *argv[]);
return (int)ks->buf[ks->begin++]; \
}
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
typedef struct __kstring_t {
size_t l, m;
char *s;
} kstring_t;
+#endif
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
--- /dev/null
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "kstring.h"
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+ va_list ap;
+ int l;
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
+ va_end(ap);
+ if (l + 1 > s->m - s->l) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ va_start(ap, fmt);
+ l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
+ }
+ va_end(ap);
+ s->l += l;
+ return l;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+ int i, n, max, last_char, last_start, *offsets, l;
+ n = 0; max = *_max; offsets = *_offsets;
+ l = strlen(s);
+
+#define __ksplit_aux do { \
+ if (_offsets) { \
+ s[i] = 0; \
+ if (n == max) { \
+ max = max? max<<1 : 2; \
+ offsets = (int*)realloc(offsets, sizeof(int) * max); \
+ } \
+ offsets[n++] = last_start; \
+ } else ++n; \
+ } while (0)
+
+ for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+ if (delimiter == 0) {
+ if (isspace(s[i]) || s[i] == 0) {
+ if (isgraph(last_char)) __ksplit_aux; // the end of a field
+ } else {
+ if (isspace(last_char) || last_char == 0) last_start = i;
+ }
+ } else {
+ if (s[i] == delimiter || s[i] == 0) {
+ if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+ } else {
+ if (last_char == delimiter || last_char == 0) last_start = i;
+ }
+ }
+ last_char = s[i];
+ }
+ *_max = max; *_offsets = offsets;
+ return n;
+}
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+ kstring_t *s;
+ int *fields, n, i;
+ s = (kstring_t*)calloc(1, sizeof(kstring_t));
+ // test ksprintf()
+ ksprintf(s, " abcdefg: %d ", 100);
+ printf("'%s'\n", s->s);
+ // test ksplit()
+ fields = ksplit(s, 0, &n);
+ for (i = 0; i < n; ++i)
+ printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+ free(s);
+ return 0;
+}
+#endif
--- /dev/null
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+ size_t l, m;
+ char *s;
+} kstring_t;
+#endif
+
+int ksprintf(kstring_t *s, const char *fmt, ...);
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+ int l = strlen(p);
+ if (s->l + l + 1 >= s->m) {
+ s->m = s->l + l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ strcpy(s->s + s->l, p);
+ s->l += l;
+ return l;
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+ if (s->l + 1 >= s->m) {
+ s->m = s->l + 2;
+ kroundup32(s->m);
+ s->s = (char*)realloc(s->s, s->m);
+ }
+ s->s[s->l++] = c;
+ s->s[s->l] = 0;
+ return c;
+}
+
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+ int max = 0, *offsets = 0;
+ *n = ksplit_core(s->s, delimiter, &max, &offsets);
+ return offsets;
+}
+
+#endif