]> git.donarmstrong.com Git - samtools.git/commitdiff
* samtools-0.1.2-16
authorHeng Li <lh3@live.co.uk>
Tue, 24 Mar 2009 10:30:23 +0000 (10:30 +0000)
committerHeng Li <lh3@live.co.uk>
Tue, 24 Mar 2009 10:30:23 +0000 (10:30 +0000)
 * made pileup take a list of proposed indels. An insertion is N at the moment.
 * added my kstring library for a bit complex parsing of the position list.

Makefile
bam_import.c
bam_maqcns.c
bam_maqcns.h
bam_plcmd.c
bamtk.c
kseq.h
kstring.c [new file with mode: 0644]
kstring.h [new file with mode: 0644]

index 6b60a7b5ab4efaf326bcc8c8e7d5064790d3c8b8..d4a4101db733bf09dee97f237c74d3742b67d72c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ CXXFLAGS=       $(CFLAGS)
 DFLAGS=                -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES
 OBJS=          bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
                        razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
-                       bam_mate.o bam_rmdup.o glf.o bam_stat.o
+                       bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o
 PROG=          razip bgzip samtools
 INCLUDES=      
 LIBS=          -lm -lz
index 7742d2fbe2c17eacdcc131bcda256a93618e8d01..6697a9ffb4e89e4d67e00e8a5917c1c71ea3b405 100644 (file)
@@ -44,29 +44,27 @@ struct __tamFile_t {
        uint64_t n_lines;
 };
 
-char **bam_load_pos(const char *fn, int *_n)
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
 {
        char **list = 0, *s;
-       int n = 0, dret, m = 0, c;
+       int n = 0, dret, m = 0;
        gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
        kstream_t *ks;
        kstring_t *str;
        str = (kstring_t*)calloc(1, sizeof(kstring_t));
        ks = ks_init(fp);
-       while (ks_getuntil(ks, 0, str, &dret) > 0) {
+       while (ks_getuntil(ks, '\n', str, &dret) > 0) {
                if (n == m) {
                        m = m? m << 1 : 16;
                        list = (char**)realloc(list, m * sizeof(char*));
                }
-               s = list[n++] = (char*)calloc(str->l + 5, 1);
+               if (str->s[str->l-1] == '\r')
+                       str->s[--str->l] = '\0';
+               s = list[n++] = (char*)calloc(str->l + 1, 1);
                strcpy(s, str->s);
-               s += str->l + 1;
-               ks_getuntil(ks, 0, str, &dret);
-               *((uint32_t*)s) = atoi(str->s);
-               if (dret != '\n')
-                       while ((c = ks_getc(fp)) >= 0 && c != '\n');
        }
        ks_destroy(ks);
+       gzclose(fp);
        free(str->s); free(str);
        *_n = n;
        return list;
index ec11f3ede314d025713e9b268136d0172dc5e209..6b2b5e03260da470801ce70b8ff18f788c8fd880 100644 (file)
@@ -283,19 +283,23 @@ void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir)
 
 #define MINUS_CONST 0x10000000
 
-bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref)
+bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+                                                                int _n_types, int *_types)
 {
        int i, j, n_types, *types, left, right;
        bam_maqindel_ret_t *ret = 0;
-       for (i = 0; i < n; ++i) {
-               const bam_pileup1_t *p = pl + i;
-               if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+       // if there is no proposed indel, check if there is an indel from the alignment
+       if (_n_types == 0) {
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+               }
+               if (i == n) return 0; // no indel
        }
-       if (i == n) return 0; // no indel
        { // calculate how many types of indels are available (set n_types and types)
                int m;
                uint32_t *aux;
-               aux = (uint32_t*)calloc(n+1, 4);
+               aux = (uint32_t*)calloc(n + _n_types + 1, 4);
                m = 0;
                aux[m++] = MINUS_CONST; // zero indel is always a type
                for (i = 0; i < n; ++i) {
@@ -303,9 +307,12 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                        if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0)
                                aux[m++] = MINUS_CONST + p->indel;
                }
+               if (_n_types) // then also add this to aux[]
+                       for (i = 0; i < _n_types; ++i)
+                               if (_types[i]) aux[m++] = MINUS_CONST + _types[i];
                ks_introsort(uint32_t, m, aux);
-               n_types = 1;
-               for (i = 1; i < m; ++i)
+               // squeeze out identical types
+               for (i = 1, n_types = 1; i < m; ++i)
                        if (aux[i] != aux[i-1]) ++n_types;
                types = (int*)calloc(n_types, sizeof(int));
                j = 0;
index 2c94fec066b25552b6008f28b8c129f5672cccc5..56fa9db3a907f3087d6e4d114d54623a87491634 100644 (file)
@@ -44,7 +44,8 @@ extern "C" {
        uint32_t glf2cns(const glf1_t *g, int q_r);
 
        bam_maqindel_opt_t *bam_maqindel_opt_init();
-       bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref);
+       bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref,
+                                                                        int _n_types, int *_types);
        void bam_maqindel_ret_destroy(bam_maqindel_ret_t*);
 
 #ifdef __cplusplus
index 98a1e493e2cd824d55d240f588bd2aaa5b067f6f..79d0e63b8e75a8c6d55c19f6433d9d4c42d159cc 100644 (file)
@@ -6,7 +6,10 @@
 #include "bam_maqcns.h"
 #include "khash.h"
 #include "glf.h"
-KHASH_SET_INIT_INT64(64)
+#include "kstring.h"
+
+typedef int *indel_list_t;
+KHASH_MAP_INIT_INT64(64, indel_list_t)
 
 #define BAM_PLF_SIMPLE     0x01
 #define BAM_PLF_CNS        0x02
@@ -26,28 +29,55 @@ typedef struct {
        glfFile fp; // for glf output only
 } pu_data_t;
 
-char **bam_load_pos(const char *fn, int *_n);
+char **__bam_get_lines(const char *fn, int *_n);
 void bam_init_header_hash(bam_header_t *header);
 int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
 
 static khash_t(64) *load_pos(const char *fn, bam_header_t *h)
 {
-       int n, tmp, i;
-       char **list, *s;
-       uint64_t x;
+       char **list;
+       int i, j, n, *fields, max_fields;
        khash_t(64) *hash;
        bam_init_header_hash(h);
-       list = bam_load_pos(fn, &n);
+       list = __bam_get_lines(fn, &n);
        hash = kh_init(64);
+       max_fields = 0; fields = 0;
        for (i = 0; i < n; ++i) {
-               x = (uint64_t)bam_get_tid(h, list[i]) << 32;
-               s = list[i];
-               while (*s++);
-               x |= *((uint32_t*)s) - 1;
-               kh_put(64, hash, x, &tmp);
-               free(list[i]);
+               char *str = list[i];
+               int chr, n_fields, ret;
+               khint_t k;
+               uint64_t x;
+               n_fields = ksplit_core(str, 0, &max_fields, &fields);
+               if (n_fields < 2) continue;
+               chr = bam_get_tid(h, str + fields[0]);
+               if (chr < 0) {
+                       fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]);
+                       continue;
+               }
+               x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1);
+               k = kh_put(64, hash, x, &ret);
+               if (ret == 0) {
+                       fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]);
+                       continue;
+               }
+               kh_val(hash, k) = 0;
+               if (n_fields > 2) {
+                       // count
+                       for (j = 2; j < n_fields; ++j) {
+                               char *s = str + fields[j];
+                               if ((*s != '+' && *s != '-') || !isdigit(s[1])) break;
+                       }
+                       if (j > 2) { // update kh_val()
+                               int *q, y, z;
+                               q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int));
+                               q[0] = j - 2; z = j; y = 1;
+                               for (j = 2; j < z; ++j)
+                                       q[y++] = atoi(str + fields[j]);
+                       }
+               }
+               free(str);
        }
-       free(list);
+       free(list); free(fields);
        return hash;
 }
 
@@ -56,14 +86,19 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu,
 {
        pu_data_t *d = (pu_data_t*)data;
        bam_maqindel_ret_t *r = 0;
-       int rb;
+       int rb, *proposed_indels = 0;
        glf1_t *g;
        glf3_t *g3;
+
        if (d->fai == 0) {
                fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n");
                exit(1);
        }
-       if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0;
+       if (d->hash) { // only output a list of sites
+               khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+               if (k == kh_end(d->hash)) return 0;
+               proposed_indels = kh_val(d->hash, k);
+       }
        g3 = glf3_init1();
        if (d->fai && (int)tid != d->tid) {
                if (d->ref) { // then write the end mark
@@ -83,7 +118,9 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu,
        g3->offset = pos - d->last_pos;
        d->last_pos = pos;
        glf3_write1(d->fp, g3);
-       r = bam_maqindel(n, pos, d->ido, pu, d->ref);
+       if (proposed_indels)
+               r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+       else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
        if (r) { // then write indel line
                int het = 3 * n, min;
                min = het;
@@ -114,11 +151,16 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
 {
        pu_data_t *d = (pu_data_t*)data;
        bam_maqindel_ret_t *r = 0;
-       int i, j, rb, max_mapq = 0;
+       int i, j, rb, max_mapq = 0, *proposed_indels = 0;
        uint32_t x;
-       if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0;
+
        if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data);
-       if (d->fai && (int)tid != d->tid) {
+       if (d->hash) { // only output a list of sites
+               khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos);
+               if (k == kh_end(d->hash)) return 0;
+               proposed_indels = kh_val(d->hash, k);
+       }
+       if (d->fai && (int)tid != d->tid) { // then update d->ref
                free(d->ref);
                d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
                d->tid = tid;
@@ -140,9 +182,10 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
                }
                printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[x>>28], x>>8&0xff, ref_q, x>>16&0xff);
        }
-       if (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) {
-               if (d->ref) // indel calling
-                       r = bam_maqindel(n, pos, d->ido, pu, d->ref);
+       if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref) {
+               if (proposed_indels)
+                       r = bam_maqindel(n, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1);
+               else r = bam_maqindel(n, pos, d->ido, pu, d->ref, 0, 0);
        }
        // pileup strings
        printf("%d\t", n);
@@ -257,7 +300,9 @@ int bam_pileup(int argc, char *argv[])
                glf3_header_write(d->fp, h);
                glf3_header_destroy(h);
        }
-       if (fn_list) {
+       if (d->fai == 0)
+               fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n");
+       if (fn_list) { // the input is SAM
                tamFile fp;
                bam1_t *b;
                int ret;
@@ -273,7 +318,7 @@ int bam_pileup(int argc, char *argv[])
                bam_plbuf_destroy(buf);
                bam_destroy1(b);
                sam_close(fp);
-       } else {
+       } else { // the input is BAM
                bamFile fp;
                fp = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r");
                d->h = bam_header_read(fp);
@@ -282,8 +327,13 @@ int bam_pileup(int argc, char *argv[])
                bam_close(fp);
        }
        if (d->format & BAM_PLF_GLF) bgzf_close(d->fp);
+       if (fn_pos) { // free the hash table
+               khint_t k;
+               for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k)
+                       if (kh_exist(d->hash, k)) free(kh_val(d->hash, k));
+               kh_destroy(64, d->hash);
+       }
        free(fn_pos); free(fn_list);
-       kh_destroy(64, d->hash);
        bam_header_destroy(d->h);
        if (d->fai) fai_destroy(d->fai);
        bam_maqcns_destroy(d->c);
diff --git a/bamtk.c b/bamtk.c
index a4863530b4e4989cb6f887317b0d6f022f7aa5f5..be37b63a5037bb92ef2de88ad865b301bff98d6f 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -3,7 +3,7 @@
 #include "bam.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.2-15"
+#define PACKAGE_VERSION "0.1.2-16"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
diff --git a/kseq.h b/kseq.h
index 25f31a3c7f12addf8f1ae9e6da508f4d384b8f05..2875c77577152eb9a20b41fc044288f67eb124ab 100644 (file)
--- a/kseq.h
+++ b/kseq.h
                return (int)ks->buf[ks->begin++];                                       \
        }
 
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
 typedef struct __kstring_t {
        size_t l, m;
        char *s;
 } kstring_t;
+#endif
 
 #ifndef kroundup32
 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
diff --git a/kstring.c b/kstring.c
new file mode 100644 (file)
index 0000000..dc20fae
--- /dev/null
+++ b/kstring.c
@@ -0,0 +1,81 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "kstring.h"
+
+int ksprintf(kstring_t *s, const char *fmt, ...)
+{
+       va_list ap;
+       int l;
+       va_start(ap, fmt);
+       l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'.
+       va_end(ap);
+       if (l + 1 > s->m - s->l) {
+               s->m = s->l + l + 2;
+               kroundup32(s->m);
+               s->s = (char*)realloc(s->s, s->m);
+               va_start(ap, fmt);
+               l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap);
+       }
+       va_end(ap);
+       s->l += l;
+       return l;
+}
+
+// s MUST BE a null terminated string; l = strlen(s)
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
+{
+       int i, n, max, last_char, last_start, *offsets, l;
+       n = 0; max = *_max; offsets = *_offsets;
+       l = strlen(s);
+       
+#define __ksplit_aux do {                                                                                              \
+               if (_offsets) {                                                                                                 \
+                       s[i] = 0;                                                                                                       \
+                       if (n == max) {                                                                                         \
+                               max = max? max<<1 : 2;                                                                  \
+                               offsets = (int*)realloc(offsets, sizeof(int) * max);    \
+                       }                                                                                                                       \
+                       offsets[n++] = last_start;                                                                      \
+               } else ++n;                                                                                                             \
+       } while (0)
+
+       for (i = 0, last_char = last_start = 0; i <= l; ++i) {
+               if (delimiter == 0) {
+                       if (isspace(s[i]) || s[i] == 0) {
+                               if (isgraph(last_char)) __ksplit_aux; // the end of a field
+                       } else {
+                               if (isspace(last_char) || last_char == 0) last_start = i;
+                       }
+               } else {
+                       if (s[i] == delimiter || s[i] == 0) {
+                               if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field
+                       } else {
+                               if (last_char == delimiter || last_char == 0) last_start = i;
+                       }
+               }
+               last_char = s[i];
+       }
+       *_max = max; *_offsets = offsets;
+       return n;
+}
+
+#ifdef KSTRING_MAIN
+#include <stdio.h>
+int main()
+{
+       kstring_t *s;
+       int *fields, n, i;
+       s = (kstring_t*)calloc(1, sizeof(kstring_t));
+       // test ksprintf()
+       ksprintf(s, " abcdefg:    %d ", 100);
+       printf("'%s'\n", s->s);
+       // test ksplit()
+       fields = ksplit(s, 0, &n);
+       for (i = 0; i < n; ++i)
+               printf("field[%d] = '%s'\n", i, s->s + fields[i]);
+       free(s);
+       return 0;
+}
+#endif
diff --git a/kstring.h b/kstring.h
new file mode 100644 (file)
index 0000000..47d63f7
--- /dev/null
+++ b/kstring.h
@@ -0,0 +1,54 @@
+#ifndef KSTRING_H
+#define KSTRING_H
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef KSTRING_T
+#define KSTRING_T kstring_t
+typedef struct __kstring_t {
+       size_t l, m;
+       char *s;
+} kstring_t;
+#endif
+
+int ksprintf(kstring_t *s, const char *fmt, ...);
+int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+
+static inline int kputs(const char *p, kstring_t *s)
+{
+       int l = strlen(p);
+       if (s->l + l + 1 >= s->m) {
+               s->m = s->l + l + 2;
+               kroundup32(s->m);
+               s->s = (char*)realloc(s->s, s->m);
+       }
+       strcpy(s->s + s->l, p);
+       s->l += l;
+       return l;
+}
+
+static inline int kputc(int c, kstring_t *s)
+{
+       if (s->l + 1 >= s->m) {
+               s->m = s->l + 2;
+               kroundup32(s->m);
+               s->s = (char*)realloc(s->s, s->m);
+       }
+       s->s[s->l++] = c;
+       s->s[s->l] = 0;
+       return c;
+}
+
+static inline int *ksplit(kstring_t *s, int delimiter, int *n)
+{
+       int max = 0, *offsets = 0;
+       *n = ksplit_core(s->s, delimiter, &max, &offsets);
+       return offsets;
+}
+
+#endif