]> git.donarmstrong.com Git - samtools.git/commitdiff
Resolved merge conflict
authorPetr Danecek <pd3@mac>
Mon, 23 Apr 2012 19:07:13 +0000 (20:07 +0100)
committerPetr Danecek <pd3@mac>
Mon, 23 Apr 2012 19:07:13 +0000 (20:07 +0100)
Makefile
bam.h
bamtk.c
bedcov.c [new file with mode: 0644]
kseq.h
kstring.h
misc/bamcheck.c

index 6ddd4ddbd6aac23bd955ed15253081f19cb5fc83..4a8a75b69347a1a689caf95223a35df68403286e 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ LOBJS=          bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
 AOBJS=         bam_tview.o bam_plcmd.o sam_view.o \
                        bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
                        bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
-                       cut_target.o phase.o bam2depth.o padding.o
+                       cut_target.o phase.o bam2depth.o padding.o bedcov.o
 PROG=          samtools
 INCLUDES=      -I.
 SUBDIRS=       . bcftools misc
diff --git a/bam.h b/bam.h
index efc0459626b1ddb564bb7b4eb368a26fa70fc5ea..82f1ffab1db81ffad94751cc40406af367a00ad3 100644 (file)
--- a/bam.h
+++ b/bam.h
@@ -40,7 +40,7 @@
   @copyright Genome Research Ltd.
  */
 
-#define BAM_VERSION "0.1.18-master-r567"
+#define BAM_VERSION "0.1.18-r572"
 
 #include <stdint.h>
 #include <stdlib.h>
diff --git a/bamtk.c b/bamtk.c
index 2e7fcb0a522f2c7ae8d6b190f4026f5fbda00bb1..acdf65fed3868ca1f934081b3af1b01536abe55f 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -28,6 +28,7 @@ int main_cat(int argc, char *argv[]);
 int main_depth(int argc, char *argv[]);
 int main_bam2fq(int argc, char *argv[]);
 int main_pad2unpad(int argc, char *argv[]);
+int main_bedcov(int argc, char *argv[]);
 
 int faidx_main(int argc, char *argv[]);
 
@@ -54,6 +55,7 @@ static int usage()
        fprintf(stderr, "         rmdup       remove PCR duplicates\n");
        fprintf(stderr, "         reheader    replace BAM header\n");
        fprintf(stderr, "         cat         concatenate BAMs\n");
+       fprintf(stderr, "         bedcov      read depth per BED region\n");
        fprintf(stderr, "         targetcut   cut fosmid regions (for fosmid pool only)\n");
        fprintf(stderr, "         phase       phase heterozygotes\n");
 //     fprintf(stderr, "         depad       convert padded BAM to unpadded BAM\n"); // not stable
@@ -98,6 +100,7 @@ int main(int argc, char *argv[])
        else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1);
        else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1);
        else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1);
+       else if (strcmp(argv[1], "bedcov") == 0) return main_bedcov(argc-1, argv+1);
        else if (strcmp(argv[1], "pileup") == 0) {
                fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
                return 1;
diff --git a/bedcov.c b/bedcov.c
new file mode 100644 (file)
index 0000000..2b61a22
--- /dev/null
+++ b/bedcov.c
@@ -0,0 +1,126 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include "kstring.h"
+#include "bgzf.h"
+#include "bam.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+typedef struct {
+       bamFile fp;
+       bam_iter_t iter;
+       int min_mapQ;
+} aux_t;
+
+static int read_bam(void *data, bam1_t *b)
+{
+       aux_t *aux = (aux_t*)data;
+       int ret = bam_iter_read(aux->fp, aux->iter, b);
+       if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+       return ret;
+}
+
+int main_bedcov(int argc, char *argv[])
+{
+       extern void bam_init_header_hash(bam_header_t*);
+       gzFile fp;
+       kstring_t str;
+       kstream_t *ks;
+       bam_index_t **idx;
+       bam_header_t *h = 0;
+       aux_t **aux;
+       int *n_plp, dret, i, n, c, min_mapQ = 0;
+       int64_t *cnt;
+       const bam_pileup1_t **plp;
+
+       while ((c = getopt(argc, argv, "Q:")) >= 0) {
+               switch (c) {
+               case 'Q': min_mapQ = atoi(optarg); break;
+               }
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n");
+               return 1;
+       }
+       memset(&str, 0, sizeof(kstring_t));
+       n = argc - optind - 1;
+       aux = calloc(n, sizeof(void*));
+       idx = calloc(n, sizeof(void*));
+       for (i = 0; i < n; ++i) {
+               aux[i] = calloc(1, sizeof(aux_t));
+               aux[i]->min_mapQ = min_mapQ;
+               aux[i]->fp = bam_open(argv[i+optind+1], "r");
+               idx[i] = bam_index_load(argv[i+optind+1]);
+               if (aux[i]->fp == 0 || idx[i] == 0) {
+                       fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
+                       return 2;
+               }
+               bgzf_set_cache_size(aux[i]->fp, 20);
+               if (i == 0) h = bam_header_read(aux[0]->fp);
+       }
+       bam_init_header_hash(h);
+       cnt = calloc(n, 8);
+
+       fp = gzopen(argv[optind], "rb");
+       ks = ks_init(fp);
+       n_plp = calloc(n, sizeof(int));
+       plp = calloc(n, sizeof(void*));
+       while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
+               char *p, *q;
+               int tid, beg, end, pos;
+               bam_mplp_t mplp;
+
+               for (p = q = str.s; *p && *p != '\t'; ++p);
+               if (*p != '\t') goto bed_error;
+               *p = 0; tid = bam_get_tid(h, q); *p = '\t';
+               if (tid < 0) goto bed_error;
+               for (q = p = p + 1; isdigit(*p); ++p);
+               if (*p != '\t') goto bed_error;
+               *p = 0; beg = atoi(q); *p = '\t';
+               for (q = p = p + 1; isdigit(*p); ++p);
+               if (*p == '\t' || *p == 0) {
+                       int c = *p;
+                       *p = 0; end = atoi(q); *p = c;
+               } else goto bed_error;
+
+               for (i = 0; i < n; ++i) {
+                       if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
+                       aux[i]->iter = bam_iter_query(idx[i], tid, beg, end);
+               }
+               mplp = bam_mplp_init(n, read_bam, (void**)aux);
+               bam_mplp_set_maxcnt(mplp, 64000);
+               memset(cnt, 0, 8 * n);
+               while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
+                       if (pos >= beg && pos < end)
+                               for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
+               for (i = 0; i < n; ++i) {
+                       kputc('\t', &str);
+                       kputl(cnt[i], &str);
+               }
+               puts(str.s);
+               bam_mplp_destroy(mplp);
+               continue;
+
+bed_error:
+               fprintf(stderr, "Errors in BED line '%s'\n", str.s);
+       }
+       free(n_plp); free(plp);
+       ks_destroy(ks);
+       gzclose(fp);
+
+       free(cnt);
+       for (i = 0; i < n; ++i) {
+               if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
+               bam_index_destroy(idx[i]);
+               bam_close(aux[i]->fp);
+               free(aux[i]);
+       }
+       bam_header_destroy(h);
+       free(aux); free(idx);
+       free(str.s);
+       return 0;
+}
diff --git a/kseq.h b/kseq.h
index 0bbc7dc9370e910b19286167bd0862c64855387c..a5cec7c289aeda5fe36edb2a828a9f669741d3a3 100644 (file)
--- a/kseq.h
+++ b/kseq.h
@@ -23,7 +23,7 @@
    SOFTWARE.
 */
 
-/* Last Modified: 18AUG2011 */
+/* Last Modified: 05MAR2012 */
 
 #ifndef AC_KSEQ_H
 #define AC_KSEQ_H
@@ -34,7 +34,8 @@
 
 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
 #define KS_SEP_TAB   1 // isspace() && !' '
-#define KS_SEP_MAX   1
+#define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
+#define KS_SEP_MAX   2
 
 #define __KS_TYPE(type_t)                                              \
        typedef struct __kstream_t {                            \
@@ -51,7 +52,7 @@
        {                                                                                                                               \
                kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
                ks->f = f;                                                                                                      \
-               ks->buf = malloc(__bufsize);                                                            \
+               ks->buf = (unsigned char*)malloc(__bufsize);                            \
                return ks;                                                                                                      \
        }                                                                                                                               \
        static inline void ks_destroy(kstream_t *ks)                                    \
@@ -103,7 +104,10 @@ typedef struct __kstring_t {
                                        if (ks->end == 0) break;                                                        \
                                } else break;                                                                                   \
                        }                                                                                                                       \
-                       if (delimiter > KS_SEP_MAX) {                                                           \
+                       if (delimiter == KS_SEP_LINE) { \
+                               for (i = ks->begin; i < ks->end; ++i) \
+                                       if (ks->buf[i] == '\n') break; \
+                       } else if (delimiter > KS_SEP_MAX) {                                            \
                                for (i = ks->begin; i < ks->end; ++i)                                   \
                                        if (ks->buf[i] == delimiter) break;                                     \
                        } else if (delimiter == KS_SEP_SPACE) {                                         \
@@ -113,7 +117,7 @@ typedef struct __kstring_t {
                                for (i = ks->begin; i < ks->end; ++i)                                   \
                                        if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
                        } else i = 0; /* never come to here! */                                         \
-                       if (str->m - str->l < i - ks->begin + 1) {                                      \
+                       if (str->m - str->l < (size_t)(i - ks->begin + 1)) {            \
                                str->m = str->l + (i - ks->begin) + 1;                                  \
                                kroundup32(str->m);                                                                             \
                                str->s = (char*)realloc(str->s, str->m);                                \
@@ -129,7 +133,7 @@ typedef struct __kstring_t {
                if (str->s == 0) {                                                                                              \
                        str->m = 1;                                                                                                     \
                        str->s = (char*)calloc(1, 1);                                                           \
-               }                                                                                                                               \
+               } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
                str->s[str->l] = '\0';                                                                                  \
                return str->l;                                                                                                  \
        } \
@@ -142,19 +146,16 @@ typedef struct __kstring_t {
        __KS_GETC(__read, __bufsize)                            \
        __KS_GETUNTIL(__read, __bufsize)
 
-#define __KSEQ_BASIC(type_t)                                                                                   \
-       static inline kseq_t *kseq_init(type_t fd)                                                      \
+#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
+
+#define __KSEQ_BASIC(SCOPE, type_t)                                                                            \
+       SCOPE kseq_t *kseq_init(type_t fd)                                                                      \
        {                                                                                                                                       \
                kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
                s->f = ks_init(fd);                                                                                             \
                return s;                                                                                                               \
        }                                                                                                                                       \
-       static inline void kseq_rewind(kseq_t *ks)                                                      \
-       {                                                                                                                                       \
-               ks->last_char = 0;                                                                                              \
-               ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
-       }                                                                                                                                       \
-       static inline void kseq_destroy(kseq_t *ks)                                                     \
+       SCOPE void kseq_destroy(kseq_t *ks)                                                                     \
        {                                                                                                                                       \
                if (!ks) return;                                                                                                \
                free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
@@ -167,8 +168,8 @@ typedef struct __kstring_t {
    -1   end-of-file
    -2   truncated quality string
  */
-#define __KSEQ_READ \
-       static int kseq_read(kseq_t *seq) \
+#define __KSEQ_READ(SCOPE) \
+       SCOPE int kseq_read(kseq_t *seq) \
        { \
                int c; \
                kstream_t *ks = seq->f; \
@@ -179,14 +180,15 @@ typedef struct __kstring_t {
                } /* else: the first header char has been read in the previous call */ \
                seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
                if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
-               if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \
+               if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
                if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
                        seq->seq.m = 256; \
                        seq->seq.s = (char*)malloc(seq->seq.m); \
                } \
                while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+                       if (c == '\n') continue; /* skip empty lines */ \
                        seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
-                       ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \
+                       ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
                } \
                if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
                if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
@@ -202,7 +204,7 @@ typedef struct __kstring_t {
                } \
                while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
                if (c == -1) return -2; /* error: no quality string */ \
-               while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
+               while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
                seq->last_char = 0;     /* we have not come to the next header line */ \
                if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
                return seq->seq.l; \
@@ -215,10 +217,19 @@ typedef struct __kstring_t {
                kstream_t *f;                                                   \
        } kseq_t;
 
-#define KSEQ_INIT(type_t, __read)                              \
+#define KSEQ_INIT2(SCOPE, type_t, __read)              \
        KSTREAM_INIT(type_t, __read, 16384)                     \
        __KSEQ_TYPE(type_t)                                                     \
-       __KSEQ_BASIC(type_t)                                            \
-       __KSEQ_READ
+       __KSEQ_BASIC(SCOPE, type_t)                                     \
+       __KSEQ_READ(SCOPE)
+
+#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
+
+#define KSEQ_DECLARE(type_t) \
+       __KS_TYPE(type_t) \
+       __KSEQ_TYPE(type_t) \
+       extern kseq_t *kseq_init(type_t fd); \
+       void kseq_destroy(kseq_t *ks); \
+       int kseq_read(kseq_t *seq);
 
 #endif
index d490f58d4f1c93fc722a0d8b3d3f5b967624cfe7..abd82369f607b74fed2f4fbb07d2bf0c83f31647 100644 (file)
--- a/kstring.h
+++ b/kstring.h
@@ -142,6 +142,23 @@ static inline int kputuw(unsigned c, kstring_t *s)
        return 0;
 }
 
+static inline int kputl(long c, kstring_t *s)
+{
+       char buf[32];
+       long l, x;
+       if (c == 0) return kputc('0', s);
+       for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+       if (c < 0) buf[l++] = '-';
+       if (s->l + l + 1 >= s->m) {
+               s->m = s->l + l + 2;
+               kroundup32(s->m);
+               s->s = (char*)realloc(s->s, s->m);
+       }
+       for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x];
+       s->s[s->l] = 0;
+       return 0;
+}
+
 static inline int *ksplit(kstring_t *s, int delimiter, int *n)
 {
        int max = 0, *offsets = 0;
index f61fac18c73ddfb2074e2b7e246dd4ce606a75fb..e5bfb5d0e1db18a0c182349ae0734294451331fc 100644 (file)
@@ -1018,7 +1018,7 @@ void output_stats(stats_t *stats)
     }
 }
 
-size_t getline(char **line, size_t *n, FILE *fp)
+size_t mygetline(char **line, size_t *n, FILE *fp)
 {
     if (line == NULL || n == NULL || fp == NULL)
     {
@@ -1068,7 +1068,7 @@ void init_regions(stats_t *stats, char *file)
     ssize_t nread;
     int warned = 0;
     int prev_tid=-1, prev_pos=-1;
-    while ((nread = getline(&line, &len, fp)) != -1) 
+    while ((nread = mygetline(&line, &len, fp)) != -1) 
     {
         if ( line[0] == '#' ) continue;