From: Petr Danecek Date: Wed, 4 Apr 2012 09:07:27 +0000 (+0100) Subject: Merge remote branch 'main/master' X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=5941304148f69879e1df9fbbdc749117a5bf2fac;hp=6c564bfee2d3d5ad47691f0c50898b9511010db6;p=samtools.git Merge remote branch 'main/master' Conflicts: misc/Makefile misc/bamcheck.c --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bb605d4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.o +.*.swp +*.a +*.dSYM diff --git a/Makefile b/Makefile index fdcddd6..6ddd4dd 100644 --- a/Makefile +++ b/Makefile @@ -43,13 +43,16 @@ libbam.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) samtools:lib-recur $(AOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LDFLAGS) libbam.a -Lbcftools -lbcf $(LIBPATH) $(LIBCURSES) -lm -lz + $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LDFLAGS) libbam.a -Lbcftools -lbcf $(LIBPATH) $(LIBCURSES) -lm -lz -lpthread razip:razip.o razf.o $(KNETFILE_O) $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz bgzip:bgzip.o bgzf.o $(KNETFILE_O) - $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz + $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz -lpthread + +bgzf.o:bgzf.c bgzf.h + $(CC) -c $(CFLAGS) $(DFLAGS) -DBGZF_CACHE $(INCLUDES) bgzf.c -o $@ razip.o:razf.h bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h diff --git a/bam.h b/bam.h index 11f8028..efc0459 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.18-dev (r982:313)" +#define BAM_VERSION "0.1.18-master-r567" #include #include diff --git a/bam_cat.c b/bam_cat.c index 0fde045..a7502b9 100644 --- a/bam_cat.c +++ b/bam_cat.c @@ -59,6 +59,7 @@ all:bam_cat #include #include +#include "knetfile.h" #include "bgzf.h" #include "bam.h" @@ -97,7 +98,7 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); return -1; } - if (in->open_mode != 'r') return -1; + if (in->is_write) return -1; old = bam_header_read(in); if (h == 0 && i == 0) bam_header_write(fp, old); @@ -109,10 +110,10 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam j=0; #ifdef _USE_KNETFILE - fp_file=fp->x.fpw; - while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { + fp_file = fp->fp; + while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) { #else - fp_file=fp->file; + fp_file = fp->fp; while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { #endif if(len #include +#include "knetfile.h" #include "bgzf.h" #include "bam.h" @@ -11,7 +12,7 @@ int bam_reheader(BGZF *in, const bam_header_t *h, int fd) bam_header_t *old; int len; uint8_t *buf; - if (in->open_mode != 'r') return -1; + if (in->is_write) return -1; buf = malloc(BUF_SIZE); old = bam_header_read(in); fp = bgzf_fdopen(fd, "w"); @@ -21,8 +22,8 @@ int bam_reheader(BGZF *in, const bam_header_t *h, int fd) bgzf_flush(fp); } #ifdef _USE_KNETFILE - while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) - fwrite(buf, 1, len, fp->x.fpw); + while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) + fwrite(buf, 1, len, fp->fp); #else while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) fwrite(buf, 1, len, fp->file); diff --git a/bam_sort.c b/bam_sort.c index e3a6c47..7d00cd1 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -10,24 +10,28 @@ static int g_is_by_qname = 0; -static inline int strnum_cmp(const char *a, const char *b) +static int strnum_cmp(const char *_a, const char *_b) { - char *pa, *pb; - pa = (char*)a; pb = (char*)b; + const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; + const unsigned char *pa = a, *pb = b; while (*pa && *pb) { if (isdigit(*pa) && isdigit(*pb)) { - long ai, bi; - ai = strtol(pa, &pa, 10); - bi = strtol(pb, &pb, 10); - if (ai != bi) return aibi? 1 : 0; + while (*pa == '0') ++pa; + while (*pb == '0') ++pb; + while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb; + if (isdigit(*pa) && isdigit(*pb)) { + int i = 0; + while (isdigit(pa[i]) && isdigit(pb[i])) ++i; + return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb; + } else if (isdigit(*pa)) return 1; + else if (isdigit(*pb)) return -1; + else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1; } else { - if (*pa != *pb) break; + if (*pa != *pb) return (int)*pa - (int)*pb; ++pa; ++pb; } } - if (*pa == *pb) - return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0; - return *pa<*pb? -1 : *pa>*pb? 1 : 0; + return *pa? 1 : *pb? -1 : 0; } #define HEAP_EMPTY 0xffffffffffffffffull @@ -46,7 +50,7 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) int t; if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0; t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); - return (t > 0 || (t == 0 && __pos_cmp(a, b))); + return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0))); } else return __pos_cmp(a, b); } @@ -85,8 +89,7 @@ static void swap_header_text(bam_header_t *h1, bam_header_t *h2) @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ -int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, - int flag, const char *reg) +int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads, int level) { bamFile fpout, *fp; heap1_t *heap; @@ -94,7 +97,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch bam_header_t *hheaders = NULL; int i, j, *RG_len = 0; uint64_t idx = 0; - char **RG = 0; + char **RG = 0, mode[8]; bam_iter_t *iter = 0; if (headers) { @@ -209,15 +212,17 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch } else h->pos = HEAP_EMPTY; } - if (flag & MERGE_UNCOMP) fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); - else if (flag & MERGE_LEVEL1) fpout = strcmp(out, "-")? bam_open(out, "w1") : bam_dopen(fileno(stdout), "w1"); - else fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); - if (fpout == 0) { + if (flag & MERGE_UNCOMP) level = 0; + else if (flag & MERGE_LEVEL1) level = 1; + strcpy(mode, "w"); + if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); + if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; } bam_header_write(fpout, hout); bam_header_destroy(hout); + if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256); ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { @@ -252,12 +257,17 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch return 0; } +int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg) +{ + return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1); +} + int bam_merge(int argc, char *argv[]) { - int c, is_by_qname = 0, flag = 0, ret = 0; + int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1; char *fn_headers = NULL, *reg = 0; - while ((c = getopt(argc, argv, "h:nru1R:f")) >= 0) { + while ((c = getopt(argc, argv, "h:nru1R:f@:l:")) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; @@ -266,6 +276,8 @@ int bam_merge(int argc, char *argv[]) case '1': flag |= MERGE_LEVEL1; break; case 'u': flag |= MERGE_UNCOMP; break; case 'R': reg = strdup(optarg); break; + case 'l': level = atoi(optarg); break; + case '@': n_threads = atoi(optarg); break; } } if (optind + 2 >= argc) { @@ -276,6 +288,8 @@ int bam_merge(int argc, char *argv[]) fprintf(stderr, " -u uncompressed BAM output\n"); fprintf(stderr, " -f overwrite the output BAM if exist\n"); fprintf(stderr, " -1 compress level 1\n"); + fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n"); + fprintf(stderr, " -@ INT number of BAM compression threads [0]\n"); fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); @@ -291,51 +305,126 @@ int bam_merge(int argc, char *argv[]) return 1; } } - if (bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg) < 0) ret = 1; + if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, n_threads, level) < 0) ret = 1; free(reg); free(fn_headers); return ret; } +/*************** + * BAM sorting * + ***************/ + +#include + typedef bam1_t *bam1_p; +static int change_SO(bam_header_t *h, const char *so) +{ + char *p, *q, *beg = 0, *end = 0, *newtext; + if (h->l_text > 3) { + if (strncmp(h->text, "@HD", 3) == 0) { + if ((p = strchr(h->text, '\n')) == 0) return -1; + *p = '\0'; + if ((q = strstr(h->text, "\tSO:")) != 0) { + *p = '\n'; // change back + if (strncmp(q + 4, so, p - q - 4) != 0) { + beg = q; + for (q += 4; *q != '\n' && *q != '\t'; ++q); + end = q; + } else return 0; // no need to change + } else beg = end = p, *p = '\n'; + } + } + if (beg == 0) { // no @HD + h->l_text += strlen(so) + 15; + newtext = malloc(h->l_text + 1); + sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so); + strcat(newtext, h->text); + } else { // has @HD but different or no SO + h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end); + newtext = malloc(h->l_text + 1); + strncpy(newtext, h->text, beg - h->text); + sprintf(newtext + (beg - h->text), "\tSO:%s", so); + strcat(newtext, end); + } + free(h->text); + h->text = newtext; + return 0; +} + static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); - return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))))); - } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))); + return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0))); + } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b))); } KSORT_INIT(sort, bam1_p, bam1_lt) -static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) +typedef struct { + size_t buf_len; + const char *prefix; + bam1_p *buf; + const bam_header_t *h; + int index; +} worker_t; + +static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_header_t *h, int n_threads) { - char *name, mode[3]; - int i; + size_t i; bamFile fp; - ks_mergesort(sort, k, buf, 0); - name = (char*)calloc(strlen(prefix) + 20, 1); - if (n >= 0) { - sprintf(name, "%s.%.4d.bam", prefix, n); - strcpy(mode, "w1"); - } else { - sprintf(name, "%s.bam", prefix); - strcpy(mode, "w"); - } - fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode); - if (fp == 0) { - fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); - free(name); - // FIXME: possible memory leak - return; - } - free(name); + fp = strcmp(fn, "-")? bam_open(fn, mode) : bam_dopen(fileno(stdout), mode); + if (fp == 0) return; bam_header_write(fp, h); - for (i = 0; i < k; ++i) + if (n_threads > 1) bgzf_mt(fp, n_threads, 256); + for (i = 0; i < l; ++i) bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data); bam_close(fp); } +static void *worker(void *data) +{ + worker_t *w = (worker_t*)data; + char *name; + ks_mergesort(sort, w->buf_len, w->buf, 0); + name = (char*)calloc(strlen(w->prefix) + 20, 1); + sprintf(name, "%s.%.4d.bam", w->prefix, w->index); + write_buffer(name, "w1", w->buf_len, w->buf, w->h, 0); + free(name); + return 0; +} + +static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_header_t *h, int n_threads) +{ + int i; + size_t rest; + bam1_p *b; + pthread_t *tid; + pthread_attr_t attr; + worker_t *w; + + if (n_threads < 1) n_threads = 1; + if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + w = calloc(n_threads, sizeof(worker_t)); + tid = calloc(n_threads, sizeof(pthread_t)); + b = buf; rest = k; + for (i = 0; i < n_threads; ++i) { + w[i].buf_len = rest / (n_threads - i); + w[i].buf = b; + w[i].prefix = prefix; + w[i].h = h; + w[i].index = n_files + i; + b += w[i].buf_len; rest -= w[i].buf_len; + pthread_create(&tid[i], &attr, worker, &w[i]); + } + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); + free(tid); free(w); + return n_files + n_threads; +} + /*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @@ -350,63 +439,86 @@ static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ -void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout) +void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level) { - int n, ret, k, i; - size_t mem; + int ret, i, n_files = 0; + size_t mem, max_k, k, max_mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; + char *fnout = 0; + if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; - n = k = 0; mem = 0; + max_k = k = 0; mem = 0; + max_mem = _max_mem * n_threads; + buf = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); - buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); + if (is_by_qname) change_SO(header, "queryname"); + else change_SO(header, "coordinate"); // write sub files for (;;) { + if (k == max_k) { + size_t old_max = max_k; + max_k = max_k? max_k<<1 : 0x10000; + buf = realloc(buf, max_k * sizeof(void*)); + memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max)); + } if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; - mem += ret; + if (b->data_len < b->m_data>>2) { // shrink + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = realloc(b->data, b->m_data); + } + mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; if (mem >= max_mem) { - sort_blocks(n++, k, buf, prefix, header, 0); - mem = 0; k = 0; + n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + mem = k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); - if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout); - else { // then merge - char **fns, *fnout; - fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); - sort_blocks(n++, k, buf, prefix, header, 0); - fnout = (char*)calloc(strlen(prefix) + 20, 1); - if (is_stdout) sprintf(fnout, "-"); - else sprintf(fnout, "%s.bam", prefix); - fns = (char**)calloc(n, sizeof(char*)); - for (i = 0; i < n; ++i) { + // output file name + fnout = calloc(strlen(prefix) + 20, 1); + if (is_stdout) sprintf(fnout, "-"); + else sprintf(fnout, "%s.bam", prefix); + // write the final output + if (n_files == 0) { // a single block + char mode[8]; + strcpy(mode, "w"); + if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); + ks_mergesort(sort, k, buf, 0); + write_buffer(fnout, mode, k, buf, header, n_threads); + } else { // then merge + char **fns; + n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); + fns = (char**)calloc(n_files, sizeof(char*)); + for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } - bam_merge_core(is_by_qname, fnout, 0, n, fns, 0, 0); - free(fnout); - for (i = 0; i < n; ++i) { + bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level); + for (i = 0; i < n_files; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } - for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) { - if (buf[k]) { - free(buf[k]->data); - free(buf[k]); - } + free(fnout); + // free + for (k = 0; k < max_k; ++k) { + if (!buf[k]) continue; + free(buf[k]->data); + free(buf[k]); } free(buf); bam_header_destroy(header); @@ -415,49 +527,40 @@ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem) { - bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0); -} - - -size_t bam_sort_get_max_mem(char *max_mem_string) -{ - char c; - size_t max_mem; - size_t multiplier=1; - c=max_mem_string[strlen(max_mem_string)-1]; - switch(c) { - case 'G': - multiplier*=1024; - case 'M': - multiplier*=1024; - case 'K': - multiplier*=1024; - case 'B': - max_mem_string[strlen(max_mem_string)-1]='\0'; - break; - default: - break; - } - max_mem = multiplier * atol(max_mem_string); - // max_mem should be checked that it was not zero after atol! - return max_mem; + bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0, 0, -1); } int bam_sort(int argc, char *argv[]) { - size_t max_mem = 500000000; - int c, is_by_qname = 0, is_stdout = 0; - while ((c = getopt(argc, argv, "nom:")) >= 0) { + size_t max_mem = 768<<20; // 512MB + int c, is_by_qname = 0, is_stdout = 0, n_threads = 0, level = -1; + while ((c = getopt(argc, argv, "nom:@:l:")) >= 0) { switch (c) { case 'o': is_stdout = 1; break; case 'n': is_by_qname = 1; break; - case 'm': max_mem = bam_sort_get_max_mem(optarg); break; + case 'm': { + char *q; + max_mem = strtol(optarg, &q, 0); + if (*q == 'k' || *q == 'K') max_mem <<= 10; + else if (*q == 'm' || *q == 'M') max_mem <<= 20; + else if (*q == 'g' || *q == 'G') max_mem <<= 30; + break; + } + case '@': n_threads = atoi(optarg); break; + case 'l': level = atoi(optarg); break; } } if (optind + 2 > argc) { - fprintf(stderr, "Usage: samtools sort [-on] [-m ] \n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools sort [options] \n\n"); + fprintf(stderr, "Options: -n sort by read name\n"); + fprintf(stderr, " -o final output to stdout\n"); + fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n"); + fprintf(stderr, " -@ INT number of sorting and compression threads [1]\n"); + fprintf(stderr, " -m INT max memory per thread; suffix K/M/G recognized [768M]\n"); + fprintf(stderr, "\n"); return 1; } - bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout); + bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout, n_threads, level); return 0; } diff --git a/bcftools/Makefile b/bcftools/Makefile index 9b6f863..be831de 100644 --- a/bcftools/Makefile +++ b/bcftools/Makefile @@ -31,7 +31,7 @@ libbcf.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) bcftools:lib $(AOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz + $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz -lpthread bcf.o:bcf.h vcf.o:bcf.h diff --git a/bcftools/bcf.c b/bcftools/bcf.c index a538e6e..0524408 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -13,9 +13,6 @@ bcf_t *bcf_open(const char *fn, const char *mode) } else { b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), mode); } -#ifndef BCF_LITE - b->fp->owned_file = 1; -#endif return b; } diff --git a/bcftools/main.c b/bcftools/main.c index fcd94b8..eda6217 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -2,6 +2,7 @@ #include #include #include +#include "knetfile.h" #include "bcf.h" #include "kseq.h" @@ -29,12 +30,12 @@ int bcf_cat(int n, char * const *fn) if (i == 0) bcf_hdr_write(out, h); bcf_hdr_destroy(h); #ifdef _USE_KNETFILE - fstat(knet_fileno(in->fp->x.fpr), &s); + fstat(knet_fileno((knetFile*)in->fp->fp), &s); end = s.st_size - 28; - while (knet_tell(in->fp->x.fpr) < end) { - int size = knet_tell(in->fp->x.fpr) + BUF_SIZE < end? BUF_SIZE : end - knet_tell(in->fp->x.fpr); - knet_read(in->fp->x.fpr, buf, size); - fwrite(buf, 1, size, out->fp->x.fpw); + while (knet_tell((knetFile*)in->fp->fp) < end) { + int size = knet_tell((knetFile*)in->fp->fp) + BUF_SIZE < end? BUF_SIZE : end - knet_tell((knetFile*)in->fp->fp); + knet_read(in->fp->fp, buf, size); + fwrite(buf, 1, size, out->fp->fp); } #else abort(); // FIXME: not implemented diff --git a/bgzf.c b/bgzf.c index 216cd04..7254fa2 100644 --- a/bgzf.c +++ b/bgzf.c @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -21,400 +22,235 @@ THE SOFTWARE. */ -/* - 2009-06-29 by lh3: cache recent uncompressed blocks. - 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. - 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ - #include #include #include #include -#include +#include +#include #include -#include #include "bgzf.h" -#include "khash.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +typedef knetFile *_bgzf_file_t; +#define _bgzf_open(fn, mode) knet_open(fn, mode) +#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode) +#define _bgzf_close(fp) knet_close(fp) +#define _bgzf_fileno(fp) ((fp)->fd) +#define _bgzf_tell(fp) knet_tell(fp) +#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence) +#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len) +#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len) +#else // ~defined(_USE_KNETFILE) +#if defined(_WIN32) || defined(_MSC_VER) +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else // ~defined(_WIN32) +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif // ~defined(_WIN32) +typedef FILE *_bgzf_file_t; +#define _bgzf_open(fn, mode) fopen(fn, mode) +#define _bgzf_dopen(fp, mode) fdopen(fp, mode) +#define _bgzf_close(fp) fclose(fp) +#define _bgzf_fileno(fp) fileno(fp) +#define _bgzf_tell(fp) ftello(fp) +#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence) +#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp) +#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp) +#endif // ~define(_USE_KNETFILE) + +#define BLOCK_HEADER_LENGTH 18 +#define BLOCK_FOOTER_LENGTH 8 + + +/* BGZF/GZIP header (speciallized from RFC 1952; little endian): + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +*/ +static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; + +#ifdef BGZF_CACHE typedef struct { int size; uint8_t *block; int64_t end_offset; } cache_t; +#include "khash.h" KHASH_MAP_INIT_INT64(cache, cache_t) - -#if defined(_WIN32) || defined(_MSC_VER) -#define ftello(fp) ftell(fp) -#define fseeko(fp, offset, whence) fseek(fp, offset, whence) -#else -extern off_t ftello(FILE *stream); -extern int fseeko(FILE *stream, off_t offset, int whence); #endif -typedef int8_t bgzf_byte_t; - -static const int DEFAULT_BLOCK_SIZE = 64 * 1024; -static const int MAX_BLOCK_SIZE = 64 * 1024; - -static const int BLOCK_HEADER_LENGTH = 18; -static const int BLOCK_FOOTER_LENGTH = 8; - -static const int GZIP_ID1 = 31; -static const int GZIP_ID2 = 139; -static const int CM_DEFLATE = 8; -static const int FLG_FEXTRA = 4; -static const int OS_UNKNOWN = 255; -static const int BGZF_ID1 = 66; // 'B' -static const int BGZF_ID2 = 67; // 'C' -static const int BGZF_LEN = 2; -static const int BGZF_XLEN = 6; // BGZF_LEN+4 - -static const int GZIP_WINDOW_BITS = -15; // no zlib header -static const int Z_DEFAULT_MEM_LEVEL = 8; - - -inline -void -packInt16(uint8_t* buffer, uint16_t value) +static inline void packInt16(uint8_t *buffer, uint16_t value) { - buffer[0] = value; - buffer[1] = value >> 8; + buffer[0] = value; + buffer[1] = value >> 8; } -inline -int -unpackInt16(const uint8_t* buffer) +static inline int unpackInt16(const uint8_t *buffer) { - return (buffer[0] | (buffer[1] << 8)); + return buffer[0] | buffer[1] << 8; } -inline -void -packInt32(uint8_t* buffer, uint32_t value) +static inline void packInt32(uint8_t *buffer, uint32_t value) { - buffer[0] = value; - buffer[1] = value >> 8; - buffer[2] = value >> 16; - buffer[3] = value >> 24; + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; } -static inline -int -bgzf_min(int x, int y) -{ - return (x < y) ? x : y; -} - -static -void -report_error(BGZF* fp, const char* message) { - fp->error = message; -} - -int bgzf_check_bgzf(const char *fn) +static BGZF *bgzf_read_init() { - BGZF *fp; - uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; - int n; - - if ((fp = bgzf_open(fn, "r")) == 0) - { - fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); - return -1; - } - -#ifdef _USE_KNETFILE - n = knet_read(fp->x.fpr, buf, 10); -#else - n = fread(buf, 1, 10, fp->file); + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->is_write = 0; + fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); +#ifdef BGZF_CACHE + fp->cache = kh_init(cache); #endif - bgzf_close(fp); - - if ( n!=10 ) - return -1; - - if ( !memcmp(magic, buf, 10) ) return 1; - return 0; + return fp; } -static BGZF *bgzf_read_init() +static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level { BGZF *fp; fp = calloc(1, sizeof(BGZF)); - fp->uncompressed_block_size = MAX_BLOCK_SIZE; - fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); - fp->compressed_block_size = MAX_BLOCK_SIZE; - fp->compressed_block = malloc(MAX_BLOCK_SIZE); - fp->cache_size = 0; - fp->cache = kh_init(cache); + fp->is_write = 1; + fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 + if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; return fp; } - -static -BGZF* -open_read(int fd) +// get the compress level from the mode string +static int mode2level(const char *__restrict mode) { -#ifdef _USE_KNETFILE - knetFile *file = knet_dopen(fd, "r"); -#else - FILE* file = fdopen(fd, "r"); -#endif - BGZF* fp; - if (file == 0) return 0; - fp = bgzf_read_init(); - fp->file_descriptor = fd; - fp->open_mode = 'r'; -#ifdef _USE_KNETFILE - fp->x.fpr = file; -#else - fp->file = file; -#endif - return fp; + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + return compress_level; } -static -BGZF* -open_write(int fd, int compress_level) // compress_level==-1 for the default level +BGZF *bgzf_open(const char *path, const char *mode) { - FILE* file = fdopen(fd, "w"); - BGZF* fp; - if (file == 0) return 0; - fp = malloc(sizeof(BGZF)); - fp->file_descriptor = fd; - fp->open_mode = 'w'; - fp->owned_file = 0; - fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 - if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; -#ifdef _USE_KNETFILE - fp->x.fpw = file; -#else - fp->file = file; -#endif - fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; - fp->uncompressed_block = NULL; - fp->compressed_block_size = MAX_BLOCK_SIZE; - fp->compressed_block = malloc(MAX_BLOCK_SIZE); - fp->block_address = 0; - fp->block_offset = 0; - fp->block_length = 0; - fp->error = NULL; - return fp; -} - -BGZF* -bgzf_open(const char* __restrict path, const char* __restrict mode) -{ - BGZF* fp = NULL; - if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */ -#ifdef _USE_KNETFILE - knetFile *file = knet_open(path, mode); - if (file == 0) return 0; + BGZF *fp = 0; + assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); + if (strchr(mode, 'r') || strchr(mode, 'R')) { + _bgzf_file_t fpr; + if ((fpr = _bgzf_open(path, "r")) == 0) return 0; fp = bgzf_read_init(); - fp->file_descriptor = -1; - fp->open_mode = 'r'; - fp->x.fpr = file; -#else - int fd, oflag = O_RDONLY; -#ifdef _WIN32 - oflag |= O_BINARY; -#endif - fd = open(path, oflag); - if (fd == -1) return 0; - fp = open_read(fd); -#endif - } else if (strchr(mode, 'w') || strchr(mode, 'W')) { - int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC; -#ifdef _WIN32 - oflag |= O_BINARY; -#endif - fd = open(path, oflag, 0666); - if (fd == -1) return 0; - { // set compress_level - int i; - for (i = 0; mode[i]; ++i) - if (mode[i] >= '0' && mode[i] <= '9') break; - if (mode[i]) compress_level = (int)mode[i] - '0'; - if (strchr(mode, 'u')) compress_level = 0; - } - fp = open_write(fd, compress_level); - } - if (fp != NULL) fp->owned_file = 1; - return fp; -} - -BGZF* -bgzf_fdopen(int fd, const char * __restrict mode) -{ - if (fd == -1) return 0; - if (mode[0] == 'r' || mode[0] == 'R') { - return open_read(fd); - } else if (mode[0] == 'w' || mode[0] == 'W') { - int i, compress_level = -1; - for (i = 0; mode[i]; ++i) - if (mode[i] >= '0' && mode[i] <= '9') break; - if (mode[i]) compress_level = (int)mode[i] - '0'; - if (strchr(mode, 'u')) compress_level = 0; - return open_write(fd, compress_level); - } else { - return NULL; - } + fp->fp = fpr; + } else if (strchr(mode, 'w') || strchr(mode, 'W')) { + FILE *fpw; + if ((fpw = fopen(path, "w")) == 0) return 0; + fp = bgzf_write_init(mode2level(mode)); + fp->fp = fpw; + } + return fp; } -static -int -deflate_block(BGZF* fp, int block_length) -{ - // Deflate the block in fp->uncompressed_block into fp->compressed_block. - // Also adds an extra field that stores the compressed block length. - - bgzf_byte_t* buffer = fp->compressed_block; - int buffer_size = fp->compressed_block_size; - - // Init gzip header - buffer[0] = GZIP_ID1; - buffer[1] = GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[4] = 0; // mtime - buffer[5] = 0; - buffer[6] = 0; - buffer[7] = 0; - buffer[8] = 0; - buffer[9] = OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[11] = 0; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - buffer[15] = 0; - buffer[16] = 0; // placeholder for block length - buffer[17] = 0; - - // loop to retry for blocks that do not compress enough - int input_length = block_length; - int compressed_length = 0; - while (1) { - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = fp->uncompressed_block; - zs.avail_in = input_length; - zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, - GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); - if (status != Z_OK) { - report_error(fp, "deflate init failed"); - return -1; - } - status = deflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { - deflateEnd(&zs); - if (status == Z_OK) { - // Not enough space in buffer. - // Can happen in the rare case the input doesn't compress enough. - // Reduce the amount of input until it fits. - input_length -= 1024; - if (input_length <= 0) { - // should never happen - report_error(fp, "input reduction failed"); - return -1; - } - continue; - } - report_error(fp, "deflate failed"); - return -1; - } - status = deflateEnd(&zs); - if (status != Z_OK) { - report_error(fp, "deflate end failed"); - return -1; - } - compressed_length = zs.total_out; - compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - if (compressed_length > MAX_BLOCK_SIZE) { - // should never happen - report_error(fp, "deflate overflow"); - return -1; - } - break; - } - - packInt16((uint8_t*)&buffer[16], compressed_length-1); - uint32_t crc = crc32(0L, NULL, 0L); - crc = crc32(crc, fp->uncompressed_block, input_length); - packInt32((uint8_t*)&buffer[compressed_length-8], crc); - packInt32((uint8_t*)&buffer[compressed_length-4], input_length); - - int remaining = block_length - input_length; - if (remaining > 0) { - if (remaining > input_length) { - // should never happen (check so we can use memcpy) - report_error(fp, "remainder too large"); - return -1; - } - memcpy(fp->uncompressed_block, - fp->uncompressed_block + input_length, - remaining); - } - fp->block_offset = remaining; - return compressed_length; +BGZF *bgzf_dopen(int fd, const char *mode) +{ + BGZF *fp = 0; + assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); + if (strchr(mode, 'r') || strchr(mode, 'R')) { + _bgzf_file_t fpr; + if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0; + fp = bgzf_read_init(); + fp->fp = fpr; + } else if (strchr(mode, 'w') || strchr(mode, 'W')) { + FILE *fpw; + if ((fpw = fdopen(fd, "w")) == 0) return 0; + fp = bgzf_write_init(mode2level(mode)); + fp->fp = fpw; + } + return fp; } -static -int -inflate_block(BGZF* fp, int block_length) +static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level) { - // Inflate the block in fp->compressed_block into fp->uncompressed_block + uint32_t crc; + z_stream zs; + uint8_t *dst = (uint8_t*)_dst; + + // compress the body + zs.zalloc = NULL; zs.zfree = NULL; + zs.next_in = src; + zs.avail_in = slen; + zs.next_out = dst + BLOCK_HEADER_LENGTH; + zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer + if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1; + if (deflateEnd(&zs) != Z_OK) return -1; + *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + // write the header + memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block + packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes + // write the footer + crc = crc32(crc32(0L, NULL, 0L), src, slen); + packInt32((uint8_t*)&dst[*dlen - 8], crc); + packInt32((uint8_t*)&dst[*dlen - 4], slen); + return 0; +} - z_stream zs; - int status; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = fp->compressed_block + 18; - zs.avail_in = block_length - 16; - zs.next_out = fp->uncompressed_block; - zs.avail_out = fp->uncompressed_block_size; +// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length. +static int deflate_block(BGZF *fp, int block_length) +{ + int comp_size = BGZF_MAX_BLOCK_SIZE; + if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + fp->block_offset = 0; + return comp_size; +} - status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if (status != Z_OK) { - report_error(fp, "inflate init failed"); - return -1; - } - status = inflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { - inflateEnd(&zs); - report_error(fp, "inflate failed"); - return -1; - } - status = inflateEnd(&zs); - if (status != Z_OK) { - report_error(fp, "inflate failed"); - return -1; - } - return zs.total_out; +// Inflate the block in fp->compressed_block into fp->uncompressed_block +static int inflate_block(BGZF* fp, int block_length) +{ + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = BGZF_MAX_BLOCK_SIZE; + + if (inflateInit2(&zs, -15) != Z_OK) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + if (inflate(&zs, Z_FINISH) != Z_STREAM_END) { + inflateEnd(&zs); + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + if (inflateEnd(&zs) != Z_OK) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + return zs.total_out; } -static -int -check_header(const bgzf_byte_t* header) +static int check_header(const uint8_t *header) { - return (header[0] == GZIP_ID1 && - header[1] == (bgzf_byte_t) GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & FLG_FEXTRA) != 0 && - unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && - header[12] == BGZF_ID1 && - header[13] == BGZF_ID2 && - unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); + return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0 + && unpackInt16((uint8_t*)&header[10]) == 6 + && header[12] == 'B' && header[13] == 'C' + && unpackInt16((uint8_t*)&header[14]) == 2); } +#ifdef BGZF_CACHE static void free_cache(BGZF *fp) { khint_t k; khash_t(cache) *h = (khash_t(cache)*)fp->cache; - if (fp->open_mode != 'r') return; + if (fp->is_write) return; for (k = kh_begin(h); k < kh_end(h); ++k) if (kh_exist(h, k)) free(kh_val(h, k).block); kh_destroy(cache, h); @@ -431,12 +267,8 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address) if (fp->block_length != 0) fp->block_offset = 0; fp->block_address = block_address; fp->block_length = p->size; - memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); -#ifdef _USE_KNETFILE - knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); -#else - fseeko(fp->file, p->end_offset, SEEK_SET); -#endif + memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE); + _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET); return p->size; } @@ -446,8 +278,8 @@ static void cache_block(BGZF *fp, int size) khint_t k; cache_t *p; khash_t(cache) *h = (khash_t(cache)*)fp->cache; - if (MAX_BLOCK_SIZE >= fp->cache_size) return; - if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > fp->cache_size) { /* A better way would be to remove the oldest block in the * cache, but here we remove a random one for simplicity. This * should not have a big impact on performance. */ @@ -463,201 +295,300 @@ static void cache_block(BGZF *fp, int size) p = &kh_val(h, k); p->size = fp->block_length; p->end_offset = fp->block_address + size; - p->block = malloc(MAX_BLOCK_SIZE); - memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); + p->block = malloc(BGZF_MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE); } +#else +static void free_cache(BGZF *fp) {} +static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} +static void cache_block(BGZF *fp, int size) {} +#endif -int -bgzf_read_block(BGZF* fp) +int bgzf_read_block(BGZF *fp) { - bgzf_byte_t header[BLOCK_HEADER_LENGTH]; + uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; int count, size = 0, block_length, remaining; -#ifdef _USE_KNETFILE - int64_t block_address = knet_tell(fp->x.fpr); - if (load_block_from_cache(fp, block_address)) return 0; - count = knet_read(fp->x.fpr, header, sizeof(header)); -#else - int64_t block_address = ftello(fp->file); - if (load_block_from_cache(fp, block_address)) return 0; - count = fread(header, 1, sizeof(header), fp->file); -#endif - if (count == 0) { - fp->block_length = 0; - return 0; - } + int64_t block_address; + block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0; + count = _bgzf_read(fp->fp, header, sizeof(header)); + if (count == 0) { // no data read + fp->block_length = 0; + return 0; + } + if (count != sizeof(header) || !check_header(header)) { + fp->errcode |= BGZF_ERR_HEADER; + return -1; + } size = count; - if (count != sizeof(header)) { - report_error(fp, "read failed"); - return -1; - } - if (!check_header(header)) { - report_error(fp, "invalid block header"); - return -1; - } - block_length = unpackInt16((uint8_t*)&header[16]) + 1; - bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; - memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); - remaining = block_length - BLOCK_HEADER_LENGTH; -#ifdef _USE_KNETFILE - count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); -#else - count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); -#endif - if (count != remaining) { - report_error(fp, "read failed"); - return -1; - } + block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" + compressed_block = (uint8_t*)fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + remaining = block_length - BLOCK_HEADER_LENGTH; + count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining); + if (count != remaining) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } size += count; - count = inflate_block(fp, block_length); - if (count < 0) return -1; - if (fp->block_length != 0) { - // Do not reset offset if this read follows a seek. - fp->block_offset = 0; - } - fp->block_address = block_address; - fp->block_length = count; + if ((count = inflate_block(fp, block_length)) < 0) return -1; + if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek. + fp->block_address = block_address; + fp->block_length = count; cache_block(fp, size); - return 0; + return 0; } -int -bgzf_read(BGZF* fp, void* data, int length) +ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length) { - if (length <= 0) { - return 0; - } - if (fp->open_mode != 'r') { - report_error(fp, "file not open for reading"); - return -1; - } + ssize_t bytes_read = 0; + uint8_t *output = data; + if (length <= 0) return 0; + assert(fp->is_write == 0); + while (bytes_read < length) { + int copy_length, available = fp->block_length - fp->block_offset; + uint8_t *buffer; + if (available <= 0) { + if (bgzf_read_block(fp) != 0) return -1; + available = fp->block_length - fp->block_offset; + if (available <= 0) break; + } + copy_length = length - bytes_read < available? length - bytes_read : available; + buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { + fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + fp->block_offset = fp->block_length = 0; + } + return bytes_read; +} - int bytes_read = 0; - bgzf_byte_t* output = data; - while (bytes_read < length) { - int copy_length, available = fp->block_length - fp->block_offset; - bgzf_byte_t *buffer; - if (available <= 0) { - if (bgzf_read_block(fp) != 0) { - return -1; - } - available = fp->block_length - fp->block_offset; - if (available <= 0) { - break; - } - } - copy_length = bgzf_min(length-bytes_read, available); - buffer = fp->uncompressed_block; - memcpy(output, buffer + fp->block_offset, copy_length); - fp->block_offset += copy_length; - output += copy_length; - bytes_read += copy_length; - } - if (fp->block_offset == fp->block_length) { -#ifdef _USE_KNETFILE - fp->block_address = knet_tell(fp->x.fpr); -#else - fp->block_address = ftello(fp->file); -#endif - fp->block_offset = 0; - fp->block_length = 0; - } - return bytes_read; +/***** BEGIN: multi-threading *****/ + +typedef struct { + BGZF *fp; + struct mtaux_t *mt; + void *buf; + int i, errcode, toproc; +} worker_t; + +typedef struct mtaux_t { + int n_threads, n_blks, curr, done; + volatile int proc_cnt; + void **blk; + int *len; + worker_t *w; + pthread_t *tid; + pthread_mutex_t lock; + pthread_cond_t cv; +} mtaux_t; + +static int worker_aux(worker_t *w) +{ + int i, tmp, stop = 0; + // wait for condition: to process or all done + pthread_mutex_lock(&w->mt->lock); + while (!w->toproc && !w->mt->done) + pthread_cond_wait(&w->mt->cv, &w->mt->lock); + if (w->mt->done) stop = 1; + w->toproc = 0; + pthread_mutex_unlock(&w->mt->lock); + if (stop) return 1; // to quit the thread + w->errcode = 0; + for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) { + int clen = BGZF_MAX_BLOCK_SIZE; + if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0) + w->errcode |= BGZF_ERR_ZLIB; + memcpy(w->mt->blk[i], w->buf, clen); + w->mt->len[i] = clen; + } + tmp = __sync_fetch_and_add(&w->mt->proc_cnt, 1); + return 0; } -int bgzf_flush(BGZF* fp) +static void *mt_worker(void *data) { - while (fp->block_offset > 0) { - int count, block_length; - block_length = deflate_block(fp, fp->block_offset); - if (block_length < 0) return -1; -#ifdef _USE_KNETFILE - count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); -#else - count = fwrite(fp->compressed_block, 1, block_length, fp->file); -#endif - if (count != block_length) { - report_error(fp, "write failed"); - return -1; - } - fp->block_address += block_length; - } - return 0; + while (worker_aux(data) == 0); + return 0; +} + +int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) +{ + int i; + mtaux_t *mt; + pthread_attr_t attr; + if (!fp->is_write || fp->mt || n_threads <= 1) return -1; + mt = calloc(1, sizeof(mtaux_t)); + mt->n_threads = n_threads; + mt->n_blks = n_threads * n_sub_blks; + mt->len = calloc(mt->n_blks, sizeof(int)); + mt->blk = calloc(mt->n_blks, sizeof(void*)); + for (i = 0; i < mt->n_blks; ++i) + mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE); + mt->tid = calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master + mt->w = calloc(mt->n_threads, sizeof(worker_t)); + for (i = 0; i < mt->n_threads; ++i) { + mt->w[i].i = i; + mt->w[i].mt = mt; + mt->w[i].fp = fp; + mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE); + } + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + pthread_mutex_init(&mt->lock, 0); + pthread_cond_init(&mt->cv, 0); + for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread + pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]); + fp->mt = mt; + return 0; +} + +static void mt_destroy(mtaux_t *mt) +{ + int i; + // signal all workers to quit + pthread_mutex_lock(&mt->lock); + mt->done = 1; mt->proc_cnt = 0; + pthread_cond_broadcast(&mt->cv); + pthread_mutex_unlock(&mt->lock); + for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread + // free other data allocated on heap + for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]); + for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf); + free(mt->blk); free(mt->len); free(mt->w); free(mt->tid); + pthread_cond_destroy(&mt->cv); + pthread_mutex_destroy(&mt->lock); + free(mt); } -int bgzf_flush_try(BGZF *fp, int size) +static void mt_queue(BGZF *fp) { - if (fp->block_offset + size > fp->uncompressed_block_size) - return bgzf_flush(fp); + mtaux_t *mt = (mtaux_t*)fp->mt; + assert(mt->curr < mt->n_blks); // guaranteed by the caller + memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset); + mt->len[mt->curr] = fp->block_offset; + fp->block_offset = 0; + ++mt->curr; +} + +static int mt_flush(BGZF *fp) +{ + int i; + mtaux_t *mt = (mtaux_t*)fp->mt; + if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail + // signal all the workers to compress + pthread_mutex_lock(&mt->lock); + for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1; + mt->proc_cnt = 0; + pthread_cond_broadcast(&mt->cv); + pthread_mutex_unlock(&mt->lock); + // worker 0 is doing things here + worker_aux(&mt->w[0]); + // wait for all the threads to complete + while (mt->proc_cnt < mt->n_threads); + // dump data to disk + for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode; + for (i = 0; i < mt->curr; ++i) + if (fwrite(mt->blk[i], 1, mt->len[i], fp->fp) != mt->len[i]) + fp->errcode |= BGZF_ERR_IO; + mt->curr = 0; + return 0; +} + +static int mt_lazy_flush(BGZF *fp) +{ + mtaux_t *mt = (mtaux_t*)fp->mt; + mt_queue(fp); + if (mt->curr == mt->n_blks) + return mt_flush(fp); return -1; } -int bgzf_write(BGZF* fp, const void* data, int length) +static ssize_t mt_write(BGZF *fp, const void *data, ssize_t length) { - const bgzf_byte_t *input = data; - int block_length, bytes_written; - if (fp->open_mode != 'w') { - report_error(fp, "file not open for writing"); - return -1; - } + const uint8_t *input = data; + ssize_t rest = length; + while (rest) { + int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest; + memcpy(fp->uncompressed_block + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; input += copy_length; rest -= copy_length; + if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp); + } + return length - rest; +} - if (fp->uncompressed_block == NULL) - fp->uncompressed_block = malloc(fp->uncompressed_block_size); - - input = data; - block_length = fp->uncompressed_block_size; - bytes_written = 0; - while (bytes_written < length) { - int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); - bgzf_byte_t* buffer = fp->uncompressed_block; - memcpy(buffer + fp->block_offset, input, copy_length); - fp->block_offset += copy_length; - input += copy_length; - bytes_written += copy_length; - if (fp->block_offset == block_length) { - if (bgzf_flush(fp) != 0) { - break; - } - } - } - return bytes_written; +/***** END: multi-threading *****/ + +int bgzf_flush(BGZF *fp) +{ + if (!fp->is_write) return 0; + if (fp->mt) return mt_flush(fp); + while (fp->block_offset > 0) { + int block_length; + block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) return -1; + if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) { + fp->errcode |= BGZF_ERR_IO; // possibly truncated file + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int bgzf_flush_try(BGZF *fp, ssize_t size) +{ + if (fp->block_offset + size > BGZF_BLOCK_SIZE) { + if (fp->mt) return mt_lazy_flush(fp); + else return bgzf_flush(fp); + } + return -1; +} + +ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length) +{ + const uint8_t *input = data; + int block_length = BGZF_BLOCK_SIZE, bytes_written = 0; + assert(fp->is_write); + if (fp->mt) return mt_write(fp, data, length); + while (bytes_written < length) { + uint8_t* buffer = fp->uncompressed_block; + int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length && bgzf_flush(fp)) break; + } + return bytes_written; } int bgzf_close(BGZF* fp) { - if (fp->open_mode == 'w') { - if (bgzf_flush(fp) != 0) return -1; - { // add an empty block - int count, block_length = deflate_block(fp, 0); -#ifdef _USE_KNETFILE - count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); -#else - count = fwrite(fp->compressed_block, 1, block_length, fp->file); -#endif + int ret, count, block_length; + if (fp == 0) return -1; + if (fp->is_write) { + if (bgzf_flush(fp) != 0) return -1; + fp->compress_level = -1; + block_length = deflate_block(fp, 0); // write an empty block + count = fwrite(fp->compressed_block, 1, block_length, fp->fp); + if (fflush(fp->fp) != 0) { + fp->errcode |= BGZF_ERR_IO; + return -1; } -#ifdef _USE_KNETFILE - if (fflush(fp->x.fpw) != 0) { -#else - if (fflush(fp->file) != 0) { -#endif - report_error(fp, "flush failed"); - return -1; - } - } - if (fp->owned_file) { -#ifdef _USE_KNETFILE - int ret; - if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); - else ret = knet_close(fp->x.fpr); - if (ret != 0) return -1; -#else - if (fclose(fp->file) != 0) return -1; -#endif - } - free(fp->uncompressed_block); - free(fp->compressed_block); + if (fp->mt) mt_destroy(fp->mt); + } + ret = fp->is_write? fclose(fp->fp) : _bgzf_close(fp->fp); + if (ret != 0) return -1; + free(fp->uncompressed_block); + free(fp->compressed_block); free_cache(fp); - free(fp); - return 0; + free(fp); + return 0; } void bgzf_set_cache_size(BGZF *fp, int cache_size) @@ -670,17 +601,10 @@ int bgzf_check_EOF(BGZF *fp) static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; uint8_t buf[28]; off_t offset; -#ifdef _USE_KNETFILE - offset = knet_tell(fp->x.fpr); - if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1; - knet_read(fp->x.fpr, buf, 28); - knet_seek(fp->x.fpr, offset, SEEK_SET); -#else - offset = ftello(fp->file); - if (fseeko(fp->file, -28, SEEK_END) != 0) return -1; - fread(buf, 1, 28, fp->file); - fseeko(fp->file, offset, SEEK_SET); -#endif + offset = _bgzf_tell((_bgzf_file_t)fp->fp); + if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0; + _bgzf_read(fp->fp, buf, 28); + _bgzf_seek(fp->fp, offset, SEEK_SET); return (memcmp(magic, buf, 28) == 0)? 1 : 0; } @@ -689,26 +613,82 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) int block_offset; int64_t block_address; - if (fp->open_mode != 'r') { - report_error(fp, "file not open for read"); - return -1; - } - if (where != SEEK_SET) { - report_error(fp, "unimplemented seek option"); - return -1; + if (fp->is_write || where != SEEK_SET) { + fp->errcode |= BGZF_ERR_MISUSE; + return -1; + } + block_offset = pos & 0xFFFF; + block_address = pos >> 16; + if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + fp->block_length = 0; // indicates current block has not been loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} + +int bgzf_is_bgzf(const char *fn) +{ + uint8_t buf[16]; + int n; + _bgzf_file_t fp; + if ((fp = _bgzf_open(fn, "r")) == 0) return 0; + n = _bgzf_read(fp, buf, 16); + _bgzf_close(fp); + if (n != 16) return 0; + return memcmp(g_magic, buf, 16) == 0? 1 : 0; +} + +int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { + fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + fp->block_offset = 0; + fp->block_length = 0; } - block_offset = pos & 0xFFFF; - block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; -#ifdef _USE_KNETFILE - if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { -#else - if (fseeko(fp->file, block_address, SEEK_SET) != 0) { + return c; +} + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif - report_error(fp, "seek failed"); - return -1; - } - fp->block_length = 0; // indicates current block is not loaded - fp->block_address = block_address; - fp->block_offset = block_offset; - return 0; + +int bgzf_getline(BGZF *fp, int delim, kstring_t *str) +{ + int l, state = 0; + unsigned char *buf = (unsigned char*)fp->uncompressed_block; + str->l = 0; + do { + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) { state = -2; break; } + if (fp->block_length == 0) { state = -1; break; } + } + for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); + if (l < fp->block_length) state = 1; + l -= fp->block_offset; + if (str->l + l + 1 >= str->m) { + str->m = str->l + l + 2; + kroundup32(str->m); + str->s = (char*)realloc(str->s, str->m); + } + memcpy(str->s + str->l, buf + fp->block_offset, l); + str->l += l; + fp->block_offset += l + 1; + if (fp->block_offset >= fp->block_length) { + fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + fp->block_offset = 0; + fp->block_length = 0; + } + } while (state == 0); + if (str->l == 0 && state < 0) return state; + str->s[str->l] = 0; + return str->l; } diff --git a/bgzf.h b/bgzf.h index 7295f37..cb67681 100644 --- a/bgzf.h +++ b/bgzf.h @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + 2011, 2012 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -21,137 +22,186 @@ THE SOFTWARE. */ +/* The BGZF library was originally written by Bob Handsaker from the Broad + * Institute. It was later improved by the SAMtools developers. */ + #ifndef __BGZF_H #define __BGZF_H #include #include #include -#ifdef _USE_KNETFILE -#include "knetfile.h" -#endif +#include + +#define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE +#define BGZF_MAX_BLOCK_SIZE 0x10000 -//typedef int8_t bool; +#define BGZF_ERR_ZLIB 1 +#define BGZF_ERR_HEADER 2 +#define BGZF_ERR_IO 4 +#define BGZF_ERR_MISUSE 8 typedef struct { - int file_descriptor; - char open_mode; // 'r' or 'w' - int16_t owned_file, compress_level; -#ifdef _USE_KNETFILE - union { - knetFile *fpr; - FILE *fpw; - } x; -#else - FILE* file; -#endif - int uncompressed_block_size; - int compressed_block_size; - void* uncompressed_block; - void* compressed_block; - int64_t block_address; - int block_length; - int block_offset; + int errcode:16, is_write:2, compress_level:14; int cache_size; - const char* error; + int block_length, block_offset; + int64_t block_address; + void *uncompressed_block, *compressed_block; void *cache; // a pointer to a hash table + void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading + void *mt; // only used for multi-threading } BGZF; +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + #ifdef __cplusplus extern "C" { #endif -/* - * Open an existing file descriptor for reading or writing. - * Mode must be either "r" or "w". - * A subsequent bgzf_close will not close the file descriptor. - * Returns null on error. - */ -BGZF* bgzf_fdopen(int fd, const char* __restrict mode); - -/* - * Open the specified file for reading or writing. - * Mode must be either "r" or "w". - * Returns null on error. - */ -BGZF* bgzf_open(const char* path, const char* __restrict mode); - -/* - * Close the BGZ file and free all associated resources. - * Does not close the underlying file descriptor if created with bgzf_fdopen. - * Returns zero on success, -1 on error. - */ -int bgzf_close(BGZF* fp); - -/* - * Read up to length bytes from the file storing into data. - * Returns the number of bytes actually read. - * Returns zero on end of file. - * Returns -1 on error. - */ -int bgzf_read(BGZF* fp, void* data, int length); - -/* - * Write length bytes from data to the file. - * Returns the number of bytes written. - * Returns -1 on error. - */ -int bgzf_write(BGZF* fp, const void* data, int length); - -/* - * Return a virtual file pointer to the current location in the file. - * No interpetation of the value should be made, other than a subsequent - * call to bgzf_seek can be used to position the file at the same point. - * Return value is non-negative on success. - * Returns -1 on error. - */ -#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) - -/* - * Set the file to read from the location specified by pos, which must - * be a value previously returned by bgzf_tell for this file (but not - * necessarily one returned by this file handle). - * The where argument must be SEEK_SET. - * Seeking on a file opened for write is not supported. - * Returns zero on success, -1 on error. - */ -int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); - -/* - * Set the cache size. Zero to disable. By default, caching is - * disabled. The recommended cache size for frequent random access is - * about 8M bytes. - */ -void bgzf_set_cache_size(BGZF *fp, int cache_size); - -int bgzf_check_EOF(BGZF *fp); -int bgzf_read_block(BGZF* fp); -int bgzf_flush(BGZF* fp); -int bgzf_flush_try(BGZF *fp, int size); -int bgzf_check_bgzf(const char *fn); + /****************** + * Basic routines * + ******************/ + + /** + * Open an existing file descriptor for reading or writing. + * + * @param fd file descriptor + * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies + * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored. + * @return BGZF file handler; 0 on error + */ + BGZF* bgzf_dopen(int fd, const char *mode); + + #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility + + /** + * Open the specified file for reading or writing. + */ + BGZF* bgzf_open(const char* path, const char *mode); + + /** + * Close the BGZF and free all associated resources. + * + * @param fp BGZF file handler + * @return 0 on success and -1 on error + */ + int bgzf_close(BGZF *fp); + + /** + * Read up to _length_ bytes from the file storing into _data_. + * + * @param fp BGZF file handler + * @param data data array to read into + * @param length size of data to read + * @return number of bytes actually read; 0 on end-of-file and -1 on error + */ + ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length); + + /** + * Write _length_ bytes from _data_ to the file. + * + * @param fp BGZF file handler + * @param data data array to write + * @param length size of data to write + * @return number of bytes actually written; -1 on error + */ + ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length); + + /** + * Write the data in the buffer to the file. + */ + int bgzf_flush(BGZF *fp); + + /** + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + */ + #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) + + /** + * Set the file to read from the location specified by _pos_. + * + * @param fp BGZF file handler + * @param pos virtual file offset returned by bgzf_tell() + * @param whence must be SEEK_SET + * @return 0 on success and -1 on error + */ + int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); + + /** + * Check if the BGZF end-of-file (EOF) marker is present + * + * @param fp BGZF file handler opened for reading + * @return 1 if EOF is present; 0 if not or on I/O error + */ + int bgzf_check_EOF(BGZF *fp); + + /** + * Check if a file is in the BGZF format + * + * @param fn file name + * @return 1 if _fn_ is BGZF; 0 if not or on I/O error + */ + int bgzf_is_bgzf(const char *fn); + + /********************* + * Advanced routines * + *********************/ + + /** + * Set the cache size. Only effective when compiled with -DBGZF_CACHE. + * + * @param fp BGZF file handler + * @param size size of cache in bytes; 0 to disable caching (default) + */ + void bgzf_set_cache_size(BGZF *fp, int size); + + /** + * Flush the file if the remaining buffer size is smaller than _size_ + */ + int bgzf_flush_try(BGZF *fp, ssize_t size); + + /** + * Read one byte from a BGZF file. It is faster than bgzf_read() + * @param fp BGZF file handler + * @return byte read; -1 on end-of-file or error + */ + int bgzf_getc(BGZF *fp); + + /** + * Read one line from a BGZF file. It is faster than bgzf_getc() + * + * @param fp BGZF file handler + * @param delim delimitor + * @param str string to write to; must be initialized + * @return length of the string; 0 on end-of-file; negative on error + */ + int bgzf_getline(BGZF *fp, int delim, kstring_t *str); + + /** + * Read the next BGZF block. + */ + int bgzf_read_block(BGZF *fp); + + /** + * Enable multi-threading (only effective on writing) + * + * @param fp BGZF file handler; must be opened for writing + * @param n_threads #threads used for writing + * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended + */ + int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); #ifdef __cplusplus } #endif -static inline int bgzf_getc(BGZF *fp) -{ - int c; - if (fp->block_offset >= fp->block_length) { - if (bgzf_read_block(fp) != 0) return -2; /* error */ - if (fp->block_length == 0) return -1; /* end-of-file */ - } - c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; - if (fp->block_offset == fp->block_length) { -#ifdef _USE_KNETFILE - fp->block_address = knet_tell(fp->x.fpr); -#else - fp->block_address = ftello(fp->file); -#endif - fp->block_offset = 0; - fp->block_length = 0; - } - return c; -} - #endif diff --git a/kprobaln.c b/kprobaln.c index 894a2ae..04e526a 100644 --- a/kprobaln.c +++ b/kprobaln.c @@ -77,6 +77,8 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer const uint8_t *ref, *query; int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr; + if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault + /*** initialization ***/ is_backward = state && q? 1 : 0; ref = _ref - 1; query = _query - 1; // change to 1-based coordinate @@ -87,7 +89,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] f = calloc(l_query+1, sizeof(void*)); if (is_backward) b = calloc(l_query+1, sizeof(void*)); - for (i = 0; i <= l_query; ++i) { + for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0 f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double)); } diff --git a/misc/Makefile b/misc/Makefile index 64aa9df..e1a5add 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 #-m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= -D_FILE_OFFSET_BITS=64 OBJS= -PROG= md5sum-lite md5fa maq2sam-short maq2sam-long ace2sam wgsim seqtk bamcheck +PROG= md5sum-lite md5fa maq2sam-short maq2sam-long ace2sam wgsim bamcheck INCLUDES= -I.. SUBDIRS= . @@ -28,7 +28,7 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur: lib: bamcheck:bamcheck.o - $(CC) $(CFLAGS) -o $@ bamcheck.o -lm -lz -L.. -lbam + $(CC) $(CFLAGS) -o $@ bamcheck.o -lm -lz -L.. -lbam -lpthread bamcheck.o:bamcheck.c ../faidx.h ../khash.h ../sam.h ../razf.h $(CC) $(CFLAGS) -c -I.. -o $@ bamcheck.c @@ -36,9 +36,6 @@ bamcheck.o:bamcheck.c ../faidx.h ../khash.h ../sam.h ../razf.h ace2sam:ace2sam.o $(CC) $(CFLAGS) -o $@ ace2sam.o -lz -seqtk:seqtk.o - $(CC) $(CFLAGS) -o $@ seqtk.o -lm -lz - wgsim:wgsim.o $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz @@ -60,9 +57,6 @@ maq2sam-long:maq2sam.c md5fa.o:md5.h md5fa.c $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c -seqtk.o:seqtk.c ../khash.h ../kseq.h - $(CC) $(CFLAGS) -c -I.. -o $@ seqtk.c - wgsim.o:wgsim.c ../kseq.h $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c diff --git a/misc/bamcheck.c b/misc/bamcheck.c index 03cf3b2..76a9f6e 100644 --- a/misc/bamcheck.c +++ b/misc/bamcheck.c @@ -819,23 +819,23 @@ void output_stats(stats_t *stats) printf(" %s",stats->argv[i]); printf("\n"); printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); - printf("SN\tsequences:\t%ld\n", stats->nreads_1st+stats->nreads_2nd); + printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0); printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); - printf("SN\t1st fragments:\t%ld\n", stats->nreads_1st); - printf("SN\tlast fragments:\t%ld\n", stats->nreads_2nd); - printf("SN\treads mapped:\t%ld\n", stats->nreads_paired+stats->nreads_unpaired); - printf("SN\treads unmapped:\t%ld\n", stats->nreads_unmapped); - printf("SN\treads unpaired:\t%ld\n", stats->nreads_unpaired); - printf("SN\treads paired:\t%ld\n", stats->nreads_paired); - printf("SN\treads duplicated:\t%ld\n", stats->nreads_dup); - printf("SN\treads MQ0:\t%ld\n", stats->nreads_mq0); - printf("SN\ttotal length:\t%ld\n", stats->total_len); - printf("SN\tbases mapped:\t%ld\n", stats->nbases_mapped); - printf("SN\tbases mapped (cigar):\t%ld\n", stats->nbases_mapped_cigar); - printf("SN\tbases trimmed:\t%ld\n", stats->nbases_trimmed); - printf("SN\tbases duplicated:\t%ld\n", stats->total_len_dup); - printf("SN\tmismatches:\t%ld\n", stats->nmismatches); + printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); + printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); + printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired)); + printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped); + printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired); + printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired); + printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup); + printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0); + printf("SN\ttotal length:\t%ld\n", (long)stats->total_len); + printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped); + printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar); + printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed); + printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); + printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches); printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar); float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; printf("SN\taverage length:\t%.0f\n", avg_read_length); @@ -843,9 +843,9 @@ void output_stats(stats_t *stats) printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0); printf("SN\tinsert size average:\t%.1f\n", avg_isize); printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize); - printf("SN\tinward oriented pairs:\t%ld\n", nisize_inward); - printf("SN\toutward oriented pairs:\t%ld\n", nisize_outward); - printf("SN\tpairs with other orientation:\t%ld\n", nisize_other); + printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward); + printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); + printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); int ibase,iqual; if ( stats->max_lennbases ) stats->max_len++; @@ -857,7 +857,7 @@ void output_stats(stats_t *stats) printf("FFQ\t%d",ibase+1); for (iqual=0; iqual<=stats->max_qual; iqual++) { - printf("\t%ld", stats->quals_1st[ibase*stats->nquals+iqual]); + printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]); } printf("\n"); } @@ -868,7 +868,7 @@ void output_stats(stats_t *stats) printf("LFQ\t%d",ibase+1); for (iqual=0; iqual<=stats->max_qual; iqual++) { - printf("\t%ld", stats->quals_2nd[ibase*stats->nquals+iqual]); + printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]); } printf("\n"); } @@ -882,7 +882,7 @@ void output_stats(stats_t *stats) printf("MPC\t%d",ibase+1); for (iqual=0; iqual<=stats->max_qual; iqual++) { - printf("\t%ld", stats->mpc_buf[ibase*stats->nquals+iqual]); + printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]); } printf("\n"); } @@ -892,7 +892,7 @@ void output_stats(stats_t *stats) for (ibase=0; ibasengc; ibase++) { if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue; - printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1),stats->gc_1st[ibase_prev]); + printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]); ibase_prev = ibase; } printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n"); @@ -900,7 +900,7 @@ void output_stats(stats_t *stats) for (ibase=0; ibasengc; ibase++) { if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue; - printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1),stats->gc_2nd[ibase_prev]); + printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]); ibase_prev = ibase; } printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n"); @@ -913,37 +913,37 @@ void output_stats(stats_t *stats) } printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); for (isize=1; isizeisize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), - stats->isize_inward[isize],stats->isize_outward[isize],stats->isize_other[isize]); + printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), + (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]); printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n"); int ilen; for (ilen=0; ilenmax_len; ilen++) { if ( stats->read_lengths[ilen]>0 ) - printf("RL\t%d\t%ld\n", ilen,stats->read_lengths[ilen]); + printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]); } printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n"); for (ilen=0; ilennindels; ilen++) { if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 ) - printf("ID\t%d\t%ld\t%ld\n", ilen+1,stats->insertions[ilen],stats->deletions[ilen]); + printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]); } printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions, number of deletions\n"); for (ilen=0; ilennbases; ilen++) { if ( stats->ins_cycles[ilen]>0 || stats->del_cycles[ilen]>0 ) - printf("IC\t%d\t%ld\t%ld\n", ilen+1,stats->ins_cycles[ilen],stats->del_cycles[ilen]); + printf("IC\t%d\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles[ilen], (long)stats->del_cycles[ilen]); } printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); - printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1,stats->cov[0]); + printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]); int icov; for (icov=1; icovncov-1; icov++) - printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1,stats->cov[icov]); - printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov[stats->ncov-1]); + printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]); + printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]); // Calculate average GC content, then sort by GC and depth diff --git a/misc/seqtk.c b/misc/seqtk.c deleted file mode 100644 index 591ddff..0000000 --- a/misc/seqtk.c +++ /dev/null @@ -1,783 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - -typedef struct { - int n, m; - uint64_t *a; -} reglist_t; - -#include "khash.h" -KHASH_MAP_INIT_STR(reg, reglist_t) - -typedef kh_reg_t reghash_t; - -reghash_t *stk_reg_read(const char *fn) -{ - reghash_t *h = kh_init(reg); - gzFile fp; - kstream_t *ks; - int dret; - kstring_t *str; - // read the list - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int beg = -1, end = -1; - reglist_t *p; - khint_t k = kh_get(reg, h, str->s); - if (k == kh_end(h)) { - int ret; - char *s = strdup(str->s); - k = kh_put(reg, h, s, &ret); - memset(&kh_val(h, k), 0, sizeof(reglist_t)); - } - p = &kh_val(h, k); - if (dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - beg = atoi(str->s); - if (dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - end = atoi(str->s); - if (end < 0) end = -1; - } - } - } - } - // skip the rest of the line - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column - if (beg < 0) beg = 0, end = INT_MAX; - if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); - } - p->a[p->n++] = (uint64_t)beg<<32 | end; - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - return h; -} - -void stk_reg_destroy(reghash_t *h) -{ - khint_t k; - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free(kh_val(h, k).a); - free((char*)kh_key(h, k)); - } - } - kh_destroy(reg, h); -} - -/* constant table */ - -unsigned char seq_nt16_table[256] = { - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 -}; - -char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN"; -unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; -int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; - -/* composition */ -int stk_comp(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l, c, upper_only = 0; - reghash_t *h = 0; - reglist_t dummy; - while ((c = getopt(argc, argv, "ur:")) >= 0) { - switch (c) { - case 'u': upper_only = 1; break; - case 'r': h = stk_reg_read(optarg); break; - } - } - if (argc == optind) { - fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); - fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); - return 1; - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - seq = kseq_init(fp); - dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); - while ((l = kseq_read(seq)) >= 0) { - int i, k; - reglist_t *p = 0; - if (h) { - khint_t k = kh_get(reg, h, seq->name.s); - if (k != kh_end(h)) p = &kh_val(h, k); - } else { - p = &dummy; - dummy.a[0] = l; - } - for (k = 0; p && k < p->n; ++k) { - int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; - int la, lb, lc, na, nb, nc, cnt[11]; - if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; - else la = 'a', lb = -1, lc = 0; - na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; - memset(cnt, 0, 11 * sizeof(int)); - for (i = beg; i < end; ++i) { - int is_CpG = 0, a, b, c; - a = na; b = nb; c = nc; - na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; - if (b == 2 || b == 10) { // C or Y - if (nb == 4 || nb == 5) is_CpG = 1; - } else if (b == 4 || b == 5) { // G or R - if (lb == 2 || lb == 10) is_CpG = 1; - } - if (upper_only == 0 || isupper(a)) { - if (c > 1) ++cnt[c+2]; - if (c == 1) ++cnt[seq_nt16to4_table[b]]; - if (b == 10 || b == 5) ++cnt[9]; - else if (c == 2) { - ++cnt[8]; - } - if (is_CpG) { - ++cnt[7]; - if (b == 10 || b == 5) ++cnt[10]; - } - } - la = a; lb = b; lc = c; - } - if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); - else printf("%s\t%d", seq->name.s, l); - for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); - putchar('\n'); - } - fflush(stdout); - } - free(dummy.a); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_randbase(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l; - if (argc == 1) { - fprintf(stderr, "Usage: seqtk randbase \n"); - return 1; - } - fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - int i; - printf(">%s", seq->name.s); - for (i = 0; i < l; ++i) { - int c, b, a, j, k, m; - b = seq->seq.s[i]; - c = seq_nt16_table[b]; - a = bitcnt_table[c]; - if (a == 2) { - m = (drand48() < 0.5); - for (j = k = 0; j < 4; ++j) { - if ((1<seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j]; - } - if (i%60 == 0) putchar('\n'); - putchar(seq->seq.s[i]); - } - putchar('\n'); - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_hety(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; - char *buf; - uint32_t cnt[3]; - if (argc == 1) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk hety [options] \n\n"); - fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); - fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); - fprintf(stderr, " -m treat lowercases as masked\n"); - fprintf(stderr, "\n"); - return 1; - } - while ((c = getopt(argc, argv, "w:t:m")) >= 0) { - switch (c) { - case 'w': win_size = atoi(optarg); break; - case 't': n_start = atoi(optarg); break; - case 'm': is_lower_mask = 1; break; - } - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - seq = kseq_init(fp); - win_step = win_size / n_start; - buf = calloc(win_size, 1); - while ((l = kseq_read(seq)) >= 0) { - int x, i, y, z, next = 0; - cnt[0] = cnt[1] = cnt[2] = 0; - for (i = 0; i <= l; ++i) { - if ((i >= win_size && i % win_step == 0) || i == l) { - if (i == l && l >= win_size) { - for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; - } - if (cnt[1] + cnt[2] > 0) - printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, - (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); - next = i; - } - if (i < l) { - y = i % win_size; - c = seq->seq.s[i]; - if (is_lower_mask && islower(c)) c = 'N'; - c = seq_nt16_table[c]; - x = bitcnt_table[c]; - if (i >= win_size) --cnt[(int)buf[y]]; - buf[y] = z = x > 2? 0 : x == 2? 2 : 1; - ++cnt[z]; - } - } - } - free(buf); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* fq2fa */ -int stk_fq2fa(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - char *buf; - int l, i, c, qual_thres = 0, linelen = 60; - while ((c = getopt(argc, argv, "q:l:")) >= 0) { - switch (c) { - case 'q': qual_thres = atoi(optarg); break; - case 'l': linelen = atoi(optarg); break; - } - } - if (argc == optind) { - fprintf(stderr, "Usage: seqtk fq2fa [-q qualThres=0] [-l lineLen=60] \n"); - return 1; - } - buf = linelen > 0? malloc(linelen + 1) : 0; - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - if (seq->qual.l && qual_thres > 0) { - for (i = 0; i < l; ++i) - if (seq->qual.s[i] - 33 < qual_thres) - seq->seq.s[i] = tolower(seq->seq.s[i]); - } - putchar('>'); - if (seq->comment.l) { - fputs(seq->name.s, stdout); - putchar(' '); - puts(seq->comment.s); - } else puts(seq->name.s); - if (buf) { // multi-line - for (i = 0; i < l; i += linelen) { - int x = i + linelen < l? linelen : l - i; - memcpy(buf, seq->seq.s + i, x); - buf[x] = 0; - puts(buf); - } - } else puts(seq->seq.s); - } - free(buf); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_maskseq(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - int l, i, j, c, is_complement = 0, is_lower = 0; - khint_t k; - while ((c = getopt(argc, argv, "cl")) >= 0) { - switch (c) { - case 'c': is_complement = 1; break; - case 'l': is_lower = 1; break; - } - } - if (argc - optind < 2) { - fprintf(stderr, "Usage: seqtk maskseq [-cl] \n\n"); - fprintf(stderr, "Options: -c mask the complement regions\n"); - fprintf(stderr, " -l soft mask (to lower cases)\n"); - return 1; - } - h = stk_reg_read(argv[optind+1]); - // maskseq - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) { // not found in the hash table - if (is_complement) { - for (j = 0; j < l; ++j) - seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; - } - } else { - reglist_t *p = &kh_val(h, k); - if (!is_complement) { - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32, end = p->a[i]; - if (beg >= seq->seq.l) { - fprintf(stderr, "[maskseq] start position >= the sequence length.\n"); - continue; - } - if (end >= seq->seq.l) end = seq->seq.l; - if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); - else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N'; - } - } else { - int8_t *mask = calloc(seq->seq.l, 1); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32, end = p->a[i]; - if (end >= seq->seq.l) end = seq->seq.l; - for (j = beg; j < end; ++j) mask[j] = 1; - } - for (j = 0; j < l; ++j) - if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; - free(mask); - } - } - printf(">%s", seq->name.s); - for (j = 0; j < seq->seq.l; ++j) { - if (j%60 == 0) putchar('\n'); - putchar(seq->seq.s[j]); - } - putchar('\n'); - } - // free - kseq_destroy(seq); - gzclose(fp); - stk_reg_destroy(h); - return 0; -} - -/* subseq */ - -int stk_subseq(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - int l, i, j, c, is_tab = 0; - khint_t k; - while ((c = getopt(argc, argv, "t")) >= 0) { - switch (c) { - case 't': is_tab = 1; break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: seqtk subseq [-t] \n\n"); - fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n"); - return 1; - } - h = stk_reg_read(argv[optind+1]); - // subseq - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - reglist_t *p; - k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) continue; - p = &kh_val(h, k); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32, end = p->a[i]; - if (beg >= seq->seq.l) { - fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); - continue; - } - if (end > seq->seq.l) end = seq->seq.l; - if (is_tab == 0) { - printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); - if (end == INT_MAX) { - if (beg) printf(":%d", beg+1); - } else printf(":%d-%d", beg+1, end); - } else printf("%s\t%d\t", seq->name.s, beg + 1); - if (end > seq->seq.l) end = seq->seq.l; - for (j = 0; j < end - beg; ++j) { - if (is_tab == 0 && j % 60 == 0) putchar('\n'); - putchar(seq->seq.s[j + beg]); - } - putchar('\n'); - if (seq->qual.l != seq->seq.l || is_tab) continue; - printf("+"); - for (j = 0; j < end - beg; ++j) { - if (j % 60 == 0) putchar('\n'); - putchar(seq->qual.s[j + beg]); - } - putchar('\n'); - } - } - // free - kseq_destroy(seq); - gzclose(fp); - stk_reg_destroy(h); - return 0; -} - -/* mergefa */ -int stk_mergefa(int argc, char *argv[]) -{ - gzFile fp[2]; - kseq_t *seq[2]; - int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0; - while ((c = getopt(argc, argv, "himq:")) >= 0) { - switch (c) { - case 'i': is_intersect = 1; break; - case 'h': is_haploid = 1; break; - case 'm': is_mask = 1; break; - case 'q': qual = atoi(optarg); break; - } - } - if (is_mask && is_intersect) { - fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__); - return 1; - } - if (optind + 2 > argc) { - fprintf(stderr, "\nUsage: seqtk mergefa [options] \n\n"); - fprintf(stderr, "Options: -q INT quality threshold [0]\n"); - fprintf(stderr, " -i take intersection\n"); - fprintf(stderr, " -m convert to lowercase when one of the input base is N.\n"); - fprintf(stderr, " -h suppress hets in the input\n\n"); - return 1; - } - for (i = 0; i < 2; ++i) { - fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); - seq[i] = kseq_init(fp[i]); - } - while (kseq_read(seq[0]) >= 0) { - int min_l, c[2], is_upper; - kseq_read(seq[1]); - if (strcmp(seq[0]->name.s, seq[1]->name.s)) - fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); - if (seq[0]->seq.l != seq[1]->seq.l) - fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); - min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; - printf(">%s", seq[0]->name.s); - for (l = 0; l < min_l; ++l) { - c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; - if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]); - if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]); - if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; - else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; - else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0; - c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]]; - if (c[0] == 0) c[0] = 15; - if (c[1] == 0) c[1] = 15; - if (is_haploid && (bitcnt_table[c[0]] > 1 || bitcnt_table[c[1]] > 1)) is_upper = 0; - if (is_intersect) { - c[0] = c[0] & c[1]; - if (c[0] == 0) is_upper = 0; - } else if (is_mask) { - if (c[0] == 15 || c[1] == 15) is_upper = 0; - c[0] = c[0] & c[1]; - if (c[0] == 0) is_upper = 0; - } else c[0] = c[0] | c[1]; - c[0] = seq_nt16_rev_table[c[0]]; - if (!is_upper) c[0] = tolower(c[0]); - if (l%60 == 0) putchar('\n'); - putchar(c[0]); - } - putchar('\n'); - } - return 0; -} - -int stk_famask(int argc, char *argv[]) -{ - gzFile fp[2]; - kseq_t *seq[2]; - int i, l; - if (argc < 3) { - fprintf(stderr, "Usage: seqtk famask \n"); - return 1; - } - for (i = 0; i < 2; ++i) { - fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); - seq[i] = kseq_init(fp[i]); - } - while (kseq_read(seq[0]) >= 0) { - int min_l, c[2]; - kseq_read(seq[1]); - if (strcmp(seq[0]->name.s, seq[1]->name.s)) - fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); - if (seq[0]->seq.l != seq[1]->seq.l) - fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); - min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; - printf(">%s", seq[0]->name.s); - for (l = 0; l < min_l; ++l) { - c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; - if (c[1] == 'x') c[0] = tolower(c[0]); - else if (c[1] != 'X') c[0] = c[1]; - if (l%60 == 0) putchar('\n'); - putchar(c[0]); - } - putchar('\n'); - } - return 0; -} - -int stk_mutfa(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - kstream_t *ks; - int l, i, dret; - kstring_t *str; - khint_t k; - if (argc < 3) { - fprintf(stderr, "Usage: seqtk mutfa \n\n"); - fprintf(stderr, "Note: contains at least four columns per line which are:\n"); - fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); - return 1; - } - // read the list - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - char *s = strdup(str->s); - int beg = 0, ret; - reglist_t *p; - k = kh_get(reg, h, s); - if (k == kh_end(h)) { - k = kh_put(reg, h, s, &ret); - memset(&kh_val(h, k), 0, sizeof(reglist_t)); - } - p = &kh_val(h, k); - if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col - ks_getuntil(ks, 0, str, &dret); // 3rd col - ks_getuntil(ks, 0, str, &dret); // 4th col - // skip the rest of the line - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (isalpha(str->s[0]) && str->l == 1) { - if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); - } - p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; - } - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - // mutfa - fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - reglist_t *p; - k = kh_get(reg, h, seq->name.s); - if (k != kh_end(h)) { - p = &kh_val(h, k); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32; - if (beg < seq->seq.l) - seq->seq.s[beg] = (int)p->a[i]; - } - } - printf(">%s", seq->name.s); - for (i = 0; i < l; ++i) { - if (i%60 == 0) putchar('\n'); - putchar(seq->seq.s[i]); - } - putchar('\n'); - } - // free - kseq_destroy(seq); - gzclose(fp); - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free(kh_val(h, k).a); - free((char*)kh_key(h, k)); - } - } - kh_destroy(reg, h); - return 0; -} - -int stk_listhet(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int i, l; - if (argc == 1) { - fprintf(stderr, "Usage: seqtk listhet \n"); - return 1; - } - fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - for (i = 0; i < l; ++i) { - int b = seq->seq.s[i]; - if (bitcnt_table[seq_nt16_table[b]] == 2) - printf("%s\t%d\t%c\n", seq->name.s, i+1, b); - } - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* cutN */ -static int cutN_min_N_tract = 1000; -static int cutN_nonN_penalty = 10; - -static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end) -{ - int i, b, e; - while (k < ks->seq.l) { - if (seq_nt16_table[(int)ks->seq.s[k]] == 15) { - int score, max; - score = 0; e = max = -1; - for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */ - if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; - else score -= cutN_nonN_penalty; - if (score > max) max = score, e = i; - } - score = 0; b = max = -1; - for (i = e; i >= 0 && score >= 0; --i) { /* backward */ - if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; - else score -= cutN_nonN_penalty; - if (score > max) max = score, b = i; - } - if (e + 1 - b >= cutN_min_N_tract) { - *begin = b; - *end = e + 1; - return *end; - } - k = e + 1; - } else ++k; - } - return -1; -} -static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end) -{ - int i; - if (begin >= end) return; // FIXME: why may this happen? Understand it! - fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end); - for (i = begin; i < end && i < ks->seq.l; ++i) { - if ((i - begin)%60 == 0) fputc('\n', fpout); - fputc(ks->seq.s[i], fpout); - } - fputc('\n', fpout); -} -int stk_cutN(int argc, char *argv[]) -{ - int c, l, gap_only = 0; - gzFile fp; - kseq_t *ks; - while ((c = getopt(argc, argv, "n:p:g")) >= 0) { - switch (c) { - case 'n': cutN_min_N_tract = atoi(optarg); break; - case 'p': cutN_nonN_penalty = atoi(optarg); break; - case 'g': gap_only = 1; break; - default: return 1; - } - } - if (argc == optind) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk cutN [options] \n\n"); - fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract); - fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty); - fprintf(stderr, " -g print gaps only, no sequence\n\n"); - return 1; - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - ks = kseq_init(fp); - while ((l = kseq_read(ks)) >= 0) { - int k = 0, begin = 0, end = 0; - while (find_next_cut(ks, k, &begin, &end) >= 0) { - if (begin != 0) { - if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end); - else print_seq(stdout, ks, k, begin); - } - k = end; - } - if (!gap_only) print_seq(stdout, ks, k, l); - } - kseq_destroy(ks); - gzclose(fp); - return 0; -} - -/* main function */ -static int usage() -{ - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk \n\n"); - fprintf(stderr, "Command: comp get the nucleotide composite of FASTA/Q\n"); - fprintf(stderr, " hety regional heterozygosity\n"); - fprintf(stderr, " fq2fa convert FASTQ to FASTA\n"); - fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); - fprintf(stderr, " maskseq mask sequences\n"); - fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); - fprintf(stderr, " mergefa merge two FASTA/Q files\n"); - fprintf(stderr, " randbase choose a random base from hets\n"); - fprintf(stderr, " cutN cut sequence at long N\n"); - fprintf(stderr, " listhet extract the position of each het\n"); - fprintf(stderr, "\n"); - return 1; -} - -int main(int argc, char *argv[]) -{ - if (argc == 1) return usage(); - if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); - else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); - else if (strcmp(argv[1], "fq2fa") == 0) stk_fq2fa(argc-1, argv+1); - else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); - else if (strcmp(argv[1], "maskseq") == 0) stk_maskseq(argc-1, argv+1); - else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); - else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); - else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); - else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); - else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); - else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); - else { - fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]); - return 1; - } - return 0; -} diff --git a/padding.c b/padding.c index fb3bc13..033a791 100644 --- a/padding.c +++ b/padding.c @@ -37,14 +37,17 @@ static void unpad_seq(bam1_t *b, kstring_t *s) int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); - assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP); - if (op == BAM_CMATCH) { - for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j); - ++j; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j); } else if (op == BAM_CSOFT_CLIP) { j += ol; - } else { + } else if (op == BAM_CHARD_CLIP) { + /* do nothing */ + } else if (op == BAM_CDEL || op == BAM_CPAD) { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; + } else { + fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b)); + assert(-1); } } } @@ -54,10 +57,12 @@ int bam_pad2unpad(bamFile in, bamFile out) bam_header_t *h; bam1_t *b; kstring_t r, q; + int r_tid = -1; uint32_t *cigar2 = 0; int n2 = 0, m2 = 0, *posmap = 0; h = bam_header_read(in); + /* TODO - The reference sequence lengths in the BAM + SAM headers should be updated */ bam_header_write(out, h); b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; @@ -66,20 +71,46 @@ int bam_pad2unpad(bamFile in, bamFile out) n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { int i, k; + /* + fprintf(stderr, "[depad] Found embedded reference %s\n", bam1_qname(b)); + */ + r_tid = b->core.tid; unpad_seq(b, &r); + if (h->target_len[r_tid] != r.l) { + fprintf(stderr, "[depad] ERROR: (Padded) length of %s is %i in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l); + return -1; + } write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = realloc(posmap, r.m * sizeof(int)); for (i = k = 0; i < r.l; ++i) { - posmap[i] = k; // note that a read should NOT start at a padding + posmap[i] = k; if (r.s[i]) ++k; } - } else { + } else if (b->core.n_cigar > 0) { int i, k, op; + if (b->core.tid < 0) { + fprintf(stderr, "[depad] ERROR: Read %s has CIGAR but no RNAME\n", bam1_qname(b)); + return -1; + } else if (b->core.tid != r_tid) { + fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence\n", h->target_name[b->core.tid]); + return -1; + } unpad_seq(b, &q); if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]); + if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) { + write_cigar(cigar2, n2, m2, cigar[0]); + if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[1]); + } + } + /* Include any pads if starts with an insert */ + for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); + if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD)); + /* Determine CIGAR operator for each base in the aligned read */ for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); + /* Count consecutive CIGAR operators to turn into a CIGAR string */ for (i = k = 1, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); @@ -88,6 +119,13 @@ int bam_pad2unpad(bamFile in, bamFile out) } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); + if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) { + if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]); + } + write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); + } + /* Remove redundant P operators between M operators, e.g. 5M2P10M -> 15M */ for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH) cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0; @@ -108,13 +146,14 @@ int bam_pad2unpad(bamFile in, bamFile out) int main_pad2unpad(int argc, char *argv[]) { bamFile in, out; + int result=0; if (argc == 1) { fprintf(stderr, "Usage: samtools depad \n"); return 1; } in = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); out = bam_dopen(fileno(stdout), "w"); - bam_pad2unpad(in, out); + result = bam_pad2unpad(in, out); bam_close(in); bam_close(out); - return 0; + return result; } diff --git a/sam.c b/sam.c index ba3158f..fa11df6 100644 --- a/sam.c +++ b/sam.c @@ -36,6 +36,13 @@ static void append_header_text(bam_header_t *header, char* text, int len) header->text[header->l_text] = 0; } +int samthreads(samfile_t *fp, int n_threads, int n_sub_blks) +{ + if (!(fp->type&1) || (fp->type&2)) return -1; + bgzf_mt(fp->x.bam, n_threads, n_sub_blks); + return 0; +} + samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; diff --git a/sam.h b/sam.h index 0b87194..0495501 100644 --- a/sam.h +++ b/sam.h @@ -90,6 +90,7 @@ extern "C" { int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); char *samfaipath(const char *fn_ref); + int samthreads(samfile_t *fp, int n_threads, int n_sub_blks); #ifdef __cplusplus } diff --git a/sam_view.c b/sam_view.c index 18b1282..c7a4774 100644 --- a/sam_view.c +++ b/sam_view.c @@ -21,8 +21,9 @@ typedef khash_t(rg) *rghash_t; // FIXME: we'd better use no global variables... static rghash_t g_rghash = 0; -static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0, g_qual_scale = 0; -static float g_subsam = -1; +static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0, g_qual_scale = 0, g_min_qlen = 0; +static uint32_t g_subsam_seed = 0; +static double g_subsam_frac = -1.; static char *g_library, *g_rg; static void *g_bed; @@ -40,14 +41,21 @@ static int process_aln(const bam_header_t *h, bam1_t *b) qual[i] = c < 93? c : 93; } } + if (g_min_qlen > 0) { + int k, qlen = 0; + uint32_t *cigar = bam1_cigar(b); + for (k = 0; k < b->core.n_cigar; ++k) + if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) + qlen += bam_cigar_oplen(cigar[k]); + if (qlen < g_min_qlen) return 1; + } if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) return 1; if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) return 1; - if (g_subsam > 0.) { - int x = (int)(g_subsam + .499); - uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + x; - if (k%1024 / 1024.0 >= g_subsam - x) return 1; + if (g_subsam_frac > 0.) { + uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + g_subsam_seed; + if ((double)(k&0xffffff) / 0x1000000 >= g_subsam_frac) return 1; } if (g_rg || g_rghash) { uint8_t *s = bam_aux_get(b, "RG"); @@ -119,16 +127,23 @@ static int usage(int is_long_help); int main_samview(int argc, char *argv[]) { int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0; - int of_type = BAM_OFDEC, is_long_help = 0; + int of_type = BAM_OFDEC, is_long_help = 0, n_threads = 0; int count = 0; samfile_t *in = 0, *out = 0; - char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0, *q; /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "SbBct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:Q:")) >= 0) { + while ((c = getopt(argc, argv, "SbBct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:Q:@:m:")) >= 0) { switch (c) { - case 's': g_subsam = atof(optarg); break; + case 's': + if ((g_subsam_seed = strtol(optarg, &q, 10)) != 0) { + srand(g_subsam_seed); + g_subsam_seed = rand(); + } + g_subsam_frac = strtod(q, &q); + break; + case 'm': g_min_qlen = atoi(optarg); break; case 'c': is_count = 1; break; case 'S': is_bamin = 0; break; case 'b': is_bamout = 1; break; @@ -151,6 +166,7 @@ int main_samview(int argc, char *argv[]) case 'T': fn_ref = strdup(optarg); is_bamin = 0; break; case 'B': bam_no_B = 1; break; case 'Q': g_qual_scale = atoi(optarg); break; + case '@': n_threads = strtol(optarg, 0, 0); break; default: return usage(is_long_help); } } @@ -208,6 +224,7 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + if (n_threads > 1) samthreads(out, n_threads, 256); if (is_header_only) goto view_end; // no need to print alignments if (argc == optind + 1) { // convert/print the entire file @@ -288,6 +305,7 @@ static int usage(int is_long_help) fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); fprintf(stderr, " -c print only the count of matching records\n"); fprintf(stderr, " -B collapse the backward CIGAR operation\n"); + fprintf(stderr, " -@ INT number of BAM compression threads [0]\n"); fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n"); fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); @@ -358,12 +376,14 @@ int main_bam2fq(int argc, char *argv[]) bam_header_t *h; bam1_t *b; int8_t *buf; - int max_buf; + int max_buf, c, no12 = 0; + while ((c = getopt(argc, argv, "n")) > 0) + if (c == 'n') no12 = 1; if (argc == 1) { fprintf(stderr, "Usage: samtools bam2fq \n"); return 1; } - fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) return 1; h = bam_header_read(fp); b = bam_init1(); @@ -373,9 +393,12 @@ int main_bam2fq(int argc, char *argv[]) int i, qlen = b->core.l_qseq; uint8_t *seq; putchar('@'); fputs(bam1_qname(b), stdout); - if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1"); - else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2"); - else putchar('\n'); + if (no12) putchar('\n'); + else { + if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1"); + else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2"); + else putchar('\n'); + } if (max_buf < qlen + 1) { max_buf = qlen + 1; kroundup32(max_buf);