From: Petr Danecek Date: Fri, 18 May 2012 09:38:07 +0000 (+0100) Subject: Merge branch 'master' of github.com:samtools/samtools X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=1250c05d84fbe5bb90a0936522c1b272ca7df8a2;hp=859feefb171ac2bd02e20e73a3054bd469c69913;p=samtools.git Merge branch 'master' of github.com:samtools/samtools --- diff --git a/Makefile b/Makefile index 4a8a75b..dbd90af 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ - cut_target.o phase.o bam2depth.o padding.o bedcov.o + cut_target.o phase.o bam2depth.o padding.o bedcov.o bamshuf.o PROG= samtools INCLUDES= -I. SUBDIRS= . bcftools misc diff --git a/bam.h b/bam.h index 82f1ffa..f3f37f3 100644 --- a/bam.h +++ b/bam.h @@ -776,5 +776,17 @@ static inline int bam_aux_type2size(int x) else return 0; } +/********************************* + *** Compatibility with htslib *** + *********************************/ + +typedef bam_header_t bam_hdr_t; + +#define bam_get_qname(b) bam1_qname(b) +#define bam_get_cigar(b) bam1_cigar(b) + +#define bam_hdr_read(fp) bam_header_read(fp) +#define bam_hdr_write(fp, h) bam_header_write(fp, h) +#define bam_hdr_destroy(fp) bam_header_destroy(fp) #endif diff --git a/bam2depth.c b/bam2depth.c index ca36b89..87a4c5b 100644 --- a/bam2depth.c +++ b/bam2depth.c @@ -13,7 +13,7 @@ typedef struct { // auxiliary data structure bamFile fp; // the file handler bam_iter_t iter; // NULL if a region not specified - int min_mapQ; // mapQ filter + int min_mapQ, min_len; // mapQ filter; length filter } aux_t; void *bed_read(const char *fn); // read a BED or position list file @@ -25,7 +25,10 @@ static int read_bam(void *data, bam1_t *b) // read level filters better go here { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); - if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + if (!(b->core.flag&BAM_FUNMAP)) { + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP; + } return ret; } @@ -35,7 +38,7 @@ int main(int argc, char *argv[]) int main_depth(int argc, char *argv[]) #endif { - int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; + int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure @@ -44,8 +47,9 @@ int main_depth(int argc, char *argv[]) bam_mplp_t mplp; // parse the command line - while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { + while ((n = getopt(argc, argv, "r:b:q:Q:l:")) >= 0) { switch (n) { + case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold @@ -53,7 +57,7 @@ int main_depth(int argc, char *argv[]) } } if (optind == argc) { - fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] [...]\n"); + fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-l minQLen] [-b in.bed] [...]\n"); return 1; } @@ -66,6 +70,7 @@ int main_depth(int argc, char *argv[]) data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter + data[i]->min_len = min_len; // set the qlen filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM diff --git a/bamshuf.c b/bamshuf.c new file mode 100644 index 0000000..33a5238 --- /dev/null +++ b/bamshuf.c @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include +#include "sam.h" +#include "ksort.h" + +#define DEF_CLEVEL 1 + +static inline unsigned hash_Wang(unsigned key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +static inline unsigned hash_X31_Wang(const char *s) +{ + unsigned h = *s; + if (h) { + for (++s ; *s; ++s) h = (h << 5) - h + *s; + return hash_Wang(h); + } else return 0; +} + +typedef struct { + unsigned key; + bam1_t *b; +} elem_t; + +static inline int elem_lt(elem_t x, elem_t y) +{ + if (x.key < y.key) return 1; + if (x.key == y.key) { + int t; + t = strcmp(bam_get_qname(x.b), bam_get_qname(y.b)); + if (t < 0) return 1; + return (t == 0 && ((x.b->core.flag>>6&3) < (y.b->core.flag>>6&3))); + } else return 0; +} + +KSORT_INIT(bamshuf, elem_t, elem_lt) + +static void bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout) +{ + BGZF *fp, *fpw, **fpt; + char **fnt, modew[8]; + bam1_t *b; + int i, l; + bam_hdr_t *h; + int64_t *cnt; + + // split + fp = strcmp(fn, "-")? bgzf_open(fn, "r") : bgzf_dopen(fileno(stdin), "r"); + assert(fp); + h = bam_hdr_read(fp); + fnt = (char**)calloc(n_files, sizeof(void*)); + fpt = (BGZF**)calloc(n_files, sizeof(void*)); + cnt = (int64_t*)calloc(n_files, 8); + l = strlen(pre); + for (i = 0; i < n_files; ++i) { + fnt[i] = (char*)calloc(l + 10, 1); + sprintf(fnt[i], "%s.%.4d.bam", pre, i); + fpt[i] = bgzf_open(fnt[i], "w1"); + bam_hdr_write(fpt[i], h); + } + b = bam_init1(); + while (bam_read1(fp, b) >= 0) { + uint32_t x; + x = hash_X31_Wang(bam_get_qname(b)) % n_files; + bam_write1(fpt[x], b); + ++cnt[x]; + } + bam_destroy1(b); + for (i = 0; i < n_files; ++i) bgzf_close(fpt[i]); + free(fpt); + bgzf_close(fp); + // merge + sprintf(modew, "w%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL); + if (!is_stdout) { // output to a file + char *fnw = (char*)calloc(l + 5, 1); + sprintf(fnw, "%s.bam", pre); + fpw = bgzf_open(fnw, modew); + free(fnw); + } else fpw = bgzf_dopen(fileno(stdout), modew); // output to stdout + bam_hdr_write(fpw, h); + bam_hdr_destroy(h); + for (i = 0; i < n_files; ++i) { + int64_t j, c = cnt[i]; + elem_t *a; + fp = bgzf_open(fnt[i], "r"); + bam_hdr_destroy(bam_hdr_read(fp)); + a = (elem_t*)calloc(c, sizeof(elem_t)); + for (j = 0; j < c; ++j) { + a[j].b = bam_init1(); + assert(bam_read1(fp, a[j].b) >= 0); + a[j].key = hash_X31_Wang(bam_get_qname(a[j].b)); + } + bgzf_close(fp); + unlink(fnt[i]); + free(fnt[i]); + ks_introsort(bamshuf, c, a); + for (j = 0; j < c; ++j) { + bam_write1(fpw, a[j].b); + bam_destroy1(a[j].b); + } + free(a); + } + bgzf_close(fpw); + free(fnt); free(cnt); +} + +int main_bamshuf(int argc, char *argv[]) +{ + int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0; + while ((c = getopt(argc, argv, "n:l:uO")) >= 0) { + switch (c) { + case 'n': n_files = atoi(optarg); break; + case 'l': clevel = atoi(optarg); break; + case 'u': is_un = 1; break; + case 'O': is_stdout = 1; break; + } + } + if (is_un) clevel = 0; + if (optind + 2 > argc) { + fprintf(stderr, "\nUsage: bamshuf [-Ou] [-n nFiles] [-c cLevel] \n\n"); + fprintf(stderr, "Options: -O output to stdout\n"); + fprintf(stderr, " -u uncompressed BAM output\n"); + fprintf(stderr, " -l INT compression level [%d]\n", DEF_CLEVEL); + fprintf(stderr, " -n INT number of temporary files [%d]\n", n_files); + fprintf(stderr, "\n"); + return 1; + } + bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout); + return 0; +} diff --git a/bamtk.c b/bamtk.c index acdf65f..9df7c11 100644 --- a/bamtk.c +++ b/bamtk.c @@ -29,6 +29,7 @@ int main_depth(int argc, char *argv[]); int main_bam2fq(int argc, char *argv[]); int main_pad2unpad(int argc, char *argv[]); int main_bedcov(int argc, char *argv[]); +int main_bamshuf(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); @@ -58,6 +59,7 @@ static int usage() fprintf(stderr, " bedcov read depth per BED region\n"); fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); fprintf(stderr, " phase phase heterozygotes\n"); + fprintf(stderr, " bamshuf shuffle and group alignments by name\n"); // fprintf(stderr, " depad convert padded BAM to unpadded BAM\n"); // not stable fprintf(stderr, "\n"); #ifdef _WIN32 @@ -101,6 +103,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1); else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1); else if (strcmp(argv[1], "bedcov") == 0) return main_bedcov(argc-1, argv+1); + else if (strcmp(argv[1], "bamshuf") == 0) return main_bamshuf(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) { fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; diff --git a/bgzf.c b/bgzf.c index 7254fa2..880d5af 100644 --- a/bgzf.c +++ b/bgzf.c @@ -401,7 +401,7 @@ static int worker_aux(worker_t *w) w->errcode = 0; for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) { int clen = BGZF_MAX_BLOCK_SIZE; - if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0) + if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0) w->errcode |= BGZF_ERR_ZLIB; memcpy(w->mt->blk[i], w->buf, clen); w->mt->len[i] = clen; @@ -502,7 +502,7 @@ static int mt_flush(BGZF *fp) static int mt_lazy_flush(BGZF *fp) { mtaux_t *mt = (mtaux_t*)fp->mt; - mt_queue(fp); + if (fp->block_offset) mt_queue(fp); if (mt->curr == mt->n_blks) return mt_flush(fp); return -1;