AOBJS= bam_tview.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \
- cut_target.o phase.o bam2depth.o padding.o bedcov.o
+ cut_target.o phase.o bam2depth.o padding.o bedcov.o bamshuf.o
PROG= samtools
INCLUDES= -I.
SUBDIRS= . bcftools misc
else return 0;
}
+/*********************************
+ *** Compatibility with htslib ***
+ *********************************/
+
+typedef bam_header_t bam_hdr_t;
+
+#define bam_get_qname(b) bam1_qname(b)
+#define bam_get_cigar(b) bam1_cigar(b)
+
+#define bam_hdr_read(fp) bam_header_read(fp)
+#define bam_hdr_write(fp, h) bam_header_write(fp, h)
+#define bam_hdr_destroy(fp) bam_header_destroy(fp)
#endif
typedef struct { // auxiliary data structure
bamFile fp; // the file handler
bam_iter_t iter; // NULL if a region not specified
- int min_mapQ; // mapQ filter
+ int min_mapQ, min_len; // mapQ filter; length filter
} aux_t;
void *bed_read(const char *fn); // read a BED or position list file
{
aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
- if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+ if (!(b->core.flag&BAM_FUNMAP)) {
+ if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+ else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP;
+ }
return ret;
}
int main_depth(int argc, char *argv[])
#endif
{
- int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0;
+ int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
const bam_pileup1_t **plp;
char *reg = 0; // specified region
void *bed = 0; // BED data structure
bam_mplp_t mplp;
// parse the command line
- while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) {
+ while ((n = getopt(argc, argv, "r:b:q:Q:l:")) >= 0) {
switch (n) {
+ case 'l': min_len = atoi(optarg); break; // minimum query length
case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header
case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
case 'q': baseQ = atoi(optarg); break; // base quality threshold
}
}
if (optind == argc) {
- fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n");
+ fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-l minQLen] [-b in.bed] <in1.bam> [...]\n");
return 1;
}
data[i] = calloc(1, sizeof(aux_t));
data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
data[i]->min_mapQ = mapQ; // set the mapQ filter
+ data[i]->min_len = min_len; // set the qlen filter
htmp = bam_header_read(data[i]->fp); // read the BAM header
if (i == 0) {
h = htmp; // keep the header of the 1st BAM
--- /dev/null
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "sam.h"
+#include "ksort.h"
+
+#define DEF_CLEVEL 1
+
+static inline unsigned hash_Wang(unsigned key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+
+static inline unsigned hash_X31_Wang(const char *s)
+{
+ unsigned h = *s;
+ if (h) {
+ for (++s ; *s; ++s) h = (h << 5) - h + *s;
+ return hash_Wang(h);
+ } else return 0;
+}
+
+typedef struct {
+ unsigned key;
+ bam1_t *b;
+} elem_t;
+
+static inline int elem_lt(elem_t x, elem_t y)
+{
+ if (x.key < y.key) return 1;
+ if (x.key == y.key) {
+ int t;
+ t = strcmp(bam_get_qname(x.b), bam_get_qname(y.b));
+ if (t < 0) return 1;
+ return (t == 0 && ((x.b->core.flag>>6&3) < (y.b->core.flag>>6&3)));
+ } else return 0;
+}
+
+KSORT_INIT(bamshuf, elem_t, elem_lt)
+
+static void bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout)
+{
+ BGZF *fp, *fpw, **fpt;
+ char **fnt, modew[8];
+ bam1_t *b;
+ int i, l;
+ bam_hdr_t *h;
+ int64_t *cnt;
+
+ // split
+ fp = strcmp(fn, "-")? bgzf_open(fn, "r") : bgzf_dopen(fileno(stdin), "r");
+ assert(fp);
+ h = bam_hdr_read(fp);
+ fnt = (char**)calloc(n_files, sizeof(void*));
+ fpt = (BGZF**)calloc(n_files, sizeof(void*));
+ cnt = (int64_t*)calloc(n_files, 8);
+ l = strlen(pre);
+ for (i = 0; i < n_files; ++i) {
+ fnt[i] = (char*)calloc(l + 10, 1);
+ sprintf(fnt[i], "%s.%.4d.bam", pre, i);
+ fpt[i] = bgzf_open(fnt[i], "w1");
+ bam_hdr_write(fpt[i], h);
+ }
+ b = bam_init1();
+ while (bam_read1(fp, b) >= 0) {
+ uint32_t x;
+ x = hash_X31_Wang(bam_get_qname(b)) % n_files;
+ bam_write1(fpt[x], b);
+ ++cnt[x];
+ }
+ bam_destroy1(b);
+ for (i = 0; i < n_files; ++i) bgzf_close(fpt[i]);
+ free(fpt);
+ bgzf_close(fp);
+ // merge
+ sprintf(modew, "w%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL);
+ if (!is_stdout) { // output to a file
+ char *fnw = (char*)calloc(l + 5, 1);
+ sprintf(fnw, "%s.bam", pre);
+ fpw = bgzf_open(fnw, modew);
+ free(fnw);
+ } else fpw = bgzf_dopen(fileno(stdout), modew); // output to stdout
+ bam_hdr_write(fpw, h);
+ bam_hdr_destroy(h);
+ for (i = 0; i < n_files; ++i) {
+ int64_t j, c = cnt[i];
+ elem_t *a;
+ fp = bgzf_open(fnt[i], "r");
+ bam_hdr_destroy(bam_hdr_read(fp));
+ a = (elem_t*)calloc(c, sizeof(elem_t));
+ for (j = 0; j < c; ++j) {
+ a[j].b = bam_init1();
+ assert(bam_read1(fp, a[j].b) >= 0);
+ a[j].key = hash_X31_Wang(bam_get_qname(a[j].b));
+ }
+ bgzf_close(fp);
+ unlink(fnt[i]);
+ free(fnt[i]);
+ ks_introsort(bamshuf, c, a);
+ for (j = 0; j < c; ++j) {
+ bam_write1(fpw, a[j].b);
+ bam_destroy1(a[j].b);
+ }
+ free(a);
+ }
+ bgzf_close(fpw);
+ free(fnt); free(cnt);
+}
+
+int main_bamshuf(int argc, char *argv[])
+{
+ int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
+ while ((c = getopt(argc, argv, "n:l:uO")) >= 0) {
+ switch (c) {
+ case 'n': n_files = atoi(optarg); break;
+ case 'l': clevel = atoi(optarg); break;
+ case 'u': is_un = 1; break;
+ case 'O': is_stdout = 1; break;
+ }
+ }
+ if (is_un) clevel = 0;
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\nUsage: bamshuf [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n");
+ fprintf(stderr, "Options: -O output to stdout\n");
+ fprintf(stderr, " -u uncompressed BAM output\n");
+ fprintf(stderr, " -l INT compression level [%d]\n", DEF_CLEVEL);
+ fprintf(stderr, " -n INT number of temporary files [%d]\n", n_files);
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout);
+ return 0;
+}
int main_bam2fq(int argc, char *argv[]);
int main_pad2unpad(int argc, char *argv[]);
int main_bedcov(int argc, char *argv[]);
+int main_bamshuf(int argc, char *argv[]);
int faidx_main(int argc, char *argv[]);
fprintf(stderr, " bedcov read depth per BED region\n");
fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n");
fprintf(stderr, " phase phase heterozygotes\n");
+ fprintf(stderr, " bamshuf shuffle and group alignments by name\n");
// fprintf(stderr, " depad convert padded BAM to unpadded BAM\n"); // not stable
fprintf(stderr, "\n");
#ifdef _WIN32
else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1);
else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1);
else if (strcmp(argv[1], "bedcov") == 0) return main_bedcov(argc-1, argv+1);
+ else if (strcmp(argv[1], "bamshuf") == 0) return main_bamshuf(argc-1, argv+1);
else if (strcmp(argv[1], "pileup") == 0) {
fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
return 1;
w->errcode = 0;
for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) {
int clen = BGZF_MAX_BLOCK_SIZE;
- if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0)
+ if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0)
w->errcode |= BGZF_ERR_ZLIB;
memcpy(w->mt->blk[i], w->buf, clen);
w->mt->len[i] = clen;
static int mt_lazy_flush(BGZF *fp)
{
mtaux_t *mt = (mtaux_t*)fp->mt;
- mt_queue(fp);
+ if (fp->block_offset) mt_queue(fp);
if (mt->curr == mt->n_blks)
return mt_flush(fp);
return -1;