X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bam2depth.c;h=87a4c5b6cf379c026c6883d95a44c700cc3d5364;hb=ccb7838cb53bf0ca6917a77f7991d940057c12db;hp=312e264cf62276e353607b9c0324abfa98033bbe;hpb=c72cba53485e07c550bf8f33d97d4fba2bd1dfba;p=samtools.git diff --git a/bam2depth.c b/bam2depth.c index 312e264..87a4c5b 100644 --- a/bam2depth.c +++ b/bam2depth.c @@ -2,7 +2,7 @@ * simutaneously, to achieve random access and to use the BED interface. * To compile this program separately, you may: * - * gcc -g -O2 -Wall -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz + * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz */ #include #include @@ -13,6 +13,7 @@ typedef struct { // auxiliary data structure bamFile fp; // the file handler bam_iter_t iter; // NULL if a region not specified + int min_mapQ, min_len; // mapQ filter; length filter } aux_t; void *bed_read(const char *fn); // read a BED or position list file @@ -20,10 +21,15 @@ void bed_destroy(void *_h); // destroy the BED data structure int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps // This function reads a BAM alignment from one BAM file. -static int read_bam(void *data, bam1_t *b) +static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure - return aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); + int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); + if (!(b->core.flag&BAM_FUNMAP)) { + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP; + } + return ret; } #ifdef _MAIN_BAM2DEPTH @@ -32,7 +38,7 @@ int main(int argc, char *argv[]) int main_depth(int argc, char *argv[]) #endif { - int i, n, tid, beg, end, pos, *n_plp; + int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure @@ -41,28 +47,33 @@ int main_depth(int argc, char *argv[]) bam_mplp_t mplp; // parse the command line - while ((n = getopt(argc, argv, "r:b:")) >= 0) { + while ((n = getopt(argc, argv, "r:b:q:Q:l:")) >= 0) { switch (n) { + case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now + case 'q': baseQ = atoi(optarg); break; // base quality threshold + case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold } } if (optind == argc) { - fprintf(stderr, "Usage: bam2depth [-r reg] [-b in.bed] [...]\n"); + fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-l minQLen] [-b in.bed] [...]\n"); return 1; } // initialize the auxiliary data structures n = argc - optind; // the number of BAMs on the command line - data = calloc(n, sizeof(void*)); - beg = 0; end = 1<<30; tid = -1; + data = calloc(n, sizeof(void*)); // data[i] for the i-th input + beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM + data[i]->min_mapQ = mapQ; // set the mapQ filter + data[i]->min_len = min_len; // set the qlen filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { - h = htmp; // keep the header for the 1st BAM + h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully @@ -75,17 +86,19 @@ int main_depth(int argc, char *argv[]) // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM - plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads internal in mplp + plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip - printf("%s\t%d", h->target_name[tid], pos+1); - for (i = 0; i < n; ++i) { + fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster + for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; - const bam_pileup1_t *p = plp[i]; - for (j = 0; j < n_plp[i]; ++j) // this loop counts #reads having deletions or refskip at tid:pos - if (p->is_del || p->is_refskip) ++m; - printf("\t%d", n_plp[i] - m); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know + if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos + else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality + } + printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); }