X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=misc%2Fbamcheck.c;h=3db9cfb55f5d3ed7ba10138384048c94b6dabbc0;hb=12c0954c689d82000572078a756678142f9a7826;hp=7b5fd029469aa2373d950a43fb92d68218d82f8c;hpb=8928103fafd05b9b555f9ae1710031666725867a;p=samtools.git diff --git a/misc/bamcheck.c b/misc/bamcheck.c index 7b5fd02..3db9cfb 100644 --- a/misc/bamcheck.c +++ b/misc/bamcheck.c @@ -1,6 +1,6 @@ /* Author: petr.danecek@sanger - gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam + gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread Assumptions, approximations and other issues: - GC-depth graph does not split reads, the starting position determines which bin is incremented. @@ -8,14 +8,16 @@ - coverage distribution ignores softclips and deletions - some stats require sorted BAMs - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths. - - The whole reads are used with -t, no splicing is done, no indels or soft clips are - considered, even small overlap is good enough to include the read in the stats. + - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq) + - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar) + counts, no splicing is done, no indels or soft clips are considered, even small overlap is + good enough to include the read in the stats. + */ -#define BAMCHECK_VERSION "2012-03-22" +#define BAMCHECK_VERSION "2012-09-04" #define _ISOC99_SOURCE -#define _GNU_SOURCE #include #include #include @@ -27,6 +29,7 @@ #include "faidx.h" #include "khash.h" #include "sam.h" +#include "sam_header.h" #include "razf.h" #define BWA_MIN_RDLEN 35 @@ -45,13 +48,14 @@ typedef struct uint64_t offset; } faidx1_t; -KHASH_MAP_INIT_STR(s, faidx1_t) -KHASH_MAP_INIT_STR(str, int) +KHASH_MAP_INIT_STR(kh_faidx, faidx1_t) +KHASH_MAP_INIT_STR(kh_bam_tid, int) +KHASH_MAP_INIT_STR(kh_rg, const char *) struct __faidx_t { RAZF *rz; int n, m; char **name; - khash_t(s) *hash; + khash_t(kh_faidx) *hash; }; typedef struct @@ -82,7 +86,6 @@ typedef struct { // Parameters int trim_qual; // bwa trim quality - int rmdup; // Exclude reads marked as duplicates from the stats // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd), // insert size histogram holder @@ -99,7 +102,7 @@ typedef struct uint64_t *acgt_cycles; uint64_t *read_lengths; uint64_t *insertions, *deletions; - uint64_t *ins_cycles, *del_cycles; + uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd; // The extremes encountered int max_len; // Maximum read length @@ -146,19 +149,24 @@ typedef struct int filter_readlen; // Target regions - int nregions; + int nregions, reg_from,reg_to; regions_t *regions; // Auxiliary data - double sum_qual; // For calculating average quality value - samfile_t *sam; // Unused - faidx_t *fai; // Reference sequence for GC-depth graph - int argc; // Command line arguments to be printed on the output + int flag_require, flag_filter; + double sum_qual; // For calculating average quality value + samfile_t *sam; + khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated + faidx_t *fai; // Reference sequence for GC-depth graph + int argc; // Command line arguments to be printed on the output char **argv; } stats_t; void error(const char *format, ...); +void bam_init_header_hash(bam_header_t *header); +int is_in_regions(bam1_t *bam_line, stats_t *stats); + // Coverage distribution methods inline int coverage_idx(int min, int max, int n, int step, int depth) @@ -269,6 +277,7 @@ int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse) void count_indels(stats_t *stats,bam1_t *bam_line) { int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; + int is_1st = IS_READ1(bam_line) ? 1 : 0; int icig; int icycle = 0; int read_len = bam_line->core.l_qseq; @@ -282,9 +291,14 @@ void count_indels(stats_t *stats,bam1_t *bam_line) if ( cig==1 ) { - int idx = is_fwd ? icycle : read_len-icycle-1; - if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); - stats->ins_cycles[idx]++; + int idx = is_fwd ? icycle : read_len-icycle; + if ( idx<0 ) + error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); + if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); + if ( is_1st ) + stats->ins_cycles_1st[idx]++; + else + stats->ins_cycles_2nd[idx]++; icycle += ncig; if ( ncig<=stats->nindels ) stats->insertions[ncig-1]++; @@ -292,14 +306,19 @@ void count_indels(stats_t *stats,bam1_t *bam_line) } if ( cig==2 ) { - int idx = is_fwd ? icycle : read_len-icycle-1; + int idx = is_fwd ? icycle-1 : read_len-icycle-1; + if ( idx<0 ) continue; // discard meaningless deletions if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); - stats->del_cycles[idx]++; + if ( is_1st ) + stats->del_cycles_1st[idx]++; + else + stats->del_cycles_2nd[idx]++; if ( ncig<=stats->nindels ) stats->deletions[ncig-1]++; continue; } - icycle += ncig; + if ( cig!=3 && cig!=5 ) + icycle += ncig; } } @@ -392,7 +411,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) { - khash_t(s) *h; + khash_t(kh_faidx) *h; khiter_t iter; faidx1_t val; char *chr, c; @@ -402,7 +421,7 @@ void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) chr = stats->sam->header->target_name[tid]; // ID of the sequence name - iter = kh_get(s, h, chr); + iter = kh_get(kh_faidx, h, chr); if (iter == kh_end(h)) error("No such reference sequence [%s]?\n", chr); val = kh_value(h, iter); @@ -515,15 +534,25 @@ void realloc_buffers(stats_t *stats, int seq_len) error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); - stats->ins_cycles = realloc(stats->ins_cycles, n*sizeof(uint64_t)); - if ( !stats->ins_cycles ) - error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); - memset(stats->ins_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t)); + if ( !stats->ins_cycles_1st ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); - stats->del_cycles = realloc(stats->del_cycles, n*sizeof(uint64_t)); - if ( !stats->del_cycles ) - error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); - memset(stats->del_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t)); + if ( !stats->ins_cycles_2nd ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t)); + if ( !stats->del_cycles_1st ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t)); + if ( !stats->del_cycles_2nd ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); stats->nbases = n; @@ -541,7 +570,19 @@ void realloc_buffers(stats_t *stats, int seq_len) void collect_stats(bam1_t *bam_line, stats_t *stats) { - if ( stats->rmdup && IS_DUP(bam_line) ) + if ( stats->rg_hash ) + { + const uint8_t *rg = bam_aux_get(bam_line, "RG"); + if ( !rg ) return; + khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end(stats->rg_hash) ) return; + } + if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require ) + return; + if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) ) + return; + + if ( !is_in_regions(bam_line,stats) ) return; int seq_len = bam_line->core.l_qseq; @@ -665,19 +706,47 @@ void collect_stats(bam1_t *bam_line, stats_t *stats) stats->nmismatches += bam_aux2i(nm); // Number of mapped bases from cigar + // Conversion from uint32_t to MIDNSHP + // 012-4-- + // MIDNSHP if ( bam_line->core.n_cigar == 0) error("FIXME: mapped read with no cigar?\n"); - int readlen = seq_len; - for (i=0; icore.n_cigar; i++) + int readlen=seq_len; + if ( stats->regions ) + { + // Count only on-target bases + int iref = bam_line->core.pos + 1; + for (i=0; icore.n_cigar; i++) + { + int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK; + int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + if ( cig==2 ) readlen += ncig; + else if ( cig==0 ) + { + if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref; + else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to; + if ( ncig<0 ) ncig = 0; + stats->nbases_mapped_cigar += ncig; + iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + } + else if ( cig==1 ) + { + iref += ncig; + if ( iref>=stats->reg_from && iref<=stats->reg_to ) + stats->nbases_mapped_cigar += ncig; + } + } + } + else { - // Conversion from uint32_t to MIDNSHP - // 01--4-- - // MIDNSHP - if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 ) - stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; - - if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 ) - readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + // Count the whole read + for (i=0; icore.n_cigar; i++) + { + if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 ) + stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 ) + readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + } } stats->nbases_mapped += seq_len; @@ -819,23 +888,23 @@ void output_stats(stats_t *stats) printf(" %s",stats->argv[i]); printf("\n"); printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); - printf("SN\tsequences:\t%ld\n", stats->nreads_1st+stats->nreads_2nd); + printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0); printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); - printf("SN\t1st fragments:\t%ld\n", stats->nreads_1st); - printf("SN\tlast fragments:\t%ld\n", stats->nreads_2nd); - printf("SN\treads mapped:\t%ld\n", stats->nreads_paired+stats->nreads_unpaired); - printf("SN\treads unmapped:\t%ld\n", stats->nreads_unmapped); - printf("SN\treads unpaired:\t%ld\n", stats->nreads_unpaired); - printf("SN\treads paired:\t%ld\n", stats->nreads_paired); - printf("SN\treads duplicated:\t%ld\n", stats->nreads_dup); - printf("SN\treads MQ0:\t%ld\n", stats->nreads_mq0); - printf("SN\ttotal length:\t%ld\n", stats->total_len); - printf("SN\tbases mapped:\t%ld\n", stats->nbases_mapped); - printf("SN\tbases mapped (cigar):\t%ld\n", stats->nbases_mapped_cigar); - printf("SN\tbases trimmed:\t%ld\n", stats->nbases_trimmed); - printf("SN\tbases duplicated:\t%ld\n", stats->total_len_dup); - printf("SN\tmismatches:\t%ld\n", stats->nmismatches); + printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); + printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); + printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired)); + printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped); + printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired); + printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired); + printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup); + printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0); + printf("SN\ttotal length:\t%ld\n", (long)stats->total_len); + printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped); + printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar); + printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed); + printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); + printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches); printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar); float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; printf("SN\taverage length:\t%.0f\n", avg_read_length); @@ -843,9 +912,9 @@ void output_stats(stats_t *stats) printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0); printf("SN\tinsert size average:\t%.1f\n", avg_isize); printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize); - printf("SN\tinward oriented pairs:\t%ld\n", nisize_inward); - printf("SN\toutward oriented pairs:\t%ld\n", nisize_outward); - printf("SN\tpairs with other orientation:\t%ld\n", nisize_other); + printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward); + printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); + printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); int ibase,iqual; if ( stats->max_lennbases ) stats->max_len++; @@ -857,7 +926,7 @@ void output_stats(stats_t *stats) printf("FFQ\t%d",ibase+1); for (iqual=0; iqual<=stats->max_qual; iqual++) { - printf("\t%ld", stats->quals_1st[ibase*stats->nquals+iqual]); + printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]); } printf("\n"); } @@ -868,7 +937,7 @@ void output_stats(stats_t *stats) printf("LFQ\t%d",ibase+1); for (iqual=0; iqual<=stats->max_qual; iqual++) { - printf("\t%ld", stats->quals_2nd[ibase*stats->nquals+iqual]); + printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]); } printf("\n"); } @@ -882,7 +951,7 @@ void output_stats(stats_t *stats) printf("MPC\t%d",ibase+1); for (iqual=0; iqual<=stats->max_qual; iqual++) { - printf("\t%ld", stats->mpc_buf[ibase*stats->nquals+iqual]); + printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]); } printf("\n"); } @@ -892,7 +961,7 @@ void output_stats(stats_t *stats) for (ibase=0; ibasengc; ibase++) { if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue; - printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1),stats->gc_1st[ibase_prev]); + printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]); ibase_prev = ibase; } printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n"); @@ -900,7 +969,7 @@ void output_stats(stats_t *stats) for (ibase=0; ibasengc; ibase++) { if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue; - printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1),stats->gc_2nd[ibase_prev]); + printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]); ibase_prev = ibase; } printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n"); @@ -913,37 +982,37 @@ void output_stats(stats_t *stats) } printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); for (isize=1; isizeisize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), - stats->isize_inward[isize],stats->isize_outward[isize],stats->isize_other[isize]); + printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), + (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]); printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n"); int ilen; for (ilen=0; ilenmax_len; ilen++) { if ( stats->read_lengths[ilen]>0 ) - printf("RL\t%d\t%ld\n", ilen,stats->read_lengths[ilen]); + printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]); } printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n"); for (ilen=0; ilennindels; ilen++) { if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 ) - printf("ID\t%d\t%ld\t%ld\n", ilen+1,stats->insertions[ilen],stats->deletions[ilen]); + printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]); } - printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions, number of deletions\n"); - for (ilen=0; ilennbases; ilen++) + printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n"); + for (ilen=0; ilen<=stats->nbases; ilen++) { - if ( stats->ins_cycles[ilen]>0 || stats->del_cycles[ilen]>0 ) - printf("IC\t%d\t%ld\t%ld\n", ilen+1,stats->ins_cycles[ilen],stats->del_cycles[ilen]); + if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 ) + printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]); } printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); - printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1,stats->cov[0]); + printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]); int icov; for (icov=1; icovncov-1; icov++) - printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1,stats->cov[icov]); - printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov[stats->ncov-1]); + printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]); + printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]); // Calculate average GC content, then sort by GC and depth @@ -980,15 +1049,47 @@ void output_stats(stats_t *stats) } } -void bam_init_header_hash(bam_header_t *header); +size_t mygetline(char **line, size_t *n, FILE *fp) +{ + if (line == NULL || n == NULL || fp == NULL) + { + errno = EINVAL; + return -1; + } + if (*n==0 || !*line) + { + *line = NULL; + *n = 0; + } + + size_t nread=0; + int c; + while ((c=getc(fp))!= EOF && c!='\n') + { + if ( ++nread>=*n ) + { + *n += 255; + *line = realloc(*line, sizeof(char)*(*n)); + } + (*line)[nread-1] = c; + } + if ( nread>=*n ) + { + *n += 255; + *line = realloc(*line, sizeof(char)*(*n)); + } + (*line)[nread] = 0; + return nread>0 ? nread : -1; + +} void init_regions(stats_t *stats, char *file) { khiter_t iter; - khash_t(str) *header_hash; + khash_t(kh_bam_tid) *header_hash; bam_init_header_hash(stats->sam->header); - header_hash = (khash_t(str)*)stats->sam->header->hash; + header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash; FILE *fp = fopen(file,"r"); if ( !fp ) error("%s: %s\n",file,strerror(errno)); @@ -998,21 +1099,21 @@ void init_regions(stats_t *stats, char *file) ssize_t nread; int warned = 0; int prev_tid=-1, prev_pos=-1; - while ((nread = getline(&line, &len, fp)) != -1) + while ((nread = mygetline(&line, &len, fp)) != -1) { if ( line[0] == '#' ) continue; int i = 0; while ( i=nread ) error("Could not parse the file: %s\n", file); + if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line); line[i] = 0; - iter = kh_get(str, header_hash, line); + iter = kh_get(kh_bam_tid, header_hash, line); int tid = kh_val(header_hash, iter); if ( iter == kh_end(header_hash) ) { if ( !warned ) - fprintf(stderr,"Warning: Some sequences not present in the BAM (%s)\n", line); + fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line); warned = 1; continue; } @@ -1046,6 +1147,7 @@ void init_regions(stats_t *stats, char *file) stats->regions[tid].npos++; } if (line) free(line); + if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n"); fclose(fp); } @@ -1066,6 +1168,61 @@ static int fetch_read(const bam1_t *bam_line, void *data) return 1; } +void reset_regions(stats_t *stats) +{ + int i; + for (i=0; inregions; i++) + stats->regions[i].cpos = 0; +} + +int is_in_regions(bam1_t *bam_line, stats_t *stats) +{ + if ( !stats->regions ) return 1; + + if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0; + if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n"); + + regions_t *reg = &stats->regions[bam_line->core.tid]; + if ( reg->cpos==reg->npos ) return 0; // done for this chr + + // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, + // even small overlap is enough to include the read in the stats. + int i = reg->cpos; + while ( inpos && reg->pos[i].to<=bam_line->core.pos ) i++; + if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; } + if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0; + reg->cpos = i; + stats->reg_from = reg->pos[i].from; + stats->reg_to = reg->pos[i].to; + + return 1; +} + +void init_group_id(stats_t *stats, char *id) +{ + if ( !stats->sam->header->dict ) + stats->sam->header->dict = sam_header_parse2(stats->sam->header->text); + void *iter = stats->sam->header->dict; + const char *key, *val; + int n = 0; + stats->rg_hash = kh_init(kh_rg); + while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) + { + if ( !strcmp(id,key) || (val && !strcmp(id,val)) ) + { + khiter_t k = kh_get(kh_rg, stats->rg_hash, key); + if ( k != kh_end(stats->rg_hash) ) + fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); + int ret; + k = kh_put(kh_rg, stats->rg_hash, key, &ret); + kh_value(stats->rg_hash, k) = val; + n++; + } + } + if ( !n ) + error("The sample or read group \"%s\" not present.\n", id); +} + void error(const char *format, ...) { @@ -1078,8 +1235,12 @@ void error(const char *format, ...) printf("Options:\n"); printf(" -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n"); + printf(" -f, --required-flag Required flag, 0 for unset [0]\n"); + printf(" -F, --filtering-flag Filtering flag, 0 for unset [0]\n"); + printf(" --GC-depth Bin size for GC-depth graph and the maximum reference length [2e4,6e9]\n"); printf(" -h, --help This help message\n"); printf(" -i, --insert-size Maximum insert size [8000]\n"); + printf(" -I, --id Include only listed read group or sample name\n"); printf(" -l, --read-length Include in the statistics only reads with the given read length []\n"); printf(" -m, --most-inserts Report only the main part of inserts [0.99]\n"); printf(" -q, --trim-quality The BWA trimming parameter [0]\n"); @@ -1102,6 +1263,7 @@ int main(int argc, char *argv[]) { char *targets = NULL; char *bam_fname = NULL; + char *group_id = NULL; samfile_t *sam = NULL; char in_mode[5]; @@ -1115,7 +1277,6 @@ int main(int argc, char *argv[]) stats->isize_main_bulk = 0.99; // There are always outliers at the far end stats->gcd_bin_size = 20000; stats->ngcd = 3e5; // 300k of 20k bins is enough to hold a genome 6Gbp big - stats->nref_seq = stats->gcd_bin_size; stats->rseq_pos = -1; stats->tid = stats->gcd_pos = -1; stats->is_sorted = 1; @@ -1134,26 +1295,40 @@ int main(int argc, char *argv[]) {"help",0,0,'h'}, {"remove-dups",0,0,'d'}, {"sam",0,0,'s'}, - {"ref-seq",0,0,'r'}, - {"coverage",0,0,'c'}, - {"read-length",0,0,'l'}, - {"insert-size",0,0,'i'}, - {"most-inserts",0,0,'m'}, - {"trim-quality",0,0,'q'}, + {"ref-seq",1,0,'r'}, + {"coverage",1,0,'c'}, + {"read-length",1,0,'l'}, + {"insert-size",1,0,'i'}, + {"most-inserts",1,0,'m'}, + {"trim-quality",1,0,'q'}, {"target-regions",0,0,'t'}, + {"required-flag",1,0,'f'}, + {"filtering-flag",0,0,'F'}, + {"id",1,0,'I'}, + {"GC-depth",1,0,1}, {0,0,0,0} }; int opt; - while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:",loptions,NULL))>0 ) + while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 ) { switch (opt) { - case 'd': stats->rmdup=1; break; + case 'f': stats->flag_require=strtol(optarg,0,0); break; + case 'F': stats->flag_filter=strtol(optarg,0,0); break; + case 'd': stats->flag_filter|=BAM_FDUP; break; case 's': strcpy(in_mode, "r"); break; case 'r': stats->fai = fai_load(optarg); if (stats->fai==0) error("Could not load faidx: %s\n", optarg); break; + case 1 : { + float flen,fbin; + if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 ) + error("Unable to parse --GC-depth %s\n", optarg); + stats->gcd_bin_size = fbin; + stats->ngcd = flen/fbin; + } + break; case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 ) error("Unable to parse -c %s\n", optarg); break; @@ -1162,6 +1337,7 @@ int main(int argc, char *argv[]) case 'm': stats->isize_main_bulk = atof(optarg); break; case 'q': stats->trim_qual = atoi(optarg); break; case 't': targets = optarg; break; + case 'I': group_id = optarg; break; case '?': case 'h': error(NULL); default: error("Unknown argument: %s\n", optarg); @@ -1194,24 +1370,28 @@ int main(int argc, char *argv[]) if ((sam = samopen(bam_fname, in_mode, NULL)) == 0) error("Failed to open: %s\n", bam_fname); stats->sam = sam; + if ( group_id ) init_group_id(stats, group_id); bam1_t *bam_line = bam_init1(); // .. arrays - stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); - stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); - stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); - stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); - stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t)); - stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t)); - stats->isize_other = calloc(stats->nisize,sizeof(uint64_t)); - stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); - stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t)); - stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; - stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t)); - stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); - stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); - stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); - stats->ins_cycles = calloc(stats->nbases,sizeof(uint64_t)); - stats->del_cycles = calloc(stats->nbases,sizeof(uint64_t)); + stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); + stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); + stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); + stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); + stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t)); + stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t)); + stats->isize_other = calloc(stats->nisize,sizeof(uint64_t)); + stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); + stats->nref_seq = stats->gcd_bin_size; + stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t)); + stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; + stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t)); + stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); + stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); + stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); + stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); + stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); + stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); + stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); if ( targets ) init_regions(stats, targets); @@ -1229,6 +1409,7 @@ int main(int argc, char *argv[]) int tid, beg, end; bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end); if ( tid < 0 ) continue; + reset_regions(stats); bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read); } bam_index_destroy(bam_idx); @@ -1237,25 +1418,7 @@ int main(int argc, char *argv[]) { // Stream through the entire BAM ignoring off-target regions if -t is given while (samread(sam,bam_line) >= 0) - { - if ( stats->regions ) - { - if ( bam_line->core.tid >= stats->nregions ) continue; - if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n"); - - regions_t *reg = &stats->regions[bam_line->core.tid]; - if ( reg->cpos==reg->npos ) continue; // done for this chr - - // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, - // even small overlap is enough to include the read in the stats. - int i = reg->cpos; - while ( inpos && reg->pos[i].to<=bam_line->core.pos ) i++; - if ( i>=reg->npos ) { reg->cpos = reg->npos; continue; } - if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) continue; - reg->cpos = i; - } collect_stats(bam_line,stats); - } } round_buffer_flush(stats,-1); @@ -1275,10 +1438,13 @@ int main(int argc, char *argv[]) free(stats->read_lengths); free(stats->insertions); free(stats->deletions); - free(stats->ins_cycles); - free(stats->del_cycles); + free(stats->ins_cycles_1st); + free(stats->ins_cycles_2nd); + free(stats->del_cycles_1st); + free(stats->del_cycles_2nd); destroy_regions(stats); free(stats); + if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash); return 0; }