From 12c0954c689d82000572078a756678142f9a7826 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 4 Sep 2012 09:09:19 +0100 Subject: [PATCH] New --GC-depth option to bamcheck. Take into account not so frequent CIGAR codes. Fix in realloc_buffers. Prevent segfault with long command line options; Set xrange correctly in plot-bamcheck coverage.gp --- misc/bamcheck.c | 114 +++++++++++++++++++++++++++++++++------------ misc/plot-bamcheck | 4 +- 2 files changed, 87 insertions(+), 31 deletions(-) diff --git a/misc/bamcheck.c b/misc/bamcheck.c index 26f7c0a..3db9cfb 100644 --- a/misc/bamcheck.c +++ b/misc/bamcheck.c @@ -1,6 +1,6 @@ /* Author: petr.danecek@sanger - gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam + gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread Assumptions, approximations and other issues: - GC-depth graph does not split reads, the starting position determines which bin is incremented. @@ -15,7 +15,7 @@ */ -#define BAMCHECK_VERSION "2012-04-24" +#define BAMCHECK_VERSION "2012-09-04" #define _ISOC99_SOURCE #include @@ -29,6 +29,7 @@ #include "faidx.h" #include "khash.h" #include "sam.h" +#include "sam_header.h" #include "razf.h" #define BWA_MIN_RDLEN 35 @@ -47,13 +48,14 @@ typedef struct uint64_t offset; } faidx1_t; -KHASH_MAP_INIT_STR(s, faidx1_t) -KHASH_MAP_INIT_STR(str, int) +KHASH_MAP_INIT_STR(kh_faidx, faidx1_t) +KHASH_MAP_INIT_STR(kh_bam_tid, int) +KHASH_MAP_INIT_STR(kh_rg, const char *) struct __faidx_t { RAZF *rz; int n, m; char **name; - khash_t(s) *hash; + khash_t(kh_faidx) *hash; }; typedef struct @@ -152,10 +154,11 @@ typedef struct // Auxiliary data int flag_require, flag_filter; - double sum_qual; // For calculating average quality value - samfile_t *sam; // Unused - faidx_t *fai; // Reference sequence for GC-depth graph - int argc; // Command line arguments to be printed on the output + double sum_qual; // For calculating average quality value + samfile_t *sam; + khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated + faidx_t *fai; // Reference sequence for GC-depth graph + int argc; // Command line arguments to be printed on the output char **argv; } stats_t; @@ -289,8 +292,9 @@ void count_indels(stats_t *stats,bam1_t *bam_line) if ( cig==1 ) { int idx = is_fwd ? icycle : read_len-icycle; - if ( idx<0 ) error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); - if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d\n", idx,stats->nbases); + if ( idx<0 ) + error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); + if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); if ( is_1st ) stats->ins_cycles_1st[idx]++; else @@ -313,7 +317,8 @@ void count_indels(stats_t *stats,bam1_t *bam_line) stats->deletions[ncig-1]++; continue; } - icycle += ncig; + if ( cig!=3 && cig!=5 ) + icycle += ncig; } } @@ -406,7 +411,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) { - khash_t(s) *h; + khash_t(kh_faidx) *h; khiter_t iter; faidx1_t val; char *chr, c; @@ -416,7 +421,7 @@ void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) chr = stats->sam->header->target_name[tid]; // ID of the sequence name - iter = kh_get(s, h, chr); + iter = kh_get(kh_faidx, h, chr); if (iter == kh_end(h)) error("No such reference sequence [%s]?\n", chr); val = kh_value(h, iter); @@ -532,22 +537,22 @@ void realloc_buffers(stats_t *stats, int seq_len) stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t)); if ( !stats->ins_cycles_1st ) error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); - memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t)); + memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t)); if ( !stats->ins_cycles_2nd ) error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); - memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t)); + memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t)); if ( !stats->del_cycles_1st ) error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); - memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t)); + memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t)); if ( !stats->del_cycles_2nd ) error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); - memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n+1-stats->nbases)*sizeof(uint64_t)); + memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); stats->nbases = n; @@ -565,6 +570,13 @@ void realloc_buffers(stats_t *stats, int seq_len) void collect_stats(bam1_t *bam_line, stats_t *stats) { + if ( stats->rg_hash ) + { + const uint8_t *rg = bam_aux_get(bam_line, "RG"); + if ( !rg ) return; + khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end(stats->rg_hash) ) return; + } if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require ) return; if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) ) @@ -1074,10 +1086,10 @@ size_t mygetline(char **line, size_t *n, FILE *fp) void init_regions(stats_t *stats, char *file) { khiter_t iter; - khash_t(str) *header_hash; + khash_t(kh_bam_tid) *header_hash; bam_init_header_hash(stats->sam->header); - header_hash = (khash_t(str)*)stats->sam->header->hash; + header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash; FILE *fp = fopen(file,"r"); if ( !fp ) error("%s: %s\n",file,strerror(errno)); @@ -1096,7 +1108,7 @@ void init_regions(stats_t *stats, char *file) if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line); line[i] = 0; - iter = kh_get(str, header_hash, line); + iter = kh_get(kh_bam_tid, header_hash, line); int tid = kh_val(header_hash, iter); if ( iter == kh_end(header_hash) ) { @@ -1186,6 +1198,32 @@ int is_in_regions(bam1_t *bam_line, stats_t *stats) return 1; } +void init_group_id(stats_t *stats, char *id) +{ + if ( !stats->sam->header->dict ) + stats->sam->header->dict = sam_header_parse2(stats->sam->header->text); + void *iter = stats->sam->header->dict; + const char *key, *val; + int n = 0; + stats->rg_hash = kh_init(kh_rg); + while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) + { + if ( !strcmp(id,key) || (val && !strcmp(id,val)) ) + { + khiter_t k = kh_get(kh_rg, stats->rg_hash, key); + if ( k != kh_end(stats->rg_hash) ) + fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); + int ret; + k = kh_put(kh_rg, stats->rg_hash, key, &ret); + kh_value(stats->rg_hash, k) = val; + n++; + } + } + if ( !n ) + error("The sample or read group \"%s\" not present.\n", id); +} + + void error(const char *format, ...) { if ( !format ) @@ -1199,8 +1237,10 @@ void error(const char *format, ...) printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n"); printf(" -f, --required-flag Required flag, 0 for unset [0]\n"); printf(" -F, --filtering-flag Filtering flag, 0 for unset [0]\n"); + printf(" --GC-depth Bin size for GC-depth graph and the maximum reference length [2e4,6e9]\n"); printf(" -h, --help This help message\n"); printf(" -i, --insert-size Maximum insert size [8000]\n"); + printf(" -I, --id Include only listed read group or sample name\n"); printf(" -l, --read-length Include in the statistics only reads with the given read length []\n"); printf(" -m, --most-inserts Report only the main part of inserts [0.99]\n"); printf(" -q, --trim-quality The BWA trimming parameter [0]\n"); @@ -1223,6 +1263,7 @@ int main(int argc, char *argv[]) { char *targets = NULL; char *bam_fname = NULL; + char *group_id = NULL; samfile_t *sam = NULL; char in_mode[5]; @@ -1236,7 +1277,6 @@ int main(int argc, char *argv[]) stats->isize_main_bulk = 0.99; // There are always outliers at the far end stats->gcd_bin_size = 20000; stats->ngcd = 3e5; // 300k of 20k bins is enough to hold a genome 6Gbp big - stats->nref_seq = stats->gcd_bin_size; stats->rseq_pos = -1; stats->tid = stats->gcd_pos = -1; stats->is_sorted = 1; @@ -1255,19 +1295,21 @@ int main(int argc, char *argv[]) {"help",0,0,'h'}, {"remove-dups",0,0,'d'}, {"sam",0,0,'s'}, - {"ref-seq",0,0,'r'}, - {"coverage",0,0,'c'}, - {"read-length",0,0,'l'}, - {"insert-size",0,0,'i'}, - {"most-inserts",0,0,'m'}, - {"trim-quality",0,0,'q'}, + {"ref-seq",1,0,'r'}, + {"coverage",1,0,'c'}, + {"read-length",1,0,'l'}, + {"insert-size",1,0,'i'}, + {"most-inserts",1,0,'m'}, + {"trim-quality",1,0,'q'}, {"target-regions",0,0,'t'}, - {"required-flag",0,0,'f'}, + {"required-flag",1,0,'f'}, {"filtering-flag",0,0,'F'}, + {"id",1,0,'I'}, + {"GC-depth",1,0,1}, {0,0,0,0} }; int opt; - while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:",loptions,NULL))>0 ) + while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 ) { switch (opt) { @@ -1279,6 +1321,14 @@ int main(int argc, char *argv[]) if (stats->fai==0) error("Could not load faidx: %s\n", optarg); break; + case 1 : { + float flen,fbin; + if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 ) + error("Unable to parse --GC-depth %s\n", optarg); + stats->gcd_bin_size = fbin; + stats->ngcd = flen/fbin; + } + break; case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 ) error("Unable to parse -c %s\n", optarg); break; @@ -1287,6 +1337,7 @@ int main(int argc, char *argv[]) case 'm': stats->isize_main_bulk = atof(optarg); break; case 'q': stats->trim_qual = atoi(optarg); break; case 't': targets = optarg; break; + case 'I': group_id = optarg; break; case '?': case 'h': error(NULL); default: error("Unknown argument: %s\n", optarg); @@ -1319,6 +1370,7 @@ int main(int argc, char *argv[]) if ((sam = samopen(bam_fname, in_mode, NULL)) == 0) error("Failed to open: %s\n", bam_fname); stats->sam = sam; + if ( group_id ) init_group_id(stats, group_id); bam1_t *bam_line = bam_init1(); // .. arrays stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); @@ -1329,6 +1381,7 @@ int main(int argc, char *argv[]) stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t)); stats->isize_other = calloc(stats->nisize,sizeof(uint64_t)); stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); + stats->nref_seq = stats->gcd_bin_size; stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t)); stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t)); @@ -1391,6 +1444,7 @@ int main(int argc, char *argv[]) free(stats->del_cycles_2nd); destroy_regions(stats); free(stats); + if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash); return 0; } diff --git a/misc/plot-bamcheck b/misc/plot-bamcheck index d5857f1..1792c6f 100755 --- a/misc/plot-bamcheck +++ b/misc/plot-bamcheck @@ -349,6 +349,7 @@ sub plot_qualities set size 0.4,0.8 unset ytics set y2tics mirror + set yrange [0:$yrange] unset ylabel set xlabel "Cycle (rev reads)" set label "$$args{title}" at screen 0.5,0.95 center @@ -662,7 +663,8 @@ sub plot_coverage my @vals; for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; } - my $p99 = percentile(99.8,@vals); + my $i = percentile(99.8,@vals); + my $p99 = $$opts{dat}{COV}[$i][1]; my $args = get_defaults($opts,"$$opts{prefix}coverage.png"); my $fh = $$args{fh}; -- 2.39.2