From 8928103fafd05b9b555f9ae1710031666725867a Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 27 Mar 2012 14:49:59 +0100 Subject: [PATCH] Initial release of bamcheck and plot-bamcheck --- misc/bamcheck.c | 1287 ++++++++++++++++++++++++++++++++++++++++++++ misc/plot-bamcheck | 877 ++++++++++++++++++++++++++++++ 2 files changed, 2164 insertions(+) create mode 100644 misc/bamcheck.c create mode 100755 misc/plot-bamcheck diff --git a/misc/bamcheck.c b/misc/bamcheck.c new file mode 100644 index 0000000..7b5fd02 --- /dev/null +++ b/misc/bamcheck.c @@ -0,0 +1,1287 @@ +/* + Author: petr.danecek@sanger + gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam + + Assumptions, approximations and other issues: + - GC-depth graph does not split reads, the starting position determines which bin is incremented. + There are small overlaps between bins (max readlen-1). However, the bins are big (20k). + - coverage distribution ignores softclips and deletions + - some stats require sorted BAMs + - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths. + - The whole reads are used with -t, no splicing is done, no indels or soft clips are + considered, even small overlap is good enough to include the read in the stats. +*/ + +#define BAMCHECK_VERSION "2012-03-22" + +#define _ISOC99_SOURCE +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include "faidx.h" +#include "khash.h" +#include "sam.h" +#include "razf.h" + +#define BWA_MIN_RDLEN 35 +#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) +#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) +#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE) +#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE) +#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1) +#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2) +#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) + +typedef struct +{ + int32_t line_len, line_blen; + int64_t len; + uint64_t offset; +} +faidx1_t; +KHASH_MAP_INIT_STR(s, faidx1_t) +KHASH_MAP_INIT_STR(str, int) +struct __faidx_t { + RAZF *rz; + int n, m; + char **name; + khash_t(s) *hash; +}; + +typedef struct +{ + float gc; + uint32_t depth; +} +gc_depth_t; + +// For coverage distribution, a simple pileup +typedef struct +{ + int64_t pos; + int size, start; + int *buffer; +} +round_buffer_t; + +typedef struct { uint32_t from, to; } pos_t; +typedef struct +{ + int npos,mpos,cpos; + pos_t *pos; +} +regions_t; + +typedef struct +{ + // Parameters + int trim_qual; // bwa trim quality + int rmdup; // Exclude reads marked as duplicates from the stats + + // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd), + // insert size histogram holder + int nquals; // The number of quality bins + int nbases; // The maximum sequence length the allocated array can hold + int nisize; // The maximum insert size that the allocated array can hold + int ngc; // The size of gc_1st and gc_2nd + int nindels; // The maximum indel length for indel distribution + + // Arrays for the histogram data + uint64_t *quals_1st, *quals_2nd; + uint64_t *gc_1st, *gc_2nd; + uint64_t *isize_inward, *isize_outward, *isize_other; + uint64_t *acgt_cycles; + uint64_t *read_lengths; + uint64_t *insertions, *deletions; + uint64_t *ins_cycles, *del_cycles; + + // The extremes encountered + int max_len; // Maximum read length + int max_qual; // Maximum quality + float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part + int is_sorted; + + // Summary numbers + uint64_t total_len; + uint64_t total_len_dup; + uint64_t nreads_1st; + uint64_t nreads_2nd; + uint64_t nreads_dup; + uint64_t nreads_unmapped; + uint64_t nreads_unpaired; + uint64_t nreads_paired; + uint64_t nreads_mq0; + uint64_t nbases_mapped; + uint64_t nbases_mapped_cigar; + uint64_t nbases_trimmed; // bwa trimmed bases + uint64_t nmismatches; + + // GC-depth related data + uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin + gc_depth_t *gcd; // The GC-depth bins holder + int gcd_bin_size; // The size of GC-depth bin + int32_t tid, gcd_pos; // Position of the current bin + int32_t pos; // Position of the last read + + // Coverage distribution related data + int ncov; // The number of coverage bins + uint64_t *cov; // The coverage frequencies + int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins + round_buffer_t cov_rbuf; // Pileup round buffer + + // Mismatches by read cycle + uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against + int nref_seq; // The size of the buffer + int32_t rseq_pos; // The coordinate of the first base in the buffer + int32_t rseq_len; // The used part of the buffer + uint64_t *mpc_buf; // Mismatches per cycle + + // Filters + int filter_readlen; + + // Target regions + int nregions; + regions_t *regions; + + // Auxiliary data + double sum_qual; // For calculating average quality value + samfile_t *sam; // Unused + faidx_t *fai; // Reference sequence for GC-depth graph + int argc; // Command line arguments to be printed on the output + char **argv; +} +stats_t; + +void error(const char *format, ...); + +// Coverage distribution methods +inline int coverage_idx(int min, int max, int n, int step, int depth) +{ + if ( depth < min ) + return 0; + + if ( depth > max ) + return n-1; + + return 1 + (depth - min) / step; +} + +inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos) +{ + return (offset + (pos-refpos) % size) % size; +} + +void round_buffer_flush(stats_t *stats, int64_t pos) +{ + int ibuf,idp; + + if ( pos==stats->cov_rbuf.pos ) + return; + + int64_t new_pos = pos; + if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size ) + { + // Flush the whole buffer, but in sequential order, + pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1; + } + + if ( pos < stats->cov_rbuf.pos ) + error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos); + + int ifrom = stats->cov_rbuf.start; + int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1); + if ( ifrom>ito ) + { + for (ibuf=ifrom; ibufcov_rbuf.size; ibuf++) + { + if ( !stats->cov_rbuf.buffer[ibuf] ) + continue; + idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); + stats->cov[idp]++; + stats->cov_rbuf.buffer[ibuf] = 0; + } + ifrom = 0; + } + for (ibuf=ifrom; ibuf<=ito; ibuf++) + { + if ( !stats->cov_rbuf.buffer[ibuf] ) + continue; + idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); + stats->cov[idp]++; + stats->cov_rbuf.buffer[ibuf] = 0; + } + stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos); + stats->cov_rbuf.pos = new_pos; +} + +void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to) +{ + if ( to-from >= rbuf->size ) + error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size); + if ( from < rbuf->pos ) + error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos); + + int ifrom,ito,ibuf; + ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from); + ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to); + if ( ifrom>ito ) + { + for (ibuf=ifrom; ibufsize; ibuf++) + rbuf->buffer[ibuf]++; + ifrom = 0; + } + for (ibuf=ifrom; ibuf<=ito; ibuf++) + rbuf->buffer[ibuf]++; +} + +// Calculate the number of bases in the read trimmed by BWA +int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse) +{ + if ( lenmax_sum ) + { + max_sum = sum; + // This is the correct way, but bwa clips from some reason one base less + // max_l = l+1; + max_l = l; + } + } + return max_l; +} + + +void count_indels(stats_t *stats,bam1_t *bam_line) +{ + int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; + int icig; + int icycle = 0; + int read_len = bam_line->core.l_qseq; + for (icig=0; icigcore.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; + + if ( cig==1 ) + { + int idx = is_fwd ? icycle : read_len-icycle-1; + if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); + stats->ins_cycles[idx]++; + icycle += ncig; + if ( ncig<=stats->nindels ) + stats->insertions[ncig-1]++; + continue; + } + if ( cig==2 ) + { + int idx = is_fwd ? icycle : read_len-icycle-1; + if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); + stats->del_cycles[idx]++; + if ( ncig<=stats->nindels ) + stats->deletions[ncig-1]++; + continue; + } + icycle += ncig; + } +} + +void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) +{ + int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; + int icig,iread=0,icycle=0; + int iref = bam_line->core.pos - stats->rseq_pos; + int read_len = bam_line->core.l_qseq; + uint8_t *read = bam1_seq(bam_line); + uint8_t *quals = bam1_qual(bam_line); + uint64_t *mpc_buf = stats->mpc_buf; + for (icig=0; icigcore.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; + if ( cig==1 ) + { + iread += ncig; + icycle += ncig; + continue; + } + if ( cig==2 ) + { + iref += ncig; + continue; + } + if ( cig==4 ) + { + icycle += ncig; + // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence + // iref += ncig; + iread += ncig; + continue; + } + if ( cig==5 ) + { + icycle += ncig; + continue; + } + if ( cig!=0 ) + error("TODO: cigar %d, %s\n", cig,bam1_qname(bam_line)); + + if ( ncig+iref > stats->rseq_len ) + error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->rseq_len, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1); + + int im; + for (im=0; imrseq_buf[iref]; + + // ---------------15 + // =ACMGRSVTWYHKDBN + if ( cread==15 ) + { + int idx = is_fwd ? icycle : read_len-icycle-1; + if ( idx>stats->max_len ) + error("mpc: %d>%d\n",idx,stats->max_len); + idx = idx*stats->nquals; + if ( idx>=stats->nquals*stats->nbases ) + error("FIXME: mpc_buf overflow\n"); + mpc_buf[idx]++; + } + else if ( cref && cread && cref!=cread ) + { + uint8_t qual = quals[iread] + 1; + if ( qual>=stats->nquals ) + error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals); + + int idx = is_fwd ? icycle : read_len-icycle-1; + if ( idx>stats->max_len ) + error("mpc: %d>%d\n",idx,stats->max_len); + + idx = idx*stats->nquals + qual; + if ( idx>=stats->nquals*stats->nbases ) + error("FIXME: mpc_buf overflow\n"); + mpc_buf[idx]++; + } + + iref++; + iread++; + icycle++; + } + } +} + +void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) +{ + khash_t(s) *h; + khiter_t iter; + faidx1_t val; + char *chr, c; + faidx_t *fai = stats->fai; + + h = fai->hash; + chr = stats->sam->header->target_name[tid]; + + // ID of the sequence name + iter = kh_get(s, h, chr); + if (iter == kh_end(h)) + error("No such reference sequence [%s]?\n", chr); + val = kh_value(h, iter); + + // Check the boundaries + if (pos >= val.len) + error("Was the bam file mapped with the reference sequence supplied?" + " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len); + int size = stats->nref_seq; + // The buffer extends beyond the chromosome end. Later the rest will be filled with N's. + if (size+pos > val.len) size = val.len-pos; + + // Position the razf reader + razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET); + + uint8_t *ptr = stats->rseq_buf; + int nread = 0; + while ( nreadrz,&c,1) && !fai->rz->z_err ) + { + if ( !isgraph(c) ) + continue; + + // Conversion between uint8_t coding and ACGT + // -12-4---8------- + // =ACMGRSVTWYHKDBN + if ( c=='A' || c=='a' ) + *ptr = 1; + else if ( c=='C' || c=='c' ) + *ptr = 2; + else if ( c=='G' || c=='g' ) + *ptr = 4; + else if ( c=='T' || c=='t' ) + *ptr = 8; + else + *ptr = 0; + ptr++; + nread++; + } + if ( nread < stats->nref_seq ) + { + memset(ptr,0, stats->nref_seq - nread); + nread = stats->nref_seq; + } + stats->rseq_len = nread; + stats->rseq_pos = pos; + stats->tid = tid; +} + +float fai_gc_content(stats_t *stats) +{ + uint32_t gc,count,c; + int i,size = stats->rseq_len; + + // Count GC content + gc = count = 0; + for (i=0; irseq_buf[i]; + if ( c==2 || c==4 ) + { + gc++; + count++; + } + else if ( c==1 || c==8 ) + count++; + } + return count ? (float)gc/count : 0; +} + + +void realloc_buffers(stats_t *stats, int seq_len) +{ + int n = 2*(1 + seq_len - stats->nbases) + stats->nbases; + + stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t)); + if ( !stats->quals_1st ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); + memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); + + stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t)); + if ( !stats->quals_2nd ) + error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); + memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); + + if ( stats->mpc_buf ) + { + stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t)); + if ( !stats->mpc_buf ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); + memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); + } + + stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t)); + if ( !stats->acgt_cycles ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t)); + memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t)); + + stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t)); + if ( !stats->read_lengths ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t)); + if ( !stats->insertions ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t)); + if ( !stats->deletions ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->ins_cycles = realloc(stats->ins_cycles, n*sizeof(uint64_t)); + if ( !stats->ins_cycles ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->ins_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->del_cycles = realloc(stats->del_cycles, n*sizeof(uint64_t)); + if ( !stats->del_cycles ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->del_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->nbases = n; + + // Realloc the coverage distribution buffer + int *rbuffer = calloc(sizeof(int),seq_len*5); + n = stats->cov_rbuf.size-stats->cov_rbuf.start; + memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n); + if ( stats->cov_rbuf.start>1 ) + memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start); + stats->cov_rbuf.start = 0; + free(stats->cov_rbuf.buffer); + stats->cov_rbuf.buffer = rbuffer; + stats->cov_rbuf.size = seq_len*5; +} + +void collect_stats(bam1_t *bam_line, stats_t *stats) +{ + if ( stats->rmdup && IS_DUP(bam_line) ) + return; + + int seq_len = bam_line->core.l_qseq; + if ( !seq_len ) return; + if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return; + if ( seq_len >= stats->nbases ) + realloc_buffers(stats,seq_len); + if ( stats->max_lenmax_len = seq_len; + + stats->read_lengths[seq_len]++; + + // Count GC and ACGT per cycle + uint8_t base, *seq = bam1_seq(bam_line); + int gc_count = 0; + int i; + int reverse = IS_REVERSE(bam_line); + for (i=0; i2 ) base=3; + if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 ) + error("FIXME: acgt_cycles\n"); + stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++; + } + int gc_idx_min = gc_count*(stats->ngc-1)/seq_len; + int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len; + if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1; + + // Determine which array (1st or 2nd read) will these stats go to, + // trim low quality bases from end the same way BWA does, + // fill GC histogram + uint64_t *quals; + uint8_t *bam_quals = bam1_qual(bam_line); + if ( bam_line->core.flag&BAM_FREAD2 ) + { + quals = stats->quals_2nd; + stats->nreads_2nd++; + for (i=gc_idx_min; igc_2nd[i]++; + } + else + { + quals = stats->quals_1st; + stats->nreads_1st++; + for (i=gc_idx_min; igc_1st[i]++; + } + if ( stats->trim_qual>0 ) + stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse); + + // Quality histogram and average quality + for (i=0; i=stats->nquals ) + error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals); + if ( qual>stats->max_qual ) + stats->max_qual = qual; + + quals[ i*stats->nquals+qual ]++; + stats->sum_qual += qual; + } + + // Look at the flags and increment appropriate counters (mapped, paired, etc) + if ( IS_UNMAPPED(bam_line) ) + stats->nreads_unmapped++; + else + { + if ( !bam_line->core.qual ) + stats->nreads_mq0++; + + count_indels(stats,bam_line); + + // The insert size is tricky, because for long inserts the libraries are + // prepared differently and the pairs point in other direction. BWA does + // not set the paired flag for them. Similar thing is true also for 454 + // reads. Therefore, do the insert size stats for all mapped reads. + int32_t isize = bam_line->core.isize; + if ( isize<0 ) isize = -isize; + if ( IS_PAIRED(bam_line) && isize!=0 ) + { + stats->nreads_paired++; + if ( isize >= stats->nisize ) + isize=stats->nisize-1; + + int pos_fst = bam_line->core.mpos - bam_line->core.pos; + int is_fst = IS_READ1(bam_line) ? 1 : -1; + int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; + int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; + + if ( is_fwd*is_mfwd>0 ) + stats->isize_other[isize]++; + else if ( is_fst*pos_fst>0 ) + { + if ( is_fst*is_fwd>0 ) + stats->isize_inward[isize]++; + else + stats->isize_outward[isize]++; + } + else if ( is_fst*pos_fst<0 ) + { + if ( is_fst*is_fwd>0 ) + stats->isize_outward[isize]++; + else + stats->isize_inward[isize]++; + } + } + else + stats->nreads_unpaired++; + + // Number of mismatches + uint8_t *nm = bam_aux_get(bam_line,"NM"); + if (nm) + stats->nmismatches += bam_aux2i(nm); + + // Number of mapped bases from cigar + if ( bam_line->core.n_cigar == 0) + error("FIXME: mapped read with no cigar?\n"); + int readlen = seq_len; + for (i=0; icore.n_cigar; i++) + { + // Conversion from uint32_t to MIDNSHP + // 01--4-- + // MIDNSHP + if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 ) + stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + + if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 ) + readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + } + stats->nbases_mapped += seq_len; + + if ( stats->tid==bam_line->core.tid && bam_line->core.pospos ) + stats->is_sorted = 0; + stats->pos = bam_line->core.pos; + + if ( stats->is_sorted ) + { + if ( stats->tid==-1 || stats->tid!=bam_line->core.tid ) + round_buffer_flush(stats,-1); + + // Mismatches per cycle and GC-depth graph + if ( stats->fai ) + { + // First pass, new chromosome or sequence spanning beyond the end of the buffer + if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid || stats->rseq_pos+stats->rseq_len < bam_line->core.pos+readlen ) + { + read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos); + + // Get the reference GC content for this bin. Note that in the current implementation + // the GCD bins overlap by seq_len-1. Because the bin size is by default 20k and the + // expected read length only ~100bp, it shouldn't really matter. + stats->gcd_pos = bam_line->core.pos; + stats->igcd++; + + if ( stats->igcd >= stats->ngcd ) + error("The genome too long or the GCD bin overlaps too big [%ud]\n", stats->igcd); + + stats->gcd[ stats->igcd ].gc = fai_gc_content(stats); + } + count_mismatches_per_cycle(stats,bam_line); + } + else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size ) + { + // First pass or a new chromosome + stats->tid = bam_line->core.tid; + stats->gcd_pos = bam_line->core.pos; + stats->igcd++; + + if ( stats->igcd >= stats->ngcd ) + { + uint32_t n = 2*(1 + stats->ngcd); + stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t)); + if ( !stats->gcd ) + error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n); + memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t)); + } + } + + stats->gcd[ stats->igcd ].depth++; + // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK) + if ( !stats->fai ) + stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len; + + // Coverage distribution graph + round_buffer_flush(stats,bam_line->core.pos); + round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1); + } + } + + stats->total_len += seq_len; + if ( IS_DUP(bam_line) ) + { + stats->total_len_dup += seq_len; + stats->nreads_dup++; + } +} + +// Sort by GC and depth +#define GCD_t(x) ((gc_depth_t *)x) +static int gcd_cmp(const void *a, const void *b) +{ + if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1; + if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1; + if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1; + if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1; + return 0; +} +#undef GCD_t + +float gcd_percentile(gc_depth_t *gcd, int N, int p) +{ + float n,d; + int k; + + n = p*(N+1)/100; + k = n; + if ( k<=0 ) + return gcd[0].depth; + if ( k>=N ) + return gcd[N-1].depth; + + d = n - k; + return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth); +} + +void output_stats(stats_t *stats) +{ + // Calculate average insert size and standard deviation (from the main bulk data only) + int isize, ibulk=0; + uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0; + for (isize=1; isizenisize; isize++) + { + // Each pair was counted twice + stats->isize_inward[isize] *= 0.5; + stats->isize_outward[isize] *= 0.5; + stats->isize_other[isize] *= 0.5; + + nisize_inward += stats->isize_inward[isize]; + nisize_outward += stats->isize_outward[isize]; + nisize_other += stats->isize_other[isize]; + nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; + } + + double bulk=0, avg_isize=0, sd_isize=0; + for (isize=1; isizenisize; isize++) + { + bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; + avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]); + + if ( bulk/nisize > stats->isize_main_bulk ) + { + ibulk = isize+1; + nisize = bulk; + break; + } + } + avg_isize /= nisize ? nisize : 1; + for (isize=1; isizeisize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize; + sd_isize = sqrt(sd_isize); + + + printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION); + printf("# The command line was: %s",stats->argv[0]); + int i; + for (i=1; iargc; i++) + printf(" %s",stats->argv[i]); + printf("\n"); + printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); + printf("SN\tsequences:\t%ld\n", stats->nreads_1st+stats->nreads_2nd); + printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0); + printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); + printf("SN\t1st fragments:\t%ld\n", stats->nreads_1st); + printf("SN\tlast fragments:\t%ld\n", stats->nreads_2nd); + printf("SN\treads mapped:\t%ld\n", stats->nreads_paired+stats->nreads_unpaired); + printf("SN\treads unmapped:\t%ld\n", stats->nreads_unmapped); + printf("SN\treads unpaired:\t%ld\n", stats->nreads_unpaired); + printf("SN\treads paired:\t%ld\n", stats->nreads_paired); + printf("SN\treads duplicated:\t%ld\n", stats->nreads_dup); + printf("SN\treads MQ0:\t%ld\n", stats->nreads_mq0); + printf("SN\ttotal length:\t%ld\n", stats->total_len); + printf("SN\tbases mapped:\t%ld\n", stats->nbases_mapped); + printf("SN\tbases mapped (cigar):\t%ld\n", stats->nbases_mapped_cigar); + printf("SN\tbases trimmed:\t%ld\n", stats->nbases_trimmed); + printf("SN\tbases duplicated:\t%ld\n", stats->total_len_dup); + printf("SN\tmismatches:\t%ld\n", stats->nmismatches); + printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar); + float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; + printf("SN\taverage length:\t%.0f\n", avg_read_length); + printf("SN\tmaximum length:\t%d\n", stats->max_len); + printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0); + printf("SN\tinsert size average:\t%.1f\n", avg_isize); + printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize); + printf("SN\tinward oriented pairs:\t%ld\n", nisize_inward); + printf("SN\toutward oriented pairs:\t%ld\n", nisize_outward); + printf("SN\tpairs with other orientation:\t%ld\n", nisize_other); + + int ibase,iqual; + if ( stats->max_lennbases ) stats->max_len++; + if ( stats->max_qual+1nquals ) stats->max_qual++; + printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n"); + printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); + for (ibase=0; ibasemax_len; ibase++) + { + printf("FFQ\t%d",ibase+1); + for (iqual=0; iqual<=stats->max_qual; iqual++) + { + printf("\t%ld", stats->quals_1st[ibase*stats->nquals+iqual]); + } + printf("\n"); + } + printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n"); + printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); + for (ibase=0; ibasemax_len; ibase++) + { + printf("LFQ\t%d",ibase+1); + for (iqual=0; iqual<=stats->max_qual; iqual++) + { + printf("\t%ld", stats->quals_2nd[ibase*stats->nquals+iqual]); + } + printf("\n"); + } + if ( stats->mpc_buf ) + { + printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n"); + printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n"); + printf("# is the number of N's and the rest is the number of mismatches\n"); + for (ibase=0; ibasemax_len; ibase++) + { + printf("MPC\t%d",ibase+1); + for (iqual=0; iqual<=stats->max_qual; iqual++) + { + printf("\t%ld", stats->mpc_buf[ibase*stats->nquals+iqual]); + } + printf("\n"); + } + } + printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n"); + int ibase_prev = 0; + for (ibase=0; ibasengc; ibase++) + { + if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue; + printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1),stats->gc_1st[ibase_prev]); + ibase_prev = ibase; + } + printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n"); + ibase_prev = 0; + for (ibase=0; ibasengc; ibase++) + { + if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue; + printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1),stats->gc_2nd[ibase_prev]); + ibase_prev = ibase; + } + printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n"); + for (ibase=0; ibasemax_len; ibase++) + { + uint64_t *ptr = &(stats->acgt_cycles[ibase*4]); + uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3]; + if ( ! sum ) continue; + printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum); + } + printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); + for (isize=1; isizeisize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), + stats->isize_inward[isize],stats->isize_outward[isize],stats->isize_other[isize]); + + printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n"); + int ilen; + for (ilen=0; ilenmax_len; ilen++) + { + if ( stats->read_lengths[ilen]>0 ) + printf("RL\t%d\t%ld\n", ilen,stats->read_lengths[ilen]); + } + + printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n"); + for (ilen=0; ilennindels; ilen++) + { + if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 ) + printf("ID\t%d\t%ld\t%ld\n", ilen+1,stats->insertions[ilen],stats->deletions[ilen]); + } + + printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions, number of deletions\n"); + for (ilen=0; ilennbases; ilen++) + { + if ( stats->ins_cycles[ilen]>0 || stats->del_cycles[ilen]>0 ) + printf("IC\t%d\t%ld\t%ld\n", ilen+1,stats->ins_cycles[ilen],stats->del_cycles[ilen]); + } + + printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); + printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1,stats->cov[0]); + int icov; + for (icov=1; icovncov-1; icov++) + printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1,stats->cov[icov]); + printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov[stats->ncov-1]); + + + // Calculate average GC content, then sort by GC and depth + printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n"); + uint32_t igcd; + for (igcd=0; igcdigcd; igcd++) + { + if ( stats->fai ) + stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc); + else + if ( stats->gcd[igcd].depth ) + stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); + } + qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); + igcd = 0; + while ( igcd < stats->igcd ) + { + // Calculate percentiles (10,25,50,75,90th) for the current GC content and print + uint32_t nbins=0, itmp=igcd; + float gc = stats->gcd[igcd].gc; + while ( itmpigcd && fabs(stats->gcd[itmp].gc-gc)<0.1 ) + { + nbins++; + itmp++; + } + printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1), + gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size + ); + igcd += nbins; + } +} + +void bam_init_header_hash(bam_header_t *header); + +void init_regions(stats_t *stats, char *file) +{ + khiter_t iter; + khash_t(str) *header_hash; + + bam_init_header_hash(stats->sam->header); + header_hash = (khash_t(str)*)stats->sam->header->hash; + + FILE *fp = fopen(file,"r"); + if ( !fp ) error("%s: %s\n",file,strerror(errno)); + + char *line = NULL; + size_t len = 0; + ssize_t nread; + int warned = 0; + int prev_tid=-1, prev_pos=-1; + while ((nread = getline(&line, &len, fp)) != -1) + { + if ( line[0] == '#' ) continue; + + int i = 0; + while ( i=nread ) error("Could not parse the file: %s\n", file); + line[i] = 0; + + iter = kh_get(str, header_hash, line); + int tid = kh_val(header_hash, iter); + if ( iter == kh_end(header_hash) ) + { + if ( !warned ) + fprintf(stderr,"Warning: Some sequences not present in the BAM (%s)\n", line); + warned = 1; + continue; + } + + if ( tid >= stats->nregions ) + { + stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100)); + int j; + for (j=stats->nregions; jnregions+100; j++) + { + stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0; + stats->regions[j].pos = NULL; + } + stats->nregions += 100; + } + int npos = stats->regions[tid].npos; + if ( npos >= stats->regions[tid].mpos ) + { + stats->regions[tid].mpos += 1000; + stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); + } + + if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n"); + if ( prev_tid==-1 || prev_tid!=tid ) + { + prev_tid = tid; + prev_pos = stats->regions[tid].pos[npos].from; + } + if ( prev_pos>stats->regions[tid].pos[npos].from ) + error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos); + stats->regions[tid].npos++; + } + if (line) free(line); + fclose(fp); +} + +void destroy_regions(stats_t *stats) +{ + int i; + for (i=0; inregions; i++) + { + if ( !stats->regions[i].mpos ) continue; + free(stats->regions[i].pos); + } + if ( stats->regions ) free(stats->regions); +} + +static int fetch_read(const bam1_t *bam_line, void *data) +{ + collect_stats((bam1_t*)bam_line,(stats_t*)data); + return 1; +} + + +void error(const char *format, ...) +{ + if ( !format ) + { + printf("Version: %s\n", BAMCHECK_VERSION); + printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n"); + printf("Usage: bamcheck [OPTIONS] file.bam\n"); + printf(" bamcheck [OPTIONS] file.bam chr:from-to\n"); + printf("Options:\n"); + printf(" -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); + printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n"); + printf(" -h, --help This help message\n"); + printf(" -i, --insert-size Maximum insert size [8000]\n"); + printf(" -l, --read-length Include in the statistics only reads with the given read length []\n"); + printf(" -m, --most-inserts Report only the main part of inserts [0.99]\n"); + printf(" -q, --trim-quality The BWA trimming parameter [0]\n"); + printf(" -r, --ref-seq Reference sequence (required for GC-depth calculation).\n"); + printf(" -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); + printf(" -s, --sam Input is SAM\n"); + printf("\n"); + } + else + { + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + } + exit(-1); +} + +int main(int argc, char *argv[]) +{ + char *targets = NULL; + char *bam_fname = NULL; + samfile_t *sam = NULL; + char in_mode[5]; + + stats_t *stats = calloc(1,sizeof(stats_t)); + stats->ngc = 200; + stats->nquals = 95; + stats->nbases = 300; + stats->nisize = 8000; + stats->max_len = 30; + stats->max_qual = 40; + stats->isize_main_bulk = 0.99; // There are always outliers at the far end + stats->gcd_bin_size = 20000; + stats->ngcd = 3e5; // 300k of 20k bins is enough to hold a genome 6Gbp big + stats->nref_seq = stats->gcd_bin_size; + stats->rseq_pos = -1; + stats->tid = stats->gcd_pos = -1; + stats->is_sorted = 1; + stats->cov_min = 1; + stats->cov_max = 1000; + stats->cov_step = 1; + stats->argc = argc; + stats->argv = argv; + stats->filter_readlen = -1; + stats->nindels = stats->nbases; + + strcpy(in_mode, "rb"); + + static struct option loptions[] = + { + {"help",0,0,'h'}, + {"remove-dups",0,0,'d'}, + {"sam",0,0,'s'}, + {"ref-seq",0,0,'r'}, + {"coverage",0,0,'c'}, + {"read-length",0,0,'l'}, + {"insert-size",0,0,'i'}, + {"most-inserts",0,0,'m'}, + {"trim-quality",0,0,'q'}, + {"target-regions",0,0,'t'}, + {0,0,0,0} + }; + int opt; + while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:",loptions,NULL))>0 ) + { + switch (opt) + { + case 'd': stats->rmdup=1; break; + case 's': strcpy(in_mode, "r"); break; + case 'r': stats->fai = fai_load(optarg); + if (stats->fai==0) + error("Could not load faidx: %s\n", optarg); + break; + case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 ) + error("Unable to parse -c %s\n", optarg); + break; + case 'l': stats->filter_readlen = atoi(optarg); break; + case 'i': stats->nisize = atoi(optarg); break; + case 'm': stats->isize_main_bulk = atof(optarg); break; + case 'q': stats->trim_qual = atoi(optarg); break; + case 't': targets = optarg; break; + case '?': + case 'h': error(NULL); + default: error("Unknown argument: %s\n", optarg); + } + } + if ( optindcov_step > stats->cov_max - stats->cov_min + 1 ) + { + stats->cov_step = stats->cov_max - stats->cov_min; + if ( stats->cov_step <= 0 ) + stats->cov_step = 1; + } + stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step; + stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1; + stats->cov = calloc(sizeof(uint64_t),stats->ncov); + stats->cov_rbuf.size = stats->nbases*5; + stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); + // .. bam + if ((sam = samopen(bam_fname, in_mode, NULL)) == 0) + error("Failed to open: %s\n", bam_fname); + stats->sam = sam; + bam1_t *bam_line = bam_init1(); + // .. arrays + stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); + stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); + stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); + stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); + stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t)); + stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t)); + stats->isize_other = calloc(stats->nisize,sizeof(uint64_t)); + stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); + stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t)); + stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; + stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t)); + stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); + stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); + stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); + stats->ins_cycles = calloc(stats->nbases,sizeof(uint64_t)); + stats->del_cycles = calloc(stats->nbases,sizeof(uint64_t)); + if ( targets ) + init_regions(stats, targets); + + // Collect statistics + if ( optindsam->header, argv[i], &tid, &beg, &end); + if ( tid < 0 ) continue; + bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read); + } + bam_index_destroy(bam_idx); + } + else + { + // Stream through the entire BAM ignoring off-target regions if -t is given + while (samread(sam,bam_line) >= 0) + { + if ( stats->regions ) + { + if ( bam_line->core.tid >= stats->nregions ) continue; + if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n"); + + regions_t *reg = &stats->regions[bam_line->core.tid]; + if ( reg->cpos==reg->npos ) continue; // done for this chr + + // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, + // even small overlap is enough to include the read in the stats. + int i = reg->cpos; + while ( inpos && reg->pos[i].to<=bam_line->core.pos ) i++; + if ( i>=reg->npos ) { reg->cpos = reg->npos; continue; } + if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) continue; + reg->cpos = i; + } + collect_stats(bam_line,stats); + } + } + round_buffer_flush(stats,-1); + + output_stats(stats); + + bam_destroy1(bam_line); + samclose(stats->sam); + if (stats->fai) fai_destroy(stats->fai); + free(stats->cov_rbuf.buffer); free(stats->cov); + free(stats->quals_1st); free(stats->quals_2nd); + free(stats->gc_1st); free(stats->gc_2nd); + free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other); + free(stats->gcd); + free(stats->rseq_buf); + free(stats->mpc_buf); + free(stats->acgt_cycles); + free(stats->read_lengths); + free(stats->insertions); + free(stats->deletions); + free(stats->ins_cycles); + free(stats->del_cycles); + destroy_regions(stats); + free(stats); + + return 0; +} + + + diff --git a/misc/plot-bamcheck b/misc/plot-bamcheck new file mode 100755 index 0000000..adaebe6 --- /dev/null +++ b/misc/plot-bamcheck @@ -0,0 +1,877 @@ +#!/usr/bin/env perl +# +# Author: petr.danecek@sanger +# + +use strict; +use warnings; +use Carp; + +my $opts = parse_params(); +parse_bamcheck($opts); +plot_qualities($opts); +plot_acgt_cycles($opts); +plot_gc($opts); +plot_gc_depth($opts); +plot_isize($opts); +plot_coverage($opts); +plot_mismatches_per_cycle($opts); +plot_indel_dist($opts); +plot_indel_cycles($opts); + +exit; + +#-------------------------------- + +sub error +{ + my (@msg) = @_; + if ( scalar @msg ) { confess @msg; } + die + "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n", + " plot-bamcheck -p outdir/ file.bam.bc\n", + "Options:\n", + " -k, --keep-files Do not remove temporary files.\n", + " -p, --prefix The output files prefix, add a slash to create new directory.\n", + " -r, --ref-stats Optional reference stats file with expected GC content (created with -s).\n", + " -s, --do-ref-stats Calculate reference sequence GC for later use with -r\n", + " -t, --targets Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n", + " -h, -?, --help This help message.\n", + "\n"; +} + + +sub parse_params +{ + $0 =~ s{^.+/}{}; + my $opts = { args=>join(' ',$0,@ARGV) }; + while (defined(my $arg=shift(@ARGV))) + { + if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; } + if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; } + if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; } + if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; } + if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } + if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } + if ( -e $arg ) { $$opts{bamcheck}=$arg; next; } + error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); + } + if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; } + if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") } + if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") } + if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; } + elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; } + return $opts; +} + + +# Creates GC stats for either the whole reference or only on target regions for exome QC +sub do_ref_stats +{ + my ($opts) = @_; + + + my %targets = (); + if ( exists($$opts{targets}) ) + { + my ($prev_chr,$prev_pos); + open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!"); + while (my $line=<$fh>) + { + if ( $line=~/^#/ ) { next; } + my ($chr,$from,$to) = split(/\s+/,$line); + chomp($to); + push @{$targets{$chr}}, $from,$to; + if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from } + if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); } + $prev_pos = $from; + } + close($fh); + } + + my $_len = 60; # for now do only standard fasta's with 60 bases per line + my %gc_counts = (); + my ($skip_chr,$pos,$ireg,$regions); + open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!"); + while (my $line=<$fh>) + { + if ( $line=~/^>/ ) + { + if ( !scalar %targets ) { next; } + + if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); } + if ( !exists($targets{$1}) ) { $skip_chr=$1; next; } + undef $skip_chr; + $pos = 0; + $ireg = 0; + $regions = $targets{$1}; + } + if ( defined $skip_chr ) { next; } + + # Only $_len sized lines are considered and no chopping for target regions. + chomp($line); + my $len = length($line); + if ( $len ne $_len ) { next; } + + if ( scalar %targets ) + { + while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; } + $pos += $len; + if ( $ireg==@$regions ) { next; } + if ( $pos < $$regions[$ireg] ) { next; } + } + + my $gc_count = 0; + for (my $i=0; $i<$len; $i++) + { + my $base = substr($line,$i,1); + if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; } + } + $gc_counts{$gc_count}++; + } + + print "# Generated by $$opts{args}\n"; + print "# The columns are: GC content bin, normalized frequency\n"; + my $max; + for my $count (values %gc_counts) + { + if ( !defined $max or $count>$max ) { $max=$count; } + } + for my $gc (sort {$a<=>$b} keys %gc_counts) + { + if ( $gc==0 ) { next; } + printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max; + } +} + +sub plot +{ + my ($cmdfile) = @_; + my $cmd = "gnuplot $cmdfile"; + system($cmd); + if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); } +} + + +sub parse_bamcheck +{ + my ($opts) = @_; + open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!"); + my $line = <$fh>; + if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); } + $$opts{dat}{version} = $1; + while ($line=<$fh>) + { + if ( $line=~/^#/ ) { next; } + my @items = split(/\t/,$line); + chomp($items[-1]); + if ( $items[0] eq 'SN' ) + { + $$opts{dat}{$items[1]} = splice(@items,2); + next; + } + push @{$$opts{dat}{$items[0]}}, [splice(@items,1)]; + } + close($fh); + + # Check sanity + if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} ) + { + error("Sanity check failed: no sequences found by bamcheck??\n"); + } +} + +sub older_than +{ + my ($opts,$version) = @_; + my ($year,$month,$day) = split(/-/,$version); + $version = $$opts{dat}{version}; + if ( !($version=~/\((\d+)-(\d+)-(\d+)\)$/) ) { return 1; } + if ( $1<$year ) { return 1; } + elsif ( $1>$year ) { return 0; } + if ( $2<$month ) { return 1; } + elsif ( $2>$month ) { return 0; } + if ( $3<$day ) { return 1; } + return 0; +} + +sub get_defaults +{ + my ($opts,$img_fname,%args) = @_; + + if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); } + + # Determine the gnuplot script file name + my $gp_file = $img_fname; + $gp_file =~ s{\.[^.]+$}{.gp}; + if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; } + + # Determine the default title: + # 5446_6/5446_6.bam.bc.gp -> 5446_6 + # test.aaa.png -> test.aaa + if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); } + my $title = $1; + + my $dir = $gp_file; + $dir =~ s{/[^/]+$}{}; + if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; } + + my $wh = exists($args{wh}) ? $args{wh} : '600,400'; + + open(my $fh,'>',$gp_file) or error("$gp_file: $!"); + return { + title => $title, + gp => $gp_file, + img => $img_fname, + fh => $fh, + terminal => qq[set terminal png size $wh truecolor], + grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"', + }; +} + +sub percentile +{ + my ($p,@vals) = @_; + my $N = 0; + for my $val (@vals) { $N += $val; } + my $n = $p*($N+1)/100.; + my $k = int($n); + my $d = $n-$k; + if ( $k<=0 ) { return 0; } + if ( $k>=$N ) { return scalar @vals-1; } + my $cnt; + for (my $i=0; $i<@vals; $i++) + { + $cnt += $vals[$i]; + if ( $cnt>=$k ) { return $i; } + } + error("FIXME: this should not happen [percentile]\n"); +} + +sub plot_qualities +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; } + + my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50; + my $is_paired = $$opts{dat}{'is paired:'}; + + # Average quality per cycle, forward and reverse reads in one plot + my $args = get_defaults($opts,"$$opts{prefix}quals.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Average Quality" + set xlabel "Cycle" + set yrange [0:$yrange] + set title "$$args{title}" + plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[ + ]; + my (@fp75,@fp50,@fmean); + my (@lp75,@lp50,@lmean); + my ($fmax,$fmax_qual,$fmax_cycle); + my ($lmax,$lmax_qual,$lmax_cycle); + for my $cycle (@{$$opts{dat}{FFQ}}) + { + my $sum=0; my $n=0; + for (my $iqual=1; $iqual<@$cycle; $iqual++) + { + $sum += $$cycle[$iqual]*$iqual; + $n += $$cycle[$iqual]; + if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; } + } + my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); + my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); + my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); + if ( !$n ) { next; } + push @fp75, "$$cycle[0]\t$p25\t$p75\n"; + push @fp50, "$$cycle[0]\t$p50\n"; + push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; + printf $fh $fmean[-1]; + } + print $fh "end\n"; + if ( $is_paired ) + { + for my $cycle (@{$$opts{dat}{LFQ}}) + { + my $sum=0; my $n=0; + for (my $iqual=1; $iqual<@$cycle; $iqual++) + { + $sum += $$cycle[$iqual]*$iqual; + $n += $$cycle[$iqual]; + if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; } + } + my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); + my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); + my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); + if ( !$n ) { next; } + push @lp75, "$$cycle[0]\t$p25\t$p75\n"; + push @lp50, "$$cycle[0]\t$p50\n"; + push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; + printf $fh $lmean[-1]; + } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); + + + + # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots + $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500'); + $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0.1,0.1 + set size 0.4,0.8 + set yrange [0:$yrange] + set ylabel "Quality" + set xlabel "Cycle (fwd reads)" + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean' + ]; + print $fh join('',@fp75),"end\n"; + print $fh join('',@fp50),"end\n"; + print $fh join('',@fmean),"end\n"; + if ( $is_paired ) + { + print $fh qq[ + set origin 0.55,0.1 + set size 0.4,0.8 + unset ytics + set y2tics mirror + unset ylabel + set xlabel "Cycle (rev reads)" + set label "$$args{title}" at screen 0.5,0.95 center + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean' + ]; + print $fh join('',@lp75),"end\n"; + print $fh join('',@lp50),"end\n"; + print $fh join('',@lmean),"end\n"; + } + close($fh); + plot($$args{gp}); + + + + # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve + $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600'); + $fh = $$args{fh}; + my $nquals = @{$$opts{dat}{FFQ}[0]}-1; + my $ncycles = @{$$opts{dat}{FFQ}}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0.15,0.52 + set size 0.8,0.4 + set title "$$args{title}" + set ylabel "Frequency (fwd reads)" + set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax + unset xlabel + set xrange [0:$nquals] + set format x "" + ]; + my @plots; + for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] } + print $fh "plot ", join(",", @plots), "\n"; + for my $cycle (@{$$opts{dat}{FFQ}}) + { + for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; } + print $fh "end\n"; + } + if ( $is_paired ) + { + print $fh qq[ + set origin 0.15,0.1 + set size 0.8,0.4 + unset title + unset format + set xtics + set xlabel "Quality" + unset label + set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax + set ylabel "Frequency (rev reads)" + ]; + print $fh "plot ", join(",", @plots), "\n"; + for my $cycle (@{$$opts{dat}{LFQ}}) + { + for (my $iqual=1; $iqual<$nquals; $iqual++) + { + print $fh "$iqual\t$$cycle[$iqual]\n"; + } + print $fh "end\n"; + } + } + close($fh); + plot($$args{gp}); + + + # Heatmap qualitites + $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500'); + $fh = $$args{fh}; + my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax; + my @ytics; + for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } + my $ytics = join(',', @ytics); + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + unset key + unset colorbox + set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1) + set cbrange [0:$max] + set yrange [0:$ncycles] + set xrange [0:$nquals] + set view map + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0,0.46 + set size 0.95,0.6 + set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles + set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black" + set ylabel "Cycle (fwd reads)" offset character -1,0 + unset ytics + set ytics ($ytics) + unset xtics + set title "$$args{title}" + splot '-' matrix with image + ]; + for my $cycle (@{$$opts{dat}{FFQ}}) + { + for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } + print $fh "\n"; + } + print $fh "end\nend\n"; + @ytics = (); + for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } + $ytics = join(',', @ytics); + print $fh qq[ + set origin 0,0.03 + set size 0.95,0.6 + set ylabel "Cycle (rev reads)" offset character -1,0 + set xlabel "Base Quality" + unset title + unset ytics + set ytics ($ytics) + set xrange [0:$nquals] + set xtics + set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812 + set cblabel "Number of bases" + splot '-' matrix with image + ]; + for my $cycle (@{$$opts{dat}{LFQ}}) + { + for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } + print $fh "\n"; + } + print $fh "end\nend\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_acgt_cycles +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linecolor rgb "green" + set style line 2 linecolor rgb "red" + set style line 3 linecolor rgb "black" + set style line 4 linecolor rgb "blue" + set style increment user + set ylabel "Base content [%]" + set xlabel "Read Cycle" + set yrange [0:100] + set title "$$args{title}" + plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T' + ]; + for my $base (1..4) + { + for my $cycle (@{$$opts{dat}{GCC}}) + { + print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n"; + } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); +} + + +sub plot_gc +{ + my ($opts) = @_; + + my $is_paired = $$opts{dat}{'is paired:'}; + my $args = get_defaults($opts,"$$opts{prefix}gc-content.png"); + my $fh = $$args{fh}; + my ($gcl_max,$gcf_max,$lmax,$fmax); + for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } } + for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } } + my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set title "$$args{title}" + set ylabel "Normalized Frequency" + set xlabel "GC Content [%]" + set yrange [0:1.1] + set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0 + plot ] + . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '') + . q['-' smooth csplines with lines lc 1 title 'First fragments' ] + . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '') + . q[ + ]; + if ( exists($$opts{ref_stats}) ) + { + open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!"); + while (my $line=<$ref>) { print $fh $line } + close($ref); + print $fh "end\n"; + } + for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; } + print $fh "end\n"; + if ( $is_paired ) + { + for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); +} + + +sub plot_gc_depth +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; } + + # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics. + my @tics = ( {gc=>30},{gc=>40},{gc=>50} ); + for my $gc (@{$$opts{dat}{GCD}}) + { + for my $tic (@tics) + { + my $diff = abs($$gc[0]-$$tic{gc}); + if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; } + } + } + + my @x2tics; + for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; } + my $x2tics = join(',',@x2tics); + + my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500'); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Mapped depth" + set xlabel "Percentile of mapped sequence ordered by GC content" + set x2label "GC Content [%]" + set title "$$args{title}" + set x2tics ($x2tics) + set xtics nomirror + set xrange [0.1:99.9] + + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\ + '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\ + '-' using 1:2 with lines lc rgb "#0084ff" t 'Median' + ]; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n"; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n"; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_isize +{ + my ($opts) = @_; + + if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; } + + my ($isize_max,$isize_cnt); + for my $isize (@{$$opts{dat}{IS}}) + { + if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; } + } + + my $args = get_defaults($opts,"$$opts{prefix}insert-size.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set rmargin 5 + set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt + set ylabel "Number of pairs" + set xlabel "Insert Size" + set title "$$args{title}" + plot \\ + '-' with lines lc rgb 'black' title 'All pairs', \\ + '-' with lines title 'Inward', \\ + '-' with lines title 'Outward', \\ + '-' with lines title 'Other' + ]; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_coverage +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; } + + my @vals; + for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; } + my $p99 = percentile(99.8,@vals); + + my $args = get_defaults($opts,"$$opts{prefix}coverage.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Number of mapped bases" + set xlabel "Coverage" + set style fill solid border -1 + set title "$$args{title}" + set xrange [:$p99] + plot '-' with lines notitle + ]; + for my $cov (@{$$opts{dat}{COV}}) + { + if ( $$cov[2]==0 ) { next; } + print $fh "$$cov[1]\t$$cov[2]\n"; + } + print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_mismatches_per_cycle +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; } + if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); } + + my $nquals = @{$$opts{dat}{MPC}[0]} - 2; + my $ncycles = @{$$opts{dat}{MPC}}; + my ($style,$with); + if ( $ncycles>100 ) { $style = ''; $with = 'w l'; } + else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; } + + my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linecolor rgb "#e40000" + set style line 2 linecolor rgb "#ff9f00" + set style line 3 linecolor rgb "#eeee00" + set style line 4 linecolor rgb "#4ebd68" + set style line 5 linecolor rgb "#0061ff" + set style increment user + set key left top + $style + set ylabel "Number of mismatches" + set xlabel "Read Cycle" + set style fill solid border -1 + set title "$$args{title}" + set xrange [-1:$ncycles] + plot '-' $with ti 'Base Quality>30', \\ + '-' $with ti '30>=Q>20', \\ + '-' $with ti '20>=Q>10', \\ + '-' $with ti '10>=Q', \\ + '-' $with ti "N's" + ]; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; } + print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + +sub plot_indel_dist +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linetype 1 linecolor rgb "red" + set style line 2 linetype 2 linecolor rgb "black" + set style line 3 linetype 3 linecolor rgb "green" + set style increment user + set ylabel "Indel count [log]" + set xlabel "Indel length" + set y2label "Insertions/Deletions ratio" + set log y + set y2tics nomirror + set ytics nomirror + set title "$$args{title}" + plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio" + ]; + for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + +sub plot_indel_cycles +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linetype 1 linecolor rgb "red" + set style line 2 linetype 2 linecolor rgb "black" + set style line 3 linetype 3 linecolor rgb "green" + set style increment user + set ylabel "Indel count" + set xlabel "Read Cycle" + set title "$$args{title}" + plot '-' w l ti 'Insertions', '' w l ti 'Deletions' + ]; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + + + + + + +sub has_values +{ + my ($opts,@tags) = @_; + for my $tag (@tags) + { + my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`; + chomp($lines[0]); + if ( $lines[0]<2 ) { return 0; } + } + return 1; +} + +sub plot_mismatches_per_cycle_old +{ + my ($opts) = @_; + + my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); + my ($nquals) = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`; + my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`; + chomp($nquals); + chomp($ncycles); + $nquals--; + $ncycles--; + my @gr0_15 = (2..17); + my @gr16_30 = (18..32); + my @gr31_n = (33..$nquals); + my $gr0_15 = '$'. join('+$',@gr0_15); + my $gr16_30 = '$'. join('+$',@gr16_30); + my $gr31_n = '$'. join('+$',@gr31_n); + + open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!"); + print $fh q[ + set terminal png size 600,400 truecolor font "DejaVuSansMono,9" + set output "] . $$args{img} . q[" + + set key left top + set style data histogram + set style histogram rowstacked + + set grid back lc rgb "#aaaaaa" + set ylabel "Number of mismatches" + set xlabel "Read Cycle" + set style fill solid border -1 + set title "] . $$args{title} . qq[" + set xrange [-1:$ncycles] + + plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q' + ]; + close($fh); + + plot($$args{gp}); +} + + -- 2.39.2