Initial release of bamcheck and plot-bamcheck

author Petr Danecek <pd3@sanger.ac.uk>

Tue, 27 Mar 2012 13:49:59 +0000 (21:49 +0800)

committer pd3 <pd3@sanger.ac.uk>

Tue, 27 Mar 2012 13:54:33 +0000 (21:54 +0800)
author Petr Danecek <pd3@sanger.ac.uk>
Tue, 27 Mar 2012 13:49:59 +0000 (21:49 +0800)
committer pd3 <pd3@sanger.ac.uk>
Tue, 27 Mar 2012 13:54:33 +0000 (21:54 +0800)
diff --git a/misc/bamcheck.c b/misc/bamcheck.c

new file mode 100644 (file)

index 0000000..7b5fd02
--- /dev/null
+++ b/misc/bamcheck.c
@@ -0,0 +1,1287 @@
+/* 
+    Author: petr.danecek@sanger
+    gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam
+
+    Assumptions, approximations and other issues:
+        - GC-depth graph does not split reads, the starting position determines which bin is incremented.
+            There are small overlaps between bins (max readlen-1). However, the bins are big (20k).
+        - coverage distribution ignores softclips and deletions
+        - some stats require sorted BAMs
+        - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
+        - The whole reads are used with -t, no splicing is done, no indels or soft clips are 
+            considered, even small overlap is good enough to include the read in the stats.
+*/
+
+#define BAMCHECK_VERSION "2012-03-22"
+
+#define _ISOC99_SOURCE
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+#include <getopt.h>
+#include <errno.h>
+#include "faidx.h"
+#include "khash.h"
+#include "sam.h"
+#include "razf.h"
+
+#define BWA_MIN_RDLEN 35
+#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP))
+#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP)
+#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE)
+#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE)
+#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1)
+#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2)
+#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP)
+
+typedef struct 
+{
+    int32_t line_len, line_blen;
+    int64_t len;
+    uint64_t offset;
+} 
+faidx1_t;
+KHASH_MAP_INIT_STR(s, faidx1_t)
+KHASH_MAP_INIT_STR(str, int)
+struct __faidx_t {
+    RAZF *rz;
+    int n, m;
+    char **name;
+    khash_t(s) *hash;
+};
+
+typedef struct
+{
+    float gc;
+    uint32_t depth;
+}
+gc_depth_t;
+
+// For coverage distribution, a simple pileup
+typedef struct 
+{
+    int64_t pos;
+    int size, start;
+    int *buffer;
+} 
+round_buffer_t;
+
+typedef struct { uint32_t from, to; } pos_t;
+typedef struct
+{
+    int npos,mpos,cpos;
+    pos_t *pos;
+}
+regions_t;
+
+typedef struct
+{
+    // Parameters
+    int trim_qual;      // bwa trim quality
+    int rmdup;          // Exclude reads marked as duplicates from the stats
+
+    // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
+    //  insert size histogram holder
+    int nquals;         // The number of quality bins 
+    int nbases;         // The maximum sequence length the allocated array can hold
+    int nisize;         // The maximum insert size that the allocated array can hold
+    int ngc;            // The size of gc_1st and gc_2nd
+    int nindels;        // The maximum indel length for indel distribution
+
+    // Arrays for the histogram data
+    uint64_t *quals_1st, *quals_2nd;
+    uint64_t *gc_1st, *gc_2nd;
+    uint64_t *isize_inward, *isize_outward, *isize_other;
+    uint64_t *acgt_cycles;
+    uint64_t *read_lengths;
+    uint64_t *insertions, *deletions;
+    uint64_t *ins_cycles, *del_cycles;
+
+    // The extremes encountered
+    int max_len;            // Maximum read length
+    int max_qual;           // Maximum quality
+    float isize_main_bulk;  // There are always some unrealistically big insert sizes, report only the main part
+    int is_sorted;
+
+    // Summary numbers
+    uint64_t total_len;
+    uint64_t total_len_dup;
+    uint64_t nreads_1st;
+    uint64_t nreads_2nd;
+    uint64_t nreads_dup;
+    uint64_t nreads_unmapped;
+    uint64_t nreads_unpaired;
+    uint64_t nreads_paired;
+    uint64_t nreads_mq0;
+    uint64_t nbases_mapped;
+    uint64_t nbases_mapped_cigar;
+    uint64_t nbases_trimmed;  // bwa trimmed bases
+    uint64_t nmismatches;
+
+    // GC-depth related data
+    uint32_t ngcd, igcd;        // The maximum number of GC depth bins and index of the current bin
+    gc_depth_t *gcd;            // The GC-depth bins holder
+    int gcd_bin_size;           // The size of GC-depth bin
+    int32_t tid, gcd_pos;       // Position of the current bin
+    int32_t pos;                // Position of the last read
+
+    // Coverage distribution related data
+    int ncov;                       // The number of coverage bins
+    uint64_t *cov;                  // The coverage frequencies
+    int cov_min,cov_max,cov_step;   // Minimum, maximum coverage and size of the coverage bins
+    round_buffer_t cov_rbuf;        // Pileup round buffer
+
+    // Mismatches by read cycle 
+    uint8_t *rseq_buf;             // A buffer for reference sequence to check the mismatches against
+    int nref_seq;               // The size of the buffer
+    int32_t rseq_pos;           // The coordinate of the first base in the buffer
+    int32_t rseq_len;           // The used part of the buffer
+    uint64_t *mpc_buf;          // Mismatches per cycle
+
+    // Filters
+    int filter_readlen;
+
+    // Target regions
+    int nregions;
+    regions_t *regions;
+
+    // Auxiliary data
+    double sum_qual;            // For calculating average quality value 
+    samfile_t *sam;             // Unused
+    faidx_t *fai;               // Reference sequence for GC-depth graph
+    int argc;                   // Command line arguments to be printed on the output
+    char **argv;
+}
+stats_t;
+
+void error(const char *format, ...);
+
+// Coverage distribution methods
+inline int coverage_idx(int min, int max, int n, int step, int depth)
+{
+    if ( depth < min )
+        return 0;
+
+    if ( depth > max )
+        return n-1;
+
+    return 1 + (depth - min) / step;
+}
+
+inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos)
+{
+    return (offset + (pos-refpos) % size) % size;
+}
+
+void round_buffer_flush(stats_t *stats, int64_t pos)
+{
+    int ibuf,idp;
+
+    if ( pos==stats->cov_rbuf.pos ) 
+        return;
+
+    int64_t new_pos = pos;
+    if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size )
+    {
+        // Flush the whole buffer, but in sequential order, 
+        pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1;
+    }
+
+    if ( pos < stats->cov_rbuf.pos ) 
+        error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos);
+
+    int ifrom = stats->cov_rbuf.start;
+    int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1);
+    if ( ifrom>ito )
+    {
+        for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++)
+        {
+            if ( !stats->cov_rbuf.buffer[ibuf] )
+                continue;
+            idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
+            stats->cov[idp]++;
+            stats->cov_rbuf.buffer[ibuf] = 0;
+        }
+        ifrom = 0;
+    }
+    for (ibuf=ifrom; ibuf<=ito; ibuf++)
+    {
+        if ( !stats->cov_rbuf.buffer[ibuf] )
+            continue;
+        idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]);
+        stats->cov[idp]++;
+        stats->cov_rbuf.buffer[ibuf] = 0;
+    }
+    stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos);
+    stats->cov_rbuf.pos   = new_pos;
+}
+
+void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to)
+{
+    if ( to-from >= rbuf->size )
+        error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size);
+    if ( from < rbuf->pos )
+        error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos);
+
+    int ifrom,ito,ibuf;
+    ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from);
+    ito   = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to);
+    if ( ifrom>ito )
+    {
+        for (ibuf=ifrom; ibuf<rbuf->size; ibuf++)
+            rbuf->buffer[ibuf]++;
+        ifrom = 0;
+    }
+    for (ibuf=ifrom; ibuf<=ito; ibuf++)
+        rbuf->buffer[ibuf]++;
+}
+
+// Calculate the number of bases in the read trimmed by BWA
+int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse) 
+{
+    if ( len<BWA_MIN_RDLEN ) return 0;
+
+    // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN,
+    //  the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c).
+    int max_trimmed = len - BWA_MIN_RDLEN + 1;
+    int l, sum=0, max_sum=0, max_l=0;
+
+    for (l=0; l<max_trimmed; l++)
+    {
+        sum += trim_qual - quals[ reverse ? l : len-1-l ];
+        if ( sum<0 ) break;
+        if ( sum>max_sum )
+        {
+            max_sum = sum;
+            // This is the correct way, but bwa clips from some reason one base less
+            // max_l   = l+1;
+            max_l   = l;
+        }
+    }
+    return max_l;
+}
+
+
+void count_indels(stats_t *stats,bam1_t *bam_line) 
+{
+    int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
+    int icig;
+    int icycle = 0;
+    int read_len = bam_line->core.l_qseq;
+    for (icig=0; icig<bam_line->core.n_cigar; icig++) 
+    {
+        // Conversion from uint32_t to MIDNSHP
+        //  0123456
+        //  MIDNSHP
+        int cig  = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
+        int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
+
+        if ( cig==1 )
+        {
+            int idx = is_fwd ? icycle : read_len-icycle-1;
+            if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
+            stats->ins_cycles[idx]++;
+            icycle += ncig;
+            if ( ncig<=stats->nindels )
+                stats->insertions[ncig-1]++;
+            continue;
+        }
+        if ( cig==2 )
+        {
+            int idx = is_fwd ? icycle : read_len-icycle-1;
+            if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
+            stats->del_cycles[idx]++;
+            if ( ncig<=stats->nindels )
+                stats->deletions[ncig-1]++;
+            continue;
+        }
+        icycle += ncig;
+    }
+}
+
+void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) 
+{
+    int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
+    int icig,iread=0,icycle=0;
+    int iref = bam_line->core.pos - stats->rseq_pos;
+    int read_len   = bam_line->core.l_qseq;
+    uint8_t *read  = bam1_seq(bam_line);
+    uint8_t *quals = bam1_qual(bam_line);
+    uint64_t *mpc_buf = stats->mpc_buf;
+    for (icig=0; icig<bam_line->core.n_cigar; icig++) 
+    {
+        // Conversion from uint32_t to MIDNSHP
+        //  0123456
+        //  MIDNSHP
+        int cig  = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK;
+        int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT;
+        if ( cig==1 )
+        {
+            iread  += ncig;
+            icycle += ncig;
+            continue;
+        }
+        if ( cig==2 )
+        {
+            iref += ncig;
+            continue;
+        }
+        if ( cig==4 )
+        {
+            icycle += ncig;
+            // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence
+            //   iref += ncig;
+            iread  += ncig;
+            continue;
+        }
+        if ( cig==5 )
+        {
+            icycle += ncig;
+            continue;
+        }
+        if ( cig!=0 )
+            error("TODO: cigar %d, %s\n", cig,bam1_qname(bam_line));
+       
+        if ( ncig+iref > stats->rseq_len )
+            error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->rseq_len, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
+
+        int im;
+        for (im=0; im<ncig; im++)
+        {
+            uint8_t cread = bam1_seqi(read,iread);
+            uint8_t cref  = stats->rseq_buf[iref];
+
+            // ---------------15
+            // =ACMGRSVTWYHKDBN
+            if ( cread==15 )
+            {
+                int idx = is_fwd ? icycle : read_len-icycle-1;
+                if ( idx>stats->max_len )
+                    error("mpc: %d>%d\n",idx,stats->max_len);
+                idx = idx*stats->nquals;
+                if ( idx>=stats->nquals*stats->nbases )
+                    error("FIXME: mpc_buf overflow\n");
+                mpc_buf[idx]++;
+            }
+            else if ( cref && cread && cref!=cread )
+            {
+                uint8_t qual = quals[iread] + 1;
+                if ( qual>=stats->nquals )
+                    error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
+
+                int idx = is_fwd ? icycle : read_len-icycle-1;
+                if ( idx>stats->max_len )
+                    error("mpc: %d>%d\n",idx,stats->max_len);
+
+                idx = idx*stats->nquals + qual;
+                if ( idx>=stats->nquals*stats->nbases )
+                    error("FIXME: mpc_buf overflow\n");
+                mpc_buf[idx]++;
+            }
+
+            iref++;
+            iread++;
+            icycle++;
+        }
+    }
+}
+
+void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
+{
+    khash_t(s) *h;
+    khiter_t iter;
+    faidx1_t val;
+    char *chr, c;
+    faidx_t *fai = stats->fai;
+
+    h = fai->hash;
+    chr = stats->sam->header->target_name[tid];
+
+    // ID of the sequence name
+    iter = kh_get(s, h, chr);
+    if (iter == kh_end(h)) 
+        error("No such reference sequence [%s]?\n", chr);
+    val = kh_value(h, iter);
+
+    // Check the boundaries
+    if (pos >= val.len)
+        error("Was the bam file mapped with the reference sequence supplied?"
+              " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
+    int size = stats->nref_seq;
+    // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
+    if (size+pos > val.len) size = val.len-pos;
+
+    // Position the razf reader
+    razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET);
+
+    uint8_t *ptr = stats->rseq_buf;
+    int nread = 0;
+    while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err )
+    {
+        if ( !isgraph(c) )
+            continue;
+
+        // Conversion between uint8_t coding and ACGT
+        //      -12-4---8-------
+        //      =ACMGRSVTWYHKDBN
+        if ( c=='A' || c=='a' )
+            *ptr = 1;
+        else if ( c=='C' || c=='c' )
+            *ptr = 2;
+        else if ( c=='G' || c=='g' )
+            *ptr = 4;
+        else if ( c=='T' || c=='t' )
+            *ptr = 8;
+        else
+            *ptr = 0;
+        ptr++;
+        nread++;
+    }
+    if ( nread < stats->nref_seq )
+    {
+        memset(ptr,0, stats->nref_seq - nread);
+        nread = stats->nref_seq;
+    }
+    stats->rseq_len = nread;
+    stats->rseq_pos = pos;
+    stats->tid      = tid;
+}
+
+float fai_gc_content(stats_t *stats)
+{
+    uint32_t gc,count,c;
+    int i,size = stats->rseq_len;
+
+    // Count GC content
+    gc = count = 0;
+    for (i=0; i<size; i++)
+    {
+        c = stats->rseq_buf[i];
+        if ( c==2 || c==4 )
+        {
+            gc++;
+            count++;
+        }
+        else if ( c==1 || c==8 )
+            count++;
+    }
+    return count ? (float)gc/count : 0;
+}
+
+
+void realloc_buffers(stats_t *stats, int seq_len)
+{
+    int n = 2*(1 + seq_len - stats->nbases) + stats->nbases;
+
+    stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t));
+    if ( !stats->quals_1st )
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
+    memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
+
+    stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t));
+    if ( !stats->quals_2nd )
+        error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
+    memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
+
+    if ( stats->mpc_buf )
+    {
+        stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t));
+        if ( !stats->mpc_buf )
+            error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t));
+        memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t));
+    }
+
+    stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t));
+    if ( !stats->acgt_cycles )
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t));
+    memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t));
+
+    stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t));
+    if ( !stats->read_lengths )
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
+    memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+    stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t));
+    if ( !stats->insertions )
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
+    memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+    stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t));
+    if ( !stats->deletions )
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
+    memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+    stats->ins_cycles = realloc(stats->ins_cycles, n*sizeof(uint64_t));
+    if ( !stats->ins_cycles )
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
+    memset(stats->ins_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+    stats->del_cycles = realloc(stats->del_cycles, n*sizeof(uint64_t));
+    if ( !stats->del_cycles )
+        error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
+    memset(stats->del_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+    stats->nbases = n;
+
+    // Realloc the coverage distribution buffer
+    int *rbuffer = calloc(sizeof(int),seq_len*5);
+    n = stats->cov_rbuf.size-stats->cov_rbuf.start;
+    memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n);
+    if ( stats->cov_rbuf.start>1 )
+        memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start);
+    stats->cov_rbuf.start = 0;
+    free(stats->cov_rbuf.buffer);
+    stats->cov_rbuf.buffer = rbuffer;
+    stats->cov_rbuf.size = seq_len*5;
+}
+
+void collect_stats(bam1_t *bam_line, stats_t *stats)
+{
+    if ( stats->rmdup && IS_DUP(bam_line) )
+        return;
+
+    int seq_len = bam_line->core.l_qseq;
+    if ( !seq_len ) return;
+    if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
+    if ( seq_len >= stats->nbases )
+        realloc_buffers(stats,seq_len);
+    if ( stats->max_len<seq_len )
+        stats->max_len = seq_len;
+
+    stats->read_lengths[seq_len]++;
+
+    // Count GC and ACGT per cycle
+    uint8_t base, *seq  = bam1_seq(bam_line);
+    int gc_count  = 0;
+    int i;
+    int reverse = IS_REVERSE(bam_line);
+    for (i=0; i<seq_len; i++)
+    {
+        // Conversion from uint8_t coding to ACGT
+        //      -12-4---8-------
+        //      =ACMGRSVTWYHKDBN
+        //       01 2   3
+        base = bam1_seqi(seq,i);
+        base /= 2;
+        if ( base==1 || base==2 ) gc_count++;
+        else if ( base>2 ) base=3;
+        if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 ) 
+            error("FIXME: acgt_cycles\n");
+        stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++;
+    }
+    int gc_idx_min = gc_count*(stats->ngc-1)/seq_len;
+    int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len;
+    if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1;
+
+    // Determine which array (1st or 2nd read) will these stats go to,
+    //  trim low quality bases from end the same way BWA does, 
+    //  fill GC histogram
+    uint64_t *quals;
+    uint8_t *bam_quals = bam1_qual(bam_line);
+    if ( bam_line->core.flag&BAM_FREAD2 )
+    {
+        quals  = stats->quals_2nd;
+        stats->nreads_2nd++;
+        for (i=gc_idx_min; i<gc_idx_max; i++)
+            stats->gc_2nd[i]++;
+    }
+    else
+    {
+        quals = stats->quals_1st;
+        stats->nreads_1st++;
+        for (i=gc_idx_min; i<gc_idx_max; i++)
+            stats->gc_1st[i]++;
+    }
+    if ( stats->trim_qual>0 ) 
+        stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse);
+
+    // Quality histogram and average quality
+    for (i=0; i<seq_len; i++)
+    {
+        uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
+        if ( qual>=stats->nquals )
+            error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
+        if ( qual>stats->max_qual )
+            stats->max_qual = qual;
+
+        quals[ i*stats->nquals+qual ]++;
+        stats->sum_qual += qual;
+    }
+
+    // Look at the flags and increment appropriate counters (mapped, paired, etc)
+    if ( IS_UNMAPPED(bam_line) )
+        stats->nreads_unmapped++;
+    else
+    {
+        if ( !bam_line->core.qual )
+            stats->nreads_mq0++;
+
+        count_indels(stats,bam_line);
+
+        // The insert size is tricky, because for long inserts the libraries are
+        // prepared differently and the pairs point in other direction. BWA does
+        // not set the paired flag for them.  Similar thing is true also for 454
+        // reads. Therefore, do the insert size stats for all mapped reads.
+        int32_t isize = bam_line->core.isize;
+        if ( isize<0 ) isize = -isize;
+        if ( IS_PAIRED(bam_line) && isize!=0 )
+        {
+            stats->nreads_paired++;
+            if ( isize >= stats->nisize )
+                isize=stats->nisize-1;
+
+            int pos_fst = bam_line->core.mpos - bam_line->core.pos;
+            int is_fst  = IS_READ1(bam_line) ? 1 : -1;
+            int is_fwd  = IS_REVERSE(bam_line) ? -1 : 1;
+            int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+
+            if ( is_fwd*is_mfwd>0 )
+                stats->isize_other[isize]++;
+            else if ( is_fst*pos_fst>0 )
+            {
+                if ( is_fst*is_fwd>0 )
+                    stats->isize_inward[isize]++;
+                else
+                    stats->isize_outward[isize]++;
+            }
+            else if ( is_fst*pos_fst<0 )
+            {
+                if ( is_fst*is_fwd>0 )
+                    stats->isize_outward[isize]++;
+                else
+                    stats->isize_inward[isize]++;
+            }
+        }
+        else
+            stats->nreads_unpaired++;
+
+        // Number of mismatches 
+        uint8_t *nm = bam_aux_get(bam_line,"NM");
+        if (nm) 
+            stats->nmismatches += bam_aux2i(nm);
+
+        // Number of mapped bases from cigar 
+        if ( bam_line->core.n_cigar == 0) 
+            error("FIXME: mapped read with no cigar?\n");
+        int readlen = seq_len;
+        for (i=0; i<bam_line->core.n_cigar; i++) 
+        {
+            // Conversion from uint32_t to MIDNSHP
+            //  01--4--
+            //  MIDNSHP
+            if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
+                stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+
+            if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
+                readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+        }
+        stats->nbases_mapped += seq_len;
+
+        if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos )
+            stats->is_sorted = 0;
+        stats->pos = bam_line->core.pos;
+
+        if ( stats->is_sorted )
+        {
+            if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
+                round_buffer_flush(stats,-1);
+
+            // Mismatches per cycle and GC-depth graph
+            if ( stats->fai )
+            {
+                // First pass, new chromosome or sequence spanning beyond the end of the buffer 
+                if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid || stats->rseq_pos+stats->rseq_len < bam_line->core.pos+readlen )
+                {
+                    read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
+
+                    // Get the reference GC content for this bin. Note that in the current implementation
+                    //  the GCD bins overlap by seq_len-1. Because the bin size is by default 20k and the 
+                    //  expected read length only ~100bp, it shouldn't really matter.
+                    stats->gcd_pos = bam_line->core.pos;
+                    stats->igcd++;
+
+                    if ( stats->igcd >= stats->ngcd )
+                        error("The genome too long or the GCD bin overlaps too big [%ud]\n", stats->igcd);
+
+                    stats->gcd[ stats->igcd ].gc = fai_gc_content(stats);
+                }
+                count_mismatches_per_cycle(stats,bam_line);
+            }
+            else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
+            {
+                // First pass or a new chromosome
+                stats->tid     = bam_line->core.tid;
+                stats->gcd_pos = bam_line->core.pos;
+                stats->igcd++;
+
+                if ( stats->igcd >= stats->ngcd )
+                {
+                    uint32_t n = 2*(1 + stats->ngcd);
+                    stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
+                    if ( !stats->gcd )
+                        error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
+                    memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
+                }
+            }
+
+            stats->gcd[ stats->igcd ].depth++;
+            // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
+            if ( !stats->fai )
+                stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len;
+
+            // Coverage distribution graph
+            round_buffer_flush(stats,bam_line->core.pos);
+            round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1);
+        }
+    }
+
+    stats->total_len += seq_len;
+    if ( IS_DUP(bam_line) )
+    {
+        stats->total_len_dup += seq_len;
+        stats->nreads_dup++;
+    }
+}
+
+// Sort by GC and depth
+#define GCD_t(x) ((gc_depth_t *)x)
+static int gcd_cmp(const void *a, const void *b)
+{
+    if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1;
+    if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1;
+    if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1;
+    if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1;
+    return 0;
+}
+#undef GCD_t
+
+float gcd_percentile(gc_depth_t *gcd, int N, int p)
+{
+    float n,d;
+    int k;
+
+    n = p*(N+1)/100;
+    k = n;
+    if ( k<=0 ) 
+        return gcd[0].depth;
+    if ( k>=N ) 
+        return gcd[N-1].depth;
+
+    d = n - k;
+    return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth);
+}
+
+void output_stats(stats_t *stats)
+{
+    // Calculate average insert size and standard deviation (from the main bulk data only)
+    int isize, ibulk=0;
+    uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
+    for (isize=1; isize<stats->nisize; isize++)
+    {
+        // Each pair was counted twice
+        stats->isize_inward[isize] *= 0.5;
+        stats->isize_outward[isize] *= 0.5;
+        stats->isize_other[isize] *= 0.5;
+
+        nisize_inward += stats->isize_inward[isize];
+        nisize_outward += stats->isize_outward[isize];
+        nisize_other += stats->isize_other[isize];
+        nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
+    }
+
+    double bulk=0, avg_isize=0, sd_isize=0;
+    for (isize=1; isize<stats->nisize; isize++)
+    {
+        bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
+        avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
+
+        if ( bulk/nisize > stats->isize_main_bulk )
+        {
+            ibulk  = isize+1;
+            nisize = bulk;
+            break;
+        }
+    }
+    avg_isize /= nisize ? nisize : 1;
+    for (isize=1; isize<ibulk; isize++)
+        sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize;
+    sd_isize = sqrt(sd_isize);
+
+
+    printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION);
+    printf("# The command line was:  %s",stats->argv[0]);
+    int i;
+    for (i=1; i<stats->argc; i++)
+        printf(" %s",stats->argv[i]);
+    printf("\n");
+    printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
+    printf("SN\tsequences:\t%ld\n", stats->nreads_1st+stats->nreads_2nd);
+    printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
+    printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
+    printf("SN\t1st fragments:\t%ld\n", stats->nreads_1st);
+    printf("SN\tlast fragments:\t%ld\n", stats->nreads_2nd);
+    printf("SN\treads mapped:\t%ld\n", stats->nreads_paired+stats->nreads_unpaired);
+    printf("SN\treads unmapped:\t%ld\n", stats->nreads_unmapped);
+    printf("SN\treads unpaired:\t%ld\n", stats->nreads_unpaired);
+    printf("SN\treads paired:\t%ld\n", stats->nreads_paired);
+    printf("SN\treads duplicated:\t%ld\n", stats->nreads_dup);
+    printf("SN\treads MQ0:\t%ld\n", stats->nreads_mq0);
+    printf("SN\ttotal length:\t%ld\n", stats->total_len);
+    printf("SN\tbases mapped:\t%ld\n", stats->nbases_mapped);
+    printf("SN\tbases mapped (cigar):\t%ld\n", stats->nbases_mapped_cigar);
+    printf("SN\tbases trimmed:\t%ld\n", stats->nbases_trimmed);
+    printf("SN\tbases duplicated:\t%ld\n", stats->total_len_dup);
+    printf("SN\tmismatches:\t%ld\n", stats->nmismatches);
+    printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar);
+    float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0;
+    printf("SN\taverage length:\t%.0f\n", avg_read_length);
+    printf("SN\tmaximum length:\t%d\n", stats->max_len);
+    printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0);
+    printf("SN\tinsert size average:\t%.1f\n", avg_isize);
+    printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize);
+    printf("SN\tinward oriented pairs:\t%ld\n", nisize_inward);
+    printf("SN\toutward oriented pairs:\t%ld\n", nisize_outward);
+    printf("SN\tpairs with other orientation:\t%ld\n", nisize_other);
+
+    int ibase,iqual;
+    if ( stats->max_len<stats->nbases ) stats->max_len++;
+    if ( stats->max_qual+1<stats->nquals ) stats->max_qual++;
+    printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n");
+    printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
+    for (ibase=0; ibase<stats->max_len; ibase++)
+    {
+        printf("FFQ\t%d",ibase+1);
+        for (iqual=0; iqual<=stats->max_qual; iqual++)
+        {
+            printf("\t%ld", stats->quals_1st[ibase*stats->nquals+iqual]);
+        }
+        printf("\n");
+    }
+    printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n");
+    printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n");
+    for (ibase=0; ibase<stats->max_len; ibase++)
+    {
+        printf("LFQ\t%d",ibase+1);
+        for (iqual=0; iqual<=stats->max_qual; iqual++)
+        {
+            printf("\t%ld", stats->quals_2nd[ibase*stats->nquals+iqual]);
+        }
+        printf("\n");
+    }
+    if ( stats->mpc_buf )
+    {
+        printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n");
+        printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n");
+        printf("# is the number of N's and the rest is the number of mismatches\n");
+        for (ibase=0; ibase<stats->max_len; ibase++)
+        {
+            printf("MPC\t%d",ibase+1);
+            for (iqual=0; iqual<=stats->max_qual; iqual++)
+            {
+                printf("\t%ld", stats->mpc_buf[ibase*stats->nquals+iqual]);
+            }
+            printf("\n");
+        }
+    }
+    printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n");
+    int ibase_prev = 0;
+    for (ibase=0; ibase<stats->ngc; ibase++)
+    {
+        if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue;
+        printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1),stats->gc_1st[ibase_prev]);
+        ibase_prev = ibase;
+    }
+    printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n");
+    ibase_prev = 0;
+    for (ibase=0; ibase<stats->ngc; ibase++)
+    {
+        if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue;
+        printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1),stats->gc_2nd[ibase_prev]);
+        ibase_prev = ibase;
+    }
+    printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n");
+    for (ibase=0; ibase<stats->max_len; ibase++)
+    {
+        uint64_t *ptr = &(stats->acgt_cycles[ibase*4]);
+        uint64_t  sum = ptr[0]+ptr[1]+ptr[2]+ptr[3];
+        if ( ! sum ) continue;
+        printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
+    }
+    printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
+    for (isize=1; isize<ibulk; isize++)
+        printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize,(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
+            stats->isize_inward[isize],stats->isize_outward[isize],stats->isize_other[isize]);
+
+    printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n");
+    int ilen;
+    for (ilen=0; ilen<stats->max_len; ilen++)
+    {
+        if ( stats->read_lengths[ilen]>0 )
+            printf("RL\t%d\t%ld\n", ilen,stats->read_lengths[ilen]);
+    }
+
+    printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
+    for (ilen=0; ilen<stats->nindels; ilen++)
+    {
+        if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 )
+            printf("ID\t%d\t%ld\t%ld\n", ilen+1,stats->insertions[ilen],stats->deletions[ilen]);
+    }
+
+    printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions, number of deletions\n");
+    for (ilen=0; ilen<stats->nbases; ilen++)
+    {
+        if ( stats->ins_cycles[ilen]>0 || stats->del_cycles[ilen]>0 )
+            printf("IC\t%d\t%ld\t%ld\n", ilen+1,stats->ins_cycles[ilen],stats->del_cycles[ilen]);
+    }
+
+    printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
+    printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1,stats->cov[0]);
+    int icov;
+    for (icov=1; icov<stats->ncov-1; icov++)
+        printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1,stats->cov[icov]);
+    printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov[stats->ncov-1]);
+
+
+    // Calculate average GC content, then sort by GC and depth
+    printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
+    uint32_t igcd;
+    for (igcd=0; igcd<stats->igcd; igcd++)
+    {
+        if ( stats->fai )
+            stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc);
+        else
+            if ( stats->gcd[igcd].depth ) 
+                stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth);
+    }
+    qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp);
+    igcd = 0;
+    while ( igcd < stats->igcd )
+    {
+        // Calculate percentiles (10,25,50,75,90th) for the current GC content and print
+        uint32_t nbins=0, itmp=igcd;
+        float gc = stats->gcd[igcd].gc;
+        while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 )
+        {
+            nbins++;
+            itmp++;
+        }
+        printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1),
+                gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size,
+                gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size, 
+                gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size, 
+                gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size, 
+                gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size 
+              );
+        igcd += nbins;
+    }
+}
+
+void bam_init_header_hash(bam_header_t *header);
+
+void init_regions(stats_t *stats, char *file)
+{
+    khiter_t iter;
+    khash_t(str) *header_hash;
+
+    bam_init_header_hash(stats->sam->header);
+    header_hash = (khash_t(str)*)stats->sam->header->hash;
+
+    FILE *fp = fopen(file,"r");
+    if ( !fp ) error("%s: %s\n",file,strerror(errno));
+
+    char *line = NULL;
+    size_t len = 0;
+    ssize_t nread;
+    int warned = 0;
+    int prev_tid=-1, prev_pos=-1;
+    while ((nread = getline(&line, &len, fp)) != -1) 
+    {
+        if ( line[0] == '#' ) continue;
+
+        int i = 0;
+        while ( i<nread && !isspace(line[i]) ) i++;
+        if ( i>=nread ) error("Could not parse the file: %s\n", file);
+        line[i] = 0;
+
+        iter = kh_get(str, header_hash, line);
+        int tid = kh_val(header_hash, iter);
+        if ( iter == kh_end(header_hash) )
+        {
+            if ( !warned )
+                fprintf(stderr,"Warning: Some sequences not present in the BAM (%s)\n", line);
+            warned = 1;
+            continue;
+        }
+
+        if ( tid >= stats->nregions )
+        {
+            stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100));
+            int j;
+            for (j=stats->nregions; j<stats->nregions+100; j++)
+            {
+                stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0;
+                stats->regions[j].pos = NULL;
+            }
+            stats->nregions += 100;
+        }
+        int npos = stats->regions[tid].npos;
+        if ( npos >= stats->regions[tid].mpos )
+        {
+            stats->regions[tid].mpos += 1000;
+            stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
+        }
+
+        if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n");
+        if ( prev_tid==-1 || prev_tid!=tid )
+        {
+            prev_tid = tid;
+            prev_pos = stats->regions[tid].pos[npos].from;
+        }
+        if ( prev_pos>stats->regions[tid].pos[npos].from )
+            error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos);
+        stats->regions[tid].npos++;
+    }
+    if (line) free(line);
+    fclose(fp);
+}
+
+void destroy_regions(stats_t *stats)
+{
+    int i;
+    for (i=0; i<stats->nregions; i++)
+    {
+        if ( !stats->regions[i].mpos ) continue;
+        free(stats->regions[i].pos);
+    }
+    if ( stats->regions ) free(stats->regions);
+}
+
+static int fetch_read(const bam1_t *bam_line, void *data)
+{
+    collect_stats((bam1_t*)bam_line,(stats_t*)data);
+    return 1;
+}
+
+
+void error(const char *format, ...)
+{
+    if ( !format )
+    {
+        printf("Version: %s\n", BAMCHECK_VERSION);
+        printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n");
+        printf("Usage: bamcheck [OPTIONS] file.bam\n");
+        printf("       bamcheck [OPTIONS] file.bam chr:from-to\n");
+        printf("Options:\n");
+        printf("    -c, --coverage <int>,<int>,<int>    Coverage distribution min,max,step [1,1000,1]\n");
+        printf("    -d, --remove-dups                   Exlude from statistics reads marked as duplicates\n");
+        printf("    -h, --help                          This help message\n");
+        printf("    -i, --insert-size <int>             Maximum insert size [8000]\n");
+        printf("    -l, --read-length <int>             Include in the statistics only reads with the given read length []\n");
+        printf("    -m, --most-inserts <float>          Report only the main part of inserts [0.99]\n");
+        printf("    -q, --trim-quality <int>            The BWA trimming parameter [0]\n");
+        printf("    -r, --ref-seq <file>                Reference sequence (required for GC-depth calculation).\n");
+        printf("    -t, --target-regions <file>         Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
+        printf("    -s, --sam                           Input is SAM\n");
+        printf("\n");
+    }
+    else
+    {
+        va_list ap;
+        va_start(ap, format);
+        vfprintf(stderr, format, ap);
+        va_end(ap);
+    }
+    exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+    char *targets = NULL;
+    char *bam_fname = NULL;
+    samfile_t *sam = NULL;
+    char in_mode[5];
+
+    stats_t *stats = calloc(1,sizeof(stats_t));
+    stats->ngc    = 200;
+    stats->nquals = 95;
+    stats->nbases = 300;
+    stats->nisize = 8000;
+    stats->max_len   = 30;
+    stats->max_qual  = 40;
+    stats->isize_main_bulk = 0.99;   // There are always outliers at the far end
+    stats->gcd_bin_size = 20000;
+    stats->ngcd         = 3e5;     // 300k of 20k bins is enough to hold a genome 6Gbp big
+    stats->nref_seq     = stats->gcd_bin_size;
+    stats->rseq_pos     = -1;
+    stats->tid = stats->gcd_pos = -1;
+    stats->is_sorted = 1;
+    stats->cov_min  = 1;
+    stats->cov_max  = 1000;
+    stats->cov_step = 1;
+    stats->argc = argc;
+    stats->argv = argv;
+    stats->filter_readlen = -1;
+    stats->nindels = stats->nbases;
+
+    strcpy(in_mode, "rb");
+
+    static struct option loptions[] = 
+    {
+        {"help",0,0,'h'},
+        {"remove-dups",0,0,'d'},
+        {"sam",0,0,'s'},
+        {"ref-seq",0,0,'r'},
+        {"coverage",0,0,'c'},
+        {"read-length",0,0,'l'},
+        {"insert-size",0,0,'i'},
+        {"most-inserts",0,0,'m'},
+        {"trim-quality",0,0,'q'},
+        {"target-regions",0,0,'t'},
+        {0,0,0,0}
+    };
+    int opt;
+    while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:",loptions,NULL))>0 )
+    {
+        switch (opt)
+        {
+            case 'd': stats->rmdup=1; break;
+            case 's': strcpy(in_mode, "r"); break;
+            case 'r': stats->fai = fai_load(optarg); 
+                      if (stats->fai==0) 
+                          error("Could not load faidx: %s\n", optarg); 
+                      break;
+            case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 ) 
+                          error("Unable to parse -c %s\n", optarg); 
+                      break;
+            case 'l': stats->filter_readlen = atoi(optarg); break;
+            case 'i': stats->nisize = atoi(optarg); break;
+            case 'm': stats->isize_main_bulk = atof(optarg); break;
+            case 'q': stats->trim_qual = atoi(optarg); break;
+            case 't': targets = optarg; break;
+            case '?': 
+            case 'h': error(NULL);
+            default: error("Unknown argument: %s\n", optarg);
+        }
+    }
+    if ( optind<argc )
+        bam_fname = argv[optind++];
+
+    if ( !bam_fname )
+    {
+        if ( isatty(fileno((FILE *)stdin)) )
+            error(NULL);
+        bam_fname = "-";
+    }
+
+    // Init structures
+    //  .. coverage bins and round buffer
+    if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 )
+    {
+        stats->cov_step = stats->cov_max - stats->cov_min;
+        if ( stats->cov_step <= 0 )
+            stats->cov_step = 1;
+    }
+    stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step;
+    stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1;
+    stats->cov = calloc(sizeof(uint64_t),stats->ncov);
+    stats->cov_rbuf.size = stats->nbases*5;
+    stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size);
+    // .. bam
+    if ((sam = samopen(bam_fname, in_mode, NULL)) == 0) 
+        error("Failed to open: %s\n", bam_fname);
+    stats->sam = sam;
+    bam1_t *bam_line = bam_init1();
+    // .. arrays
+    stats->quals_1st     = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+    stats->quals_2nd     = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+    stats->gc_1st        = calloc(stats->ngc,sizeof(uint64_t));
+    stats->gc_2nd        = calloc(stats->ngc,sizeof(uint64_t));
+    stats->isize_inward  = calloc(stats->nisize,sizeof(uint64_t));
+    stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
+    stats->isize_other   = calloc(stats->nisize,sizeof(uint64_t));
+    stats->gcd           = calloc(stats->ngcd,sizeof(gc_depth_t));
+    stats->rseq_buf      = calloc(stats->nref_seq,sizeof(uint8_t));
+    stats->mpc_buf       = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
+    stats->acgt_cycles   = calloc(4*stats->nbases,sizeof(uint64_t));
+    stats->read_lengths  = calloc(stats->nbases,sizeof(uint64_t));
+    stats->insertions    = calloc(stats->nbases,sizeof(uint64_t));
+    stats->deletions     = calloc(stats->nbases,sizeof(uint64_t));
+    stats->ins_cycles    = calloc(stats->nbases,sizeof(uint64_t));
+    stats->del_cycles    = calloc(stats->nbases,sizeof(uint64_t));
+    if ( targets )
+        init_regions(stats, targets);
+
+    // Collect statistics
+    if ( optind<argc )
+    {
+        // Collect stats in selected regions only
+        bam_index_t *bam_idx = bam_index_load(bam_fname);
+        if (bam_idx == 0)
+            error("Random alignment retrieval only works for indexed BAM files.\n");
+
+        int i;
+        for (i=optind; i<argc; i++) 
+        {
+            int tid, beg, end;
+            bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
+            if ( tid < 0 ) continue;
+            bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
+        }
+        bam_index_destroy(bam_idx);
+    }
+    else
+    {
+        // Stream through the entire BAM ignoring off-target regions if -t is given
+        while (samread(sam,bam_line) >= 0) 
+        {
+            if ( stats->regions )
+            {
+                if ( bam_line->core.tid >= stats->nregions ) continue;
+                if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
+
+                regions_t *reg = &stats->regions[bam_line->core.tid];
+                if ( reg->cpos==reg->npos ) continue;       // done for this chr
+
+                // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, 
+                //  even small overlap is enough to include the read in the stats.
+                int i = reg->cpos;
+                while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
+                if ( i>=reg->npos ) { reg->cpos = reg->npos; continue; }
+                if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) continue;
+                reg->cpos = i;
+            }
+            collect_stats(bam_line,stats);
+        }
+    }
+    round_buffer_flush(stats,-1);
+
+    output_stats(stats);
+
+    bam_destroy1(bam_line);
+    samclose(stats->sam);
+    if (stats->fai) fai_destroy(stats->fai);
+    free(stats->cov_rbuf.buffer); free(stats->cov);
+    free(stats->quals_1st); free(stats->quals_2nd); 
+    free(stats->gc_1st); free(stats->gc_2nd);
+    free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other);
+    free(stats->gcd);
+    free(stats->rseq_buf);
+    free(stats->mpc_buf);
+    free(stats->acgt_cycles);
+    free(stats->read_lengths);
+    free(stats->insertions);
+    free(stats->deletions);
+    free(stats->ins_cycles);
+    free(stats->del_cycles);
+    destroy_regions(stats);
+    free(stats);
+
+       return 0;
+}
+
+
+
diff --git a/misc/plot-bamcheck b/misc/plot-bamcheck

new file mode 100755 (executable)

index 0000000..adaebe6
--- /dev/null
+++ b/misc/plot-bamcheck
@@ -0,0 +1,877 @@
+#!/usr/bin/env perl
+#
+# Author: petr.danecek@sanger
+#
+
+use strict;
+use warnings;
+use Carp;
+
+my $opts = parse_params();
+parse_bamcheck($opts);
+plot_qualities($opts);
+plot_acgt_cycles($opts);
+plot_gc($opts);
+plot_gc_depth($opts);
+plot_isize($opts);
+plot_coverage($opts);
+plot_mismatches_per_cycle($opts);
+plot_indel_dist($opts);
+plot_indel_cycles($opts);
+
+exit;
+
+#--------------------------------
+
+sub error
+{
+    my (@msg) = @_;
+    if ( scalar @msg ) { confess @msg; }
+    die
+        "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n",
+        "       plot-bamcheck -p outdir/ file.bam.bc\n",
+        "Options:\n",
+        "   -k, --keep-files                    Do not remove temporary files.\n",
+        "   -p, --prefix <path>                 The output files prefix, add a slash to create new directory.\n",
+        "   -r, --ref-stats <file.fa.gc>        Optional reference stats file with expected GC content (created with -s).\n",
+        "   -s, --do-ref-stats <file.fa>        Calculate reference sequence GC for later use with -r\n",
+        "   -t, --targets <file.tab>            Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n",
+        "   -h, -?, --help                      This help message.\n",
+        "\n";
+}
+
+
+sub parse_params
+{
+    $0 =~ s{^.+/}{};
+    my $opts = { args=>join(' ',$0,@ARGV) };
+    while (defined(my $arg=shift(@ARGV)))
+    {
+        if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; }
+        if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; }
+        if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; }
+        if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; }
+        if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; }
+        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
+        if ( -e $arg ) { $$opts{bamcheck}=$arg; next; }
+        error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
+    }
+    if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; }
+    if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") }
+    if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") }
+    if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; }
+    elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; }
+    return $opts;
+}
+
+
+# Creates GC stats for either the whole reference or only on target regions for exome QC
+sub do_ref_stats
+{
+    my ($opts) = @_;
+
+
+    my %targets = ();
+    if ( exists($$opts{targets}) )
+    {
+        my ($prev_chr,$prev_pos);
+        open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!");
+        while (my $line=<$fh>)
+        {
+            if ( $line=~/^#/ ) { next; }
+            my ($chr,$from,$to) = split(/\s+/,$line);
+            chomp($to);
+            push @{$targets{$chr}}, $from,$to;
+            if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from }
+            if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); }
+            $prev_pos = $from;
+        }
+        close($fh);
+    }
+
+    my $_len = 60;  # for now do only standard fasta's with 60 bases per line
+    my %gc_counts = ();
+    my ($skip_chr,$pos,$ireg,$regions);
+    open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!");
+    while (my $line=<$fh>)
+    {
+        if ( $line=~/^>/ )
+        {
+            if ( !scalar %targets ) { next; }
+
+            if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); }
+            if ( !exists($targets{$1}) ) { $skip_chr=$1; next; }
+            undef $skip_chr;
+            $pos = 0;
+            $ireg = 0;
+            $regions = $targets{$1};
+        }
+        if ( defined $skip_chr ) { next; }
+
+        # Only $_len sized lines are considered and no chopping for target regions.
+        chomp($line);
+        my $len = length($line);
+        if ( $len ne $_len ) { next; }
+
+        if ( scalar %targets )
+        {
+            while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; }
+            $pos += $len;
+            if ( $ireg==@$regions ) { next; }
+            if ( $pos < $$regions[$ireg] ) { next; }
+        }
+
+        my $gc_count = 0;
+        for (my $i=0; $i<$len; $i++)
+        {
+            my $base = substr($line,$i,1);
+            if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; }
+        }
+        $gc_counts{$gc_count}++;
+    }
+
+    print "# Generated by $$opts{args}\n";
+    print "# The columns are: GC content bin, normalized frequency\n";
+    my $max;
+    for my $count (values %gc_counts) 
+    { 
+        if ( !defined $max or $count>$max ) { $max=$count; }
+    }
+    for my $gc (sort {$a<=>$b} keys %gc_counts)
+    {
+        if ( $gc==0 ) { next; }
+        printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max;
+    }
+}
+
+sub plot
+{
+    my ($cmdfile) = @_;
+    my $cmd = "gnuplot $cmdfile";
+    system($cmd);
+    if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); }
+}
+
+
+sub parse_bamcheck
+{
+    my ($opts) = @_;
+    open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!");
+    my $line = <$fh>;
+    if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); }
+    $$opts{dat}{version} = $1;
+    while ($line=<$fh>)
+    {
+        if ( $line=~/^#/ ) { next; }
+        my @items = split(/\t/,$line);
+        chomp($items[-1]);
+        if ( $items[0] eq 'SN' )
+        {
+            $$opts{dat}{$items[1]} = splice(@items,2);
+            next;
+        }
+        push @{$$opts{dat}{$items[0]}}, [splice(@items,1)];
+    }
+    close($fh);
+
+    # Check sanity
+    if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} )
+    {
+        error("Sanity check failed: no sequences found by bamcheck??\n");
+    }
+}
+
+sub older_than
+{
+    my ($opts,$version) = @_;
+    my ($year,$month,$day) = split(/-/,$version);
+    $version = $$opts{dat}{version};
+    if ( !($version=~/\((\d+)-(\d+)-(\d+)\)$/) ) { return 1; }
+    if ( $1<$year ) { return 1; }
+    elsif ( $1>$year ) { return 0; }
+    if ( $2<$month ) { return 1; }
+    elsif ( $2>$month ) { return 0; }
+    if ( $3<$day ) { return 1; }
+    return 0;
+}
+
+sub get_defaults
+{
+    my ($opts,$img_fname,%args) = @_;
+
+    if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); }
+
+    # Determine the gnuplot script file name
+    my $gp_file = $img_fname;
+    $gp_file =~ s{\.[^.]+$}{.gp};
+    if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; }
+
+    # Determine the default title:
+    #       5446_6/5446_6.bam.bc.gp -> 5446_6
+    #       test.aaa.png -> test.aaa
+    if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); }
+    my $title = $1;
+
+    my $dir = $gp_file;
+    $dir =~ s{/[^/]+$}{};
+    if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; }
+
+    my $wh = exists($args{wh}) ? $args{wh} : '600,400';
+
+    open(my $fh,'>',$gp_file) or error("$gp_file: $!");
+    return { 
+        title => $title, 
+        gp    => $gp_file, 
+        img   => $img_fname, 
+        fh    => $fh, 
+        terminal => qq[set terminal png size $wh truecolor],
+        grid  => 'set grid xtics ytics y2tics back lc rgb "#cccccc"',
+    };
+}
+
+sub percentile
+{
+    my ($p,@vals) = @_;
+    my $N = 0;
+    for my $val (@vals) { $N += $val; }
+    my $n = $p*($N+1)/100.;
+    my $k = int($n);
+    my $d = $n-$k;
+    if ( $k<=0 ) { return 0; }
+    if ( $k>=$N ) { return scalar @vals-1; }
+    my $cnt;
+    for (my $i=0; $i<@vals; $i++)
+    { 
+        $cnt += $vals[$i]; 
+        if ( $cnt>=$k ) { return $i; }
+    }
+    error("FIXME: this should not happen [percentile]\n");
+}
+
+sub plot_qualities
+{
+    my ($opts) = @_;
+
+    if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; }
+
+    my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50;
+    my $is_paired = $$opts{dat}{'is paired:'};
+    
+    # Average quality per cycle, forward and reverse reads in one plot 
+    my $args = get_defaults($opts,"$$opts{prefix}quals.png");
+    my $fh = $$args{fh};
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            $$args{grid}
+            set ylabel "Average Quality"
+            set xlabel "Cycle"
+            set yrange [0:$yrange]
+            set title "$$args{title}"
+            plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[
+        ];
+    my (@fp75,@fp50,@fmean);
+    my (@lp75,@lp50,@lmean);
+    my ($fmax,$fmax_qual,$fmax_cycle);
+    my ($lmax,$lmax_qual,$lmax_cycle);
+    for my $cycle (@{$$opts{dat}{FFQ}})
+    {
+        my $sum=0; my $n=0; 
+        for (my $iqual=1; $iqual<@$cycle; $iqual++) 
+        { 
+            $sum += $$cycle[$iqual]*$iqual; 
+            $n += $$cycle[$iqual]; 
+            if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; }
+        }
+        my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
+        my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
+        my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
+        if ( !$n ) { next; }
+        push @fp75, "$$cycle[0]\t$p25\t$p75\n";
+        push @fp50, "$$cycle[0]\t$p50\n";
+        push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
+        printf $fh $fmean[-1];
+    }
+    print $fh "end\n";
+    if ( $is_paired )
+    {
+        for my $cycle (@{$$opts{dat}{LFQ}})
+        {
+            my $sum=0; my $n=0; 
+            for (my $iqual=1; $iqual<@$cycle; $iqual++) 
+            { 
+                $sum += $$cycle[$iqual]*$iqual; 
+                $n += $$cycle[$iqual]; 
+                if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; }
+            }
+            my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
+            my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
+            my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
+            if ( !$n ) { next; }
+            push @lp75, "$$cycle[0]\t$p25\t$p75\n";
+            push @lp50, "$$cycle[0]\t$p50\n";
+            push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
+            printf $fh $lmean[-1];
+        }
+        print $fh "end\n";
+    }
+    close($fh);
+    plot($$args{gp});
+
+
+
+    # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots
+    $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500');
+    $fh = $$args{fh};
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            $$args{grid}
+            set multiplot
+            set rmargin 0 
+            set lmargin 0 
+            set tmargin 0 
+            set bmargin 0 
+            set origin 0.1,0.1
+            set size 0.4,0.8
+            set yrange [0:$yrange]
+            set ylabel "Quality"
+            set xlabel "Cycle (fwd reads)"
+            plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean'
+        ];
+    print $fh join('',@fp75),"end\n";
+    print $fh join('',@fp50),"end\n";
+    print $fh join('',@fmean),"end\n";
+    if ( $is_paired )
+    {
+        print $fh qq[
+                set origin 0.55,0.1
+                set size 0.4,0.8
+                unset ytics
+                set y2tics mirror
+                unset ylabel
+                set xlabel "Cycle (rev reads)"
+                set label "$$args{title}" at screen 0.5,0.95 center
+                plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean'
+            ];
+        print $fh join('',@lp75),"end\n";
+        print $fh join('',@lp50),"end\n";
+        print $fh join('',@lmean),"end\n";
+    }
+    close($fh);
+    plot($$args{gp});
+
+
+
+    # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve
+    $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600');
+    $fh = $$args{fh};
+    my $nquals = @{$$opts{dat}{FFQ}[0]}-1;
+    my $ncycles = @{$$opts{dat}{FFQ}};
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            $$args{grid}
+            set multiplot
+            set rmargin 0
+            set lmargin 0
+            set tmargin 0
+            set bmargin 0
+            set origin 0.15,0.52
+            set size 0.8,0.4
+            set title "$$args{title}"
+            set ylabel "Frequency (fwd reads)"
+            set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax
+            unset xlabel
+            set xrange [0:$nquals]
+            set format x ""
+        ];
+    my @plots;
+    for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] }
+    print $fh "plot ", join(",", @plots), "\n";
+    for my $cycle (@{$$opts{dat}{FFQ}})
+    {
+        for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; }
+        print $fh "end\n";
+    }
+    if ( $is_paired )
+    {
+        print $fh qq[
+                set origin 0.15,0.1
+                set size 0.8,0.4
+                unset title
+                unset format
+                set xtics
+                set xlabel "Quality"
+                unset label
+                set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax
+                set ylabel "Frequency (rev reads)" 
+            ];
+        print $fh "plot ", join(",", @plots), "\n";
+        for my $cycle (@{$$opts{dat}{LFQ}})
+        {
+            for (my $iqual=1; $iqual<$nquals; $iqual++) 
+            { 
+                print $fh "$iqual\t$$cycle[$iqual]\n"; 
+            }
+            print $fh "end\n";
+        }
+    }
+    close($fh);
+    plot($$args{gp});
+
+
+    # Heatmap qualitites
+    $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500');
+    $fh = $$args{fh};
+    my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax;
+    my @ytics;
+    for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
+    my $ytics = join(',', @ytics);
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            unset key
+            unset colorbox
+            set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1)
+            set cbrange [0:$max]
+            set yrange  [0:$ncycles]
+            set xrange  [0:$nquals]
+            set view map
+            set multiplot
+            set rmargin 0 
+            set lmargin 0 
+            set tmargin 0 
+            set bmargin 0 
+            set origin 0,0.46
+            set size 0.95,0.6
+            set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles
+            set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black"
+            set ylabel "Cycle (fwd reads)" offset character -1,0
+            unset ytics
+            set ytics ($ytics)
+            unset xtics
+            set title "$$args{title}"
+            splot '-' matrix with image
+        ];
+    for my $cycle (@{$$opts{dat}{FFQ}})
+    {
+        for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
+        print $fh "\n";
+    }
+    print $fh "end\nend\n";
+    @ytics = ();
+    for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
+    $ytics = join(',', @ytics);
+    print $fh qq[
+            set origin 0,0.03
+            set size 0.95,0.6
+            set ylabel "Cycle (rev reads)" offset character -1,0
+            set xlabel "Base Quality"
+            unset title
+            unset ytics
+            set ytics ($ytics)
+            set xrange  [0:$nquals]
+            set xtics
+            set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812
+            set cblabel "Number of bases"
+            splot '-' matrix with image
+        ];
+    for my $cycle (@{$$opts{dat}{LFQ}})
+    {
+        for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
+        print $fh "\n";
+    }
+    print $fh "end\nend\n";
+    close($fh);
+    plot($$args{gp});
+}
+
+
+sub plot_acgt_cycles
+{
+    my ($opts) = @_;
+
+    if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; }
+
+    my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png");
+    my $fh = $$args{fh};
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            $$args{grid}
+            set style line 1 linecolor rgb "green"
+            set style line 2 linecolor rgb "red"
+            set style line 3 linecolor rgb "black"
+            set style line 4 linecolor rgb "blue"
+            set style increment user
+            set ylabel "Base content [%]" 
+            set xlabel "Read Cycle"
+            set yrange [0:100]
+            set title "$$args{title}"
+            plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T'
+        ];
+    for my $base (1..4)
+    {
+        for my $cycle (@{$$opts{dat}{GCC}})
+        {
+            print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n";
+        }
+        print $fh "end\n";
+    }
+    close($fh);
+    plot($$args{gp});
+}
+
+
+sub plot_gc
+{
+    my ($opts) = @_;
+
+    my $is_paired = $$opts{dat}{'is paired:'};
+    my $args = get_defaults($opts,"$$opts{prefix}gc-content.png");
+    my $fh = $$args{fh};
+    my ($gcl_max,$gcf_max,$lmax,$fmax);
+    for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } }
+    for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } }
+    my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax;
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            $$args{grid}
+            set title "$$args{title}"
+            set ylabel "Normalized Frequency"
+            set xlabel "GC Content [%]"
+            set yrange [0:1.1]
+            set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0
+            plot ] 
+                 . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '')
+                 . q['-' smooth csplines with lines lc 1 title 'First fragments' ] 
+                 . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '') 
+                 . q[
+        ];
+    if ( exists($$opts{ref_stats}) )
+    {
+        open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!");
+        while (my $line=<$ref>) { print $fh $line }
+        close($ref);
+        print $fh "end\n";
+    }
+    for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; }
+    print $fh "end\n";
+    if ( $is_paired )
+    {
+        for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; }
+        print $fh "end\n";
+    }
+    close($fh);
+    plot($$args{gp});
+}
+
+
+sub plot_gc_depth
+{
+    my ($opts) = @_;
+
+    if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; }
+
+    # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics.
+    my @tics = ( {gc=>30},{gc=>40},{gc=>50} );
+    for my $gc (@{$$opts{dat}{GCD}})
+    {
+        for my $tic (@tics)
+        {
+            my $diff = abs($$gc[0]-$$tic{gc});
+            if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; }
+        }
+    }
+
+    my @x2tics;
+    for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; }
+    my $x2tics = join(',',@x2tics);
+    
+    my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500');
+    my $fh = $$args{fh};
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            $$args{grid}
+            set ylabel "Mapped depth" 
+            set xlabel "Percentile of mapped sequence ordered by GC content"
+            set x2label "GC Content [%]"
+            set title "$$args{title}"
+            set x2tics ($x2tics)
+            set xtics nomirror
+            set xrange [0.1:99.9]
+            
+            plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\
+                 '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\
+                 '-' using 1:2 with lines lc rgb "#0084ff" t 'Median'
+        ];
+    for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n";
+    for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n";
+    for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n";
+    close($fh);
+    plot($$args{gp});
+}
+
+
+sub plot_isize
+{
+    my ($opts) = @_;
+
+    if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; }
+
+    my ($isize_max,$isize_cnt);
+    for my $isize (@{$$opts{dat}{IS}})
+    {
+        if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; }
+    }
+    
+    my $args = get_defaults($opts,"$$opts{prefix}insert-size.png");
+    my $fh = $$args{fh};
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            $$args{grid}
+            set rmargin 5
+            set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt
+            set ylabel  "Number of pairs"
+            set xlabel  "Insert Size"
+            set title "$$args{title}"
+            plot \\
+                '-' with lines lc rgb 'black' title 'All pairs', \\
+                '-' with lines title 'Inward', \\
+                '-' with lines title 'Outward', \\
+                '-' with lines title 'Other'
+        ];
+    for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n";
+    for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n";
+    for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n";
+    for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n";
+    close($fh);
+    plot($$args{gp});
+}
+
+
+sub plot_coverage
+{
+    my ($opts) = @_;
+
+    if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; }
+
+    my @vals;
+    for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; }
+    my $p99 = percentile(99.8,@vals);
+
+    my $args = get_defaults($opts,"$$opts{prefix}coverage.png");
+    my $fh = $$args{fh};
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            $$args{grid}
+            set ylabel "Number of mapped bases" 
+            set xlabel "Coverage"
+            set style fill solid border -1
+            set title "$$args{title}"
+            set xrange [:$p99]
+            plot '-' with lines notitle
+        ];
+    for my $cov (@{$$opts{dat}{COV}}) 
+    { 
+        if ( $$cov[2]==0 ) { next; }
+        print $fh "$$cov[1]\t$$cov[2]\n"; 
+    } 
+    print $fh "end\n";
+    close($fh);
+    plot($$args{gp});
+}
+
+
+sub plot_mismatches_per_cycle
+{
+    my ($opts) = @_;
+
+    if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; }
+    if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); }
+
+    my $nquals = @{$$opts{dat}{MPC}[0]} - 2;
+    my $ncycles = @{$$opts{dat}{MPC}};
+    my ($style,$with);
+    if ( $ncycles>100 ) { $style = ''; $with = 'w l'; }
+    else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; }
+
+    my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
+    my $fh = $$args{fh};
+    print $fh qq[
+            $$args{terminal}
+            set output "$$args{img}"
+            $$args{grid}
+            set style line 1 linecolor rgb "#e40000"
+            set style line 2 linecolor rgb "#ff9f00"
+            set style line 3 linecolor rgb "#eeee00"
+            set style line 4 linecolor rgb "#4ebd68"
+            set style line 5 linecolor rgb "#0061ff"
+            set style increment user
+            set key left top
+            $style
+            set ylabel "Number of mismatches" 
+            set xlabel "Read Cycle"
+            set style fill solid border -1
+            set title "$$args{title}"
+            set xrange [-1:$ncycles]
+            plot '-' $with ti 'Base Quality>30', \\
+                 '-' $with ti '30>=Q>20', \\
+                 '-' $with ti '20>=Q>10', \\
+                 '-' $with ti '10>=Q', \\
+                 '-' $with ti "N's"
+        ];
+    for my $cycle (@{$$opts{dat}{MPC}}) 
+    { 
+        my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; } 
+        print $fh "$sum\n"; 
+    }
+    print $fh "end\n";
+    for my $cycle (@{$$opts{dat}{MPC}}) 
+    { 
+        my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; } 
+        print $fh "$sum\n"; 
+    }
+    print $fh "end\n";
+    for my $cycle (@{$$opts{dat}{MPC}}) 
+    { 
+        my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; } 
+        print $fh "$sum\n"; 
+    }
+    print $fh "end\n";
+    for my $cycle (@{$$opts{dat}{MPC}}) 
+    { 
+        my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; } 
+        print $fh "$sum\n"; 
+    }
+    print $fh "end\n";
+    for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; }
+    print $fh "end\n";
+    close($fh);
+    plot($$args{gp});
+}
+
+sub plot_indel_dist
+{
+    my ($opts) = @_;
+
+    if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; }
+
+    my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png");
+    my $fh = $$args{fh};
+    print $fh qq[
+        $$args{terminal}
+        set output "$$args{img}"
+        $$args{grid}
+        set style line 1 linetype 1  linecolor rgb "red"
+        set style line 2 linetype 2  linecolor rgb "black"
+        set style line 3 linetype 3  linecolor rgb "green"
+        set style increment user
+        set ylabel "Indel count [log]" 
+        set xlabel "Indel length"
+        set y2label "Insertions/Deletions ratio"
+        set log y
+        set y2tics nomirror
+        set ytics nomirror
+        set title "$$args{title}"
+        plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio"
+    ];
+    for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
+    for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
+    for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n";
+    close($fh);
+    plot($$args{gp});
+}
+
+sub plot_indel_cycles
+{
+    my ($opts) = @_;
+
+    if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; }
+
+    my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png");
+    my $fh = $$args{fh};
+    print $fh qq[
+        $$args{terminal}
+        set output "$$args{img}"
+        $$args{grid}
+        set style line 1 linetype 1  linecolor rgb "red"
+        set style line 2 linetype 2  linecolor rgb "black"
+        set style line 3 linetype 3  linecolor rgb "green"
+        set style increment user
+        set ylabel "Indel count" 
+        set xlabel "Read Cycle"
+        set title "$$args{title}"
+        plot '-' w l ti 'Insertions', '' w l ti 'Deletions'
+    ];
+    for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
+    for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
+    close($fh);
+    plot($$args{gp});
+}
+
+
+
+
+
+
+
+sub has_values
+{
+    my ($opts,@tags) = @_;
+    for my $tag (@tags)
+    {
+        my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`;
+        chomp($lines[0]);
+        if ( $lines[0]<2 ) { return 0; }
+    }
+    return 1;
+}
+
+sub plot_mismatches_per_cycle_old
+{
+    my ($opts) = @_;
+
+    my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
+    my ($nquals)  = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`;
+    my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`;
+    chomp($nquals);
+    chomp($ncycles);
+    $nquals--;
+    $ncycles--;
+    my @gr0_15  = (2..17);
+    my @gr16_30 = (18..32);
+    my @gr31_n  = (33..$nquals);
+    my $gr0_15  = '$'. join('+$',@gr0_15); 
+    my $gr16_30 = '$'. join('+$',@gr16_30); 
+    my $gr31_n  = '$'. join('+$',@gr31_n); 
+
+    open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!");
+    print $fh q[
+        set terminal png size 600,400 truecolor font "DejaVuSansMono,9"
+        set output "] . $$args{img} . q["
+        
+        set key left top
+        set style data histogram
+        set style histogram rowstacked
+
+        set grid back lc rgb "#aaaaaa"
+        set ylabel "Number of mismatches" 
+        set xlabel "Read Cycle"
+        set style fill solid border -1
+        set title "] . $$args{title} . qq["
+        set xrange [-1:$ncycles]
+        
+        plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q'
+    ];
+    close($fh);
+
+    plot($$args{gp});
+}
+
+
author	Petr Danecek <pd3@sanger.ac.uk>
	Tue, 27 Mar 2012 13:49:59 +0000 (21:49 +0800)
committer	pd3 <pd3@sanger.ac.uk>
	Tue, 27 Mar 2012 13:54:33 +0000 (21:54 +0800)
misc/bamcheck.c	[new file with mode: 0644]	patch \| blob
misc/plot-bamcheck	[new file with mode: 0755]	patch \| blob