/*
Author: petr.danecek@sanger
- gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam
+ gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread
Assumptions, approximations and other issues:
- GC-depth graph does not split reads, the starting position determines which bin is incremented.
- coverage distribution ignores softclips and deletions
- some stats require sorted BAMs
- GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths.
- - The whole reads are used with -t, no splicing is done, no indels or soft clips are
- considered, even small overlap is good enough to include the read in the stats.
+ - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq)
+ - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar)
+ counts, no splicing is done, no indels or soft clips are considered, even small overlap is
+ good enough to include the read in the stats.
+
*/
-#define BAMCHECK_VERSION "2012-03-29"
+#define BAMCHECK_VERSION "2012-09-04"
#define _ISOC99_SOURCE
-#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <ctype.h>
#include <getopt.h>
#include <errno.h>
+#include <assert.h>
#include "faidx.h"
#include "khash.h"
#include "sam.h"
+#include "sam_header.h"
#include "razf.h"
#define BWA_MIN_RDLEN 35
uint64_t offset;
}
faidx1_t;
-KHASH_MAP_INIT_STR(s, faidx1_t)
-KHASH_MAP_INIT_STR(str, int)
+KHASH_MAP_INIT_STR(kh_faidx, faidx1_t)
+KHASH_MAP_INIT_STR(kh_bam_tid, int)
+KHASH_MAP_INIT_STR(kh_rg, const char *)
struct __faidx_t {
RAZF *rz;
int n, m;
char **name;
- khash_t(s) *hash;
+ khash_t(kh_faidx) *hash;
};
typedef struct
{
// Parameters
int trim_qual; // bwa trim quality
- int rmdup; // Exclude reads marked as duplicates from the stats
// Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd),
// insert size histogram holder
uint64_t *acgt_cycles;
uint64_t *read_lengths;
uint64_t *insertions, *deletions;
- uint64_t *ins_cycles, *del_cycles;
+ uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
// The extremes encountered
int max_len; // Maximum read length
uint64_t total_len_dup;
uint64_t nreads_1st;
uint64_t nreads_2nd;
+ uint64_t nreads_filtered;
uint64_t nreads_dup;
uint64_t nreads_unmapped;
uint64_t nreads_unpaired;
uint64_t nreads_paired;
+ uint64_t nreads_anomalous;
uint64_t nreads_mq0;
uint64_t nbases_mapped;
uint64_t nbases_mapped_cigar;
uint64_t nbases_trimmed; // bwa trimmed bases
uint64_t nmismatches;
+ uint64_t nreads_QCfailed, nreads_secondary;
// GC-depth related data
uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin
gc_depth_t *gcd; // The GC-depth bins holder
int gcd_bin_size; // The size of GC-depth bin
+ uint32_t gcd_ref_size; // The approximate size of the genome
int32_t tid, gcd_pos; // Position of the current bin
int32_t pos; // Position of the last read
round_buffer_t cov_rbuf; // Pileup round buffer
// Mismatches by read cycle
- uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
- int nref_seq; // The size of the buffer
- int32_t rseq_pos; // The coordinate of the first base in the buffer
- int32_t rseq_len; // The used part of the buffer
- uint64_t *mpc_buf; // Mismatches per cycle
+ uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against
+ int mrseq_buf; // The size of the buffer
+ int32_t rseq_pos; // The coordinate of the first base in the buffer
+ int32_t nrseq_buf; // The used part of the buffer
+ uint64_t *mpc_buf; // Mismatches per cycle
// Filters
int filter_readlen;
// Target regions
- int nregions;
+ int nregions, reg_from,reg_to;
regions_t *regions;
// Auxiliary data
- double sum_qual; // For calculating average quality value
- samfile_t *sam; // Unused
- faidx_t *fai; // Reference sequence for GC-depth graph
- int argc; // Command line arguments to be printed on the output
+ int flag_require, flag_filter;
+ double sum_qual; // For calculating average quality value
+ samfile_t *sam;
+ khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated
+ faidx_t *fai; // Reference sequence for GC-depth graph
+ int argc; // Command line arguments to be printed on the output
char **argv;
}
stats_t;
void error(const char *format, ...);
+void bam_init_header_hash(bam_header_t *header);
+int is_in_regions(bam1_t *bam_line, stats_t *stats);
+
// Coverage distribution methods
inline int coverage_idx(int min, int max, int n, int step, int depth)
void count_indels(stats_t *stats,bam1_t *bam_line)
{
int is_fwd = IS_REVERSE(bam_line) ? 0 : 1;
+ int is_1st = IS_READ1(bam_line) ? 1 : 0;
int icig;
int icycle = 0;
int read_len = bam_line->core.l_qseq;
if ( cig==1 )
{
- int idx = is_fwd ? icycle : read_len-icycle-1;
- if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
- stats->ins_cycles[idx]++;
+ int idx = is_fwd ? icycle : read_len-icycle-ncig;
+ if ( idx<0 )
+ error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
+ if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
+ if ( is_1st )
+ stats->ins_cycles_1st[idx]++;
+ else
+ stats->ins_cycles_2nd[idx]++;
icycle += ncig;
if ( ncig<=stats->nindels )
stats->insertions[ncig-1]++;
}
if ( cig==2 )
{
- int idx = is_fwd ? icycle : read_len-icycle-1;
+ int idx = is_fwd ? icycle-1 : read_len-icycle-1;
+ if ( idx<0 ) continue; // discard meaningless deletions
if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases);
- stats->del_cycles[idx]++;
+ if ( is_1st )
+ stats->del_cycles_1st[idx]++;
+ else
+ stats->del_cycles_2nd[idx]++;
if ( ncig<=stats->nindels )
stats->deletions[ncig-1]++;
continue;
}
- icycle += ncig;
+ if ( cig!=3 && cig!=5 )
+ icycle += ncig;
}
}
icycle += ncig;
continue;
}
+ // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
+ // chunk of refseq in memory. Not very frequent and not noticable in the stats.
+ if ( cig==3 || cig==5 ) continue;
if ( cig!=0 )
- error("TODO: cigar %d, %s\n", cig,bam1_qname(bam_line));
+ error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
- if ( ncig+iref > stats->rseq_len )
- error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->rseq_len, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
+ if ( ncig+iref > stats->nrseq_buf )
+ error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
int im;
for (im=0; im<ncig; im++)
{
uint8_t qual = quals[iread] + 1;
if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
int idx = is_fwd ? icycle : read_len-icycle-1;
if ( idx>stats->max_len )
void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
{
- khash_t(s) *h;
+ khash_t(kh_faidx) *h;
khiter_t iter;
faidx1_t val;
char *chr, c;
chr = stats->sam->header->target_name[tid];
// ID of the sequence name
- iter = kh_get(s, h, chr);
+ iter = kh_get(kh_faidx, h, chr);
if (iter == kh_end(h))
error("No such reference sequence [%s]?\n", chr);
val = kh_value(h, iter);
if (pos >= val.len)
error("Was the bam file mapped with the reference sequence supplied?"
" A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
- int size = stats->nref_seq;
+ int size = stats->mrseq_buf;
// The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
if (size+pos > val.len) size = val.len-pos;
ptr++;
nread++;
}
- if ( nread < stats->nref_seq )
+ if ( nread < stats->mrseq_buf )
{
- memset(ptr,0, stats->nref_seq - nread);
- nread = stats->nref_seq;
+ memset(ptr,0, stats->mrseq_buf - nread);
+ nread = stats->mrseq_buf;
}
- stats->rseq_len = nread;
- stats->rseq_pos = pos;
- stats->tid = tid;
+ stats->nrseq_buf = nread;
+ stats->rseq_pos = pos;
+ stats->tid = tid;
}
-float fai_gc_content(stats_t *stats)
+float fai_gc_content(stats_t *stats, int pos, int len)
{
uint32_t gc,count,c;
- int i,size = stats->rseq_len;
+ int i = pos - stats->rseq_pos, ito = i + len;
+ assert( i>=0 && ito<=stats->nrseq_buf );
// Count GC content
gc = count = 0;
- for (i=0; i<size; i++)
+ for (; i<ito; i++)
{
c = stats->rseq_buf[i];
if ( c==2 || c==4 )
return count ? (float)gc/count : 0;
}
+void realloc_rseq_buffer(stats_t *stats)
+{
+ int n = stats->nbases*10;
+ if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
+ if ( stats->mrseq_buf<n )
+ {
+ stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
+ stats->mrseq_buf = n;
+ }
+}
+
+void realloc_gcd_buffer(stats_t *stats, int seq_len)
+{
+ if ( seq_len >= stats->gcd_bin_size )
+ error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
+
+ int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
+ if ( n <= stats->igcd )
+ error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
+
+ if ( n > stats->ngcd )
+ {
+ stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
+ if ( !stats->gcd )
+ error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
+ memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
+ stats->ngcd = n;
+ }
+
+ realloc_rseq_buffer(stats);
+}
void realloc_buffers(stats_t *stats, int seq_len)
{
error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
- stats->ins_cycles = realloc(stats->ins_cycles, n*sizeof(uint64_t));
- if ( !stats->ins_cycles )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
- memset(stats->ins_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+ stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t));
+ if ( !stats->ins_cycles_1st )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
- stats->del_cycles = realloc(stats->del_cycles, n*sizeof(uint64_t));
- if ( !stats->del_cycles )
- error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t));
- memset(stats->del_cycles + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t));
+ stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t));
+ if ( !stats->ins_cycles_2nd )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+ stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t));
+ if ( !stats->del_cycles_1st )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
+
+ stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t));
+ if ( !stats->del_cycles_2nd )
+ error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t));
+ memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t));
stats->nbases = n;
free(stats->cov_rbuf.buffer);
stats->cov_rbuf.buffer = rbuffer;
stats->cov_rbuf.size = seq_len*5;
+
+ realloc_rseq_buffer(stats);
}
void collect_stats(bam1_t *bam_line, stats_t *stats)
{
- if ( stats->rmdup && IS_DUP(bam_line) )
+ if ( stats->rg_hash )
+ {
+ const uint8_t *rg = bam_aux_get(bam_line, "RG");
+ if ( !rg ) return;
+ khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1));
+ if ( k == kh_end(stats->rg_hash) ) return;
+ }
+ if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
+ {
+ stats->nreads_filtered++;
+ return;
+ }
+ if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
+ {
+ stats->nreads_filtered++;
+ return;
+ }
+ if ( !is_in_regions(bam_line,stats) )
return;
+ if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen )
+ return;
+
+ if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
+ if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
int seq_len = bam_line->core.l_qseq;
if ( !seq_len ) return;
- if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
+
if ( seq_len >= stats->nbases )
realloc_buffers(stats,seq_len);
if ( stats->max_len<seq_len )
{
uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
if ( qual>=stats->nquals )
- error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
+ error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
if ( qual>stats->max_qual )
stats->max_qual = qual;
count_indels(stats,bam_line);
- // The insert size is tricky, because for long inserts the libraries are
- // prepared differently and the pairs point in other direction. BWA does
- // not set the paired flag for them. Similar thing is true also for 454
- // reads. Therefore, do the insert size stats for all mapped reads.
- int32_t isize = bam_line->core.isize;
- if ( isize<0 ) isize = -isize;
- if ( IS_PAIRED(bam_line) && isize!=0 )
+ if ( !IS_PAIRED(bam_line) )
+ stats->nreads_unpaired++;
+ else
{
stats->nreads_paired++;
- if ( isize >= stats->nisize )
- isize=stats->nisize-1;
- int pos_fst = bam_line->core.mpos - bam_line->core.pos;
- int is_fst = IS_READ1(bam_line) ? 1 : -1;
- int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
- int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+ if ( bam_line->core.tid!=bam_line->core.mtid )
+ stats->nreads_anomalous++;
- if ( is_fwd*is_mfwd>0 )
- stats->isize_other[isize]++;
- else if ( is_fst*pos_fst>0 )
- {
- if ( is_fst*is_fwd>0 )
- stats->isize_inward[isize]++;
- else
- stats->isize_outward[isize]++;
- }
- else if ( is_fst*pos_fst<0 )
+ // The insert size is tricky, because for long inserts the libraries are
+ // prepared differently and the pairs point in other direction. BWA does
+ // not set the paired flag for them. Similar thing is true also for 454
+ // reads. Mates mapped to different chromosomes have isize==0.
+ int32_t isize = bam_line->core.isize;
+ if ( isize<0 ) isize = -isize;
+ if ( isize >= stats->nisize )
+ isize = stats->nisize-1;
+ if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
{
- if ( is_fst*is_fwd>0 )
- stats->isize_outward[isize]++;
- else
- stats->isize_inward[isize]++;
+ int pos_fst = bam_line->core.mpos - bam_line->core.pos;
+ int is_fst = IS_READ1(bam_line) ? 1 : -1;
+ int is_fwd = IS_REVERSE(bam_line) ? -1 : 1;
+ int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+
+ if ( is_fwd*is_mfwd>0 )
+ stats->isize_other[isize]++;
+ else if ( is_fst*pos_fst>0 )
+ {
+ if ( is_fst*is_fwd>0 )
+ stats->isize_inward[isize]++;
+ else
+ stats->isize_outward[isize]++;
+ }
+ else if ( is_fst*pos_fst<0 )
+ {
+ if ( is_fst*is_fwd>0 )
+ stats->isize_outward[isize]++;
+ else
+ stats->isize_inward[isize]++;
+ }
}
}
- else
- stats->nreads_unpaired++;
// Number of mismatches
uint8_t *nm = bam_aux_get(bam_line,"NM");
stats->nmismatches += bam_aux2i(nm);
// Number of mapped bases from cigar
+ // Conversion from uint32_t to MIDNSHP
+ // 012-4--
+ // MIDNSHP
if ( bam_line->core.n_cigar == 0)
error("FIXME: mapped read with no cigar?\n");
- int readlen = seq_len;
- for (i=0; i<bam_line->core.n_cigar; i++)
+ int readlen=seq_len;
+ if ( stats->regions )
{
- // Conversion from uint32_t to MIDNSHP
- // 01--4--
- // MIDNSHP
- if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
- stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
-
- if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
- readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+ // Count only on-target bases
+ int iref = bam_line->core.pos + 1;
+ for (i=0; i<bam_line->core.n_cigar; i++)
+ {
+ int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK;
+ int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+ if ( cig==2 ) readlen += ncig;
+ else if ( cig==0 )
+ {
+ if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref;
+ else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to;
+ if ( ncig<0 ) ncig = 0;
+ stats->nbases_mapped_cigar += ncig;
+ iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+ }
+ else if ( cig==1 )
+ {
+ iref += ncig;
+ if ( iref>=stats->reg_from && iref<=stats->reg_to )
+ stats->nbases_mapped_cigar += ncig;
+ }
+ }
+ }
+ else
+ {
+ // Count the whole read
+ for (i=0; i<bam_line->core.n_cigar; i++)
+ {
+ if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 )
+ stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+ if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 )
+ readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT;
+ }
}
stats->nbases_mapped += seq_len;
if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
round_buffer_flush(stats,-1);
- // Mismatches per cycle and GC-depth graph
+ // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
+ // are not splitted which results in up to seq_len-1 overlaps. The default bin size is
+ // 20kbp, so the effect is negligible.
if ( stats->fai )
{
- // First pass, new chromosome or sequence spanning beyond the end of the buffer
- if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid || stats->rseq_pos+stats->rseq_len < bam_line->core.pos+readlen )
+ int inc_ref = 0, inc_gcd = 0;
+ // First pass or new chromosome
+ if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
+ // Read goes beyond the end of the rseq buffer
+ else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
+ // Read overlaps the next gcd bin
+ else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen )
+ {
+ inc_gcd = 1;
+ if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
+ }
+ if ( inc_gcd )
{
- read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
-
- // Get the reference GC content for this bin. Note that in the current implementation
- // the GCD bins overlap by seq_len-1. Because the bin size is by default 20k and the
- // expected read length only ~100bp, it shouldn't really matter.
- stats->gcd_pos = bam_line->core.pos;
stats->igcd++;
-
if ( stats->igcd >= stats->ngcd )
- error("The genome too long or the GCD bin overlaps too big [%ud]\n", stats->igcd);
-
- stats->gcd[ stats->igcd ].gc = fai_gc_content(stats);
+ realloc_gcd_buffer(stats, readlen);
+ if ( inc_ref )
+ read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
+ stats->gcd_pos = bam_line->core.pos;
+ stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
}
+
count_mismatches_per_cycle(stats,bam_line);
}
+ // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
{
// First pass or a new chromosome
stats->tid = bam_line->core.tid;
stats->gcd_pos = bam_line->core.pos;
stats->igcd++;
-
if ( stats->igcd >= stats->ngcd )
- {
- uint32_t n = 2*(1 + stats->ngcd);
- stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
- if ( !stats->gcd )
- error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
- memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
- }
+ realloc_gcd_buffer(stats, readlen);
}
-
stats->gcd[ stats->igcd ].depth++;
// When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
if ( !stats->fai )
// Calculate average insert size and standard deviation (from the main bulk data only)
int isize, ibulk=0;
uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
- for (isize=1; isize<stats->nisize; isize++)
+ for (isize=0; isize<stats->nisize; isize++)
{
// Each pair was counted twice
stats->isize_inward[isize] *= 0.5;
}
double bulk=0, avg_isize=0, sd_isize=0;
- for (isize=1; isize<stats->nisize; isize++)
+ for (isize=0; isize<stats->nisize; isize++)
{
bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
printf(" %s",stats->argv[i]);
printf("\n");
printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
+ printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd));
+ printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
+ printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
+ printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);
printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
+ printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
int ibase,iqual;
if ( stats->max_len<stats->nbases ) stats->max_len++;
printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
}
printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
- for (isize=1; isize<ibulk; isize++)
+ for (isize=0; isize<ibulk; isize++)
printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
(long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]);
}
- printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions, number of deletions\n");
- for (ilen=0; ilen<stats->nbases; ilen++)
+ printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
+ for (ilen=0; ilen<=stats->nbases; ilen++)
{
- if ( stats->ins_cycles[ilen]>0 || stats->del_cycles[ilen]>0 )
- printf("IC\t%d\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles[ilen], (long)stats->del_cycles[ilen]);
+ // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
+ // the index of the cycle of the first inserted base (also 1-based)
+ if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
+ printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
}
printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
- printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
+ if ( stats->cov[0] )
+ printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
int icov;
for (icov=1; icov<stats->ncov-1; icov++)
- printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
- printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
-
+ if ( stats->cov[icov] )
+ printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
+ if ( stats->cov[stats->ncov-1] )
+ printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
// Calculate average GC content, then sort by GC and depth
printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
}
}
-void bam_init_header_hash(bam_header_t *header);
+size_t mygetline(char **line, size_t *n, FILE *fp)
+{
+ if (line == NULL || n == NULL || fp == NULL)
+ {
+ errno = EINVAL;
+ return -1;
+ }
+ if (*n==0 || !*line)
+ {
+ *line = NULL;
+ *n = 0;
+ }
+
+ size_t nread=0;
+ int c;
+ while ((c=getc(fp))!= EOF && c!='\n')
+ {
+ if ( ++nread>=*n )
+ {
+ *n += 255;
+ *line = realloc(*line, sizeof(char)*(*n));
+ }
+ (*line)[nread-1] = c;
+ }
+ if ( nread>=*n )
+ {
+ *n += 255;
+ *line = realloc(*line, sizeof(char)*(*n));
+ }
+ (*line)[nread] = 0;
+ return nread>0 ? nread : -1;
+
+}
void init_regions(stats_t *stats, char *file)
{
khiter_t iter;
- khash_t(str) *header_hash;
+ khash_t(kh_bam_tid) *header_hash;
bam_init_header_hash(stats->sam->header);
- header_hash = (khash_t(str)*)stats->sam->header->hash;
+ header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash;
FILE *fp = fopen(file,"r");
if ( !fp ) error("%s: %s\n",file,strerror(errno));
ssize_t nread;
int warned = 0;
int prev_tid=-1, prev_pos=-1;
- while ((nread = getline(&line, &len, fp)) != -1)
+ while ((nread = mygetline(&line, &len, fp)) != -1)
{
if ( line[0] == '#' ) continue;
int i = 0;
while ( i<nread && !isspace(line[i]) ) i++;
- if ( i>=nread ) error("Could not parse the file: %s\n", file);
+ if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line);
line[i] = 0;
- iter = kh_get(str, header_hash, line);
+ iter = kh_get(kh_bam_tid, header_hash, line);
int tid = kh_val(header_hash, iter);
if ( iter == kh_end(header_hash) )
{
if ( !warned )
- fprintf(stderr,"Warning: Some sequences not present in the BAM (%s)\n", line);
+ fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line);
warned = 1;
continue;
}
stats->regions[tid].npos++;
}
if (line) free(line);
+ if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n");
fclose(fp);
}
return 1;
}
+void reset_regions(stats_t *stats)
+{
+ int i;
+ for (i=0; i<stats->nregions; i++)
+ stats->regions[i].cpos = 0;
+}
+
+int is_in_regions(bam1_t *bam_line, stats_t *stats)
+{
+ if ( !stats->regions ) return 1;
+
+ if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0;
+ if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
+
+ regions_t *reg = &stats->regions[bam_line->core.tid];
+ if ( reg->cpos==reg->npos ) return 0; // done for this chr
+
+ // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
+ // even small overlap is enough to include the read in the stats.
+ int i = reg->cpos;
+ while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
+ if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; }
+ if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0;
+ reg->cpos = i;
+ stats->reg_from = reg->pos[i].from;
+ stats->reg_to = reg->pos[i].to;
+
+ return 1;
+}
+
+void init_group_id(stats_t *stats, char *id)
+{
+ if ( !stats->sam->header->dict )
+ stats->sam->header->dict = sam_header_parse2(stats->sam->header->text);
+ void *iter = stats->sam->header->dict;
+ const char *key, *val;
+ int n = 0;
+ stats->rg_hash = kh_init(kh_rg);
+ while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
+ {
+ if ( !strcmp(id,key) || (val && !strcmp(id,val)) )
+ {
+ khiter_t k = kh_get(kh_rg, stats->rg_hash, key);
+ if ( k != kh_end(stats->rg_hash) )
+ fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key);
+ int ret;
+ k = kh_put(kh_rg, stats->rg_hash, key, &ret);
+ kh_value(stats->rg_hash, k) = val;
+ n++;
+ }
+ }
+ if ( !n )
+ error("The sample or read group \"%s\" not present.\n", id);
+}
+
void error(const char *format, ...)
{
printf("Options:\n");
printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n");
printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n");
+ printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n");
+ printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n");
+ printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
printf(" -h, --help This help message\n");
printf(" -i, --insert-size <int> Maximum insert size [8000]\n");
+ printf(" -I, --id <string> Include only listed read group or sample name\n");
printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n");
printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n");
printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n");
{
char *targets = NULL;
char *bam_fname = NULL;
+ char *group_id = NULL;
samfile_t *sam = NULL;
char in_mode[5];
stats_t *stats = calloc(1,sizeof(stats_t));
stats->ngc = 200;
- stats->nquals = 95;
+ stats->nquals = 256;
stats->nbases = 300;
stats->nisize = 8000;
stats->max_len = 30;
stats->max_qual = 40;
stats->isize_main_bulk = 0.99; // There are always outliers at the far end
- stats->gcd_bin_size = 20000;
- stats->ngcd = 3e5; // 300k of 20k bins is enough to hold a genome 6Gbp big
- stats->nref_seq = stats->gcd_bin_size;
+ stats->gcd_bin_size = 20e3;
+ stats->gcd_ref_size = 4.2e9;
stats->rseq_pos = -1;
stats->tid = stats->gcd_pos = -1;
+ stats->igcd = 0;
stats->is_sorted = 1;
stats->cov_min = 1;
stats->cov_max = 1000;
{"help",0,0,'h'},
{"remove-dups",0,0,'d'},
{"sam",0,0,'s'},
- {"ref-seq",0,0,'r'},
- {"coverage",0,0,'c'},
- {"read-length",0,0,'l'},
- {"insert-size",0,0,'i'},
- {"most-inserts",0,0,'m'},
- {"trim-quality",0,0,'q'},
+ {"ref-seq",1,0,'r'},
+ {"coverage",1,0,'c'},
+ {"read-length",1,0,'l'},
+ {"insert-size",1,0,'i'},
+ {"most-inserts",1,0,'m'},
+ {"trim-quality",1,0,'q'},
{"target-regions",0,0,'t'},
+ {"required-flag",1,0,'f'},
+ {"filtering-flag",0,0,'F'},
+ {"id",1,0,'I'},
+ {"GC-depth",1,0,1},
{0,0,0,0}
};
int opt;
- while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:",loptions,NULL))>0 )
+ while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 )
{
switch (opt)
{
- case 'd': stats->rmdup=1; break;
+ case 'f': stats->flag_require=strtol(optarg,0,0); break;
+ case 'F': stats->flag_filter=strtol(optarg,0,0); break;
+ case 'd': stats->flag_filter|=BAM_FDUP; break;
case 's': strcpy(in_mode, "r"); break;
case 'r': stats->fai = fai_load(optarg);
if (stats->fai==0)
error("Could not load faidx: %s\n", optarg);
break;
+ case 1 : {
+ float flen,fbin;
+ if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 )
+ error("Unable to parse --GC-depth %s\n", optarg);
+ stats->gcd_bin_size = fbin;
+ stats->gcd_ref_size = flen;
+ }
+ break;
case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 )
error("Unable to parse -c %s\n", optarg);
break;
case 'm': stats->isize_main_bulk = atof(optarg); break;
case 'q': stats->trim_qual = atoi(optarg); break;
case 't': targets = optarg; break;
+ case 'I': group_id = optarg; break;
case '?':
case 'h': error(NULL);
default: error("Unknown argument: %s\n", optarg);
if ((sam = samopen(bam_fname, in_mode, NULL)) == 0)
error("Failed to open: %s\n", bam_fname);
stats->sam = sam;
+ if ( group_id ) init_group_id(stats, group_id);
bam1_t *bam_line = bam_init1();
// .. arrays
- stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
- stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
- stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
- stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
- stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
- stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
- stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
- stats->rseq_buf = calloc(stats->nref_seq,sizeof(uint8_t));
- stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
- stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
- stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
- stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
- stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
- stats->ins_cycles = calloc(stats->nbases,sizeof(uint64_t));
- stats->del_cycles = calloc(stats->nbases,sizeof(uint64_t));
+ stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
+ stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
+ stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
+ stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t));
+ stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t));
+ stats->isize_other = calloc(stats->nisize,sizeof(uint64_t));
+ stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
+ stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
+ stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t));
+ stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t));
+ stats->insertions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->deletions = calloc(stats->nbases,sizeof(uint64_t));
+ stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
+ stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+ realloc_rseq_buffer(stats);
if ( targets )
init_regions(stats, targets);
int tid, beg, end;
bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end);
if ( tid < 0 ) continue;
+ reset_regions(stats);
bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read);
}
bam_index_destroy(bam_idx);
{
// Stream through the entire BAM ignoring off-target regions if -t is given
while (samread(sam,bam_line) >= 0)
- {
- if ( stats->regions )
- {
- if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) continue;
- if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n");
-
- regions_t *reg = &stats->regions[bam_line->core.tid];
- if ( reg->cpos==reg->npos ) continue; // done for this chr
-
- // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered,
- // even small overlap is enough to include the read in the stats.
- int i = reg->cpos;
- while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++;
- if ( i>=reg->npos ) { reg->cpos = reg->npos; continue; }
- if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) continue;
- reg->cpos = i;
- }
collect_stats(bam_line,stats);
- }
}
round_buffer_flush(stats,-1);
free(stats->read_lengths);
free(stats->insertions);
free(stats->deletions);
- free(stats->ins_cycles);
- free(stats->del_cycles);
+ free(stats->ins_cycles_1st);
+ free(stats->ins_cycles_2nd);
+ free(stats->del_cycles_1st);
+ free(stats->del_cycles_2nd);
destroy_regions(stats);
free(stats);
+ if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);
- return 0;
+ return 0;
}